3
d?S              	   @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlm	Z	 dd ZG dd	 d	Z
d
d Zd/ddZe jdZd1ddZdd Zd3ddZdd Ze jde jZe jdZdd  Zd!d"d#d$d%d&d'd(d)g	dfd*d+Zd,d- Zed.k re  dS )4    N)accuracy)map_tag)	str2tuple)Treec             C   sF   g }g }x2|D ]*}| j |j }|t|7 }|t|7 }qW t||S )a|  
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    )parseflattentree2conlltags	_accuracy)ZchunkerZgoldZ	gold_tagsZ	test_tagsZ	gold_treeZ	test_tree r
   //tmp/pip-build-v9q4h5k9/nltk/nltk/chunk/util.pyr      s    
r   c               @   s   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	d ddZ
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )!
ChunkScorea;  
    A utility class for scoring chunk parsers.  ``ChunkScore`` can
    evaluate a chunk parser's output, based on a number of statistics
    (precision, recall, f-measure, misssed chunks, incorrect chunks).
    It can also combine the scores from the parsing of multiple texts;
    this makes it significantly easier to evaluate a chunk parser that
    operates one sentence at a time.

    Texts are evaluated with the ``score`` method.  The results of
    evaluation can be accessed via a number of accessor methods, such
    as ``precision`` and ``f_measure``.  A typical use of the
    ``ChunkScore`` class is::

        >>> chunkscore = ChunkScore()           # doctest: +SKIP
        >>> for correct in correct_sentences:   # doctest: +SKIP
        ...     guess = chunkparser.parse(correct.leaves())   # doctest: +SKIP
        ...     chunkscore.score(correct, guess)              # doctest: +SKIP
        >>> print('F Measure:', chunkscore.f_measure())       # doctest: +SKIP
        F Measure: 0.823

    :ivar kwargs: Keyword arguments:

        - max_tp_examples: The maximum number actual examples of true
          positives to record.  This affects the ``correct`` member
          function: ``correct`` will not return more than this number
          of true positive examples.  This does *not* affect any of
          the numerical metrics (precision, recall, or f-measure)

        - max_fp_examples: The maximum number actual examples of false
          positives to record.  This affects the ``incorrect`` member
          function and the ``guessed`` member function: ``incorrect``
          will not return more than this number of examples, and
          ``guessed`` will not return more than this number of true
          positive examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - max_fn_examples: The maximum number actual examples of false
          negatives to record.  This affects the ``missed`` member
          function and the ``correct`` member function: ``missed``
          will not return more than this number of examples, and
          ``correct`` will not return more than this number of true
          negative examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - chunk_label: A regular expression indicating which chunks
          should be compared.  Defaults to ``'.*'`` (i.e., all chunks).

    :type _tp: list(Token)
    :ivar _tp: List of true positives
    :type _fp: list(Token)
    :ivar _fp: List of false positives
    :type _fn: list(Token)
    :ivar _fn: List of false negatives

    :type _tp_num: int
    :ivar _tp_num: Number of true positives
    :type _fp_num: int
    :ivar _fp_num: Number of false positives
    :type _fn_num: int
    :ivar _fn_num: Number of false negatives.
    c             K   s   t  | _t  | _t  | _t  | _t  | _|jdd| _|jdd| _|jdd| _	|jdd| _
d| _d| _d| _d| _d| _d| _d	| _d S )
NZmax_tp_examplesd   Zmax_fp_examplesZmax_fn_exampleschunk_labelz.*r   g        F)set_correct_guessed_tp_fp_fngetZ_max_tpZ_max_fpZ_max_fn_chunk_label_tp_num_fp_num_fn_num_count_tags_correct_tags_total_measuresNeedUpdate)selfkwargsr
   r
   r   __init__r   s     zChunkScore.__init__c             C   s^   | j rZ| j| j@ | _| j| j | _| j| j | _t| j| _t| j| _t| j| _	d| _ d S )NF)
r   r   r   r   r   r   lenr   r   r   )r   r
   r
   r   _updateMeasures   s    zChunkScore._updateMeasuresc             C   s   |  j t|| j| jO  _ |  jt|| j| jO  _|  jd7  _d| _yt|}t|}W n tk
rx   f  }}Y nX |  jt	|7  _|  j
tdd t||D 7  _
dS )aU  
        Given a correctly chunked sentence, score another chunked
        version of the same sentence.

        :type correct: chunk structure
        :param correct: The known-correct ("gold standard") chunked
            sentence.
        :type guessed: chunk structure
        :param guessed: The chunked sentence to be scored.
           Tc             s   s   | ]\}}||krd V  qdS )r#   Nr
   ).0tgr
   r
   r   	<genexpr>   s    z#ChunkScore.score.<locals>.<genexpr>N)r   
_chunksetsr   r   r   r   r   
ValueErrorr   r!   r   sumzip)r   correctguessedZcorrect_tagsZguessed_tagsr
   r
   r   score   s    zChunkScore.scorec             C   s   | j dkrdS | j| j  S )z
        Return the overall tag-based accuracy for all text that have
        been scored by this ``ChunkScore``, using the IOB (conll2000)
        tag encoding.

        :rtype: float
        r   r#   )r   r   )r   r
   r
   r   r      s    
zChunkScore.accuracyc             C   s.   | j   | j| j }|dkr dS | j| S dS )z
        Return the overall precision for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        r   N)r"   r   r   )r   divr
   r
   r   	precision   s
    zChunkScore.precisionc             C   s.   | j   | j| j }|dkr dS | j| S dS )z
        Return the overall recall for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        r   N)r"   r   r   )r   r/   r
   r
   r   recall   s
    zChunkScore.recall      ?c             C   sD   | j   | j }| j }|dks(|dkr,dS d|| d| |   S )a  
        Return the overall F measure for all texts that have been
        scored by this ``ChunkScore``.

        :param alpha: the relative weighting of precision and recall.
            Larger alpha biases the score towards the precision value,
            while smaller alpha biases the score towards the recall
            value.  ``alpha`` should have a value in the range [0,1].
        :type alpha: float
        :rtype: float
        r   r#   )r"   r0   r1   )r   alphaprr
   r
   r   	f_measure   s    zChunkScore.f_measurec             C   s    | j   t| j}dd |D S )z
        Return the chunks which were included in the
        correct chunk structures, but not in the guessed chunk
        structures, listed in input order.

        :rtype: list of chunks
        c             S   s   g | ]}|d  qS )r#   r
   )r$   cr
   r
   r   
<listcomp>   s    z%ChunkScore.missed.<locals>.<listcomp>)r"   listr   )r   chunksr
   r
   r   missed   s    
zChunkScore.missedc             C   s    | j   t| j}dd |D S )z
        Return the chunks which were included in the guessed chunk structures,
        but not in the correct chunk structures, listed in input order.

        :rtype: list of chunks
        c             S   s   g | ]}|d  qS )r#   r
   )r$   r7   r
   r
   r   r8      s    z(ChunkScore.incorrect.<locals>.<listcomp>)r"   r9   r   )r   r:   r
   r
   r   	incorrect   s    
zChunkScore.incorrectc             C   s   t | j}dd |D S )z
        Return the chunks which were included in the correct
        chunk structures, listed in input order.

        :rtype: list of chunks
        c             S   s   g | ]}|d  qS )r#   r
   )r$   r7   r
   r
   r   r8     s    z&ChunkScore.correct.<locals>.<listcomp>)r9   r   )r   r:   r
   r
   r   r,      s    
zChunkScore.correctc             C   s   t | j}dd |D S )z
        Return the chunks which were included in the guessed
        chunk structures, listed in input order.

        :rtype: list of chunks
        c             S   s   g | ]}|d  qS )r#   r
   )r$   r7   r
   r
   r   r8     s    z&ChunkScore.guessed.<locals>.<listcomp>)r9   r   )r   r:   r
   r
   r   r-     s    
zChunkScore.guessedc             C   s   | j   | j| j S )N)r"   r   r   )r   r
   r
   r   __len__  s    zChunkScore.__len__c             C   s   dt t|  d S )z`
        Return a concise representation of this ``ChunkScoring``.

        :rtype: str
        z<ChunkScoring of z chunks>)reprr!   )r   r
   r
   r   __repr__  s    zChunkScore.__repr__c             C   s\   dd| j  d dd d| j d dd d| j d dd d| j d dd	 S )
a-  
        Return a verbose representation of this ``ChunkScoring``.
        This representation includes the precision, recall, and
        f-measure scores.  For other information about the score,
        use the accessor methods (e.g., ``missed()`` and ``incorrect()``).

        :rtype: str
        zChunkParse score:
z    IOB Accuracy: r   z5.1fz%%
z    Precision:    z    Recall:       z    F-Measure:    z%%)r   r0   r1   r6   )r   r
   r
   r   __str__  s    zChunkScore.__str__N)r2   )__name__
__module____qualname____doc__r    r"   r.   r   r0   r1   r6   r;   r<   r,   r-   r=   r?   r@   r
   r
   r
   r   r   3   s   =



r   c             C   sh   d}g }xV| D ]N}t |trTtj||j rB|j||f|j f |t|j 7 }q|d7 }qW t	|S )Nr   r#   )

isinstancer   rematchlabelappendfreezer!   Zleavesr   )r%   countr   posr:   childr
   r
   r   r(   2  s    

r(   NPS/c             C   s.  t jd}t|g g}x|j| D ]}|j }	|	d dkr~t|dkrZtd|j dt|g }
|d j|
 |j|
 q"|	d dkrt|dkrtd	|j d|j	  q"|d
kr|d j|	 q"t
|	|\}}|r|rt|||}|d j||f q"W t|dkr&tdt| d|d S )aB  
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    z\[|\]|[^\[\]\s]+r   [r#   zUnexpected [ at char d]   zUnexpected ] at char NzExpected ] at char rU   rU   )rF   compiler   finditergroupr!   r)   startrI   popr   r   )sr   
root_labelsepZsource_tagsetZtarget_tagsetZWORD_OR_BRACKETstackrG   textchunkwordtagr
   r
   r   tagstr2tree?  s.    


rc   z(\S+)\s+(\S+)\s+([IOB])-?(\S+)?PPVPc             C   s   t |g g}xt| jdD ]\}}|j s.qtj|}|dkrPtd|d|j \}}}	}
|dk	rt|
|krtd}	|	dko|
|d j k}|	dks|rt	|d	kr|j
  |	d
ks|rt |
g }|d j| |j| |d j||f qW |d S )a*  
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    
NzError on line rR   OIr#   ZBOrT   Br   rU   rU   rU   )r   	enumeratesplitstrip_LINE_RErG   r)   groupsrH   r!   rZ   rI   )r[   chunk_typesr\   r^   linenolinerG   ra   rb   stateZ
chunk_typeZ
mismatch_Ir`   r
   r
   r   conllstr2treeu  s(    


rs   c             C   s   g }x| D ]}yP|j  }d}x>|D ]6}t|tr8td|j|d |d || f d}q"W W q
 tk
r   |j|d |d df Y q
X q
W |S )z
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    zB-z7Tree is too deeply nested to be printed in CoNLL formatr   r#   zI-rg   )rH   rE   r   r)   rI   AttributeError)r%   tagsrM   categoryprefixcontentsr
   r
   r   r     s    



"r   Fc             C   s  t |g }x| D ]\}}}|dkrD|r4tdn|j||f q|jdrn|jt |dd ||fg q|jdrt|dkst|d
 t  s|d j |dd kr|rtdq|jt |dd ||fg n|d j||f q|dkr|j||f qtd	|qW |S )z1
    Convert the CoNLL IOB format to a tree.
    NzBad conll tag sequencezB-rT   zI-r   r#   rg   zBad conll tag rU   rU   rU   )r   r)   rI   
startswithr!   rE   rH   )Zsentencero   r\   stricttreera   ZpostagZchunktagr
   r
   r   conlltags2tree  s(    


 

 
r|   c             C   s   dd t | D }dj|S )z
    Return a multiline string where each line contains a word, tag and IOB tag.
    Convert a tree to the CoNLL IOB string format

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: str
    c             S   s   g | ]}d j |qS ) )join)r$   tokenr
   r
   r   r8     s    z!tree2conllstr.<locals>.<listcomp>rf   )r   r~   )r%   linesr
   r
   r   tree2conllstr  s    	r   a   <DOC>\s*(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?<BODY>\s*(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?<TEXT>(?P<text>.*?)</TEXT>\s*</BODY>\s*</DOC>\s*z#<b_\w+\s+[^>]*?type="(?P<type>\w+)"c             C   s
  t |g g}| d krg S xtjd| D ]}|j }yv|jdrtj|}|d krZtd| t |jdg }|d j| |j| n"|jdr|j	  n|d j| W q& t
tfk
r } ztd|j dd	|W Y d d }~X q&X q&W t|dkrtd
|d S )Nz<[^>]+>|[^\s<]+z<b_ZXXXXtyper#   z<e_z$Bad IEER string (error at character rR   )zBad IEER stringr   rU   rU   )r   rF   rW   rX   ry   _IEER_TYPE_RErG   printrI   rZ   
IndexErrorr)   rY   r!   )r[   r\   r^   Zpiece_mZpiecemr`   er
   r
   r   _ieer_read_text  s.    




r   ZLOCATIONZORGANIZATIONZPERSONZDURATIONZDATEZCARDINALPERCENTZMONEYZMEASUREc             C   sV   t j| }|rHt|jd||jd|jd|jdt|jd|dS t| |S dS )ap  
    Return a chunk structure containing the chunked tagged text that is
    encoded in the given IEER style string.
    Convert a string of chunked tagged text in the IEER named
    entity format into a chunk structure.  Chunks are of several
    types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
    PERCENT, MONEY, and MEASURE.

    :rtype: Tree
    r_   docnodoctype	date_timeheadline)r_   r   r   r   r   N)_IEER_DOC_RErG   r   rX   )r[   ro   r\   r   r
   r
   r   ieerstr2tree'  s    
r   c              C   sd   d} dd l }|jj| dd}|j  t  d} t| d	d}|j  td t|jj| t  d S )
Nzd[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./.r   rN   )r   av  
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
rd   )ro   zCoNLL output:)rN   rd   )nltkr`   rc   pprintr   rs   r   )r[   r   r%   Z
conll_treer
   r
   r   demoR  s    r   __main__)rN   rO   rP   NNrN   rd   re   )r   rO   rN   rd   re   )r   rO   F)rF   Znltk.metricsr   r	   Znltk.tag.mappingr   Znltk.tag.utilr   Z	nltk.treer   r   r(   rc   rV   rm   rs   r   r|   r   DOTALLr   r   r   r   r   rA   r
   r
   r
   r   <module>	   sB     
2

5
#
#0
