3
d+                 @   s   d Z ddlZddlZddlZddlmZ ddlmZm	Z	 yddl
mZ W n ek
r\   Y nX ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd deZG dd deZdd Zdd Zdd Zd"ddZdd Zdd Zd#ddZe dkrdd l!mZ ed ed! dS )$z
Named entity chunker
    N)ElementTree)ClassifierBasedTaggerpos_tag)MaxentClassifier)ChunkParserI)
ChunkScore)find)word_tokenize)Treec               @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )NEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    c             C   s   t j| || jd d S )N)trainZclassifier_builder)r   __init___classifier_builder)selfr    r   7/tmp/pip-build-v9q4h5k9/nltk/nltk/chunk/named_entity.pyr   $   s    zNEChunkParserTagger.__init__c             C   s   t j|ddddS )NZmegam      )	algorithmZgaussian_prior_sigmatrace)r   r   )r   r   r   r   r   r   )   s    z'NEChunkParserTagger._classifier_builderc             C   sF   y
| j }W n6 tk
r@   ddlm} t|jd| _ | j }Y nX |S )Nr   )wordszen-basic)Z_en_wordlistAttributeErrorZnltk.corpusr   set)r   Zwlr   r   r   r   _english_wordlist.   s    
z%NEChunkParserTagger._english_wordlistc             C   s0  || d }t || d }|dkrBd  }}d  }}	d  }
 }}n|dkr||d  d j }d }t ||d  d }d }	||d  d }d  }
}np||d  d j }||d  d j }t ||d  d }t ||d  d }	||d  }||d  }t|}
|t|d kr(d  }}d  }}n|t|d krl||d  d j }||d  d j }d }d }nP||d  d j }||d  d j }||d  d j }||d  d j }dt|t||d d j |dd  j |||| j k||||||j  d| | d| |
 d| d}|S )	Nr   r   r   T   +)ZbiasshapeZwordlenZprefix3Zsuffix3poswordzen-wordlistprevtagprevposnextposprevwordnextwordzword+nextposzpos+prevtagzshape+prevtag)simplify_poslowerr   lenr   )r   tokensindexhistoryr   r   r"   Zprevprevwordr    ZprevprevposZ	prevshaper   Zprevprevtagr#   Znextnextwordr!   Znextnextposfeaturesr   r   r   _feature_detector8   sb    


z%NEChunkParserTagger._feature_detectorN)__name__
__module____qualname____doc__r   r   r   r,   r   r   r   r   r      s
   
r   c               @   s<   e Zd ZdZdd Zdd Zdd Zdd	 Zed
d Z	dS )NEChunkParserz2
    Expected input: list of pos-tagged words
    c             C   s   | j | d S )N)_train)r   r   r   r   r   r   x   s    zNEChunkParser.__init__c             C   s   | j j|}| j|}|S )z8
        Each token should be a pos-tagged word
        )_taggertag_tagged_to_parse)r   r(   Ztaggedtreer   r   r   parse{   s    
zNEChunkParser.parsec                s"    fdd|D }t |d _d S )Nc                s   g | ]} j |qS r   )_parse_to_tagged).0s)r   r   r   
<listcomp>   s    z(NEChunkParser._train.<locals>.<listcomp>)r   )r   r3   )r   Zcorpusr   )r   r   r2      s    zNEChunkParser._trainc             C   s   t dg }x|D ]\}}|dkr,|j| q|jdrR|jt |dd |g q|jdr|rt|d t r|d	 j |dd kr|d
 j| q|jt |dd |g qW |S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        SOzB-r   NzI-r   r>   r>   )r
   append
startswith
isinstancelabel)r   Ztagged_tokenssenttokr4   r   r   r   r5      s    


*zNEChunkParser._tagged_to_parsec             C   s   g }x| D ]~}t |trzt|dkr.td q
|j|d d|j  f x<|dd D ]}|j|d|j  f qXW q
|j|df q
W |S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencezB-r   NzI-r=   )rA   r
   r'   printr?   rB   )rC   tokschildrD   r   r   r   r8      s    

zNEChunkParser._parse_to_taggedN)
r-   r.   r/   r0   r   r7   r2   r5   staticmethodr8   r   r   r   r   r1   s   s   r1   c             C   s^   t jd| t jrdS t jd| t jr(dS t jd| t jrV| j rDdS | j rPdS dS nd	S d S )
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$numberz\W+$punctz\w+$ZupcaseZdowncaseZ	mixedcaseother)rematchUNICODEistitleislower)r   r   r   r   r      s    r   c             C   s    | j drdS | jdd S d S )NV-r   )r@   split)r:   r   r   r   r%      s    
r%   c             C   s   | j  }dd t|D }tdg }xb| D ]Z}t|trr|jt|j g  x6|D ]}|d j|t|f qRW q*|j|t|f q*W |S )Nc             s   s   | ]\}}|V  qd S )Nr   )r9   r   r   r   r   r   	<genexpr>   s    zpostag_tree.<locals>.<genexpr>r<   r   r>   )leavesr   r
   rA   r?   rB   next)r6   r   Ztag_iterZnewtreerG   Zsubchildr   r   r   postag_tree   s    



rW   binaryTc             c   sn   xh| D ]`}xZt j|D ]L\}}}|jdr0|r0qx0|D ](}|jdr6tt jj|||E d H  q6W qW qW d S )NZbnewsz.sgm)oswalkendswithload_ace_filepathjoin)rootsfmtZ
skip_bnewsrootdirsfilesfr   r   r   load_ace_data   s    


re   c             c   s  t dtjj| d   | d }g }t|}tj|j }W d Q R X xv|jdD ]h}|j	dj
}xV|jdD ]H}|jddkrqpt|j	d	j
}	t|j	d
j
d }
|j|	|
|f qpW qTW t| }|j }W d Q R X tjdd|}dd }tjd||}tjdd|}tjdd|}tjdd|}dd |D }|dkrd}tdg }xjt|D ]^\}	}
}|	|k rr|}	|
|	krqZ|jt|||	  |jtd||	|
 j  |
}qZW |jt||d   |V  n|dkr|d}tdg }xjt|D ]^\}	}
}|	|k r|}	|
|	kr"q|jt|||	  |jt|||	|
 j  |
}qW |jt||d   |V  ntdd S )Nz  - r   z.tmx.rdc.xmlzdocument/entityZentity_typeZentity_mentionZTYPENAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+> c             S   s   d| j  | j  d  S )N    )endstart)mr   r   r   subfunc   s    zload_ace_file.<locals>.subfuncz[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" c             S   s   h | ]\}}}|qS r   r   )r9   r:   etypr   r   r   	<setcomp>  s    z load_ace_file.<locals>.<setcomp>rX   r   r<   ZNE
multiclasszbad fmt value)rE   rY   r]   rS   openETr7   getrootfindallr   textgetintr?   readrL   subr
   sortedextendr	   
ValueError)Ztextfiler`   Zannfileentitiesinfilexmlentityro   Zmentionr:   rn   rv   rm   Zentity_typesirF   r   r   r   r\      sb    











r\   c             C   s   t j| } t j|}d}xt| |D ]\\}}\}}||  koFdkn  r|std|dd|dd|  tdjddd d}q$d}td|dd|dd|  q$W d S )	NFr=   z  Z15rh   z  {:15} {:15} {2}z...T)r1   r8   ziprE   format)correctZguessedZellipsiswctgtr   r   r   
cmp_chunks'  s    

r   c             C   s  t d tdtdtdtdg}t|| }dd |D }t d t|}~t d	 td
g}t|| }dd |D }t d t }x@t|D ]4\}	}
|j|
j }|j|
| |	dk rt	|
| qW t | d|  d}t d| d t
|d}tj||d W d Q R X |S )NzLoading training data...zcorpora/ace_data/ace.devzcorpora/ace_data/ace.heldoutzcorpora/ace_data/bbn.devzcorpora/ace_data/muc.devc             S   s   g | ]}t |qS r   )rW   )r9   tr   r   r   r;   ?  s    zbuild_model.<locals>.<listcomp>zTraining...zLoading eval data...zcorpora/ace_data/ace.evalc             S   s   g | ]}t |qS r   )rW   )r9   r   r   r   r   r;   G  s    zEvaluating...r   z/tmp/ne_chunker_z.picklezSaving chunker to z...wbr   r>   )rE   r   re   r1   r   	enumerater7   rU   Zscorer   rr   pickledump)r`   Ztrain_pathsZtrain_treesZ
train_datacpZ
eval_pathsZ
eval_treesZ	eval_dataZ
chunkscorer   r   guessZoutfilenameoutfiler   r   r   build_model6  s6    



r   __main__)r   rq   )rX   T)rX   )"r0   rY   r   rL   Z	xml.etreer   rs   Znltk.tagr   r   Znltk.classifyr   ImportErrorZnltk.chunk.apir   Znltk.chunk.utilr   Z	nltk.datar   Znltk.tokenizer	   Z	nltk.treer
   r   r1   r   r%   rW   re   r\   r   r   r-   Znltk.chunk.named_entityr   r   r   r   <module>
   s6   T;

I
%
