3
d;                 @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZmZ d dlmZmZ dd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd>d$d%Zd&d' Zd?d(d)Zed@dAgZedBdCdDdEdFdGdHdIdJg	Z d;d< Z!e"d=kre  dS )K    N)treebank)BrillTaggerTrainerRegexpTaggerUnigramTagger)PosWord)Template
error_listc               C   s
   t   dS )z
    Run a demo with defaults. See source comments for details,
    or docstrings of any of the more specific demo_* functions.
    N)postag r   r   -/tmp/pip-build-v9q4h5k9/nltk/nltk/tbl/demo.pydemo   s    r   c               C   s   t dd dS )zN
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    repr)
ruleformatN)r
   r   r   r   r   demo_repr_rule_format   s    r   c               C   s   t dd dS )zN
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    str)r   N)r
   r   r   r   r   demo_str_rule_format$   s    r   c               C   s   t dd dS )z*
    Exemplify Rule.format("verbose")
    verbose)r   N)r
   r   r   r   r   demo_verbose_rule_format+   s    r   c               C   s   t ttdddggd dS )	a  
    The feature/s of a template takes a list of positions
    relative to the current word where the feature should be
    looked for, conceptually joined by logical OR. For instance,
    Pos([-1, 1]), given a value V, will hold whenever V is found
    one step to the left and/or one step to the right.

    For contiguous ranges, a 2-arg form giving inclusive end
    points can also be used: Pos(-3, -1) is the same as the arg
    below.
             )	templatesN)r
   r   r   r   r   r   r   demo_multiposition_feature2   s    r   c               C   s$   t ttdgtddggd dS )z8
    Templates can have more than a single feature.
    r   r   r   )r   Nr   r   )r
   r   r   r   r   r   r   r   demo_multifeature_templateA   s    r   c               C   s   t ddd dS )ah  
    Show aggregate statistics per template. Little used templates are
    candidates for deletion, much used templates may possibly be refined.

    Deleting unused templates is mostly about saving time and/or space:
    training is basically O(T) in the number of templates T
    (also in terms of memory usage, which often will be the limiting factor).
    T)incremental_statstemplate_statsN)r
   r   r   r   r   demo_template_statisticsH   s    	r    c              C   sp   t jdddgddgdd} tjddddgddgdd}ttj| |gdd}td	jt| t|ddd
 dS )a	  
    Template.expand and Feature.expand are class methods facilitating
    generating large amounts of templates. See their documentation for
    details.

    Note: training with 500 templates can easily fill all available
    even on relatively small corpora
    r   r   r   F)ZexcludezeroTr   )combinationsz8Generated {} templates for transformation-based learning)r   r   r   Nr   r   r   )r   r   )	r   expandr   listr   printformatlenr
   )ZwordtplsZtagtplsr   r   r   r   demo_generated_templatesT   s    	r'   c               C   s   t dddd dS )z
    Plot a learning curve -- the contribution on tagging accuracy of
    the individual rules.
    Note: requires matplotlib
    Tzlearningcurve.png)r   separate_baseline_datalearning_curve_outputN)r
   r   r   r   r   demo_learning_curveh   s    r*   c               C   s   t dd dS )zW
    Writes a file with context for each erroneous word after tagging testing data
    z
errors.txt)error_outputN)r
   r   r   r   r   demo_error_analysisu   s    r,   c               C   s   t dd dS )zm
    Serializes the learned tagger to a file in pickle format; reloads it
    and validates the process.
    z
tagger.pcl)serialize_outputN)r
   r   r   r   r   demo_serialize_tagger|   s    r.   c               C   s   t dddd dS )z
    Discard rules with low accuracy. This may hurt performance a bit,
    but will often produce rules which are more interesting read to a human.
    i  gQ?
   )	num_sentsmin_acc	min_scoreN)r
   r   r   r   r   demo_high_accuracy_rules   s    r3     ,  r   皙?Fr   c       &   &   C   s  |pt }| dkr&ddlm}m} | } t|||||\}}}}|rtjj|st||d}t	|d}t
j|| W dQ R X tdj| t	|}t
j|}td|  W dQ R X nt||d}td |rtd	j|j| tj }t|| ||	d
}td |j||||}tdtj | dd |rFtd|j|  |dkrtd x8t|j dD ]&\}}t|dd|j|	d qhW |
rtd |j||\} }!td |std |j }"|r|j|! |r$t||!|"|d td|  n td |j|} |r$|j  |dk	rt	|d4}#|#jd|  |#jdjt|| jdd  W dQ R X td|  |dk	r|j|} t	|d}t
j|| W dQ R X td|  t	|}t
j|}$W dQ R X td|  |j|}%| |%krtd  ntd! dS )"a
  
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    Nr   )brill24describe_template_sets)backoffwz)Trained baseline tagger, pickled it to {}zReloaded pickled tagger from zTrained baseline taggerz!    Accuracy on test set: {:0.4f})r   zTraining tbl tagger...zTrained tbl tagger in z0.2fz secondsz    Accuracy on test set: %.4fr   z
Learned rules: Z4d szJIncrementally tagging the test data, collecting individual rule statisticsz    Rule statistics collectedzbWARNING: train_stats asked for separate_baseline_data=True; the baseline will be artificially high)takez Wrote plot of learning curve to zTagging the test datazErrors for Brill Tagger %r


zutf-8z)Wrote tagger errors including context to zWrote pickled tagger to z4Reloaded tagger tried on test set, results identicalz;PROBLEM: Reloaded tagger gave different results on test set)REGEXP_TAGGERnltk.tag.brillr7   r8   _demo_prepare_dataospathexistsr   openpickledumpr$   r%   loadZaccuracytimer   train	enumeraterulesZbatch_tag_incrementalZtrain_statsZprint_template_statistics
_demo_plotZ	tag_sentswritejoinr	   encode)&r   tagged_datar0   Z	max_rulesr2   r1   rJ   trace	randomizer   r   r   r+   r-   r)   Zlearning_curve_takeZbaseline_backoff_taggerr(   Zcache_baseline_taggerr7   r8   training_databaseline_data	gold_datatesting_dataZbaseline_taggerZprint_rulesZtbrillZtrainerZbrill_taggerZrulenoZruleZ
taggedtest	teststats
trainstatsfZbrill_tagger_reloadedZtaggedtest_reloadedr   r   r   r
      s    X



"


*





r
   c             C   s2  | d krt d tj } |d ks,t| |kr4t| }|rPtjt|  tj|  t|| }| d | }| || }dd |D }|s|}	n&t|d }
|d |
 ||
d   }	}t|\}}t|\}}t|	\}}t d|dd|dd t d	|dd|dd t d
j	|||rdnd ||	||fS )Nz%Loading tagged data from treebank... c             S   s   g | ]}d d |D qS )c             S   s   g | ]}|d  qS )r   r   ).0tr   r   r   
<listcomp>a  s    z1_demo_prepare_data.<locals>.<listcomp>.<listcomp>r   )r[   sentr   r   r   r]   a  s    z&_demo_prepare_data.<locals>.<listcomp>r   zRead testing data (dz sents/z wds)zRead training data (z-Read baseline data ({:d} sents/{:d} wds) {:s} z[reused the training set])
r$   r   Ztagged_sentsr&   randomseedshuffleintcorpus_sizer%   )rQ   rJ   r0   rS   r(   cutoffrT   rV   rW   rU   Z	bl_cutoffZ	trainseqsZtraintokensZtestseqsZ
testtokensZbltrainseqsZbltraintokensr   r   r   rA   Q  s8    

rA   c       	         s    d g}x" d D ]}|j |d	 |  qW  fdd|d | D }d g}x"d D ]}|j |d
 |  q\W fdd|d | D }dd lj}ttt|}|j|||| |jd d d dg |j|  d S )NZinitialerrorsZ
rulescoresr   c                s   g | ]}d | d   qS )r   
tokencountr   )r[   x)rX   r   r   r]   }  s    z_demo_plot.<locals>.<listcomp>c                s   g | ]}d | d   qS )r   rg   r   )r[   rh   )rY   r   r   r]     s    r   g      ?r   r   )	appendZmatplotlib.pyplotZpyplotr#   ranger&   ZplotZaxisZsavefig)	r)   rX   rY   r=   Z	testcurveZ	rulescoreZ
traincurveZpltrr   )rX   rY   r   rM   y  s    


rM   ^-?[0-9]+(\.[0-9]+)?$CD.*NN(The|the|A|a|An|an)$AT.*able$JJ.*ness$.*ly$RB.*s$NNS.*ing$VBG.*ed$VBDc             C   s   t | tdd | D fS )Nc             s   s   | ]}t |V  qd S )N)r&   )r[   rh   r   r   r   	<genexpr>  s    zcorpus_size.<locals>.<genexpr>)r&   sum)Zseqsr   r   r   re     s    re   __main__)NNr4   r5   r   Nr6   r   Fr   FFNNNr5   NFN)NN)rl   rm   )rn   ro   )rl   rm   )rp   rq   )rr   rs   )rt   ro   )ru   rv   )rw   rx   )ry   rz   )r{   r|   )rn   ro   )#rB   rF   ra   rI   Znltk.corpusr   Znltk.tagr   r   r   r@   r   r   Znltk.tblr   r	   r   r   r   r   r   r   r    r'   r*   r,   r.   r3   r
   rA   rM   ZNN_CD_TAGGERr?   re   __name__r   r   r   r   <module>
   sn   	                  
 3(

