3
d0                 @   s   d Z ddlZddlZddlmZ dddZdd Zdd	 Zd
d Z	G dd dZ
dd Zdd ZefddZefddZi adddZdd ZdS )z0
Utility functions and classes for classifiers.
    N)LazyMapc                sF   |dkr|ot |d ttf}|r8 fdd}t||S t |S dS )a  
    Use the ``LazyMap`` class to construct a lazy list-like
    object that is analogous to ``map(feature_func, toks)``.  In
    particular, if ``labeled=False``, then the returned list-like
    object's values are equal to::

        [feature_func(tok) for tok in toks]

    If ``labeled=True``, then the returned list-like object's values
    are equal to::

        [(feature_func(tok), label) for (tok, label) in toks]

    The primary purpose of this function is to avoid the memory
    overhead involved in storing all the featuresets for every token
    in a corpus.  Instead, these featuresets are constructed lazily,
    as-needed.  The reduction in memory overhead can be especially
    significant when the underlying list of tokens is itself lazy (as
    is the case with many corpus readers).

    :param feature_func: The function that will be applied to each
        token.  It should return a featureset -- i.e., a dict
        mapping feature names to feature values.
    :param toks: The list of tokens to which ``feature_func`` should be
        applied.  If ``labeled=True``, then the list elements will be
        passed directly to ``feature_func()``.  If ``labeled=False``,
        then the list elements should be tuples ``(tok,label)``, and
        ``tok`` will be passed to ``feature_func()``.
    :param labeled: If true, then ``toks`` contains labeled tokens --
        i.e., tuples of the form ``(tok, label)``.  (Default:
        auto-detect based on types.)
    Nr   c                s    | d | d fS )Nr       )Zlabeled_token)feature_funcr   2/tmp/pip-build-v9q4h5k9/nltk/nltk/classify/util.py	lazy_func@   s    z!apply_features.<locals>.lazy_func)
isinstancetuplelistr   )r   toksZlabeledr   r   )r   r   apply_features   s    !
r   c             C   s   t dd | D S )a!  
    :return: A list of all labels that are attested in the given list
        of tokens.
    :rtype: list of (immutable)
    :param tokens: The list of classified tokens from which to extract
        labels.  A classified token has the form ``(token, label)``.
    :type tokens: list
    c             S   s   h | ]\}}|qS r   r   ).0toklabelr   r   r   	<setcomp>Q   s    z"attested_labels.<locals>.<setcomp>)r	   )tokensr   r   r   attested_labelsH   s    	r   c             C   s>   | j dd |D }dd t||D }tjt|t| S )Nc             S   s   g | ]\}}|qS r   r   )r   fslr   r   r   
<listcomp>U   s    z"log_likelihood.<locals>.<listcomp>c             S   s   g | ]\\}}}|j |qS r   )prob)r   r   r   pdistr   r   r   r   V   s    )prob_classify_manyzipmathlogsumlen)
classifiergoldresultsllr   r   r   log_likelihoodT   s    r"   c             C   sD   | j dd |D }dd t||D }|r<t|t| S dS d S )Nc             S   s   g | ]\}}|qS r   r   )r   r   r   r   r   r   r   [   s    zaccuracy.<locals>.<listcomp>c             S   s   g | ]\\}}}||kqS r   r   )r   r   r   rr   r   r   r   \   s    r   )Zclassify_manyr   r   r   )r   r   r    correctr   r   r   accuracyZ   s
    r%   c               @   s    e Zd ZdZdd Zdd ZdS )CutoffCheckerz
    A helper class that implements cutoff checks based on number of
    iterations and log likelihood.

    Accuracy cutoffs are also implemented, but they're almost never
    a good idea to use.
    c             C   sR   |j  | _d|kr$t|d  |d< d|kr<t|d |d< d | _d | _d| _d S )Nmin_llmin_lldeltar   )copycutoffsabsr!   acciter)selfr*   r   r   r   __init__l   s    
zCutoffChecker.__init__c             C   s  | j }|  jd7  _d|kr.| j|d kr.dS tjjj||}tj|rLdS d|ks\d|krd|krt||d krtdS d|kr| jr|| j t	|d krdS || _d|ksd|krtjjj||}d|kr||d krdS d|ko| j
o|| j
 t	|d krdS || _
dS d S )	Nr   Zmax_iterTr'   r(   Zmax_accZmin_accdeltaF)r*   r-   nltkZclassifyutilr"   r   isnanr!   r+   r,   )r.   r   Z
train_toksr*   Znew_llZnew_accr   r   r   checkv   s2    
zCutoffChecker.checkN)__name__
__module____qualname____doc__r/   r3   r   r   r   r   r&   c   s   
r&   c             C   sh   i }d|d< | d j  |d< | d
 j  |d< x6dD ].}| j  j||d| < || j  k|d	| < q2W |S )NTalwaysonr   
startswithr   endswithabcdefghijklmnopqrstuvwxyzz	count(%s)zhas(%s))lowercount)namefeaturesletterr   r   r   names_demo_features   s    
rB   c             C   s   i }d|d< | d j  dk|d< | d j  dk|d< xfdD ]^}| j  j||d	| < || j  k|d
| < || d j  k|d| < || d j  k|d| < q:W |S )NTr8   r   Zaeiouyzstartswith(vowel)r   zendswith(vowel)r;   z	count(%s)zhas(%s)zstartswith(%s)zendswith(%s)r<   r<   )r=   r>   )r?   r@   rA   r   r   r   binary_names_demo_features   s    
rC   c                s  dd l }ddlm} dd |jdD dd |jdD  }|jd |j| |d d	 }|d	d
 }td |  fdd|D }td t| fdd|D }td|  y fdd|D }	|j|	}
dd t	||
D }tdt
|t|   t  tddd   xZtt	||
d d D ]@\\}}}|dkrFd}nd}t|||jd|jdf  q*W W n tk
r   Y nX |S )Nr   )namesc             S   s   g | ]}|d fqS )maler   )r   r?   r   r   r   r      s    znames_demo.<locals>.<listcomp>zmale.txtc             S   s   g | ]}|d fqS )femaler   )r   r?   r   r   r   r      s    z
female.txti@ i  i|  zTraining classifier...c                s   g | ]\}} ||fqS r   r   )r   ng)r@   r   r   r      s    zTesting classifier...c                s   g | ]\}} ||fqS r   r   )r   rG   rH   )r@   r   r   r      s    zAccuracy: %6.4fc                s   g | ]\}} |qS r   r   )r   rG   rH   )r@   r   r   r      s    c             S   s   g | ]\\}}}|j |qS r   )logprob)r   r?   r   r   r   r   r   r      s    zAvg. log likelihood: %6.4fz%Unseen Names      P(Male)  P(Female)
-(      rE   z  %-15s *%6.4f   %6.4fz  %-15s  %6.4f  *%6.4frF   )randomnltk.corpusrD   wordsseedshuffleprintr%   r   r   r   r   r
   r   NotImplementedError)trainerr@   rM   rD   namelisttraintestr   r,   test_featuresetspdistsr!   r?   Zgenderr   fmtr   )r@   r   
names_demo   s8    


$
(r[   c                s  dd l }ddlm} |jd}|jd}|jd |j| |j| t |d d }t |dd |d d  }d	d
 |dd D dd
 |dd D  }|j| td | ||}	td t|	 fdd
|D }
td|
  y fdd
|D }|	j	|}dd
 t
||D }tdt|t|   t  tddd   xVt
||d d D ]@\\}}}|dkrzd}nd}t|||jd|jdf  q^W W n tk
r   Y nX |	S )Nr   )rD   zmale.txtz
female.txti	 i  i	  i  c             S   s   g | ]}|d fqS )Tr   )r   r?   r   r   r   r      s    z&partial_names_demo.<locals>.<listcomp>i
  c             S   s   g | ]}|d fqS )Fr   )r   r?   r   r   r   r      s    i  zTraining classifier...zTesting classifier...c                s   g | ]\}} ||fqS r   r   )r   rG   m)r@   r   r   r     s    zAccuracy: %6.4fc                s   g | ]\}} |qS r   r   )r   rG   r\   )r@   r   r   r     s    c             S   s   g | ]\\}}}|j |qS r   )rI   )r   r?   r   r   r   r   r   r     s    zAvg. log likelihood: %6.4fz%Unseen Names      P(Male)  P(Female)
rJ   rK   rL   Tz  %-15s *%6.4f   %6.4fz  %-15s  %6.4f  *%6.4fF)rM   rN   rD   rO   rP   rQ   maprR   r%   r   r   r   r   r   rS   )rT   r@   rM   rD   Z
male_namesZfemale_namesZpositiveZ	unlabeledrW   r   r,   rX   rY   r!   r?   Zis_maler   rZ   r   )r@   r   partial_names_demo   s@    







 
(r^     c                sz  dd l }ddlm} td |tkr<dd |j|D t|< t| d d  }|t|kr`t|}tdd |D }tdd	j|  td
 |j	d |j
| |d td|  }|td| | }	td |  fdd|D }
td t|
 fdd|	D }td|  yL fdd|	D }|
j|}dd t|	|D }tdt|t|	   W n tk
rt   Y nX |
S )Nr   )sensevalzReading data...c             S   s   g | ]}||j d  fqS )r   )senses)r   ir   r   r   r   *  s    zwsd_demo.<locals>.<listcomp>c             S   s   h | ]\}}|qS r   r   )r   rb   r   r   r   r   r   .  s    zwsd_demo.<locals>.<setcomp>z
  Senses:  zSplitting into test & train...i@ g?zTraining classifier...c                s   g | ]\}} ||fqS r   r   )r   rb   r   )r@   r   r   r   :  s    zTesting classifier...c                s   g | ]\}} ||fqS r   r   )r   rb   r   )r@   r   r   r   >  s    zAccuracy: %6.4fc                s   g | ]\}} |qS r   r   )r   rb   rG   )r@   r   r   r   D  s    c             S   s   g | ]\\}}}|j |qS r   )rI   )r   r?   r   r   r   r   r   r   F  s    zAvg. log likelihood: %6.4f)rM   rN   r`   rR   _inst_cache	instancesr   r
   joinrP   rQ   intr%   r   r   r   rS   )rT   wordr@   rG   rM   r`   re   ra   rV   rW   r   r,   rX   rY   r!   r   )r@   r   wsd_demo!  s8    


ri   c              C   sB   yt  W n4 tk
r< }  ztd}t|| W Y dd} ~ X nX dS )z8
    Checks whether the MEGAM binary is configured.
    z\Please configure your megam binary first, e.g.
>>> nltk.config_megam('/usr/bin/local/megam')N)Z
_megam_bin	NameErrorstr)eerr_msgr   r   r   check_megam_configO  s    rn   )N)r_   )r7   r   Znltk.classify.utilr0   Z	nltk.utilr   r   r   r"   r%   r&   rB   rC   r[   r^   rd   ri   rn   r   r   r   r   <module>   s   

-	</8
.