3
d@                 @   s   d Z ddlZddlmZmZ ddlmZ dd Zej	Z
dd ZdZydd	lmZ W n ek
rn   d
d ZY nX dZdZdZG dd dedZG dd deZG dd deZG dd deZG dd dZdS )z
Provides scoring functions for a number of association measures through a
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
    N)ABCMetaabstractmethod)reducec             C   s
   t j| S )N)_mathlog2)x r   8/tmp/pip-build-v9q4h5k9/nltk/nltk/metrics/association.py<lambda>   s    r
   c             C   s   t dd | S )Nc             S   s   | | S )Nr   )r   yr   r   r	   r
      s    z<lambda>.<locals>.<lambda>)r   )sr   r   r	   r
      s    g#B;)fisher_exactc              O   s   t d S )N)NotImplementedError)_args_kwargsr   r   r	   r      s    r         c               @   s   e Zd ZdZdZeedd Zeedd Ze	dd Z
ed	d
 Ze	dd Ze	dd Zedd Ze	dd Ze	dd Ze	dd Ze	dd ZdS )NgramAssocMeasuresa  
    An abstract class defining a collection of generic association measures.
    Each public method returns a score, taking the following arguments::

        score_fn(count_of_ngram,
                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
                 ...,
                 (count_of_1gram_1, ..., count_of_1gram_n),
                 count_of_total_words)

    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``

    Inheriting classes should define a property _n, and a method _contingency
    which calculates contingency values from marginals in order for all
    association measures defined here to be usable.
    r   c              G   s   t ddS )z>Calculates values of a contingency table from marginal values.z?The contingency table is not availablein the general ngram caseN)r   )	marginalsr   r   r	   _contingencyB   s    zNgramAssocMeasures._contingencyc              G   s   t ddS )zACalculates values of contingency table marginals from its values.z?The contingency table is not availablein the general ngram caseN)r   )contingencyr   r   r	   
_marginalsJ   s    zNgramAssocMeasures._marginalsc             #   s`   t }dd t jD }x>ttD ].t fdd|D | jd   V  q*W dS )z3Calculates expected values for a contingency table.c             S   s   g | ]}d |> qS )r   r   ).0ir   r   r	   
<listcomp>V   s    z7NgramAssocMeasures._expected_values.<locals>.<listcomp>c             3   s2   | ]* t  fd dtdj D V  qdS )c             3   s&   | ]}|@ @ kr | V  qd S )Nr   )r   r   )contr   jr   r	   	<genexpr>]   s    z@NgramAssocMeasures._expected_values.<locals>.<genexpr>.<genexpr>r   N)sumrange_n)r   )clsr   r   )r   r	   r   ]   s   z6NgramAssocMeasures._expected_values.<locals>.<genexpr>r   N)r   r   r    len_product)r!   r   n_allbitsr   )r!   r   r   r	   _expected_valuesR   s    z#NgramAssocMeasures._expected_valuesc              G   s   | t  | t  S )z Scores ngrams by their frequency)NGRAMTOTAL)r   r   r   r	   raw_freqc   s    zNgramAssocMeasures.raw_freqc             G   s6   |t  t|t |t | jd    |t  t d  S )zScores ngrams using Student's t test with independence hypothesis
        for unigrams, as in Manning and Schutze 5.3.1.
        r   g      ?)r'   r#   UNIGRAMSr(   r    _SMALL)r!   r   r   r   r	   	student_th   s    zNgramAssocMeasures.student_tc             G   s,   | j | }| j|}tdd t||D S )zZScores ngrams using Pearson's chi-square as in Manning and Schutze
        5.3.3.
        c             s   s&   | ]\}}|| d  |t   V  qdS )r   N)r+   )r   obsexpr   r   r	   r   y   s    z,NgramAssocMeasures.chi_sq.<locals>.<genexpr>)r   r&   r   zip)r!   r   r   Zexpsr   r   r	   chi_sqr   s    

zNgramAssocMeasures.chi_sqc              O   s    | t  |jdd t| t  S )zScores ngrams using a variant of mutual information. The keyword
        argument power sets an exponent (default 3) for the numerator. No
        logarithm of the result is calculated.
        power   )r'   getr#   r*   )r   kwargsr   r   r	   mi_like{   s    zNgramAssocMeasures.mi_likec             G   s.   t |t |t | jd   t t|t  S )z^Scores ngrams by pointwise mutual information, as in Manning and
        Schutze 5.4.
        r   )_log2r'   r(   r    r#   r*   )r!   r   r   r   r	   pmi   s    zNgramAssocMeasures.pmic             G   s,   | j | }dtdd t|| j|D  S )zFScores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.r   c             s   s*   | ]"\}}|t ||t  t  V  qd S )N)_lnr+   )r   r-   r.   r   r   r	   r      s   z6NgramAssocMeasures.likelihood_ratio.<locals>.<genexpr>)r   r   r/   r&   )r!   r   r   r   r   r	   likelihood_ratio   s    
z#NgramAssocMeasures.likelihood_ratioc             G   s:   t |t |t | jd   }|t t|t | d  S )z1Scores ngrams using the Poisson-Stirling measure.r   )r#   r*   r(   r    r'   r6   )r!   r   r.   r   r   r	   poisson_stirling   s    z#NgramAssocMeasures.poisson_stirlingc             G   s"   | j | }|d t|dd  S )z&Scores ngrams using the Jaccard index.r   Nr   )r   r   )r!   r   r   r   r   r	   jaccard   s    
zNgramAssocMeasures.jaccardN)__name__
__module____qualname____doc__r    staticmethodr   r   r   classmethodr&   r)   r,   r0   r5   r7   r9   r:   r<   r   r   r   r	   r   -   s   
	
		r   )	metaclassc               @   sh   e Zd ZdZdZedd Zedd Zedd Ze	d	d
 Z
e	dd Ze	dd Zedd ZdS )BigramAssocMeasuresa  
    A collection of bigram association measures. Each association measure
    is provided as a function with three arguments::

        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_ii counts ``(w1, w2)``, i.e. the bigram being scored
    - n_ix counts ``(w1, *)``
    - n_xi counts ``(*, w2)``
    - n_xx counts ``(*, *)``, i.e. any bigram

    This may be shown with respect to a contingency table::

                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
    r   c             C   s0   |\}}||  }||  }| ||||  | | fS )zECalculates values of a bigram contingency table from marginal values.r   )n_iin_ix_xi_tuplen_xxn_ixn_xin_oin_ior   r   r	   r      s    z BigramAssocMeasures._contingencyc             C   s"   | ||  ||  f|| | |  fS )zACalculates values of contingency table marginals from its values.r   )rE   rJ   rK   n_oor   r   r	   r      s    zBigramAssocMeasures._marginalsc             c   sL   t | }x>tdD ]2}| | | |dA   | | | |dA    | V  qW dS )z3Calculates expected values for a contingency table.   r   r   N)r   r   )r   rG   r   r   r   r	   r&      s    z$BigramAssocMeasures._expected_valuesc             G   sF   | j | \}}}}|| ||  d || ||  ||  ||   S )zdScores bigrams using phi-square, the square of the Pearson correlation
        coefficient.
        r   )r   )r!   r   rE   rK   rJ   rL   r   r   r	   phi_sq   s    zBigramAssocMeasures.phi_sqc             C   s   |\}}|| j |||f| S )zScores bigrams using chi-square, i.e. phi-sq multiplied by the number
        of bigrams, as in Manning and Schutze 5.3.3.
        )rN   )r!   rE   rF   rG   rH   rI   r   r   r	   r0      s    zBigramAssocMeasures.chi_sqc             G   s2   | j | \}}}}t||g||ggdd\}}|S )zScores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.
        Zless)alternative)r   r   )r!   r   rE   rK   rJ   rL   ZoddsZpvaluer   r   r	   fisher   s    zBigramAssocMeasures.fisherc             C   s   |\}}d|  ||  S )z(Scores bigrams using Dice's coefficient.r   r   )rE   rF   rG   rH   rI   r   r   r	   dice   s    zBigramAssocMeasures.diceN)r=   r>   r?   r@   r    rA   r   r   r&   rB   rN   r0   rP   rQ   r   r   r   r	   rD      s   rD   c               @   s,   e Zd ZdZdZedd Zedd ZdS )TrigramAssocMeasuresa  
    A collection of trigram association measures. Each association measure
    is provided as a function with four arguments::

        trigram_score_fn(n_iii,
                         (n_iix, n_ixi, n_xii),
                         (n_ixx, n_xix, n_xxi),
                         n_xxx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
    - n_ixx counts ``(w1, *, *)``
    - n_xxx counts ``(*, *, *)``, i.e. any trigram
    r2   c             C   s   |\}}}|\}}}	||  }
||  }||  }|	|  |
 | }||  |
 | }||  | | }||  |
 | | | | | }| |
||||||fS )zCalculates values of a trigram contingency table (or cube) from
        marginal values.
        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
        (1, 0, 0, 0, 0, 72, 0, 1927)
        r   )n_iiiZn_iix_tupleZn_ixx_tupleZn_xxxZn_iixZn_ixiZn_xiiZn_ixxZn_xixZn_xxin_oiin_ioin_iion_ooin_oion_ioon_ooor   r   r	   r     s    

 z!TrigramAssocMeasures._contingencyc        	      G   s`   | \}}}}}}}}||| || || f|| | | || | | || | | ft | fS )zCalculates values of contingency table marginals from its values.
        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
        (1, (1, 1, 1), (1, 73, 1), 2000)
        )r   )	r   rS   rT   rU   rW   rV   rX   rY   rZ   r   r   r	   r   &  s    zTrigramAssocMeasures._marginalsN)r=   r>   r?   r@   r    rA   r   r   r   r   r   r	   rR      s   rR   c               @   s,   e Zd ZdZdZedd Zedd ZdS )QuadgramAssocMeasuresaF  
    A collection of quadgram association measures. Each association measure
    is provided as a function with five arguments::

        trigram_score_fn(n_iiii,
                        (n_iiix, n_iixi, n_ixii, n_xiii),
                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
                        n_all)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
    - n_ixxi counts ``(w1, *, *, w4)``
    - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram
    rM   c       "      C   s  |\}}}}|\}	}
}}}}|\}}}}||  }||  }||  }||  | | }||  | | }||  | | }||  | | | | | | }||  }||  | | }|
|  | | }||  | | | | | | }|	|  | | }||  | | | | | | }||  | | | | | | } ||  | | | | | | | | | | | | | |  }!| |||||||||||||| |!fS )zXCalculates values of a quadgram contingency table from
        marginal values.
        r   )"n_iiiiZn_iiix_tupleZn_iixx_tupleZn_ixxx_tupleZn_xxxxn_iiixn_iixin_ixiin_xiiin_iixxn_ixixn_ixxin_xixin_xxiin_xiixn_ixxxn_xixxn_xxixn_xxxin_oiiin_ioiin_iioin_ooiin_oioin_iooin_oooin_iiion_oiion_ioion_ooion_iioon_oioon_iooon_oooor   r   r	   r   P  sD        @z"QuadgramAssocMeasures._contingencyc               G   sV  | \}}}}}}}}}	}
}}}}}}||	 }|| }|| }|| }|| |	 | }|| |	 | }|| | | }|| | | }|| | | }|| |	 |
 }|| | |	 | | | | }|| | |	 | |
 | | }|| | |	 | | |
 | }|| | | | | | | }t | }|||||f||||||f||||f|fS )a  Calculates values of contingency table marginals from its values.
        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
        )r   ) r   r\   rk   rl   rn   rm   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   r$   r   r   r	   r     s*    $    

z QuadgramAssocMeasures._marginalsN)r=   r>   r?   r@   r    rA   r   r   r   r   r   r	   r[   9  s   <r[   c               @   s$   e Zd ZdZdd Zedd ZdS )ContingencyMeasureszWraps NgramAssocMeasures classes such that the arguments of association
    measures are contingency table values rather than marginals.
    c             C   s^   d|j j | j _xHt|D ]<}|jdr*qt||}|jdsJ| j||}t| || qW dS )zAConstructs a ContingencyMeasures given a NgramAssocMeasures classZContingency___N)	__class__r=   dir
startswithgetattr_make_contingency_fnsetattr)selfmeasureskvr   r   r	   __init__  s    


zContingencyMeasures.__init__c                s"    fdd}j |_ j|_|S )zFrom an association measure function, produces a new function which
        accepts contingency table values as its arguments.
        c                 s    j |   S )N)r   )r   )r   old_fnr   r	   res  s    z5ContingencyMeasures._make_contingency_fn.<locals>.res)r@   r=   )r   r   r   r   )r   r   r	   r     s    z(ContingencyMeasures._make_contingency_fnN)r=   r>   r?   r@   r   rA   r   r   r   r   r	   rz     s   rz   r;   )r@   mathr   abcr   r   	functoolsr   r6   logr8   r#   r+   Zscipy.statsr   ImportErrorr'   r*   r(   r   rD   rR   r[   rz   r   r   r   r	   <module>   s(   wY< 	