3
­dÙ)  ã               @   s^   d Z ddlmZ ddlmZ ddlmZmZmZm	Z	 G dd„ deƒZ
dd„ Zed	krZeƒ  d
S )aê  
A classifier based on the Naive Bayes algorithm.  In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):

|                       P(label) * P(features|label)
|  P(label|features) = ------------------------------
|                              P(features)

The algorithm then makes the 'naive' assumption that all features are
independent, given the label:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                                         P(features)

Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
é    )Údefaultdict)ÚClassifierI)ÚDictionaryProbDistÚELEProbDistÚFreqDistÚsum_logsc               @   sT   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zddd„Zddd„Z	e
efdd„ƒZdS )ÚNaiveBayesClassifiera  
    A Naive Bayes classifier.  Naive Bayes classifiers are
    paramaterized by two probability distributions:

      - P(label) gives the probability that an input will receive each
        label, given no information about the input's features.

      - P(fname=fval|label) gives the probability that a given feature
        (fname) will receive a given value (fval), given that the
        label (label).

    If the classifier encounters an input with a feature that has
    never been seen with any label, then rather than assigning a
    probability of 0 to all labels, it will ignore that feature.

    The feature value 'None' is reserved for unseen feature values;
    you generally should not use 'None' as a feature value for one of
    your own features.
    c             C   s   || _ || _t|jƒ ƒ| _dS )a=  
        :param label_probdist: P(label), the probability distribution
            over labels.  It is expressed as a ``ProbDistI`` whose
            samples are labels.  I.e., P(label) =
            ``label_probdist.prob(label)``.

        :param feature_probdist: P(fname=fval|label), the probability
            distribution for feature values, given labels.  It is
            expressed as a dictionary whose keys are ``(label, fname)``
            pairs and whose values are ``ProbDistI`` objects over feature
            values.  I.e., P(fname=fval|label) =
            ``feature_probdist[label,fname].prob(fval)``.  If a given
            ``(label,fname)`` is not a key in ``feature_probdist``, then
            it is assumed that the corresponding P(fname=fval|label)
            is 0 for all values of ``fval``.
        N)Ú_label_probdistÚ_feature_probdistÚlistÚsamplesÚ_labels)ÚselfÚlabel_probdistÚfeature_probdist© r   ú8/tmp/pip-build-v9q4h5k9/nltk/nltk/classify/naivebayes.pyÚ__init__@   s    zNaiveBayesClassifier.__init__c             C   s   | j S )N)r   )r   r   r   r   ÚlabelsU   s    zNaiveBayesClassifier.labelsc             C   s   | j |ƒjƒ S )N)Úprob_classifyÚmax)r   Ú
featuresetr   r   r   ÚclassifyX   s    zNaiveBayesClassifier.classifyc             C   sä   |j ƒ }x:t|jƒ ƒD ]*}x$| jD ]}||f| jkr"P q"W ||= qW i }x| jD ]}| jj|ƒ||< qPW xl| jD ]b}x\|jƒ D ]P\}}||f| jkrº| j||f }||  |j|ƒ7  < q~||  tg ƒ7  < q~W qpW t	|dddS )NT)Ú	normalizeÚlog)
Úcopyr   Úkeysr   r
   r	   ÚlogprobÚitemsr   r   )r   r   ÚfnameÚlabelr   ÚfvalZfeature_probsr   r   r   r   [   s     
z"NaiveBayesClassifier.prob_classifyé
   c          	      sô   | j ‰ tdƒ xà| j|ƒD ]Ò\‰‰‡ ‡‡fdd„‰t‡ ‡‡fdd„| jD ƒ‡fdd„dd	}t|ƒd
krjq|d }|d }ˆ |ˆf jˆƒdkr–d}n(dˆ |ˆf jˆƒˆ |ˆf jˆƒ  }tdˆˆd| d d… d| d d… |f ƒ qW d S )NzMost Informative Featuresc                s   ˆ | ˆf j ˆƒS )N)Úprob)Úl)Úcpdistr   r!   r   r   Ú	labelprobƒ   s    zFNaiveBayesClassifier.show_most_informative_features.<locals>.labelprobc             3   s&   | ]}ˆˆ |ˆf j ƒ kr|V  qd S )N)r   )Ú.0r$   )r%   r   r!   r   r   ú	<genexpr>‡   s    zFNaiveBayesClassifier.show_most_informative_features.<locals>.<genexpr>c                s   ˆ | ƒ | fS )Nr   )Úelement)r&   r   r   Ú<lambda>ˆ   s    zENaiveBayesClassifier.show_most_informative_features.<locals>.<lambda>T)ÚkeyÚreverseé   r   ZINFz%8.1fz"%24s = %-14r %6s : %-6s = %s : 1.0z%sé   éÿÿÿÿ)r
   ÚprintÚmost_informative_featuresÚsortedr   Úlenr#   )r   Únr   Zl0Úl1Zratior   )r%   r   r!   r&   r   Úshow_most_informative_features|   s&    
&z3NaiveBayesClassifier.show_most_informative_featureséd   c       	         sà   t | dƒr| jd|… S tƒ }tdd„ ƒ‰ tdd„ ƒ‰x‚| jjƒ D ]t\\}}}xf|jƒ D ]Z}||f}|j|ƒ |j|ƒ}t	|ˆ | ƒˆ |< t
|ˆ| ƒˆ|< ˆ| dkrX|j|ƒ qXW qBW t|‡ ‡fdd„d| _| jd|… S )	a—  
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature ``(fname,fval)`` is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label:

        |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        Ú_most_informative_featuresNc               S   s   dS )Ng        r   r   r   r   r   r*   «   s    z@NaiveBayesClassifier.most_informative_features.<locals>.<lambda>c               S   s   dS )Ng      ð?r   r   r   r   r   r*   ¬   s    r   c                s0   ˆ|  ˆ |   | d | d dkt | d ƒjƒ fS )Nr   r-   FT)NFT)ÚstrÚlower)Zfeature_)ÚmaxprobÚminprobr   r   r*   ¼   s    
)r+   )Úhasattrr8   Úsetr   r
   r   r   Úaddr#   r   ÚminÚdiscardr2   )	r   r4   Úfeaturesr    r   Úprobdistr!   ZfeatureÚpr   )r;   r<   r   r1   š   s$    



z.NaiveBayesClassifier.most_informative_featuresc             C   s@  t ƒ }tt ƒ}ttƒ}tƒ }xf|D ]^\}}||  d7  < xD|jƒ D ]8\}	}
|||	f |
  d7  < ||	 j|
ƒ |j|	ƒ qDW q"W xh|D ]`}|| }xR|D ]J}	|||	f jƒ }|| dkrœ|||	f d  || 7  < ||	 jdƒ qœW qŠW ||ƒ}i }x:|jƒ D ].\\}}	}||t||	 ƒd}||||	f< qW | ||ƒS )z‹
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        r-   r   N)Zbins)r   r   r>   r   r?   ÚNr3   )ÚclsZlabeled_featuresetsZ	estimatorZlabel_freqdistZfeature_freqdistZfeature_valuesÚfnamesr   r    r   r!   Znum_samplesÚcountr   r   ZfreqdistrC   r   r   r   ÚtrainÅ   s.    

zNaiveBayesClassifier.trainN)r"   )r7   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r   r6   r1   Úclassmethodr   rI   r   r   r   r   r   +   s   !

+r   c              C   s"   ddl m}  | tjƒ}|jƒ  d S )Nr   )Ú
names_demo)Znltk.classify.utilrO   r   rI   r6   )rO   Ú
classifierr   r   r   Údemoü   s    
rQ   Ú__main__N)rM   Úcollectionsr   Znltk.classify.apir   Znltk.probabilityr   r   r   r   r   rQ   rJ   r   r   r   r   Ú<module>   s    R