3
d                 @   s  d Z yddlZW n ek
r$   Y nX ddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ dZG dd deZeZG dd dZG dd deZG dd deZ G dd de Z!G dd de Z"G dd deZ#d.ddZ$dd Z%dd  Z&d/d!d"Z'd#d$ Z(d%d& Z)d0d'd(Z*G d)d* d*eZ+d+d, Z,e-d-kre,  dS )1a  
A classifier model based on maximum entropy modeling framework.  This
framework considers all of the probability distributions that are
empirically consistent with the training data; and chooses the
distribution with the highest entropy.  A probability distribution is
"empirically consistent" with a set of training data if its estimated
frequency with which a class and a feature vector value co-occur is
equal to the actual frequency in the data.

Terminology: 'feature'
======================
The term *feature* is usually used to refer to some property of an
unlabeled token.  For example, when performing word sense
disambiguation, we might define a ``'prevword'`` feature whose value is
the word preceding the target word.  However, in the context of
maxent modeling, the term *feature* is typically used to refer to a
property of a "labeled" token.  In order to prevent confusion, we
will introduce two distinct terms to disambiguate these two different
concepts:

  - An "input-feature" is a property of an unlabeled token.
  - A "joint-feature" is a property of a labeled token.

In the rest of the ``nltk.classify`` module, the term "features" is
used to refer to what we will call "input-features" in this module.

In literature that describes and discusses maximum entropy models,
input-features are typically called "contexts", and joint-features
are simply referred to as "features".

Converting Input-Features to Joint-Features
-------------------------------------------
In maximum entropy models, joint-features are required to have numeric
values.  Typically, each input-feature ``input_feat`` is mapped to a
set of joint-features of the form:

|   joint_feat(token, label) = { 1 if input_feat(token) == feat_val
|                              {      and label == some_label
|                              {
|                              { 0 otherwise

For all values of ``feat_val`` and ``some_label``.  This mapping is
performed by classes that implement the ``MaxentFeatureEncodingI``
interface.
    N)defaultdict)ClassifierI)
call_megamparse_megam_weightswrite_megam_file)	call_tadmparse_tadm_weightswrite_tadm_file)CutoffCheckeraccuracylog_likelihood)gzip_open_unicode)DictionaryProbDist)OrderedDictz
epytext enc               @   s   e Zd ZdZd#ddZdd Zdd Zd	d
 Zdd Zdd Z	d$ddZ
d%ddZd&ddZdd ZddddgZed'd!d"ZdS )(MaxentClassifiera  
    A maximum entropy classifier (also known as a "conditional
    exponential classifier").  This classifier is parameterized by a
    set of "weights", which are used to combine the joint-features
    that are generated from a featureset by an "encoding".  In
    particular, the encoding maps each ``(featureset, label)`` pair to
    a vector.  The probability of each label is then computed using
    the following equation::

                                dotprod(weights, encode(fs,label))
      prob(fs|label) = ---------------------------------------------------
                       sum(dotprod(weights, encode(fs,l)) for l in labels)

    Where ``dotprod`` is the dot product::

      dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
    Tc             C   s*   || _ || _|| _|j t|ks&tdS )a{  
        Construct a new maxent classifier model.  Typically, new
        classifier models are created using the ``train()`` method.

        :type encoding: MaxentFeatureEncodingI
        :param encoding: An encoding that is used to convert the
            featuresets that are given to the ``classify`` method into
            joint-feature vectors, which are used by the maxent
            classifier model.

        :type weights: list of float
        :param weights:  The feature weight vector for this classifier.

        :type logarithmic: bool
        :param logarithmic: If false, then use non-logarithmic weights.
        N)	_encoding_weights_logarithmiclengthlenAssertionError)selfencodingweightsZlogarithmic r   4/tmp/pip-build-v9q4h5k9/nltk/nltk/classify/maxent.py__init__a   s    zMaxentClassifier.__init__c             C   s
   | j j S )N)r   labels)r   r   r   r   r   x   s    zMaxentClassifier.labelsc             C   s    || _ | jj t|kstdS )z
        Set the feature weight vector for this classifier.
        :param new_weights: The new feature weight vector.
        :type new_weights: list of float
        N)r   r   r   r   r   )r   Znew_weightsr   r   r   set_weights{   s    zMaxentClassifier.set_weightsc             C   s   | j S )zg
        :return: The feature weight vector for this classifier.
        :rtype: list of float
        )r   )r   r   r   r   r      s    zMaxentClassifier.weightsc             C   s   | j |j S )N)prob_classifymax)r   
featuresetr   r   r   classify   s    zMaxentClassifier.classifyc       	      C   s   i }x| j j D ]z}| j j||}| jrZd}x"|D ]\}}|| j| | 7 }q2W |||< qd}x"|D ]\}}|| j| | 9 }qdW |||< qW t|| jddS )Ng        g      ?T)log	normalize)r   r   encoder   r   r   )	r   r!   Z	prob_dictlabelfeature_vectortotalf_idf_valprodr   r   r   r      s    
zMaxentClassifier.prob_classify   c                s  d}dt |d  d }j| t j  jdd}|d| }tdj|d	jd
d |D   tdd|d dt|     t	t
xt|D ]\}}jj||}|jfdddd x|D ]\}	}
jrj|	 |
 }nj|	 |
 }jj|	}|jdd }|d|
 7 }t|dkr8|dd d }t|||d d |f  |  |7  < qW qW tdd|d dt|     tdj|d	jfdd|D   tdj|d	j fdd|D   dS )z
        Print a table showing the effect of each of the features in
        the given feature set, and how they combine to determine the
        probabilities of each label for that featureset.
        2   z  %-   zs%s%8.3fT)keyreverseNz	  Feature c             s   s"   | ]}d d| dd  V  qdS )z%8sz%sN   r   ).0lr   r   r   	<genexpr>   s    z+MaxentClassifier.explain.<locals>.<genexpr>z  -   c                s   t  j| d  S )Nr   )absr   )Zfid__)r   r   r   <lambda>   s    z*MaxentClassifier.explain.<locals>.<lambda>z and label is r   z (%s)/   ,   z...    z  TOTAL:c             3   s   | ]}d  |  V  qdS )z%8.3fNr   )r3   r4   )sumsr   r   r5      s    z  PROBS:c             3   s   | ]}d  j | V  qdS )z%8.3fN)prob)r3   r4   )pdistr   r   r5      s    )strr   sortedsamplesr?   printljustjoinr   r   int	enumerater   r%   sortr   r   describesplit)r   r!   columnsZdescr_widthTEMPLATEr   ir&   r'   r)   r*   Zscoredescrr   )r@   r   r>   r   explain   s>    
  $zMaxentClassifier.explain
   c                sP   t  dr jd| S tttt j fdddd _ jd| S dS )zW
        Generates the ranked list of informative features from most to least.
        _most_informative_featuresNc                s   t  j|  S )N)r8   r   )fid)r   r   r   r9      s    z<MaxentClassifier.most_informative_features.<locals>.<lambda>T)r/   r0   )hasattrrR   rB   listranger   r   )r   nr   )r   r   most_informative_features   s    


z*MaxentClassifier.most_informative_featuresallc                s|    j d}|dkr& fdd|D }n|dkr@ fdd|D }x6|d| D ]&}t j| dd jj|  qNW dS )	z
        :param show: all, neg, or pos (for negative-only or positive-only)
        :type show: str
        :param n: The no. of top features
        :type n: int
        Nposc                s   g | ]} j | d kr|qS )r   )r   )r3   rS   )r   r   r   
<listcomp>   s    zCMaxentClassifier.show_most_informative_features.<locals>.<listcomp>negc                s   g | ]} j | d k r|qS )r   )r   )r3   rS   )r   r   r   r[      s    z8.3fr<   )rX   rD   r   r   rJ   )r   rW   showZfidsrS   r   )r   r   show_most_informative_features   s    
z/MaxentClassifier.show_most_informative_featuresc             C   s   dt | jj | jj f S )Nz:<ConditionalExponentialClassifier: %d labels, %d features>)r   r   r   r   )r   r   r   r   __repr__   s    zMaxentClassifier.__repr__ZGISZIISZMEGAMZTADMN   r   c       
      K   s   |dkrd}x |D ]}|dkrt d| qW |j }|dkrPt||||f|S |dkrjt||||f|S |dkrt|||||f|S |dkr|}	||	d< ||	d< ||	d< ||	d< tj|f|	S td| dS )a	  
        Train a new maxent classifier based on the given corpus of
        training samples.  This classifier will have its weights
        chosen to maximize entropy while remaining empirically
        consistent with the training corpus.

        :rtype: MaxentClassifier
        :return: The new maxent classifier

        :type train_toks: list
        :param train_toks: Training data, represented as a list of
            pairs, the first member of which is a featureset,
            and the second of which is a classification label.

        :type algorithm: str
        :param algorithm: A case-insensitive string, specifying which
            algorithm should be used to train the classifier.  The
            following algorithms are currently available.

            - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
              Improved Iterative Scaling (``'IIS'``)
            - External Libraries (requiring megam):
              LM-BFGS algorithm, with training performed by Megam (``'megam'``)

            The default algorithm is ``'IIS'``.

        :type trace: int
        :param trace: The level of diagnostic tracing output to produce.
            Higher values produce more verbose output.
        :type encoding: MaxentFeatureEncodingI
        :param encoding: A feature encoding, used to convert featuresets
            into feature vectors.  If none is specified, then a
            ``BinaryMaxentFeatureEncoding`` will be built based on the
            features that are attested in the training corpus.
        :type labels: list(str)
        :param labels: The set of possible labels.  If none is given, then
            the set of all labels attested in the training data will be
            used instead.
        :param gaussian_prior_sigma: The sigma value for a gaussian
            prior on model weights.  Currently, this is supported by
            ``megam``. For other algorithms, its value is ignored.
        :param cutoffs: Arguments specifying various conditions under
            which the training should be halted.  (Some of the cutoff
            conditions are not supported by some algorithms.)

            - ``max_iter=v``: Terminate after ``v`` iterations.
            - ``min_ll=v``: Terminate after the negative average
              log-likelihood drops under ``v``.
            - ``min_lldelta=v``: Terminate if a single iteration improves
              log likelihood by less than ``v``.
        NZiismax_itermin_llmin_lldeltamax_accmin_accdeltacount_cutoffnormexplicit	bernoullizUnexpected keyword arg %rZgisZmegamZtadmtracer   r   gaussian_prior_sigmazUnknown algorithm %s)	ra   rb   rc   rd   re   rf   rg   rh   ri   )	TypeErrorlower train_maxent_classifier_with_iis train_maxent_classifier_with_gis"train_maxent_classifier_with_megamTadmMaxentClassifiertrain
ValueError)
cls
train_toks	algorithmrj   r   r   rk   cutoffsr/   kwargsr   r   r   rr      s@    >
        zMaxentClassifier.train)T)r,   )rQ   )rQ   rY   )Nr`   NNr   )__name__
__module____qualname____doc__r   r   r   r   r"   r   rP   rX   r^   r_   Z
ALGORITHMSclassmethodrr   r   r   r   r   r   N   s$   
	
,

    r   c               @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )MaxentFeatureEncodingIa  
    A mapping that converts a set of input-feature values to a vector
    of joint-feature values, given a label.  This conversion is
    necessary to translate featuresets into a format that can be used
    by maximum entropy models.

    The set of joint-features used by a given encoding is fixed, and
    each index in the generated joint-feature vectors corresponds to a
    single joint-feature.  The length of the generated joint-feature
    vectors is therefore constant (for a given encoding).

    Because the joint-feature vectors generated by
    ``MaxentFeatureEncodingI`` are typically very sparse, they are
    represented as a list of ``(index, value)`` tuples, specifying the
    value of each non-zero joint-feature.

    Feature encodings are generally created using the ``train()``
    method, which generates an appropriate encoding based on the
    input-feature values and labels that are present in a given
    corpus.
    c             C   s
   t  dS )aC  
        Given a (featureset, label) pair, return the corresponding
        vector of joint-feature values.  This vector is represented as
        a list of ``(index, value)`` tuples, specifying the value of
        each non-zero joint-feature.

        :type featureset: dict
        :rtype: list(tuple(int, int))
        N)NotImplementedError)r   r!   r&   r   r   r   r%   {  s    
zMaxentFeatureEncodingI.encodec             C   s
   t  dS )z
        :return: The size of the fixed-length joint-feature vectors
            that are generated by this encoding.
        :rtype: int
        N)r   )r   r   r   r   r     s    zMaxentFeatureEncodingI.lengthc             C   s
   t  dS )z
        :return: A list of the "known labels" -- i.e., all labels
            ``l`` such that ``self.encode(fs,l)`` can be a nonzero
            joint-feature vector for some value of ``fs``.
        :rtype: list
        N)r   )r   r   r   r   r     s    zMaxentFeatureEncodingI.labelsc             C   s
   t  dS )z
        :return: A string describing the value of the joint-feature
            whose index in the generated feature vectors is ``fid``.
        :rtype: str
        N)r   )r   rS   r   r   r   rJ     s    zMaxentFeatureEncodingI.describec             C   s
   t  dS )ao  
        Construct and return new feature encoding, based on a given
        training corpus ``train_toks``.

        :type train_toks: list(tuple(dict, str))
        :param train_toks: Training data, represented as a list of
            pairs, the first member of which is a feature dictionary,
            and the second of which is a classification label.
        N)r   )rt   ru   r   r   r   rr     s    
zMaxentFeatureEncodingI.trainN)	ry   rz   r{   r|   r%   r   r   rJ   rr   r   r   r   r   r~   d  s   	r~   c               @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )#FunctionBackedMaxentFeatureEncodingz
    A feature encoding that calls a user-supplied function to map a
    given featureset/label pair to a sparse joint-feature vector.
    c             C   s   || _ || _|| _dS )ag  
        Construct a new feature encoding based on the given function.

        :type func: (callable)
        :param func: A function that takes two arguments, a featureset
             and a label, and returns the sparse joint feature vector
             that encodes them::

                 func(featureset, label) -> feature_vector

             This sparse joint feature vector (``feature_vector``) is a
             list of ``(index,value)`` tuples.

        :type length: int
        :param length: The size of the fixed-length joint-feature
            vectors that are generated by this encoding.

        :type labels: list
        :param labels: A list of the "known labels" for this
            encoding -- i.e., all labels ``l`` such that
            ``self.encode(fs,l)`` can be a nonzero joint-feature vector
            for some value of ``fs``.
        N)_length_func_labels)r   funcr   r   r   r   r   r     s    z,FunctionBackedMaxentFeatureEncoding.__init__c             C   s   | j ||S )N)r   )r   r!   r&   r   r   r   r%     s    z*FunctionBackedMaxentFeatureEncoding.encodec             C   s   | j S )N)r   )r   r   r   r   r     s    z*FunctionBackedMaxentFeatureEncoding.lengthc             C   s   | j S )N)r   )r   r   r   r   r     s    z*FunctionBackedMaxentFeatureEncoding.labelsc             C   s   dS )Nzno description availabler   )r   rS   r   r   r   rJ     s    z,FunctionBackedMaxentFeatureEncoding.describeN)	ry   rz   r{   r|   r   r%   r   r   rJ   r   r   r   r   r     s   r   c               @   sH   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Ze	dddZ
dS )BinaryMaxentFeatureEncodinga  
    A feature encoding that generates vectors containing a binary
    joint-features of the form:

    |  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
    |                      {
    |                      { 0 otherwise

    Where ``fname`` is the name of an input-feature, ``fval`` is a value
    for that input-feature, and ``label`` is a label.

    Typically, these features are constructed based on a training
    corpus, using the ``train()`` method.  This method will create one
    feature for each combination of ``fname``, ``fval``, and ``label``
    that occurs at least once in the training corpus.

    The ``unseen_features`` parameter can be used to add "unseen-value
    features", which are used whenever an input feature has a value
    that was not encountered in the training corpus.  These features
    have the form:

    |  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
    |                      {      and l == label
    |                      {
    |                      { 0 otherwise

    Where ``is_unseen(fname, fval)`` is true if the encoding does not
    contain any joint features that are true when ``fs[fname]==fval``.

    The ``alwayson_features`` parameter can be used to add "always-on
    features", which have the form::

    |  joint_feat(fs, l) = { 1 if (l == label)
    |                      {
    |                      { 0 otherwise

    These always-on features allow the maxent model to directly model
    the prior probabilities of each label.
    Fc                s   t |j t tt|kr$tdt| _| _t| _d _	d _
|rz fddt|D  _	  jt j	7  _|rdd |D } fddt|D  _
  jt|7  _dS )a  
        :param labels: A list of the "known labels" for this encoding.

        :param mapping: A dictionary mapping from ``(fname,fval,label)``
            tuples to corresponding joint-feature indexes.  These
            indexes must be the set of integers from 0...len(mapping).
            If ``mapping[fname,fval,label]=id``, then
            ``self.encode(..., fname:fval, ..., label)[id]`` is 1;
            otherwise, it is 0.

        :param unseen_features: If true, then include unseen value
           features in the generated joint-feature vectors.

        :param alwayson_features: If true, then include always-on
           features in the generated joint-feature vectors.
        zHMapping values must be exactly the set of integers from 0...len(mapping)Nc                s   i | ]\}}| j  |qS r   )r   )r3   rN   r&   )r   r   r   
<dictcomp>,  s   z8BinaryMaxentFeatureEncoding.__init__.<locals>.<dictcomp>c             S   s   h | ]\}}}|qS r   r   )r3   fnamefvalr&   r   r   r   	<setcomp>2  s    z7BinaryMaxentFeatureEncoding.__init__.<locals>.<setcomp>c                s   i | ]\}}| j  |qS r   )r   )r3   rN   r   )r   r   r   r   3  s    )setvaluesrV   r   rs   rU   r   _mappingr   	_alwayson_unseenrH   )r   r   mappingunseen_featuresalwayson_featuresfnamesr   )r   r   r     s     


z$BinaryMaxentFeatureEncoding.__init__c             C   s   g }x|j  D ]z\}}|||f| jkrB|j| j|||f df q| jrx>| jD ]}|||f| jkrPP qPW || jkr|j| j| df qW | jr|| jkr|j| j| df |S )Nr=   )itemsr   appendr   r   r   )r   r!   r&   r   r   r   label2r   r   r   r%   6  s    
z"BinaryMaxentFeatureEncoding.encodec             C   s"  t |tstdy
| j W nH tk
rd   dgt| j | _x | jj D ]\}}|| j|< qJW Y nX |t| jk r| j| \}}}| d|d|S | jr|| jj	 krxp| jj D ]\}}||krd| S qW nH| j
o|| j
j	 krx0| j
j D ]\}}||krd| S qW ntdd S )	Nzdescribe() expected an intr=   z==z and label is zlabel is %rz%s is unseenzBad feature id)
isinstancerG   rl   _inv_mappingAttributeErrorr   r   r   r   r   r   rs   )r   r)   inforN   r   r   r&   f_id2r   r   r   rJ   Q  s(    

z$BinaryMaxentFeatureEncoding.describec             C   s   | j S )N)r   )r   r   r   r   r   j  s    z"BinaryMaxentFeatureEncoding.labelsc             C   s   | j S )N)r   )r   r   r   r   r   n  s    z"BinaryMaxentFeatureEncoding.lengthr   Nc             K   s   i }t  }tt}x|D ]\}}	|r8|	|kr8td|	 |j|	 xX|j D ]L\}
}||
|f  d7  < ||
|f |krL|
||	f|krLt|||
||	f< qLW qW |dkr|}| ||f|S )a  
        Construct and return new feature encoding, based on a given
        training corpus ``train_toks``.  See the class description
        ``BinaryMaxentFeatureEncoding`` for a description of the
        joint-features that will be included in this encoding.

        :type train_toks: list(tuple(dict, str))
        :param train_toks: Training data, represented as a list of
            pairs, the first member of which is a feature dictionary,
            and the second of which is a classification label.

        :type count_cutoff: int
        :param count_cutoff: A cutoff value that is used to discard
            rare joint-features.  If a joint-feature's value is 1
            fewer than ``count_cutoff`` times in the training corpus,
            then that joint-feature is not included in the generated
            encoding.

        :type labels: list
        :param labels: A list of labels that should be used by the
            classifier.  If not specified, then the set of labels
            attested in ``train_toks`` will be used.

        :param options: Extra parameters for the constructor, such as
            ``unseen_features`` and ``alwayson_features``.
        zUnexpected label %sr=   N)r   r   rG   rs   addr   r   )rt   ru   rf   r   optionsr   seen_labelscounttokr&   r   r   r   r   r   rr   r  s    
z!BinaryMaxentFeatureEncoding.train)FF)r   N)ry   rz   r{   r|   r   r%   rJ   r   r   r}   rr   r   r   r   r   r     s   '
1r   c               @   s>   e Zd ZdZdddZedd Zdd	 Zd
d Zdd Z	dS )GISEncodinga  
    A binary feature encoding which adds one new joint-feature to the
    joint-features defined by ``BinaryMaxentFeatureEncoding``: a
    correction feature, whose value is chosen to ensure that the
    sparse vector always sums to a constant non-negative number.  This
    new feature is used to ensure two preconditions for the GIS
    training algorithm:

      - At least one feature vector index must be nonzero for every
        token.
      - The feature vector must sum to a constant non-negative number
        for every token.
    FNc             C   s:   t j| |||| |dkr0tdd |D d }|| _dS )a	  
        :param C: The correction constant.  The value of the correction
            feature is based on this value.  In particular, its value is
            ``C - sum([v for (f,v) in encoding])``.
        :seealso: ``BinaryMaxentFeatureEncoding.__init__``
        Nc             S   s   h | ]\}}}|qS r   r   )r3   r   r   r&   r   r   r   r     s    z'GISEncoding.__init__.<locals>.<setcomp>r=   )r   r   r   _C)r   r   r   r   r   Cr   r   r   r     s
    	zGISEncoding.__init__c             C   s   | j S )zOThe non-negative constant that all encoded feature vectors
        will sum to.)r   )r   r   r   r   r     s    zGISEncoding.Cc             C   sT   t j| ||}t j| }tdd |D }|| jkr<td|j|| j| f |S )Nc             s   s   | ]\}}|V  qd S )Nr   )r3   fvr   r   r   r5     s    z%GISEncoding.encode.<locals>.<genexpr>z&Correction feature is not high enough!)r   r%   r   sumr   rs   r   )r   r!   r&   r   Zbase_lengthr(   r   r   r   r%     s    

zGISEncoding.encodec             C   s   t j| d S )Nr=   )r   r   )r   r   r   r   r     s    zGISEncoding.lengthc             C   s(   |t j| krd| j S t j| |S d S )NzCorrection feature (%s))r   r   r   rJ   )r   r)   r   r   r   rJ     s    
zGISEncoding.describe)FFN)
ry   rz   r{   r|   r   propertyr   r%   r   rJ   r   r   r   r   r     s   
r   c               @   sD   e Zd ZdddZdd Zdd Zdd	 Zd
d ZedddZ	dS )TadmEventMaxentFeatureEncodingFc             C   s*   t || _t  | _tj| || j|| d S )N)r   r   _label_mappingr   r   )r   r   r   r   r   r   r   r   r     s    
z'TadmEventMaxentFeatureEncoding.__init__c             C   s   g }x|j  D ]x\}}||f| jkr8t| j| j||f< || jkrht|ts^t| j| j|< n
|| j|< |j| j||f | j| f qW |S )N)r   r   r   r   r   rG   r   )r   r!   r&   r   featurevaluer   r   r   r%     s    


z%TadmEventMaxentFeatureEncoding.encodec             C   s   | j S )N)r   )r   r   r   r   r     s    z%TadmEventMaxentFeatureEncoding.labelsc             C   s2   x,| j D ]"\}}| j ||f |kr||fS qW d S )N)r   )r   rS   r   r&   r   r   r   rJ     s    z'TadmEventMaxentFeatureEncoding.describec             C   s
   t | jS )N)r   r   )r   r   r   r   r     s    z%TadmEventMaxentFeatureEncoding.lengthr   Nc       	      K   s   t  }|sg }t|}x"|D ]\}}||kr|j| qW xH|D ]@\}}x6|D ].}x(|D ] }||f|krXt||||f< qXW qNW q@W | ||f|S )N)r   rU   r   r   )	rt   ru   rf   r   r   r   r!   r&   r   r   r   r   rr     s    

z$TadmEventMaxentFeatureEncoding.train)FF)r   N)
ry   rz   r{   r   r%   r   rJ   r   r}   rr   r   r   r   r   r     s   
r   c               @   sH   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Ze	dddZ
dS )TypedMaxentFeatureEncodingaZ  
    A feature encoding that generates vectors containing integer,
    float and binary joint-features of the form:

    Binary (for string and boolean features):

    |  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
    |                      {
    |                      { 0 otherwise

    Value (for integer and float features):

    |  joint_feat(fs, l) = { fval if     (fs[fname] == type(fval))
    |                      {         and (l == label)
    |                      {
    |                      { not encoded otherwise

    Where ``fname`` is the name of an input-feature, ``fval`` is a value
    for that input-feature, and ``label`` is a label.

    Typically, these features are constructed based on a training
    corpus, using the ``train()`` method.

    For string and boolean features [type(fval) not in (int, float)]
    this method will create one feature for each combination of
    ``fname``, ``fval``, and ``label`` that occurs at least once in the
    training corpus.

    For integer and float features [type(fval) in (int, float)] this
    method will create one feature for each combination of ``fname``
    and ``label`` that occurs at least once in the training corpus.

    For binary features the ``unseen_features`` parameter can be used
    to add "unseen-value features", which are used whenever an input
    feature has a value that was not encountered in the training
    corpus.  These features have the form:

    |  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
    |                      {      and l == label
    |                      {
    |                      { 0 otherwise

    Where ``is_unseen(fname, fval)`` is true if the encoding does not
    contain any joint features that are true when ``fs[fname]==fval``.

    The ``alwayson_features`` parameter can be used to add "always-on
    features", which have the form:

    |  joint_feat(fs, l) = { 1 if (l == label)
    |                      {
    |                      { 0 otherwise

    These always-on features allow the maxent model to directly model
    the prior probabilities of each label.
    Fc                s   t |j t tt|kr$tdt| _| _t| _d _	d _
|rz fddt|D  _	  jt j	7  _|rdd |D } fddt|D  _
  jt|7  _dS )a  
        :param labels: A list of the "known labels" for this encoding.

        :param mapping: A dictionary mapping from ``(fname,fval,label)``
            tuples to corresponding joint-feature indexes.  These
            indexes must be the set of integers from 0...len(mapping).
            If ``mapping[fname,fval,label]=id``, then
            ``self.encode({..., fname:fval, ...``, label)[id]} is 1;
            otherwise, it is 0.

        :param unseen_features: If true, then include unseen value
           features in the generated joint-feature vectors.

        :param alwayson_features: If true, then include always-on
           features in the generated joint-feature vectors.
        zHMapping values must be exactly the set of integers from 0...len(mapping)Nc                s   i | ]\}}| j  |qS r   )r   )r3   rN   r&   )r   r   r   r   |  s   z7TypedMaxentFeatureEncoding.__init__.<locals>.<dictcomp>c             S   s   h | ]\}}}|qS r   r   )r3   r   r   r&   r   r   r   r     s    z6TypedMaxentFeatureEncoding.__init__.<locals>.<setcomp>c                s   i | ]\}}| j  |qS r   )r   )r3   rN   r   )r   r   r   r     s    )r   r   rV   r   rs   rU   r   r   r   r   r   rH   )r   r   r   r   r   r   r   )r   r   r   U  s     


z#TypedMaxentFeatureEncoding.__init__c             C   s   g }x|j  D ]\}}t|ttfrX|t||f| jkr|j| j|t||f |f q|||f| jkr|j| j|||f df q| jrx>| jD ]}|||f| jkrP qW || jkr|j| j| df qW | j	r|| j	kr|j| j	| df |S )Nr=   )
r   r   rG   floattyper   r   r   r   r   )r   r!   r&   r   r   r   r   r   r   r   r%     s      
z!TypedMaxentFeatureEncoding.encodec             C   s"  t |tstdy
| j W nH tk
rd   dgt| j | _x | jj D ]\}}|| j|< qJW Y nX |t| jk r| j| \}}}| d|d|S | jr|| jj	 krxp| jj D ]\}}||krd| S qW nH| j
o|| j
j	 krx0| j
j D ]\}}||krd| S qW ntdd S )	Nzdescribe() expected an intr=   z==z and label is zlabel is %rz%s is unseenzBad feature idr   )r   rG   rl   r   r   r   r   r   r   r   r   rs   )r   r)   r   rN   r   r   r&   r   r   r   r   rJ     s(    

z#TypedMaxentFeatureEncoding.describec             C   s   | j S )N)r   )r   r   r   r   r     s    z!TypedMaxentFeatureEncoding.labelsc             C   s   | j S )N)r   )r   r   r   r   r     s    z!TypedMaxentFeatureEncoding.lengthr   Nc             K   s   i }t  }tt}x|D ]\}}	|r8|	|kr8td|	 |j|	 xp|j D ]d\}
}t|ttfkrlt|}||
|f  d7  < ||
|f |krL|
||	f|krLt|||
||	f< qLW qW |dkr|}| ||f|S )a)  
        Construct and return new feature encoding, based on a given
        training corpus ``train_toks``.  See the class description
        ``TypedMaxentFeatureEncoding`` for a description of the
        joint-features that will be included in this encoding.

        Note: recognized feature values types are (int, float), over
        types are interpreted as regular binary features.

        :type train_toks: list(tuple(dict, str))
        :param train_toks: Training data, represented as a list of
            pairs, the first member of which is a feature dictionary,
            and the second of which is a classification label.

        :type count_cutoff: int
        :param count_cutoff: A cutoff value that is used to discard
            rare joint-features.  If a joint-feature's value is 1
            fewer than ``count_cutoff`` times in the training corpus,
            then that joint-feature is not included in the generated
            encoding.

        :type labels: list
        :param labels: A list of labels that should be used by the
            classifier.  If not specified, then the set of labels
            attested in ``train_toks`` will be used.

        :param options: Extra parameters for the constructor, such as
            ``unseen_features`` and ``alwayson_features``.
        zUnexpected label %sr=   N)	r   r   rG   rs   r   r   r   r   r   )rt   ru   rf   r   r   r   r   r   r   r&   r   r   r   r   r   rr     s"    
z TypedMaxentFeatureEncoding.train)FF)r   N)ry   rz   r{   r|   r   r%   rJ   r   r   r}   rr   r   r   r   r   r     s   7
1 r   r`   c             K   s  |j dd t|}|dkr*tj| |d}t|ds<tdd|j }t| |}tt	j
|dkd }t	jt|d	}	x|D ]}
t	j|	|
< q|W t||	}t	j|}~|dkrtd
|d   |dkrt  td td yx|dkr"|jpt|| }|jpt|| }|j}td|||f  t|| |}x|D ]}
||
  d7  < q4W t	j|}~|j }	|	|| | 7 }	|j|	 |j|| rP qW W n* tk
r   td Y n    Y nX |dkrt|| }t|| }td|dd|d |S )a  
    Train a new ``ConditionalExponentialClassifier``, using the given
    training samples, using the Generalized Iterative Scaling
    algorithm.  This ``ConditionalExponentialClassifier`` will encode
    the model that maximizes entropy from all the models that are
    empirically consistent with ``train_toks``.

    :see: ``train_maxent_classifier()`` for parameter descriptions.
    ra   d   N)r   r   zJThe GIS algorithm requires an encoding that defines C (e.g., GISEncoding).g      ?r   dz  ==> Training (%d iterations)r.   z-      Iteration    Log Likelihood    Accuracyz-      ---------------------------------------z     %9d    %14.5f    %9.3fr=   z*      Training stopped: keyboard interruptz         Final    z14.5fz    z9.3f)
setdefaultr
   r   rr   rT   rl   r   calculate_empirical_fcountr   numpynonzerozerosr   NINF ConditionalExponentialClassifierlog2rD   llr   accr   itercalculate_estimated_fcountr   r   checkKeyboardInterrupt)ru   rj   r   r   rw   cutoffcheckerZCinvZempirical_fcount
unattestedr   rS   
classifierZlog_empirical_fcountr   r   iternumZestimated_fcountZlog_estimated_fcountr   r   r   ro     sb    














ro   c             C   sP   t j|j d}x:| D ]2\}}x(|j||D ]\}}||  |7  < q,W qW |S )Nr   )r   r   r   r%   )ru   r   fcountr   r&   indexvalr   r   r   r   g  s
    r   c       
      C   sz   t j|j d}xd|D ]\\}}| j|}xH|j D ]<}|j|}x,|j||D ]\}}	||  ||	 7  < qNW q2W qW |S )Nr   )r   r   r   r   rC   r?   r%   )
r   ru   r   r   r   r&   r@   r?   rS   r   r   r   r   r   q  s    

 r   c             K   s  |j dd t|}|dkr*tj| |d}t| |t|  }t| |}tjt	||j
dd}tj|t|df}	ttj|dkd }
tjt|d}x|
D ]}tj||< qW t||}|dkrtd	|d   |d
krt  td td yx|d
kr8|jpt|| }|jpt|| }|j}td|||f  t| ||
||||	|}|j }||7 }|j| |j|| rP qW W n* tk
r   td Y n    Y nX |d
krt|| }t|| }td|dd|d |S )a  
    Train a new ``ConditionalExponentialClassifier``, using the given
    training samples, using the Improved Iterative Scaling algorithm.
    This ``ConditionalExponentialClassifier`` will encode the model
    that maximizes entropy from all the models that are empirically
    consistent with ``train_toks``.

    :see: ``train_maxent_classifier()`` for parameter descriptions.
    ra   r   N)r   )r/   r   r=   r   z  ==> Training (%d iterations)r.   z-      Iteration    Log Likelihood    Accuracyz-      ---------------------------------------z     %9d    %14.5f    %9.3fz*      Training stopped: keyboard interruptz         Final    z14.5fz    z9.3f)r   r
   r   rr   r   r   calculate_nfmapr   arrayrB   __getitem__Zreshaper   r   r   r   r   rD   r   r   r   r   r   calculate_deltasr   r   r   r   )ru   rj   r   r   rw   r   Zempirical_ffreqnfmapnfarraynftransposer   r   rS   r   r   r   r   deltasr   r   r   rn     sb    








rn   c             C   s\   t  }xB| D ]:\}}x0|j D ]$}|jtdd |j||D  qW qW dd t|D S )a  
    Construct a map that can be used to compress ``nf`` (which is
    typically sparse).

    *nf(feature_vector)* is the sum of the feature values for
    *feature_vector*.

    This represents the number of features that are active for a
    given labeled text.  This method finds all values of *nf(t)*
    that are attested for at least one token in the given list of
    training tokens; and constructs a dictionary mapping these
    attested values to a continuous range *0...N*.  For example,
    if the only values of *nf()* that were attested were 3, 5, and
    7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.

    :return: A map that can be used to compress ``nf`` to a dense
        vector.
    :rtype: dict(int -> int)
    c             s   s   | ]\}}|V  qd S )Nr   )r3   idr   r   r   r   r5     s    z"calculate_nfmap.<locals>.<genexpr>c             S   s   i | ]\}}||qS r   r   )r3   rN   nfr   r   r   r     s    z#calculate_nfmap.<locals>.<dictcomp>)r   r   r   r   r%   rH   )ru   r   Znfsetr   _r&   r   r   r   r     s
    (r   c             C   sp  d}d}	t j|j d}
t jt||j fd}x~| D ]v\}}|j|}xb|j D ]V}|j||}tdd |D }x2|D ]*\}}||| |f  |j	|| 7  < qzW qRW q6W |t|  }xt
|	D ]}t j||
}d| }|| }t j|| dd}t j|| dd}x|D ]}||  d	7  < qW |
|| |  8 }
t jt|| t jt|
 }||k r|
S qW |
S )
a
  
    Calculate the update values for the classifier weights for
    this iteration of IIS.  These update weights are the value of
    ``delta`` that solves the equation::

      ffreq_empirical[i]
             =
      SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
                 feature_vector(fs,l)[i] *
                 exp(delta[i] * nf(feature_vector(fs,l))))

    Where:
        - *(fs,l)* is a (featureset, label) tuple from ``train_toks``
        - *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
        - *nf(vector)* = ``sum([val for (id,val) in vector])``

    This method uses Newton's method to solve this equation for
    *delta[i]*.  In particular, it starts with a guess of
    ``delta[i]`` = 1; and iteratively updates ``delta`` with:

    | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])

    until convergence, where *sum1* and *sum2* are defined as:

    |    sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
    |    sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
    |    f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
    |                        feature_vector(fs,l)[i] .
    |                        exp(delta[i] . nf(feature_vector(fs,l))))

    Note that *sum1* and *sum2* depend on ``delta``; so they need
    to be re-computed each iteration.

    The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
    used to generate a dense encoding for *nf(ltext)*.  This
    allows ``_deltas`` to calculate *sum1* and *sum2* using
    matrices, which yields a significant performance improvement.

    :param train_toks: The set of training tokens.
    :type train_toks: list(tuple(dict, str))
    :param classifier: The current classifier.
    :type classifier: ClassifierI
    :param ffreq_empirical: An array containing the empirical
        frequency for each feature.  The *i*\ th element of this
        array is the empirical frequency for feature *i*.
    :type ffreq_empirical: sequence of float
    :param unattested: An array that is 1 for features that are
        not attested in the training data; and 0 for features that
        are attested.  In other words, ``unattested[i]==0`` iff
        ``ffreq_empirical[i]==0``.
    :type unattested: sequence of int
    :param nfmap: A map that can be used to compress ``nf`` to a dense
        vector.
    :type nfmap: dict(int -> int)
    :param nfarray: An array that can be used to uncompress ``nf``
        from a dense vector.
    :type nfarray: array(float)
    :param nftranspose: The transpose of ``nfarray``
    :type nftranspose: array(float)
    g-q=i,  r   c             s   s   | ]\}}|V  qd S )Nr   )r3   r   r   r   r   r   r5   U  s    z#calculate_deltas.<locals>.<genexpr>r.   r   )Zaxisr=   )r   Zonesr   r   r   r   r   r%   r   r?   rV   outerr8   )ru   r   r   Zffreq_empiricalr   r   r   r   ZNEWTON_CONVERGEZ
MAX_NEWTONr   Ar   r&   distr'   r   r   r   ZrangenumZnf_deltaZexp_nf_deltaZnf_exp_nf_deltaZsum1Zsum2rS   Zn_errorr   r   r   r     s2    I
.

 r   c              K   s&  d}d}d|kr|d }d|kr(|d }|dkrP|j dd}tj| ||dd}n|dk	r`tdyFtjd	d
\}	}
t|
d}t| ||||d W dQ R X tj	|	 W n4 t
tfk
r } ztd| |W Y dd}~X nX g }|dddg7 }|r|dg7 }|s|dg7 }|r d|d  }nd}|dd| dg7 }|dk rJ|dg7 }d|krh|dd|d  g7 }d|kr|ddt|d  g7 }t|dr|d g7 }|d!|
g7 }t|}ytj|
 W n8 t
k
r } ztd"|
 d#|  W Y dd}~X nX t||j |}|tjtj9 }t||S )$a  
    Train a new ``ConditionalExponentialClassifier``, using the given
    training samples, using the external ``megam`` library.  This
    ``ConditionalExponentialClassifier`` will encode the model that
    maximizes entropy from all the models that are empirically
    consistent with ``train_toks``.

    :see: ``train_maxent_classifier()`` for parameter descriptions.
    :see: ``nltk.classify.megam``
    Trh   ri   Nrf   r   )r   r   z$Specify encoding or labels, not bothznltk-)prefixw)rh   ri   z,Error while creating megam training file: %sz-nobiasz-repeat10z	-explicitz-fvalsg      ?r.   z-lambdaz%.2fz-tuner`   z-quietra   z-maxiz%sll_deltaz-dppZcostz-multilabelZ
multiclasszWarning: unable to delete z: )getr   rr   rs   tempfilemkstempopenr   oscloseOSErrorr8   rT   r   removerD   r   r   r   r   er   )ru   rj   r   r   rk   rx   rh   ri   rf   fdtrainfile_name	trainfiler   r   Zinv_variancestdoutr   r   r   r   rp     s^     






&rp   c               @   s   e Zd Zedd ZdS )rq   c          
   K   s  |j dd}|j dd}|j dd }|j dd }|j dd}|j d	d}|j d
}	|j d}
|sptj|||d}tjddd\}}tjdd\}}t|d}t||| |j  g }|jdg |jd|g |r|jdd|d  g |	r |jdd|	 g |
r|jddt	|
 g |jd|g |jd|g |dk rP|jdg n|jdg t
| t|}t|}W d Q R X tj| tj| |tjtj9 }| ||S )Nrv   Ztao_lmvmrj   r`   r   r   rk   r   rf   ra   rc   )r   znltk-tadm-events-z.gz)r   suffixznltk-tadm-weights-)r   r   z-monitorz-methodz-l2z%.6fr.   z-max_itz%dz-fatolz
-events_inz-params_outz2>&1z-summary)r   r   rr   r   r   r   r	   r   extendr8   r   r   r   r   r   r   r   r   )rt   ru   rx   rv   rj   r   r   sigmarf   ra   r   Ztrainfile_fdr   Zweightfile_fdZweightfile_namer   r   Z
weightfiler   r   r   r   rr     sL    






zTadmMaxentClassifier.trainN)ry   rz   r{   r}   rr   r   r   r   r   rq     s   rq   c              C   s   ddl m}  | tj}d S )Nr   )
names_demo)nltk.classify.utilr   r   rr   )r   r   r   r   r   demo  s    r   __main__)r`   NN)r`   NN)r`   NNr   ).r|   r   ImportErrorr   r   collectionsr   Znltk.classify.apir   Znltk.classify.megamr   r   r   Znltk.classify.tadmr   r   r	   r   r
   r   r   Z	nltk.datar   Znltk.probabilityr   Z	nltk.utilr   Z__docformat__r   r   r~   r   r   r   r   r   ro   r   r   rn   r   r   rp   rq   r   ry   r   r   r   r   <module>5   sN     I/ L=8 k
a

[ 
[=
