3
­d}  ã               @   sd   d Z ddlmZ ddlmZ ddlmZ dd„ ZG dd„ deƒZG d	d
„ d
eƒZ	G dd„ deƒZ
dS )z…Smoothing algorithms for language modeling.

According to Chen & Goodman 1995 these should work with both Backoff and
Interpolation.
é    )Úmethodcaller)ÚConditionalFreqDist)Ú	Smoothingc                s4   t | tƒrtdƒndd„ ‰ t‡ fdd„| jƒ D ƒƒS )zµCount values that are greater than zero in a distribution.

    Assumes distribution is either a mapping with counts as values or
    an instance of `nltk.ConditionalFreqDist`.
    ÚNc             S   s   | S )N© )Úcountr   r   ú1/tmp/pip-build-v9q4h5k9/nltk/nltk/lm/smoothing.pyÚ<lambda>   s    z'_count_values_gt_zero.<locals>.<lambda>c             3   s   | ]}ˆ |ƒd krdV  qdS )r   é   Nr   )Ú.0Zdist_or_count)Úas_countr   r   ú	<genexpr>    s    z(_count_values_gt_zero.<locals>.<genexpr>)Ú
isinstancer   r   ÚsumÚvalues)Údistributionr   )r   r   Ú_count_values_gt_zero   s    r   c                   s8   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Zdd	„ Z‡  ZS )
Ú
WittenBellzWitten-Bell smoothing.c                s   t ƒ j||f|Ž d S )N)ÚsuperÚ__init__)ÚselfÚ
vocabularyÚcounterÚkwargs)Ú	__class__r   r   r   '   s    zWittenBell.__init__c             C   s*   | j | j|ƒ}| j|ƒ}d| | |fS )Ng      ð?)ÚcountsÚfreqÚ_gamma)r   ÚwordÚcontextÚalphaÚgammar   r   r   Úalpha_gamma*   s    
zWittenBell.alpha_gammac             C   s$   t | j| ƒ}||| j| jƒ   S )N)r   r   r   )r   r   Ún_plusr   r   r   r   /   s    zWittenBell._gammac             C   s   | j jj|ƒS )N)r   Úunigramsr   )r   r   r   r   r   Úunigram_score3   s    zWittenBell.unigram_score)	Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r"   r   r%   Ú__classcell__r   r   )r   r   r   $   s
   r   c                   s:   e Zd ZdZd‡ fdd„	Zdd„ Zdd„ Zd	d
„ Z‡  ZS )ÚAbsoluteDiscountingz!Smoothing with absolute discount.ç      è?c                s   t ƒ j||f|Ž || _d S )N)r   r   Údiscount)r   r   r   r-   r   )r   r   r   r   :   s    zAbsoluteDiscounting.__init__c             C   s:   t | j| | | j dƒ| j| jƒ  }| j|ƒ}||fS )Nr   )Úmaxr   r-   r   r   )r   r   r   r    r!   r   r   r   r"   >   s    
zAbsoluteDiscounting.alpha_gammac             C   s&   t | j| ƒ}| j| | j| jƒ  S )N)r   r   r-   r   )r   r   r#   r   r   r   r   F   s    zAbsoluteDiscounting._gammac             C   s   | j jj|ƒS )N)r   r$   r   )r   r   r   r   r   r%   J   s    z!AbsoluteDiscounting.unigram_score)r,   )	r&   r'   r(   r)   r   r"   r   r%   r*   r   r   )r   r   r+   7   s
   r+   c                   s@   e Zd ZdZd‡ fdd„	Zdd„ Zdd„ Zeƒ fd	d
„Z‡  Z	S )Ú	KneserNeyañ  Kneser-Ney Smoothing.

    This is an extension of smoothing with a discount.

    Resources:
    - https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf
    - https://www.youtube.com/watch?v=ody1ysUTD7o
    - https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8
    - https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf
    - https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf
    çš™™™™™¹?c                s"   t ƒ j||f|Ž || _|| _d S )N)r   r   r-   Ú_order)r   r   r   Úorderr-   r   )r   r   r   r   [   s    zKneserNey.__init__c             C   s   | j |ƒ\}}|| S )N)Ú_continuation_counts)r   r   Úword_continuation_countÚtotal_countr   r   r   r%   `   s    zKneserNey.unigram_scorec             C   sj   | j | }t|ƒd | jkr,|| |jƒ fn
| j||ƒ\}}t|| j dƒ| }| jt|ƒ | }||fS )Nr
   g        )r   Úlenr1   r   r3   r.   r-   r   )r   r   r   Zprefix_countsr4   r5   r    r!   r   r   r   r"   d   s    
"zKneserNey.alpha_gammac                sb   ‡ fdd„| j tˆ ƒd  jƒ D ƒ}d\}}x,|D ]$}|t|| dkƒ7 }|t|ƒ7 }q2W ||fS )a  Count continuations that end with context and word.

        Continuations track unique ngram "types", regardless of how many
        instances were observed for each "type".
        This is different than raw ngram counts which track number of instances.
        c             3   s&   | ]\}}|d d… ˆ kr|V  qdS )r
   Nr   )r   Zprefix_ngramr   )r   r   r   r   w   s   z1KneserNey._continuation_counts.<locals>.<genexpr>é   r   )r   r   )r   r6   ÚitemsÚintr   )r   r   r   Z higher_order_ngrams_with_contextZ#higher_order_ngrams_with_word_countÚtotalr   r   )r   r   r3   o   s    

zKneserNey._continuation_counts)r0   )
r&   r'   r(   r)   r   r%   r"   Útupler3   r*   r   r   )r   r   r/   N   s
   r/   N)r)   Úoperatorr   Znltkr   Znltk.lm.apir   r   r   r+   r/   r   r   r   r   Ú<module>   s   