3
d                 @   s   d Z ddlZddlmZmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ G d	d
 d
edZdd Zdd ZdddZG dd dedZdS )zLanguage Model Interface.    N)ABCMetaabstractmethod)bisect)
accumulate)NgramCounter)	log_base2)
Vocabularyc               @   s0   e Zd ZdZdd Zedd Zedd ZdS )		SmoothingzNgram Smoothing Interface

    Implements Chen & Goodman 1995's idea that all smoothing algorithms have
    certain features in common. This should ideally allow smoothing algorithms to
    work both with Backoff and Interpolation.
    c             C   s   || _ || _dS )z
        :param vocabulary: The Ngram vocabulary object.
        :type vocabulary: nltk.lm.vocab.Vocabulary
        :param counter: The counts of the vocabulary items.
        :type counter: nltk.lm.counter.NgramCounter
        N)vocabcounts)self
vocabularycounter r   +/tmp/pip-build-v9q4h5k9/nltk/nltk/lm/api.py__init__   s    zSmoothing.__init__c             C   s
   t  d S )N)NotImplementedError)r   wordr   r   r   unigram_score%   s    zSmoothing.unigram_scorec             C   s
   t  d S )N)r   )r   r   contextr   r   r   alpha_gamma)   s    zSmoothing.alpha_gammaN)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   r	      s   
r	   )	metaclassc             C   s   t | t|  S )z0Return average (aka mean) for sequence of items.)sumlen)itemsr   r   r   _mean.   s    r   c             C   s   t | tjr| S tj| S )N)
isinstancerandomRandom)Zseed_or_generatorr   r   r   _random_generator3   s    r#   c             C   sR   | st dt| t|kr$t dtt|}|d }|j }| t|||  S )z`Like random.choice, but with weights.

    Heavily inspired by python 3.6 `random.choices`.
    z"Can't choose from empty populationz3The number of weights does not match the population   )
ValueErrorr   listr   r!   r   )
populationweightsrandom_generatorcum_weightstotal	thresholdr   r   r   _weighted_choice9   s    r.   c               @   sh   e Zd ZdZdddZdddZdddZedd	d
ZdddZ	dd Z
dd Zdd ZdddZdS )LanguageModelzKABC for Language Models.

    Cannot be directly instantiated itself.

    Nc             C   s2   || _ |dkrt n|| _|dkr(t n|| _dS )as  Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
            of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type vocabulary: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
            sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how sentences in training text are padded.
        :type pad_fn: function or None
        N)orderr   r
   r   r   )r   r0   r   r   r   r   r   r   O   s    zLanguageModel.__init__c                s@    j s"|dkrtd j j|  jj fdd|D  dS )zeTrains the model on a text.

        :param text: Training text as a sequence of sentences.

        Nz:Cannot fit without a vocabulary or text to create it from.c             3   s   | ]} j j|V  qd S )N)r
   lookup).0sent)r   r   r   	<genexpr>m   s    z$LanguageModel.fit.<locals>.<genexpr>)r
   r&   updater   )r   textZvocabulary_textr   )r   r   fita   s    zLanguageModel.fitc             C   s$   | j | jj||r| jj|ndS )zMasks out of vocab (OOV) words and computes their model score.

        For model-specific logic of calculating scores, see the `unmasked_score`
        method.
        N)unmasked_scorer
   r1   )r   r   r   r   r   r   scoreo   s    zLanguageModel.scorec             C   s
   t  dS )a  Score a word given some optional context.

        Concrete models are expected to provide an implementation.
        Note that this method does not mask its arguments with the OOV label.
        Use the `score` method for that.

        :param str word: Word for which we want the score
        :param tuple(str) context: Context the word is in.
            If `None`, compute unigram score.
        :param context: tuple(str) or None
        :rtype: float
        N)r   )r   r   r   r   r   r   r8   y   s    zLanguageModel.unmasked_scorec             C   s   t | j||S )zEvaluate the log score of this word in this context.

        The arguments are the same as for `score` and `unmasked_score`.

        )r   r9   )r   r   r   r   r   r   logscore   s    zLanguageModel.logscorec             C   s"   |r| j t|d  | S | j jS )zHelper method for retrieving counts for a given context.

        Assumes context has been checked and oov words in it masked.
        :type context: tuple(str) or None

        r$   )r   r   Zunigrams)r   r   r   r   r   context_counts   s    zLanguageModel.context_countsc                s   dt  fdd|D  S )zCalculate cross-entropy of model for given evaluation text.

        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
        :rtype: float

        r$   c                s$   g | ]} j |d |dd qS )r$   Nr%   r%   )r:   )r2   Zngram)r   r   r   
<listcomp>   s    z)LanguageModel.entropy.<locals>.<listcomp>r%   )r   )r   text_ngramsr   )r   r   entropy   s    zLanguageModel.entropyc             C   s   t d| j|S )zCalculates the perplexity of the given text.

        This is simply 2 ** cross-entropy for the text, so the arguments are the same.

        g       @)powr>   )r   r=   r   r   r   
perplexity   s    zLanguageModel.perplexityr$   c                s   |dkrg nt |}t|}|dkrt|jkrF|j d d n| jjj }x< r| rt dkr dd ng  jjj }q^W t|}t|t	 fdd|D |S g }x*t
|D ]}|jjd|| |d qW |S )a  Generate words from the model.

        :param int num_words: How many words to generate. By default 1.
        :param text_seed: Generation can be conditioned on preceding context.
        :param random_seed: A random seed or an instance of `random.Random`. If provided,
            makes the random sampling part of generation reproducible.
        :return: One (str) word or a list of words generated from model.

        Examples:

        >>> from nltk.lm import MLE
        >>> lm = MLE(2)
        >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
        >>> lm.fit([[("a",), ("b",), ("c",)]])
        >>> lm.generate(random_seed=3)
        'a'
        >>> lm.generate(text_seed=['a'])
        'b'

        Nr$   c             3   s   | ]}j | V  qd S )N)r9   )r2   w)r   r   r   r   r4      s    z)LanguageModel.generate.<locals>.<genexpr>)	num_words	text_seedrandom_seed)r'   r#   r   r0   r;   r
   r1   sortedr.   tuplerangeappendgenerate)r   rB   rC   rD   r*   Zsamples	generated_r   )r   r   r   rI      s,    "zLanguageModel.generate)NN)N)N)N)N)r$   NN)r   r   r   r   r   r7   r9   r   r8   r:   r;   r>   r@   rI   r   r   r   r   r/   H   s   




r/   )N)r   r!   abcr   r   r   	itertoolsr   Znltk.lm.counterr   Znltk.lm.utilr   Znltk.lm.vocabularyr   r	   r   r#   r.   r/   r   r   r   r   <module>   s   
