3
d                 @   sZ   d dl mZ d dlmZ d dlmZmZ ejZeedddddZ	de	_
d	d
 Zdd ZdS )    )partial)chain)
everygramspad_sequenceTz<s>z</s>)Zpad_leftZleft_pad_symbolZ	pad_rightZright_pad_symbolzPads both ends of a sentence to length specified by ngram order.

    Following convention <s> pads the start of sentence </s> pads its end.
    c             C   s   t tt|| d| dS )zpHelper with some useful defaults.

    Applies pad_both_ends to sentence and follows it up with everygrams.
    )n)max_len)r   listpad_both_ends)orderZsentence r   5/tmp/pip-build-v9q4h5k9/nltk/nltk/lm/preprocessing.pypadded_everygrams   s    r   c                s.   t t d fdd|D tt|fS )a  Default preprocessing for a sequence of sentences.

    Creates two iterators:

    - sentences padded and turned into sequences of `nltk.util.everygrams`
    - sentences padded as above and chained together for a flat stream of words

    :param order: Largest ngram length produced by `everygrams`.
    :param text: Text to iterate over. Expected to be an iterable of sentences.
    :type text: Iterable[Iterable[str]]
    :return: iterator over text as ngrams, iterator over text as vocabulary data
    )r   c             3   s"   | ]}t t| d V  qdS ))r   N)r   r   ).0sent)r
   
padding_fnr   r   	<genexpr>1   s    z,padded_everygram_pipeline.<locals>.<genexpr>)r   r	   flattenmap)r
   textr   )r
   r   r   padded_everygram_pipeline"   s    r   N)	functoolsr   	itertoolsr   Z	nltk.utilr   r   from_iterabler   r	   __doc__r   r   r   r   r   r   <module>   s   