3
­d=  ã               @   sr   d Z ddlmZmZ ddlmZmZ G dd„ deƒZG dd„ deƒZG dd	„ d	eƒZ	G d
d„ deƒZ
ddd„ZdS )aq  
Simple Tokenizers

These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.

The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> s.split()
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    >>> s.split(' ')
    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    >>> s.split('\n')
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', '', 'Thanks.']

The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer.  For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.

é    )ÚStringTokenizerÚ
TokenizerI)Úregexp_span_tokenizeÚstring_span_tokenizec               @   s   e Zd ZdZdZdS )ÚSpaceTokenizera­  Tokenize a string using the space character as a delimiter,
    which is the same as ``s.split(' ')``.

        >>> from nltk.tokenize import SpaceTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> SpaceTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    ú N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú_string© r   r   ú4/tmp/pip-build-v9q4h5k9/nltk/nltk/tokenize/simple.pyr   *   s   	r   c               @   s   e Zd ZdZdZdS )ÚTabTokenizerzäTokenize a string use the tab character as a delimiter,
    the same as ``s.split('\t')``.

        >>> from nltk.tokenize import TabTokenizer
        >>> TabTokenizer().tokenize('a\tb c\n\t d')
        ['a', 'b c\n', ' d']
    ú	N)r   r	   r
   r   r   r   r   r   r   r   8   s   r   c               @   s    e Zd ZdZdd„ Zdd„ ZdS )ÚCharTokenizerz„Tokenize a string into individual characters.  If this functionality
    is ever required directly, use ``for char in string``.
    c             C   s   t |ƒS )N)Úlist)ÚselfÚsr   r   r   ÚtokenizeI   s    zCharTokenizer.tokenizec             c   s    t tdt|ƒd ƒƒE d H  d S )Né   )Ú	enumerateÚrangeÚlen)r   r   r   r   r   Úspan_tokenizeL   s    zCharTokenizer.span_tokenizeN)r   r	   r
   r   r   r   r   r   r   r   r   D   s   r   c               @   s*   e Zd ZdZd
dd„Zdd„ Zdd„ Zd	S )ÚLineTokenizeraV  Tokenize a string into its lines, optionally discarding blank lines.
    This is similar to ``s.split('\n')``.

        >>> from nltk.tokenize import LineTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> LineTokenizer(blanklines='keep').tokenize(s)
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', '', 'Thanks.']
        >>> # same as [l for l in s.split('\n') if l.strip()]:
        >>> LineTokenizer(blanklines='discard').tokenize(s)
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', 'Thanks.']

    :param blanklines: Indicates how blank lines should be handled.  Valid values are:

        - ``discard``: strip blank lines out of the token list before returning it.
           A line is considered blank if it contains only whitespace characters.
        - ``keep``: leave all blank lines in the token list.
        - ``discard-eof``: if the string ends with a newline, then do not generate
           a corresponding token ``''`` after that newline.
    Údiscardc             C   s(   d}||krt ddj|ƒ ƒ‚|| _d S )Nr   Úkeepúdiscard-eofzBlank lines must be one of: %sr   )r   r   r   )Ú
ValueErrorÚjoinÚ_blanklines)r   Ú
blanklinesZvalid_blanklinesr   r   r   Ú__init__g   s
    zLineTokenizer.__init__c             C   sJ   |j ƒ }| jdkr"dd„ |D ƒ}n$| jdkrF|rF|d jƒ  rF|jƒ  |S )Nr   c             S   s   g | ]}|j ƒ r|‘qS r   )Úrstrip)Ú.0Úlr   r   r   ú
<listcomp>t   s    z*LineTokenizer.tokenize.<locals>.<listcomp>zdiscard-eofr   éÿÿÿÿ)Ú
splitlinesr!   ÚstripÚpop)r   r   Úlinesr   r   r   r   p   s    

zLineTokenizer.tokenizec             c   s0   | j dkrt|dƒE d H  nt|dƒE d H  d S )Nr   z\nz
\n(\s+\n)*)r!   r   r   )r   r   r   r   r   r   {   s    
zLineTokenizer.span_tokenizeN)r   )r   r	   r
   r   r#   r   r   r   r   r   r   r   P   s   
	r   r   c             C   s   t |ƒj| ƒS )N)r   r   )Útextr"   r   r   r   Úline_tokenizeˆ   s    r.   N)r   )r   Znltk.tokenize.apir   r   Znltk.tokenize.utilr   r   r   r   r   r   r.   r   r   r   r   Ú<module>$   s   8