3
dX%                 @   s^   d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	 G dd dZ
G dd deZdS )	    N)IteratorListTuple)
TokenizerI)align_tokensc               @   s4   e Zd ZdZdddddddd	gZd
dgZddgZdS )MacIntyreContractionszI
    List of contractions adapted from Robert MacIntyre's tokenizer.
    z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(more)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)(?=\s)z(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\bz(?i)\b(whad)(dd)(ya)\bz(?i)\b(wha)(t)(cha)\bN)__name__
__module____qualname____doc__CONTRACTIONS2CONTRACTIONS3ZCONTRACTIONS4 r   r   9/tmp/pip-build-v9q4h5k9/nltk/nltk/tokenize/destructive.pyr      s   r   c               @   s  e Zd ZdZejdejdfejddfejddfejddfejd	ejd
fgZejdejdfejddfejddfejddfejddfgZejdejdfejddfejddfejdejdfejddfejddfejddfejddfejdejdfg	Z	ejd dfZ
ejd!d"fejd#d$fejd%d&fejd'd(fejd)d*fejd+d,fgZejd-d.fZe ZeeejejZeeejejZd7eeeee d0d1d2Zeeeeef  d3d4d5Zd6S )8NLTKWordTokenizeraE  
    The NLTK tokenizer that has improved upon the TreebankWordTokenizer.

    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    The tokenizer is "destructive" such that the regexes applied will munge the
    input string to a state beyond re-construction. It is possible to apply
    `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
    `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
    revert to the original string.
    u   ([«“‘„]|[`]+)z \1 z^\"z``z(``)z([ \(\[{<])(\"|\'{2})z\1 `` z$(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\bz\1 \2u   ([»”’])z''z '' "z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) u&   ([^\.])(\.)([\]\)}>"\'»”’ ]*)\s*$z	\1 \2 \3 z([:,])([^\d])z \1 \2z([:,])$z\.{2,}z \g<0> z[;@#$%&]z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[*]z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- F)textconvert_parentheses
return_strreturnc             C   s  |rt jdtdd x| jD ]\}}|j||}qW x| jD ]\}}|j||}q<W | j\}}|j||}|rx| jD ]\}}|j||}qvW | j\}}|j||}d| d }x| j	D ]\}}|j||}qW x| j
D ]}|jd|}qW x| jD ]}|jd|}qW |j S )ab  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import NLTKWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> NLTKWordTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True)
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> NLTKWordTokenizer().tokenize(s, return_str=True)
        ' Good muffins cost  $ 3.88  ( roughly 3,36 euros ) \nin New York.  Please buy me\ntwo of them.\nThanks  .  '

        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        zHParameter 'return_str' has been deprecated and should no longer be used.   )category
stacklevel z \1 \2 )warningswarnDeprecationWarningSTARTING_QUOTESsubPUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESESDOUBLE_DASHESENDING_QUOTESr   r   split)selfr   r   r   regexpZsubstitutionr   r   r   tokenizex   s0    

zNLTKWordTokenizer.tokenize)r   r   c             #   s\   | j |}d|ksd|krDdd tjd|D   fdd|D }n|}t||E dH  dS )a}  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import NLTKWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r   z''c             S   s   g | ]}|j  qS r   )group).0mr   r   r   
<listcomp>   s    z3NLTKWordTokenizer.span_tokenize.<locals>.<listcomp>z
``|'{2}|\"c                s"   g | ]}|dkr j dn|qS )r   ``''r   )r   r,   r-   )pop)r)   tok)matchedr   r   r+      s   N)r'   refinditerr   )r%   r   Z
raw_tokenstokensr   )r0   r   span_tokenize   s    


zNLTKWordTokenizer.span_tokenizeN)FF)r   r	   r
   r   r1   compileUr   r#   r   r    r!   r"   r   Z_contractionslistmapr   r   strboolr   r'   r   r   intr4   r   r   r   r   r   %   sJ   Gr   )r1   r   Ztypingr   r   r   Znltk.tokenize.apir   Znltk.tokenize.utilr   r   r   r   r   r   r   <module>
   s   