3
d=A                 @   sp   d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
 ddlmZ G dd deZG d	d
 d
eZdS )a	  

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
    N)IteratorListTuple)
TokenizerI)MacIntyreContractions)align_tokensc               @   s  e Zd ZdZejddfejddfejddfgZejdd	fejd
dfejddfejddfejddfejddfejddfgZejddfZejddfejddfejddfejddfejddfejdd fgZ	ejd!d"fZ
ejd#d$fejd%d$fejd&d'fejd(d'fgZe ZeeejejZeeejejZd1eeeee d*d+d,Zeeeeef  d-d.d/Zd0S )2TreebankWordTokenizera	  
    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.

    This tokenizer performs the following steps:

    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

    >>> from nltk.tokenize import TreebankWordTokenizer
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
    >>> TreebankWordTokenizer().tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
    >>> s = "They'll save and invest more."
    >>> TreebankWordTokenizer().tokenize(s)
    ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
    >>> s = "hi, my name can't hello,"
    >>> TreebankWordTokenizer().tokenize(s)
    ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
    z^\"z``z(``)z \1 z([ \(\[{<])(\"|\'{2})z\1 `` z([:,])([^\d])z \1 \2z([:,])$z\.\.\.z ... z[;@#$%&]z \g<0> z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- z''z '' "z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) F)textconvert_parentheses
return_strreturnc             C   s  |dk	rt jdtdd x| jD ]\}}|j||}q W x| jD ]\}}|j||}q@W | j\}}|j||}|rx| jD ]\}}|j||}qzW | j\}}|j||}d| d }x| j	D ]\}}|j||}qW x| j
D ]}|jd|}qW x| jD ]}|jd|}qW |j S )aq  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import TreebankWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> TreebankWordTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True)
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> TreebankWordTokenizer().tokenize(s, return_str=True)
        ' Good muffins cost  $ 3.88  ( roughly 3,36 euros ) \nin New York.  Please buy me\ntwo of them.\nThanks .  '

        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        FzHParameter 'return_str' has been deprecated and should no longer be used.   )category
stacklevel z \1 \2 )warningswarnDeprecationWarningSTARTING_QUOTESsubPUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESESDOUBLE_DASHESENDING_QUOTESCONTRACTIONS2CONTRACTIONS3split)selfr
   r   r   regexpsubstitution r"   6/tmp/pip-build-v9q4h5k9/nltk/nltk/tokenize/treebank.pytokenizee   s0    

zTreebankWordTokenizer.tokenize)r
   r   c             #   s\   | j |}d|ksd|krDdd tjd|D   fdd|D }n|}t||E dH  dS )a  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r	   z''c             S   s   g | ]}|j  qS r"   )group).0mr"   r"   r#   
<listcomp>   s    z7TreebankWordTokenizer.span_tokenize.<locals>.<listcomp>z
``|'{2}|\"c                s"   g | ]}|dkr j dn|qS )r	   ``''r   )r	   r)   r*   )pop)r&   tok)matchedr"   r#   r(      s   N)r$   refinditerr   )r   r
   Z
raw_tokenstokensr"   )r-   r#   span_tokenize   s    


z#TreebankWordTokenizer.span_tokenizeN)FF)__name__
__module____qualname____doc__r.   compiler   r   r   r   r   r   r   _contractionslistmapr   r   strboolr   r$   r   r   intr1   r"   r"   r"   r#   r      s<   Gr   c               @   s  e Zd ZdZe Zdd ejD Zdd ejD Zej	ddfej	ddfej	dd	fej	d
d	fej	ddfgZ
ej	ddfZej	ddfej	ddfej	ddfej	ddfej	ddfej	ddfgZej	ddfej	ddfej	dd	fgZej	dd fej	d!dfej	d"d#fej	d$dfej	d%dfej	d&d'fej	d(d)fgZej	d*d+fej	d,d)fej	d-dfgZd5ee eed/d0d1Zd6ee eed/d2d3Zd4S )7TreebankWordDetokenizera  
    The Treebank detokenizer uses the reverse regex operations corresponding to
    the Treebank tokenizer's regexes.

    Note:

    - There're additional assumption mades when undoing the padding of ``[;@#$%&]``
      punctuation symbols that isn't presupposed in the TreebankTokenizer.
    - There're additional regexes added in reversing the parentheses tokenization,
       such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
       padding added to the closing parentheses precedding ``[:;,.]``.
    - It's not possible to return the original whitespaces as they were because
      there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
      the text.split() operation.

    >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
    >>> d = TreebankWordDetokenizer()
    >>> t = TreebankWordTokenizer()
    >>> toks = t.tokenize(s)
    >>> d.detokenize(toks)
    'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

    The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
    parameter:

    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
    >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
    ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
    ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
    >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
    True
    >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
    >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
    True

    During tokenization it's safe to add more spaces but during detokenization,
    simply undoing the padding doesn't really help.

    - During tokenization, left and right pad is added to ``[!?]``, when
      detokenizing, only left shift the ``[!?]`` is needed.
      Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.

    - During tokenization ``[:,]`` are left and right padded but when detokenizing,
      only left shift is necessary and we keep right pad after comma/colon
      if the string after is a non-digit.
      Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.

    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
    >>> twd = TreebankWordDetokenizer()
    >>> twd.detokenize(toks)
    "hello, i can't feel my feet! Help!!"

    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
    >>> twd.detokenize(toks)
    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
    c             C   s   g | ]}t j|jd dqS )z(?#X)z\s)r.   r6   replace)r&   patternr"   r"   r#   r(     s   z"TreebankWordDetokenizer.<listcomp>c             C   s   g | ]}t j|jd dqS )z(?#X)z\s)r.   r6   r>   )r&   r?   r"   r"   r#   r(     s   z+([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) z\1\2 z([^' ])\s('[sS]|'[mM]|'[dD]|') z(\S)\s(\'\')z\1\2z(\'\')\s([.,:)\]>};%])z''r	   z -- z--z-LRB-(z-RRB-)z-LSB-[z-RSB-]z-LCB-{z-RCB-}z([\[\(\{\<])\sz\g<1>z\s([\]\)\}\>])z([\]\)\}\>])\s([:;,.])z([^'])\s'\sz\1' z\s([?!])z([^\.])\s(\.)([\]\)}>"\']*)\s*$z\1\2\3z([#$])\sz\s([;%])z
\s\.\.\.\sz...z\s([:,])z\1z([ (\[{<])\s``z\1``z(``)\sz``F)r0   r   r   c             C   s  dj |}x| jD ]}|jd|}qW x| jD ]}|jd|}q.W x| jD ]\}}|j||}qJW |j }| j\}}|j||}|rx| jD ]\}}|j||}qW x| jD ]\}}|j||}qW x| j	D ]\}}|j||}qW x| j
D ]\}}|j||}qW |j S )a  
        Treebank detokenizer, created by undoing the regexes from
        the TreebankWordTokenizer.tokenize.

        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: List[str]
        :param convert_parentheses: if True, replace PTB symbols with parentheses,
            e.g. `-LRB-` to `(`. Defaults to False.
        :type convert_parentheses: bool, optional
        :return: str
        r   z\1\2)joinr   r   r   r   stripr   r   r   r   r   )r   r0   r   r
   r    r!   r"   r"   r#   r$   ]  s(    

z TreebankWordDetokenizer.tokenizec             C   s   | j ||S )z&Duck-typing the abstract *tokenize()*.)r$   )r   r0   r   r"   r"   r#   
detokenize  s    z"TreebankWordDetokenizer.detokenizeN)F)F)r2   r3   r4   r5   r   r7   r   r   r.   r6   r   r   r   r   r   r   r   r:   r;   r$   rH   r"   r"   r"   r#   r=      sF   ;

1r=   )r5   r.   r   Ztypingr   r   r   Znltk.tokenize.apir   Znltk.tokenize.destructiver   Znltk.tokenize.utilr   r   r=   r"   r"   r"   r#   <module>   s    >