3
d(                 @   sL   d Z ddlZddlZddlmZ ddlmZ ddlmZ G dd deZ	dS )a  
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
    N)perluniprops)
TokenizerI)xml_unescapec               @   sf  e Zd ZdZejddfZejddfZejddfZejdd	fZ	ejd
dfZ
ejdd	fZee	e
egZedjeejdZedjeejdZedjeejdZejddeZejddeZejddeZejddfZejde de dd	fZejde de ddfZejde ddfZeeeegZdd ZdddZd ddZ dS )!NISTTokenizeruT  
    This NIST tokenizer is sentence-based instead of the original
    paragraph-based tokenization from mteval-14.pl; The sentence-based
    tokenization is consistent with the other tokenizers available in NLTK.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()
    >>> s = "Good muffins cost $3.88 in New York."
    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
    >>> nist.tokenize(s, lowercase=False) == expected_cased
    True
    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
    True

    The international_tokenize() is the preferred function when tokenizing
    non-european text, e.g.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()

    # Input strings.
    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...'
    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'

    # Expected tokens.
    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'阿里巴巴集团控股', u'有限公司', u')']
    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ˈæ', u'm']
    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'楽天株式会社', u'Rakuten', u'Kabushiki', u'-', u'gaisha']

    >>> nist.international_tokenize(albb)[:10] == expected_albb
    True
    >>> nist.international_tokenize(amz)[:10] == expected_amz
    True
    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
    True

    # Doctest for patching issue #1926
    >>> sent = u'this is a foo☄sentence.'
    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'☄', u'sentence', u'.']
    >>> nist.international_tokenize(sent) == expected_sent
    True
    z	<skipped> u     z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)NumberZPunctuationZSymbolz[]^\\-]z\\\g<0>z([ -]+)z([z])([z])c             C   s8   | j \}}|j||}t|}| j\}}|j||}|S )z8Performs the language independent string substituitions.)
STRIP_SKIPsubr   STRIP_EOL_HYPHEN)selftextregexpsubstitution r   2/tmp/pip-build-v9q4h5k9/nltk/nltk/tokenize/nist.pylang_independent_sub   s    

z"NISTTokenizer.lang_independent_subFTc             C   sx   t |}| j|}|rNd| d }|r.|j }x| jD ]\}}|j||}q6W dj|j }t |j }|rp|S |j S )Nr   )strr   lowerLANG_DEPENDENT_REGEXESr
   joinsplitstrip)r   r   	lowercaseZwestern_lang
return_strr   r   r   r   r   tokenize   s    
zNISTTokenizer.tokenizec             C   s   t |}| j\}}|j||}| j\}}|j||}t|}|rH|j }x| jD ]\}}|j||}qPW dj|j j	 }|r|S |j	 S )Nr   )
r   r	   r
   r   r   r   INTERNATIONAL_REGEXESr   r   r   )r   r   r   Zsplit_non_asciir   r   r   r   r   r   international_tokenize   s    

z$NISTTokenizer.international_tokenizeN)FTF)FTF)!__name__
__module____qualname____doc__recompiler	   r   ZPUNCTZPERIOD_COMMA_PRECEEDZPERIOD_COMMA_FOLLOWZDASH_PRECEED_DIGITr   r   r   setr   charsZ
pup_numberZ	pup_punctZ
pup_symbolr
   Znumber_regexZpunct_regexZsymbol_regexZNONASCIIZPUNCT_1ZPUNCT_2ZSYMBOLSr   r   r   r   r   r   r   r   r      s6   ,
r   )
r!   ior"   Znltk.corpusr   Znltk.tokenize.apir   Znltk.tokenize.utilr   r   r   r   r   r   <module>   s   