3
d                 @   s,   d Z ddlZddlmZ G dd deZdS )a  
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.

Tok-tok has been tested on, and gives reasonably good results for English,
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.

Reference:
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
    N)
TokenizerIc               @   s  e Zd ZdZejddfZejddfZejddfZejddfZ	ejdd	fZ
ejd
dfZejddfZejddfZejddfZejddfZejddfZejddfZejddfZejddfZejddfZejddfZedZedZedZejde dd fZejde dd fZejde dd fZejd!d"fZejd#d$fZejd%d&fZejd'd&fZ ejd(d)fZ!ejd*d+fZ"ejd,dfZ#eeeeee e
eeeeeeeeeeee	eeeee#gZ$d1d.d/Z%d0S )2ToktokTokenizeru  
    This is a Python port of the tok-tok.pl from
    https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

    >>> toktok = ToktokTokenizer()
    >>> text = u'Is 9.5 or 525,600 my favorite number?'
    >>> print(toktok.tokenize(text, return_str=True))
    Is 9.5 or 525,600 my favorite number ?
    >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
    >>> print(toktok.tokenize(text, return_str=True))
    The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
    >>> text = u'¡This, is a sentence with weird» symbols… appearing everywhere¿'
    >>> expected = u'¡ This , is a sentence with weird » symbols … appearing everywhere ¿'
    >>> assert toktok.tokenize(text, return_str=True) == expected
    >>> toktok.tokenize(text) == [u'¡', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'»', u'symbols', u'…', u'appearing', u'everywhere', u'¿']
    True
         u1   ([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])z \1 u   ([({\[“‘„‚«‹「『])u
   ([–—])z& z&amp; 	z &#9; z\|z &#124; u   (?<!,)([,،])(?![,\d])u	   (['’`])z ` ` z `` z ' ' z '' z
(?<!\.)\.$z .u    (?<!\.)\.\s*(["'’»›”]) *$z . \1z(,{2,})z(-{2,})z(\.{2,})u   ([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢u   )]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣u   $¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦z([z])z\1 z:(?!//)z : z\?(?!\S)z ? z(:\/\/)[\S+\.\S+\/\S+][\/]z / z /z^ + z\s+$
z {2,}Fc             C   sD   t |}x| jD ]\}}|j||}qW t |j }|r<|S |j S )N)strTOKTOK_REGEXESsubstripsplit)selftextZ
return_strregexpZsubstitution r   4/tmp/pip-build-v9q4h5k9/nltk/nltk/tokenize/toktok.pytokenize   s
    zToktokTokenizer.tokenizeN)F)&__name__
__module____qualname____doc__recompileZNON_BREAKINGZFUNKY_PUNCT_1ZFUNKY_PUNCT_2ZEN_EM_DASHESZ	AMPERCENTZTABPIPEZCOMMA_IN_NUMZPROB_SINGLE_QUOTESZSTUPID_QUOTES_1ZSTUPID_QUOTES_2ZFINAL_PERIOD_1ZFINAL_PERIOD_2ZMULTI_COMMASZMULTI_DASHESZ
MULTI_DOTSr	   Z
OPEN_PUNCTZCLOSE_PUNCTZCURRENCY_SYMZOPEN_PUNCT_REZCLOSE_PUNCT_REZCURRENCY_SYM_REZ	URL_FOE_1Z	URL_FOE_2Z	URL_FOE_3Z	URL_FOE_4ZLSTRIPZRSTRIPZ	ONE_SPACEr
   r   r   r   r   r   r      st   	r   )r   r   Znltk.tokenize.apir   r   r   r   r   r   <module>   s   