3
d                 @   st   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ dZG dd deZdS )	    N)PIPE)_java_optionsconfig_javafind_jarjava)CoreNLPParser)
TokenizerIz1https://nlp.stanford.edu/software/tokenizer.shtmlc               @   s<   e Zd ZdZdZdddZed	d
 Zdd ZdddZ	dS )StanfordTokenizera$  
    Interface to the Stanford Tokenizer

    >>> from nltk.tokenize.stanford import StanfordTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    >>> StanfordTokenizer().tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    >>> s = "The colour of the wall is blue."
    >>> StanfordTokenizer(options={"americanize": True}).tokenize(s)
    ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
    zstanford-postagger.jarNutf8F-mx1000mc             C   sf   t jtdtdd t| j|d	f t|d| _|| _|| _	|d krDi n|}dj
dd |j D | _d S )
Nzz
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.'   )
stacklevelSTANFORD_POSTAGGER)Zenv_varsZ
searchpathurlverbose,c             s   s    | ]\}}| d | V  qdS )=N ).0keyvalr   r   6/tmp/pip-build-v9q4h5k9/nltk/nltk/tokenize/stanford.py	<genexpr>E   s    z-StanfordTokenizer.__init__.<locals>.<genexpr>)r   )warningswarnstrDeprecationWarningr   _JAR_stanford_url_stanford_jar	_encodingjava_optionsjoinitems_options_cmd)selfZpath_to_jarencodingoptionsr   r!   r   r   r   __init__%   s     	
zStanfordTokenizer.__init__c             C   s   | j  S )N)
splitlines)sr   r   r   _parse_tokenized_outputG   s    z)StanfordTokenizer._parse_tokenized_outputc             C   s   dg}| j | j||S )zW
        Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
        z%edu.stanford.nlp.process.PTBTokenizer)r+   _execute)r%   r*   cmdr   r   r   tokenizeK   s    zStanfordTokenizer.tokenizec       
      C   s   | j }|jd|g | j}|r.|jd| jg djt}t| j|d tjddd\}t	|t
rn|rn|j|}|j| |j  |j|j t|| jttd\}}	|j|}W d Q R X tj|j t|dd |S )	Nz-charsetz-options )r'   r   wbF)modedelete)Z	classpathstdoutstderr)r    extendr$   r"   r   r   r!   tempfileNamedTemporaryFile
isinstancer   encodewriteflushappendnamer   r   r   decodeosunlink)
r%   r-   Zinput_r   r&   r$   Zdefault_optionsZ
input_filer3   r4   r   r   r   r,   R   s&    


zStanfordTokenizer._execute)Nr
   NFr   )F)
__name__
__module____qualname____doc__r   r(   staticmethodr+   r.   r,   r   r   r   r   r	      s       
r	   )jsonr?   r6   r   
subprocessr   Znltk.internalsr   r   r   r   Znltk.parse.corenlpr   Znltk.tokenize.apir   r   r	   r   r   r   r   <module>	   s   