3
­d5	  ã               @   sd   d Z ddlmZmZ ddlmZmZmZ ddlm	Z	 ddl
mZ G dd„ deƒZG dd	„ d	eƒZd
S )z
Tokenizer Interface
é    )ÚABCÚabstractmethod)ÚIteratorÚListÚTuple)Ú
overridden)Ústring_span_tokenizec               @   s„   e Zd ZdZeeee dœdd„ƒZeee	e
e
f  dœdd„Zee eee  dœdd	„Zee eee	e
e
f   dœd
d„ZdS )Ú
TokenizerIz†
    A processing interface for tokenizing a string.
    Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
    )ÚsÚreturnc             C   s   t | jƒr| j|gƒd S dS )zL
        Return a tokenized copy of *s*.

        :rtype: List[str]
        r   N)r   Útokenize_sents)Úselfr
   © r   ú1/tmp/pip-build-v9q4h5k9/nltk/nltk/tokenize/api.pyÚtokenize   s    
zTokenizerI.tokenizec             C   s
   t ƒ ‚dS )z»
        Identify the tokens using integer offsets ``(start_i, end_i)``,
        where ``s[start_i:end_i]`` is the corresponding token.

        :rtype: Iterator[Tuple[int, int]]
        N)ÚNotImplementedError)r   r
   r   r   r   Úspan_tokenize$   s    zTokenizerI.span_tokenize)Ústringsr   c                s   ‡ fdd„|D ƒS )z«
        Apply ``self.tokenize()`` to each element of ``strings``.  I.e.:

            return [self.tokenize(s) for s in strings]

        :rtype: List[List[str]]
        c                s   g | ]}ˆ j |ƒ‘qS r   )r   )Ú.0r
   )r   r   r   ú
<listcomp>5   s    z-TokenizerI.tokenize_sents.<locals>.<listcomp>r   )r   r   r   )r   r   r   -   s    zTokenizerI.tokenize_sentsc             c   s"   x|D ]}t | j|ƒƒV  qW dS )z»
        Apply ``self.span_tokenize()`` to each element of ``strings``.  I.e.:

            return [self.span_tokenize(s) for s in strings]

        :yield: List[Tuple[int, int]]
        N)Úlistr   )r   r   r
   r   r   r   Úspan_tokenize_sents7   s    

zTokenizerI.span_tokenize_sentsN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Ústrr   r   r   r   Úintr   r   r   r   r   r   r   r	      s   		r	   c               @   s0   e Zd ZdZeedd„ ƒƒZdd„ Zdd„ ZdS )	ÚStringTokenizerzxA tokenizer that divides a string into substrings by splitting
    on the specified string (defined in subclasses).
    c             C   s   t ‚d S )N)r   )r   r   r   r   Ú_stringJ   s    zStringTokenizer._stringc             C   s   |j | jƒS )N)Úsplitr   )r   r
   r   r   r   r   O   s    zStringTokenizer.tokenizec             c   s   t || jƒE d H  d S )N)r   r   )r   r
   r   r   r   r   R   s    zStringTokenizer.span_tokenizeN)	r   r   r   r   Úpropertyr   r   r   r   r   r   r   r   r   E   s
   r   N)r   Úabcr   r   Ztypingr   r   r   Znltk.internalsr   Znltk.tokenize.utilr   r	   r   r   r   r   r   Ú<module>   s   1