3
d                 @   sj   d Z ddlmZ ddlmZ yddlZW n ek
r@   dZY nX G dd dZdd Z	e
d	krfe	  dS )
a  
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.

For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
    )maxsize)trigramsNc               @   sP   e Zd ZdZi ZdZdZi Zdd Zdd Z	dd	 Z
d
d Zdd Zdd ZdS )TextCatN<>c             C   sB   t stdddlm} || _x| jj D ]}| jj| q*W d S )Nzclassify.textcat requires the regex module that supports unicode. Try '$ pip install regex' and see https://pypi.python.org/pypi/regex for further details.r   )crubadan)reOSErrornltk.corpusr   _corpuslangs	lang_freq)selfr   lang r   5/tmp/pip-build-v9q4h5k9/nltk/nltk/classify/textcat.py__init__8   s    zTextCat.__init__c             C   s   t jdd|S )z)Get rid of punctuation except apostrophesz[^\P{P}\']+ )r   sub)r   textr   r   r   remove_punctuationH   s    zTextCat.remove_punctuationc             C   s   ddl m}m} | j|}||}| }x^|D ]V}t| j| | j }dd |D }	x.|	D ]&}
|
|krx||
  d7  < qZd||
< qZW q.W |S )z'Create FreqDist of trigrams within textr   )FreqDistword_tokenizec             S   s   g | ]}d j |qS )r   )join).0Ztrir   r   r   
<listcomp>V   s    z#TextCat.profile.<locals>.<listcomp>   )Znltkr   r   r   r   _START_CHAR	_END_CHAR)r   r   r   r   Z
clean_texttokensfingerprinttZtoken_trigram_tuplesZtoken_trigramsZcur_trigramr   r   r   profileL   s    


zTextCat.profilec             C   sR   | j j|}d}||krJt|j j|}t|j j|}t|| }nt}|S )zgCalculate the "out-of-place" measure between the
        text and language profile for a single trigramr   )r   r   listkeysindexabsr   )r   r   trigramZtext_profileZlang_fddistZidx_lang_profileZidx_textr   r   r   	calc_dist`   s    zTextCat.calc_distc             C   sT   i }| j |}x@| jjj D ]0}d}x|D ]}|| j|||7 }q*W |||< qW |S )zOCalculate the "out-of-place" measure between
        the text and all languagesr   )r"   r   Z_all_lang_freqr$   r)   )r   r   Z	distancesr"   r   Z	lang_distr'   r   r   r   
lang_distsu   s    

zTextCat.lang_distsc             C   s   | j || _t| j| jjdS )zYFind the language with the min distance
        to the text and return its ISO 639-3 code)key)r*   last_distancesminget)r   r   r   r   r   guess_language   s    zTextCat.guess_language)__name__
__module____qualname__r   Zfingerprintsr   r   r,   r   r   r"   r)   r*   r/   r   r   r   r   r   /   s   r   c           
   C   s  ddl m}  ddddddd	d
dg	}dddddddddd	}t }x|D ]}| j|}t|d }ttt|}d}xJtd|D ]<}	d}
x*td||	 D ]}|
d||	 |  7 }
qW ||
7 }q~W td|dd  d  |j	|}td| d||  d tdd  qFW d S ) Nr   )udhrzKurdish-UTF8zAbkhaz-UTF8zFarsi_Persian-UTF8z
Hindi-UTF8zHawaiian-UTF8zRussian-UTF8zVietnamese-UTF8zSerbian_Srpski-UTF8zEsperanto-UTF8zNorthern KurdishZ	AbkhazianzIranian PersianZHindiZHawaiianRussianZ
VietnameseZSerbianZ	Esperanto)	ZkmrZabkZpesZhinZhawZrusZvieZsrpZepor   r    zLanguage snippet:    z...zLanguage detection: z ()#)
r
   r3   r   Zsentslenr#   maprangeprintr/   )r3   r   ZfriendlyZtcZcur_langZraw_sentencesZrowscolssampleiZcur_sentjguessr   r   r   demo   sD    


rB   __main__)__doc__sysr   Z	nltk.utilr   regexr   ImportErrorr   rB   r0   r   r   r   r   <module>   s   
a4