3
d+1                 @   s,   d Z ddlZddlmZ G dd deZdS )z
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
    N)StemmerIc            s   @   s`   e Zd ZdZdZddwdxZddydzZd{d| Zd}d~ Zdd Z	dd Z
dd Zdd Zdd ZduS )LancasterStemmera/  
    Lancaster Stemmer

        >>> from nltk.stem.lancaster import LancasterStemmer
        >>> st = LancasterStemmer()
        >>> st.stem('maximum')     # Remove "-um" when word is intact
        'maxim'
        >>> st.stem('presumably')  # Don't remove "-um" when word is not intact
        'presum'
        >>> st.stem('multiply')    # No action taken if word ends with "-ply"
        'multiply'
        >>> st.stem('provision')   # Replace "-sion" with "-j" to trigger "j" set of rules
        'provid'
        >>> st.stem('owed')        # Word starting with vowel must contain at least 2 letters
        'ow'
        >>> st.stem('ear')         # ditto
        'ear'
        >>> st.stem('saying')      # Words starting with consonant must contain at least 3
        'say'
        >>> st.stem('crying')      #     letters and one of those letters must be a vowel
        'cry'
        >>> st.stem('string')      # ditto
        'string'
        >>> st.stem('meant')       # ditto
        'meant'
        >>> st.stem('cement')      # ditto
        'cem'
        >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
        >>> st_pre.stem('kilometer') # Test Prefix
        'met'
        >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
        >>> st_custom.stem("ness") # Change s to t
        'nest'
    ai*2.a*1.bb1.city3s.ci2>cn1t>dd1.dei3y>deec2ss.dee1.de2>dooh4>e1>feil1v.fi2>gni3>gai3y.ga2>gg1.ht*2.	hsiug5ct.hsi3>i*1.i1y>ji1d.juf1s.ju1d.jo1d.jeh1r.jrev1t.jsim2t.jn1d.j1s.lbaifi6.lbai4y.lba3>lbi3.lib2l>lc1.lufi4y.luf3>lu2.lai3>lau3>la2>ll1.mui3.mu*2.msi3>mm1.nois4j>noix4ct.noi3>nai3>na2>nee0.ne2>nn1.pihs4>pp1.re2>rae0.ra2.ro2>ru2>rr1.rt1>rei3y>sei3y>sis2.si2>ssen4>ss0.suo3>su*2.s*1>s0.	tacilp4y.ta2>tnem4>tne3>tna3>tpir2b.tpro2b.tcud1.tpmus2.tpec2iv.tulo2v.tsis0.tsi3>tt1.uqi3.ugo1.vis3j>vie0.vi2>ylb1>yli3y>ylp0.yl2>ygo1.yhp1.ymo1.ypo1.yti3>yte3>ytl2.yrtsi5.yra3>yro3>yfi3.ycn2t>yca3>zi2>zy1s.NFc             C   s    i | _ || _|r|n| j| _dS )z,Create an instance of the Lancaster stemmer.N)rule_dictionary_strip_prefixdefault_rule_tuple_rule_tuple)self
rule_tupleZstrip_prefix_flag r}   3/tmp/pip-build-v9q4h5k9/nltk/nltk/stem/lancaster.py__init__   s    zLancasterStemmer.__init__c             C   s~   |r|n| j }tjd}i | _xZ|D ]R}|j|sBtd| d|dd }|| jkrj| j| j| q$|g| j|< q$W dS )a(  Validate the set of rules used in this stemmer.

        If this function is called as an individual method, without using stem
        method, rule_tuple argument will be compiled into self.rule_dictionary.
        If this function is called within stem, self._rule_tuple will be used.

        z^[a-z]+\*?\d[a-z]*[>\.]?$z	The rule z is invalidr      N)rz   recompilerw   match
ValueErrorappend)r{   r|   
valid_ruleruleZfirst_letterr}   r}   r~   
parseRules   s    	



zLancasterStemmer.parseRulesc             C   s:   |j  }| jr| j|n|}|}| js.| j  | j||S )z(Stem a word using the Lancaster stemmer.)lowerrx   _LancasterStemmer__stripPrefixrw   r   _LancasterStemmer__doStemming)r{   wordintact_wordr}   r}   r~   stem   s    zLancasterStemmer.stemc             C   s  t jd}d}x|r| j|}|dk s6|| | jkr<d}qd}x| j||  D ]}|j|}|rP|j \}	}
}}}t|}|j|	ddd rP|
r||kr| j||r| j	|||}d}|dkrd}P qP| j||rP| j	|||}d}|dkrd}P qPW |dkrd}qW |S )	z Perform the actual word stemmingz#^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$Tr   FNr   .)
r   r    _LancasterStemmer__getLastLetterrw   r   groupsintendswith_LancasterStemmer__isAcceptable_LancasterStemmer__applyRule)r{   r   r   r   proceedZlast_letter_positionZrule_was_appliedr   Z
rule_matchZending_stringZintact_flagremove_totalappend_stringZ	cont_flagr}   r}   r~   Z__doStemming   sB    




zLancasterStemmer.__doStemmingc             C   s2   d}x(t t|D ]}|| j r(|}qP qW |S )zHGet the zero-based index of the last alphabetic character in this stringr   r   )rangelenisalpha)r{   r   Zlast_letterpositionr}   r}   r~   Z__getLastLetter  s    z LancasterStemmer.__getLastLetterc             C   s\   d}|d dkr&t || dkrXd}n2t || dkrX|d dkrHd}n|d dkrXd}|S )z1Determine if the word is acceptable for stemming.Fr   Zaeiouy   T   r   )r   )r{   r   r   Zword_is_acceptabler}   r}   r~   Z__isAcceptable%  s    zLancasterStemmer.__isAcceptablec             C   s(   t || }|d| }|r$||7 }|S )z#Apply the stemming rule to the wordr   )r   )r{   r   r   r   Znew_word_lengthr}   r}   r~   Z__applyRule6  s
    zLancasterStemmer.__applyRulec          	   C   s,   x&dD ]}|j |r|t|d
 S qW |S )zYRemove prefix from a word.

        This function originally taken from Whoosh.

        kilomicromilliintraultramegananopicopseudoN)	r   r   r   r   r   r   r   r   r   )
startswithr   )r{   r   prefixr}   r}   r~   Z__stripPrefixA  s            
zLancasterStemmer.__stripPrefixc             C   s   dS )Nz<LancasterStemmer>r}   )r{   r}   r}   r~   __repr__V  s    zLancasterStemmer.__repr__)sr   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   )NF)N)__name__
__module____qualname____doc__ry   r   r   r   r   r   r   r   r   r   r}   r}   r}   r~   r      s   "                                                                                                                  
	
@
r   )r   r   Znltk.stem.apir   r   r}   r}   r}   r~   <module>   s   