3
d"                 @   s   d Z ddlZyddlmZ ddlmZ W n, ek
rT   dd Zdd Zd	d
 ZY nX ej	dZ
G dd dZdd Zdd ZefddZefddZdd Zdd ZdS )z

A port of the Gale-Church Aligner.

Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
https://aclweb.org/anthology/J93-1004.pdf

    N)logsf)normc             C   s   t | }ddd|   }|tj| | d |d|d|d|d|d|d|d
|d|d                   }| dkr|S d| S dS )zComplementary error function.   g      ?gś??g5 ?g`yg?gƸ?gꪂI?g#v?g9)?gS?gޅ1O?gv(?g        g       @NgꪂIǿg9)gޅ1O)absmathexp)xztr r   :/tmp/pip-build-v9q4h5k9/nltk/nltk/translate/gale_church.pyerfcc   s(    4r   c             C   s   ddt | tjd   S )u>   Return the area under the normal distribution from M{-∞..x}.r   g      ?   )r   r   sqrt)r   r   r   r   norm_cdf@   s    r   c             C   s0   yt jdt|  S  tk
r*   tdS X d S )Nr   z-inf)r   logr   
ValueErrorfloat)r   r   r   r   
norm_logsfD   s    r   r   c               @   s0   e Zd Zd
dddddddddddiZdZdZd	S )LanguageIndependentr   r   gׁsF?g{Gz?r   gbX9ȶ?gI+?g333333@N)r   r   )r   r   )r   r   )r   r   )r   r   )r   r   )__name__
__module____qualname__PRIORSAVERAGE_CHARACTERSVARIANCE_CHARACTERSr   r   r   r   r   N   s   r   c       	      C   s   g }t |t |f}x|dkrtdd |D ry| | \}}W n. tk
rn   |d d |d d f}wY nX xHt|D ]<}x6t|D ]*}|j|d | d |d | d f qW qzW |d | |d | f}qW |ddd S )a  
    Traverse the alignment cost from the tracebacks and retrieves
    appropriate sentence pairs.

    :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
    :type backlinks: dict
    :param source_sents_lens: A list of target sentences' lengths
    :type source_sents_lens: list(int)
    :param target_sents_lens: A list of target sentences' lengths
    :type target_sents_lens: list(int)
    r   c             s   s   | ]}|d kV  qdS )r   Nr   ).0pr   r   r   	<genexpr>n   s    ztrace.<locals>.<genexpr>r   N)r   r   )lenall	TypeErrorrangeappend)		backlinkssource_sents_lenstarget_sents_lenslinkspositionsr
   ijr   r   r   trace`   s    .r.   c       
         s   t  fddt|d D }t fddt|d D }y4|||j  d }||j | tj||j  }	W n tk
r   tdS X tt	t
|	 tj|j|   S )aP  Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
    being aligned with a specific C{alignment}.

    @param i: The offset of the source sentence.
    @param j: The offset of the target sentence.
    @param source_sents: The list of source sentence lengths.
    @param target_sents: The list of target sentence lengths.
    @param alignment: The alignment type, a tuple of two integers.
    @param params: The sentence alignment parameters.

    @returns: The log probability of a specific alignment between the two sentences, given the parameters.
    c             3   s   | ]} | d   V  qdS )r   Nr   )r   offset)r,   source_sentsr   r   r      s    z!align_log_prob.<locals>.<genexpr>r   c             3   s   | ]} | d   V  qdS )r   Nr   )r   r/   )r-   target_sentsr   r   r      s    r   r   z-inf)sumr$   r   r   r   r   ZeroDivisionErrorr   LOG2r   r   r   r   )
r,   r-   r0   r1   	alignmentparamsZl_sZl_tmdeltar   )r,   r-   r0   r1   r   align_log_prob|   s      
r9   c             C   s  t |jj }g g}i }xtt| d D ]}xtt|d D ]}td}d}	xj|D ]b}
d|
d  }||
d  }|t| k sV|dk rqV|| | t||| ||
| }||k rV|}|
}	qVW |tdkrd}|	|||f< |d j| q@W t|dkr|jd |jg  q*W t	|| |S )a  Return the sentence alignment of two text blocks (usually paragraphs).

        >>> align_blocks([5,5,5], [7,7,7])
        [(0, 0), (1, 1), (2, 2)]
        >>> align_blocks([10,5,5], [12,20])
        [(0, 0), (1, 1), (2, 1)]
        >>> align_blocks([12,20], [10,5,5])
        [(0, 0), (1, 1), (1, 2)]
        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]

    @param source_sents_lens: The list of source sentence lengths.
    @param target_sents_lens: The list of target sentence lengths.
    @param params: the sentence alignment parameters.
    @return: The sentence alignments, a list of index pairs.
    r   infNr   r   r    r    )
listr   keysr$   r!   r   r9   r%   popr.   )r'   r(   r6   Zalignment_typesDr&   r,   r-   Zmin_distZ	min_alignaZprev_iZprev_jr   r   r   r   align_blocks   s2    

r@   c                s0   t | t |krtd fddt| |D S )a  Creates the sentence alignment of two texts.

    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
    alignment links.

    Each block consists of a list that contains the lengths (in characters) of the sentences
    in this block.

    @param source_blocks: The list of blocks in the source text.
    @param target_blocks: The list of blocks in the target text.
    @param params: the sentence alignment parameters.

    @returns: A list of sentence alignment lists
    z>Source and target texts do not have the same number of blocks.c                s   g | ]\}}t || qS r   )r@   )r   Zsource_blockZtarget_block)r6   r   r   
<listcomp>   s   zalign_texts.<locals>.<listcomp>)r!   r   zip)Zsource_blocksZtarget_blocksr6   r   )r6   r   align_texts   s
    
rC   c             #   s&    fdd}x| j  V  qW dS )zSplits an iterator C{it} at values of C{split_value}.

    Each instance of C{split_value} is swallowed. The iterator produces
    subiterators which need to be consumed fully before the next subiterator
    can be used.
    c             3   s$   | }x|kr|V   j  }qW d S )N)next)firstv)itsplit_valuer   r   _chunk_iterator   s    
z!split_at.<locals>._chunk_iteratorN)rD   )rG   rH   rI   r   )rG   rH   r   split_at   s    rJ   c                s    fddt | |D S )zParses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
    c                s    g | ]}d d t | D qS )c             S   s   g | ]}t d d |D qS )c             s   s   | ]}t |V  qd S )N)r!   )r   tokenr   r   r   r     s    z;parse_token_stream.<locals>.<listcomp>.<listcomp>.<genexpr>)r2   )r   Zsentence_itr   r   r   rA     s   z1parse_token_stream.<locals>.<listcomp>.<listcomp>)rJ   )r   Zblock_it)soft_delimiterr   r   rA     s   z&parse_token_stream.<locals>.<listcomp>)rJ   )streamrL   Zhard_delimiterr   )rL   r   parse_token_stream   s    
rN   )__doc__r   r   r   r   Zscipy.statsImportErrorr   r   r   r4   r   r.   r9   r@   rC   rJ   rN   r   r   r   r   <module>   s    '
6