3
­d
1  ã               @   s`   d Z ddlZddlmZ ddlmZmZmZmZ ddl	m
Z
 G dd„ deƒZG dd	„ d	e
ƒZdS )
a¸  
Lexical translation model that considers word order.

IBM Model 2 improves on Model 1 by accounting for word order.
An alignment probability is introduced, a(i | j,l,m), which predicts
a source word position, given its aligned target word's position.

The EM algorithm used in Model 2 is:

:E step: In the training data, collect counts, weighted by prior
         probabilities.

         - (a) count how many times a source language word is translated
               into a target language word
         - (b) count how many times a particular position in the source
               sentence is aligned to a particular position in the target
               sentence

:M step: Estimate new probabilities based on the counts from the E step

Notations
---------

:i: Position in the source sentence
     Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
:j: Position in the target sentence
     Valid values are 1, 2, ..., length of target sentence
:l: Number of words in the source sentence, excluding NULL
:m: Number of words in the target sentence
:s: A word in the source language
:t: A word in the target language

References
----------

Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.

Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
é    N)Údefaultdict)ÚAlignedSentÚ	AlignmentÚIBMModelÚ	IBMModel1)ÚCountsc                   sb   e Zd ZdZd‡ fdd„	Zdd„ Zdd„ Zd	d
„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Z‡  ZS )Ú	IBMModel2uY  
    Lexical translation model that considers word order

    >>> bitext = []
    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groÃŸ'], ['the', 'house', 'is', 'big']))
    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))

    >>> ibm2 = IBMModel2(bitext, 5)

    >>> print(round(ibm2.translation_table['buch']['book'], 3))
    1.0
    >>> print(round(ibm2.translation_table['das']['book'], 3))
    0.0
    >>> print(round(ibm2.translation_table['buch'][None], 3))
    0.0
    >>> print(round(ibm2.translation_table['ja'][None], 3))
    0.0

    >>> print(ibm2.alignment_table[1][1][2][2])
    0.938...
    >>> print(round(ibm2.alignment_table[1][2][2][2], 3))
    0.0
    >>> print(round(ibm2.alignment_table[2][2][4][5], 3))
    1.0

    >>> test_sentence = bitext[2]
    >>> test_sentence.words
    ['das', 'buch', 'ist', 'ja', 'klein']
    >>> test_sentence.mots
    ['the', 'book', 'is', 'small']
    >>> test_sentence.alignment
    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])

    Nc                sv   t ƒ j|ƒ |dkr6t|d| ƒ}|j| _| j|ƒ n|d | _|d | _xtd|ƒD ]}| j|ƒ qVW | j|ƒ dS )a™  
        Train on ``sentence_aligned_corpus`` and create a lexical
        translation model and an alignment model.

        Translation direction is from ``AlignedSent.mots`` to
        ``AlignedSent.words``.

        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
        :type sentence_aligned_corpus: list(AlignedSent)

        :param iterations: Number of iterations to run training algorithm
        :type iterations: int

        :param probability_tables: Optional. Use this to pass in custom
            probability values. If not specified, probabilities will be
            set to a uniform distribution, or some other sensible value.
            If specified, all the following entries must be present:
            ``translation_table``, ``alignment_table``.
            See ``IBMModel`` for the type and purpose of these tables.
        :type probability_tables: dict[str]: object
        Né   Útranslation_tableÚalignment_tabler   )	ÚsuperÚ__init__r   r
   Úset_uniform_probabilitiesr   ÚrangeÚtrainÚ	align_all)ÚselfÚsentence_aligned_corpusZ
iterationsZprobability_tablesZibm1Ún)Ú	__class__© ú3/tmp/pip-build-v9q4h5k9/nltk/nltk/translate/ibm2.pyr   c   s    

zIBMModel2.__init__c       	      C   s¸   t ƒ }x¬|D ]¤}t|jƒ}t|jƒ}||f|kr|j||fƒ d|d  }|tjk rjtjdt	|ƒ d ƒ xDt
d|d ƒD ]2}x,t
d|d ƒD ]}|| j| | | |< qŽW qzW qW d S )Né   zA source sentence is too long (z& words). Results may be less accurate.r   )ÚsetÚlenÚmotsÚwordsÚaddr   ÚMIN_PROBÚwarningsÚwarnÚstrr   r   )	r   r   Zl_m_combinationsÚaligned_sentenceÚlÚmZinitial_probÚiÚjr   r   r   r   Œ   s    



z#IBMModel2.set_uniform_probabilitiesc          	   C   sà   t ƒ }xÀ|D ]¸}d g|j }dg|j }t|jƒ}t|jƒ}| j||ƒ}xztd|d ƒD ]h}	||	 }
xZtd|d ƒD ]H}|| }| j||	||ƒ}|||
  }|j|||
ƒ |j|||	||ƒ qtW qXW qW | j	|ƒ | j
|ƒ d S )NZUNUSEDr   r   )ÚModel2Countsr   r   r   Úprob_all_alignmentsr   Úprob_alignment_pointÚupdate_lexical_translationÚupdate_alignmentZ*maximize_lexical_translation_probabilitiesÚ maximize_alignment_probabilities)r   Úparallel_corpusÚcountsr"   Úsrc_sentenceÚtrg_sentencer#   r$   Ztotal_countr&   Útr%   ÚsÚcountZnormalized_countr   r   r   r       s"    



zIBMModel2.trainc             C   s    t j}x”|jjƒ D ]†\}}x||jƒ D ]p\}}xf|jƒ D ]Z\}}xP|D ]H}	|j| | | |	 |j| | |	  }
t|
|ƒ| j| | | |	< qDW q6W q$W qW d S )N)r   r   Ú	alignmentÚitemsÚalignment_for_any_iÚmaxr   )r   r.   r   r%   Zj_sr&   Zsrc_sentence_lengthsr#   Ztrg_sentence_lengthsr$   Zestimater   r   r   r,   º   s    
z*IBMModel2.maximize_alignment_probabilitiesc          
   C   sd   t dd„ ƒ}xRtdt|ƒƒD ]@}|| }x2tdt|ƒƒD ] }||  | j||||ƒ7  < q8W qW |S )aï  
        Computes the probability of all possible word alignments,
        expressed as a marginal distribution over target words t

        Each entry in the return value represents the contribution to
        the total alignment probability by the target word t.

        To obtain probability(alignment | src_sentence, trg_sentence),
        simply sum the entries in the return value.

        :return: Probability of t for all s in ``src_sentence``
        :rtype: dict(str): float
        c               S   s   dS )Ng        r   r   r   r   r   Ú<lambda>Ô   s    z/IBMModel2.prob_all_alignments.<locals>.<lambda>r   r   )r   r   r   r)   )r   r/   r0   Zalignment_prob_for_tr&   r1   r%   r   r   r   r(   Æ   s    zIBMModel2.prob_all_alignmentsc       	      C   sL   t |ƒd }t |ƒd }|| }|| }| j| | | j| | | |  S )zz
        Probability that position j in ``trg_sentence`` is aligned to
        position i in the ``src_sentence``
        r   )r   r
   r   )	r   r%   r&   r/   r0   r#   r$   r2   r1   r   r   r   r)   Ý   s
    zIBMModel2.prob_alignment_pointc       	      C   sŠ   d}t |jƒd }t |jƒd }x\t|jƒD ]N\}}|dkr>q,|j| }|j| }|| j| | | j| | | |  9 }q,W t|tj	ƒS )zc
        Probability of target sentence and an alignment given the
        source sentence
        g      ð?r   r   )
r   r/   r0   Ú	enumerater4   r
   r   r7   r   r   )	r   Zalignment_infoZprobr#   r$   r&   r%   Útrg_wordÚsrc_wordr   r   r   Úprob_t_a_given_sè   s    

zIBMModel2.prob_t_a_given_sc             C   s   x|D ]}| j |ƒ qW d S )N)Úalign)r   r-   Úsentence_pairr   r   r   r   ý   s    
zIBMModel2.align_allc             C   sØ   g }t |jƒ}t |jƒ}x°t|jƒD ]¢\}}| j| d | jd |d  | |  }t|tjƒ}d}xRt|jƒD ]D\}	}
| j| |
 | j|	d  |d  | |  }||krp|}|	}qpW |j	||fƒ q$W t
|ƒ|_dS )a  
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The best alignment will be set in ``sentence_pair`` when the
        method returns. In contrast with the internal implementation of
        IBM models, the word indices in the ``Alignment`` are zero-
        indexed, not one-indexed.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent
        Nr   r   )r   r   r   r9   r
   r   r7   r   r   Úappendr   r4   )r   r>   Zbest_alignmentr#   r$   r&   r:   Z	best_probZbest_alignment_pointr%   r;   Z
align_probr   r   r   r=     s     

 zIBMModel2.align)N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r,   r(   r)   r<   r   r=   Ú__classcell__r   r   )r   r   r   ;   s   &)r   c                   s0   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Z‡  ZS )r'   zo
    Data object to store counts of various parameters during training.
    Includes counts for alignment.
    c                s*   t ƒ jƒ  tdd„ ƒ| _tdd„ ƒ| _d S )Nc               S   s   t dd„ ƒS )Nc               S   s   t dd„ ƒS )Nc               S   s   t dd„ ƒS )Nc               S   s   dS )Ng        r   r   r   r   r   r8   3  s    z]Model2Counts.__init__.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>)r   r   r   r   r   r8   3  s    zKModel2Counts.__init__.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>)r   r   r   r   r   r8   3  s    z9Model2Counts.__init__.<locals>.<lambda>.<locals>.<lambda>)r   r   r   r   r   r8   3  s    z'Model2Counts.__init__.<locals>.<lambda>c               S   s   t dd„ ƒS )Nc               S   s   t dd„ ƒS )Nc               S   s   dS )Ng        r   r   r   r   r   r8   6  s    zKModel2Counts.__init__.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>)r   r   r   r   r   r8   6  s    z9Model2Counts.__init__.<locals>.<lambda>.<locals>.<lambda>)r   r   r   r   r   r8   6  s    )r   r   r   r4   r6   )r   )r   r   r   r   0  s
    
zModel2Counts.__init__c             C   s,   | j | |  |7  < | j|  |7  < d S )N)Z	t_given_sZany_t_given_s)r   r3   r2   r1   r   r   r   r*   9  s    z'Model2Counts.update_lexical_translationc             C   s<   | j | | | |  |7  < | j| | |  |7  < d S )N)r4   r6   )r   r3   r%   r&   r#   r$   r   r   r   r+   =  s    zModel2Counts.update_alignment)r@   rA   rB   rC   r   r*   r+   rD   r   r   )r   r   r'   *  s   	r'   )rC   r   Úcollectionsr   Znltk.translater   r   r   r   Znltk.translate.ibm_modelr   r   r'   r   r   r   r   Ú<module>2   s    p