标签:ibm model1 em 词对齐
RT,蛋疼的MT实验~
伪代码请参见《统计机器翻译》中文版61页。
ps:输入只是参考了书上的例子——三个对齐句对~保证和书上输出的概率相同
该代码还没有经过大规模平行语料的检验。。后续有待优化~
输入:data.e
the house the book a book
das Haus das Buch ein Buch
#coding=utf-8 import os #Read File to pairs fp_en = open('data.e','r') fp_cn = open('data.f','r') iters = 1 pairDic = {} #生成原始序对字典 countPair = 0 for line_cn,line_en in zip(fp_cn,fp_en): f = line_cn.split() e = line_en.split() for word1 in f: for word2 in e: pairDic[countPair] = (word1,word2) countPair += 1 iters += 1 fp_en.close() fp_cn.close() #先将序对字典一次性去重 lst = list(set(pairDic.values())) #print "lst=",lst NewpairDic = {} i = 0 foreign,english = [],[] for _tuple in lst: #生成新的序对字典 NewpairDic[i] = _tuple i += 1 #print "pairs=",NewpairDic # run ibm-model-1(EM) t = {} for key in NewpairDic.values(): t[key]=0.25 #initialize t(e|f) uniformly print "t0=",t K = 0 while K<=2: #while not converged fp_en = open('data.e','r') fp_cn = open('data.f','r') count,total = {},{} for key in NewpairDic.values(): count[key] = 0 for _tuple in NewpairDic.values(): total[_tuple[0]] = 0 s_total = {} for ee,ff in zip(fp_en,fp_cn): #compute normalization for e in ee.split(): s_total[e] = 0 for f in ff.split(): s_total[e] += t[(f,e)] #collect counts for e in ee.split(): for f in ff.split(): count[(f,e)] += t[(f,e)]/s_total[e] total[f] += t[(f,e)]/s_total[e] #estimate probabilities for f,e in NewpairDic.values(): t[(f,e)] = count[(f,e)]/total[f] #end of while K += 1 fp_en.close() fp_cn.close() print "t%d=" %K for it in t.items(): print it
标签:ibm model1 em 词对齐
原文地址:http://blog.csdn.net/messiandzcy/article/details/44813041