RT,NLP实验二。音字转换,其中用到的思想比较基本、比较老。
1.首先统计unigram和bigram的频数
2.词作为状态集,音作为观测序列。
3.计算转移矩阵概率和发射矩阵概率,建立HMM模型
4.给定HMM模型和观测序列,采用viterbi算法动态规划解码。
viterbi.py
# -*- coding: cp936 -*- """ viterbi.py author:messiandzcy date:2014.12.11 """ import sys #from numpy import * #申请矩阵 def matrix(rows,cols): mat = [[0 for col in range(cols)]for row in range(rows)] return mat #加载unigram def LoadUnigram(): print "Loading Unigram..." fp = open("unigramSorted.txt","r") uni={} iters=1 #控制读入行数 for line in fp: w,freq = line.split() if len(w)==2 and int(freq)>=2700:#w in debug: #状态集只包含频繁单字 uni[w]=int(freq) #iters += 1 #uni[w]=int(freq) #if iters>=1300:break #控制状态集的大小,有内存溢出问题 fp.close() print "Loaded Unigram SUCCESS!" print len(uni) return uni #加载bigram def LoadBigram(): print "Loading Bigram..." fp = open("bigramSorted.txt","r") big={} iters=1 for line in fp: w,freq = line.split() big[w]=int(freq) iters += 1 fp.close() print "Loaded Bigram SUCCESS!" return big #输出测试 def printf(A,m,n): for i in range(m): for j in range(n): print "%.8lf"%(A[i][j]), print #首先计算状态转移概率矩阵A def computeA(uni,big): num1,num2=len(uni),len(big) #print uni sum_uni,sum_big = 0,0 #计算总频数 for w in uni: sum_uni+=uni[w] for w in big: sum_big+=big[w] #print sum_uni,sum_big lst = uni.keys() #lst存储int和键的映射 #print "num1=%d"%num1 #print lst A = matrix(num1,num1) #申请转移矩阵,每个下标号对应字典中一个词 #开始计算转移概率矩阵 for i in range(num1): for j in range(num1): fenmu = (uni.get(lst[i],0)+1.0)/(sum_uni+num1) #Laplace smooth fenzi = (big.get(lst[i]+lst[j],0)+1.0)/(sum_big+num2) A[i][j]=fenzi/fenmu #sys.stdout.write('\n') #print "A=" #printf(A,num1,num1) return (A,lst) #加载lexicon.txt def LoadLexicon(): print "Loading 'lexicon.txt'..." fp = open("lexicon.txt","r") lex={} #字典:键是单词,值是音标的列表 iters=1 for line in fp: tmp = [] word = line.split()[0] if word in lex: #如果单词已经在词典中 for yin in line.split()[1:]:lex[word].append(yin[:-1]) else: #新开一个单词 for yin in line.split()[1:]:tmp.append(yin[:-1]) lex[word]=tmp #if iters>=12:break iters += 1 #对词典的值进行列表去重 for key in lex: lex[key]=list(set(lex[key])) fp.close() print "Loaded 'lexicon.txt' SUCCESS!" return lex #计算发射概率矩阵B def computeB(lst,lex): yin = [] for key in lex: #print key #print lex[key] yin += lex[key] yin = list(set(yin)) #去重 #print yin num1,num2=len(lst),len(yin) B = matrix(num1,num2) #申请发射矩阵,行是词,列是音 #print B for i in range(num1): #对于每个词 yinlst=lex.get(lst[i]) #lst[i]全部是单字 if yinlst is None:continue for yinj in yinlst: j = yin.index(yinj) B[i][j]=1.0/len(yinlst) #print B[i][j] #print yinj #print yinlst #print B #print len(yin) return (B,yin) #给定HMM模型,开始解码 def viterbi(A,B,lst,yin,string): states =len(lst) #总状态数 iters = len(string) #迭代次数 pi = [1.0/states for i in range(states)] #初始 fai = matrix(iters,states) #dp记录最优路径 delta = matrix(iters,states) #主要计算矩阵 #初始化 for i in range(states): #对每个状态 delta[0][i]=pi[i]*B[i][yin.index(string[0])] #print delta[0] #迭代 t = 1 while t<iters: for i in range(states): #对每个状态 left = [] maxx,keep = 0,0 for j in range(states): if delta[t-1][j]*A[j][i]>maxx: maxx=delta[t-1][j]*A[j][i] keep = j delta[t][i]=maxx*B[i][yin.index(string[t])] fai[t][i]=keep t += 1 #print fai #print delta #终止 P = 0 for i in range(states): if delta[t-1][i]>=P: P=delta[t-1][i] end = i print P #end是终止状态 res = [] for k in range(t-1,-1,-1): res.append(lst[end]) end = fai[k][end] print "".join(res[::-1]) #print string #主函数 uni=LoadUnigram() #一元词典 big=LoadBigram() #二元词典 lex=LoadLexicon() #音词转换表 A,lst=computeA(uni,big) #lst映射表必须跟着 B,yin=computeB(lst,lex) #yin映射表也必须返回 while True: #for test print "input a string(split by ' ')" string=raw_input().split() #string=['yi','zhi','mei','li','de','xiao','hua'] viterbi(A,B,lst,yin,string) #decode #print B[lst.index(m)][yin.index(n)] """ #maxNum=7136 #申请43373的矩阵时会溢出 #matrix = arange(maxNum*maxNum) #a = [0 for i in range(maxNum)] #mat = a*maxNum #matrix(maxNum,maxNum) #print "hehe" """
原文地址:http://blog.csdn.net/messiandzcy/article/details/41909323