RT,NLP实验二。音字转换,其中用到的思想比较基本、比较老。
1.首先统计unigram和bigram的频数
2.词作为状态集,音作为观测序列。
3.计算转移矩阵概率和发射矩阵概率,建立HMM模型
4.给定HMM模型和观测序列,采用viterbi算法动态规划解码。
viterbi.py
# -*- coding: cp936 -*-
"""
viterbi.py
author:messiandzcy
date:2014.12.11
"""
import sys
#from numpy import *
#申请矩阵
def matrix(rows,cols):
mat = [[0 for col in range(cols)]for row in range(rows)]
return mat
#加载unigram
def LoadUnigram():
print "Loading Unigram..."
fp = open("unigramSorted.txt","r")
uni={}
iters=1 #控制读入行数
for line in fp:
w,freq = line.split()
if len(w)==2 and int(freq)>=2700:#w in debug: #状态集只包含频繁单字
uni[w]=int(freq)
#iters += 1
#uni[w]=int(freq)
#if iters>=1300:break #控制状态集的大小,有内存溢出问题
fp.close()
print "Loaded Unigram SUCCESS!"
print len(uni)
return uni
#加载bigram
def LoadBigram():
print "Loading Bigram..."
fp = open("bigramSorted.txt","r")
big={}
iters=1
for line in fp:
w,freq = line.split()
big[w]=int(freq)
iters += 1
fp.close()
print "Loaded Bigram SUCCESS!"
return big
#输出测试
def printf(A,m,n):
for i in range(m):
for j in range(n):
print "%.8lf"%(A[i][j]),
print
#首先计算状态转移概率矩阵A
def computeA(uni,big):
num1,num2=len(uni),len(big)
#print uni
sum_uni,sum_big = 0,0 #计算总频数
for w in uni:
sum_uni+=uni[w]
for w in big:
sum_big+=big[w]
#print sum_uni,sum_big
lst = uni.keys() #lst存储int和键的映射
#print "num1=%d"%num1
#print lst
A = matrix(num1,num1) #申请转移矩阵,每个下标号对应字典中一个词
#开始计算转移概率矩阵
for i in range(num1):
for j in range(num1):
fenmu = (uni.get(lst[i],0)+1.0)/(sum_uni+num1) #Laplace smooth
fenzi = (big.get(lst[i]+lst[j],0)+1.0)/(sum_big+num2)
A[i][j]=fenzi/fenmu
#sys.stdout.write('\n')
#print "A="
#printf(A,num1,num1)
return (A,lst)
#加载lexicon.txt
def LoadLexicon():
print "Loading 'lexicon.txt'..."
fp = open("lexicon.txt","r")
lex={} #字典:键是单词,值是音标的列表
iters=1
for line in fp:
tmp = []
word = line.split()[0]
if word in lex: #如果单词已经在词典中
for yin in line.split()[1:]:lex[word].append(yin[:-1])
else: #新开一个单词
for yin in line.split()[1:]:tmp.append(yin[:-1])
lex[word]=tmp
#if iters>=12:break
iters += 1
#对词典的值进行列表去重
for key in lex:
lex[key]=list(set(lex[key]))
fp.close()
print "Loaded 'lexicon.txt' SUCCESS!"
return lex
#计算发射概率矩阵B
def computeB(lst,lex):
yin = []
for key in lex:
#print key
#print lex[key]
yin += lex[key]
yin = list(set(yin)) #去重
#print yin
num1,num2=len(lst),len(yin)
B = matrix(num1,num2) #申请发射矩阵,行是词,列是音
#print B
for i in range(num1): #对于每个词
yinlst=lex.get(lst[i]) #lst[i]全部是单字
if yinlst is None:continue
for yinj in yinlst:
j = yin.index(yinj)
B[i][j]=1.0/len(yinlst)
#print B[i][j]
#print yinj
#print yinlst
#print B
#print len(yin)
return (B,yin)
#给定HMM模型,开始解码
def viterbi(A,B,lst,yin,string):
states =len(lst) #总状态数
iters = len(string) #迭代次数
pi = [1.0/states for i in range(states)] #初始
fai = matrix(iters,states) #dp记录最优路径
delta = matrix(iters,states) #主要计算矩阵
#初始化
for i in range(states): #对每个状态
delta[0][i]=pi[i]*B[i][yin.index(string[0])]
#print delta[0]
#迭代
t = 1
while t<iters:
for i in range(states): #对每个状态
left = []
maxx,keep = 0,0
for j in range(states):
if delta[t-1][j]*A[j][i]>maxx:
maxx=delta[t-1][j]*A[j][i]
keep = j
delta[t][i]=maxx*B[i][yin.index(string[t])]
fai[t][i]=keep
t += 1
#print fai
#print delta
#终止
P = 0
for i in range(states):
if delta[t-1][i]>=P:
P=delta[t-1][i]
end = i
print P
#end是终止状态
res = []
for k in range(t-1,-1,-1):
res.append(lst[end])
end = fai[k][end]
print "".join(res[::-1])
#print string
#主函数
uni=LoadUnigram() #一元词典
big=LoadBigram() #二元词典
lex=LoadLexicon() #音词转换表
A,lst=computeA(uni,big) #lst映射表必须跟着
B,yin=computeB(lst,lex) #yin映射表也必须返回
while True: #for test
print "input a string(split by ' ')"
string=raw_input().split()
#string=['yi','zhi','mei','li','de','xiao','hua']
viterbi(A,B,lst,yin,string) #decode
#print B[lst.index(m)][yin.index(n)]
"""
#maxNum=7136 #申请43373的矩阵时会溢出
#matrix = arange(maxNum*maxNum)
#a = [0 for i in range(maxNum)]
#mat = a*maxNum
#matrix(maxNum,maxNum)
#print "hehe"
"""
原文地址:http://blog.csdn.net/messiandzcy/article/details/41909323