标签:
#!/usr/bin/python
#coding:utf-8
#2015-11-04 21:23:17.230000
"""
改编自:python实现将汉字转换成汉语拼音的库_python_脚本之家 - http://www.jb51.net/article/65496.htm
从这里下的字典表(文件编码要转成utf8的): 汉字编码表 - 下载频道 - CSDN.NET - http://download.csdn.net/detail/slowwind9999/291213
可以用在线工具验证: 中文拼音五笔转换带声调 - 在线工具 - http://tool.lu/py5bconvert/
如果要新增函数把汉字转换为其他编码,仿照hanzi2pinyin或hanzi2wubi,再增加一个字典项并且在load_word里加载数据就行了;
"""
import sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)
__version__ = ‘0.9‘
__all__ = ["PinYin"]
import os.path
class Hanzi2code(object):
def __init__(self, dict_file=‘code.txt‘): #code.txt的编码是utf8
self.word_dict = {}
self.wubi_dict = {}
self.dict_file = dict_file
self.load_word() #qxx 对象实例就加载word
def load_word(self):
if not os.path.exists(self.dict_file):
raise IOError("NotFoundFile")
with file(self.dict_file) as f_obj:
codeList = f_obj.readlines()[6:]
for f_line in codeList:
try:
line = f_line.strip().split()
self.word_dict[line[0]] = line[1]
self.wubi_dict[line[0]] = line[2]
except:
print ‘err....‘
# line = f_line.split()
# self.word_dict[line[0]] = line[1]
# def hanzi2pinyin(self, string=""):
# result = []
# if not isinstance(string, unicode):
# string = string.decode("utf-8")
# for char in string:
## key = ‘%X‘ % ord(char)
# result.append(self.word_dict.get(char.encode(‘utf8‘), char).split()[0].lower())
# return result
def hanzi2pinyin_split(self, string="", split=""):
result = self.hanzi2pinyin(string=string)
if split == "":
return result
else:
return split.join(result)
def hanzi2code(self,string=‘‘,dic={}):
result = []
if not isinstance(string, unicode):
string = string.decode("utf-8")
for char in string:
# key = ‘%X‘ % ord(char)
result.append(dic.get(char.encode(‘utf8‘), char).split()[0].lower())
return result
def hanzi2wubi(self,string=‘‘):
return self.hanzi2code(string,self.wubi_dict)
def hanzi2pinyin(self,string=‘‘):
return self.hanzi2code(string,self.word_dict)
if __name__ == "__main__":
test = Hanzi2code()
string = "钓鱼岛是中国的"
print "in: %s" % string
print "out: %s" % str(test.hanzi2pinyin(string=string))
print "out: %s" % test.hanzi2pinyin_split(string=string, split="-")
print "out: %s" % str(test.hanzi2wubi(string=string))
标签:
原文地址:http://www.cnblogs.com/QIAOXINGXING001/p/5023117.html