把上面所有的片段都当做一个词,采用逆向最大匹配方法进行分词(注意:这里虽然分词了,但由于字典中的词并不能保证正确性,所以单纯从分词合法性上来说分词结果往往是错的)。把所有词的意见累加起来。 取其中概率最大类别。至此结束。为撑场面我把分类程序的源码贴在下面。
# -*- coding: utf-8 -*- # created by axuanwu 2015.1.25 # key word: hash count import numpy as np import math def getseed(str1): """ :param str1: 词条的utf8形式 :return: 词条的hash指纹 256的位随机数 """ h = 0 for x in str1: if ord(x) > 256: h <<= 12 h += ord(x) else: h <<= 6 h += ord(x) while (h >> 256) > 0: h = (h & (2 ** 256 - 1)) ^ (h >> 256) # 数字不能太大 return h class MCard(): def __init__(self): self.M_num = 8 self.N_max = 16777216 self.nummax2 = 24 self.MCARD = [0] self.Opath = "" self.index = [0] * 8 self.__keys = ['first_NULL'] self.i_key = 1 # 新增元素增加在位置 i_key 处 self.index2 = [0] * 8 def get_keys(self, iii=-1): if iii == -1: return self.__keys[1:] else: return self.__keys[iii] def flush_key(self, iii): self.__keys[iii] = "" # 去掉keys的值 def getindex(self, str1, for_up=False): # 获取 词条的 8个随机位置 seed = getseed(str1) for n in range(0, self.M_num): a = 0 k = (n + 1) seed1 = seed if (seed >> 64) < 0: seed1 = seed * (n + 15048796327) while seed1 > 0: a ^= (seed1 & (self.N_max - 1)) + k a = ((a << k) & (self.N_max - 1)) | (a >> (self.nummax2 - k)) # 左循环移位 seed1 >>= self.nummax2 if for_up: self.index2[n] = a else: self.index[n] = a def update_card(self, str1): """ :param str1: 词的utf-8编码形式 :param num: 该词需要增加的value值 """ if self.read_card(str1, True) == 0: # 新词 for iii in self.index: if self.MCARD[iii] == 0: self.MCARD[iii] = self.i_key if self.i_key % 10000 == 0: print self.i_key self.i_key += 1 self.__keys.append(str1) def read_card(self, str1, for_up=False): """ :param str1: 词的utf-8编码形式 :return: 输出该次条对应的value值 """ if for_up: for i in xrange(0, 10): # 最多尝试10次 i_str1 = str1 + str(i) if i > 5: print i self.getindex(i_str1) aaa = min(self.MCARD[self.index]) if aaa == 0: return 0 return -1 else: for i in xrange(0, 10): # 最多连续处理碰撞10次 i_str1 = str1 + str(i) self.getindex(i_str1) aaa = max(self.MCARD[self.index]) if aaa == 0: # 不存在 return 0 elif aaa < self.N_max: if str1 == self.__keys[aaa]: return aaa # print ("warning : bad case happened , card array maybe too short when update " + str1) # hash 桶太少 return 0 def setbase(self, num1=16777216, num2=8): """ :param num1: 数组长度参数 :param num2: 每个词条对应的hash位置数 """ self.nummax2 = int(math.ceil(math.log(num1, 2))) self.N_max = 2 ** self.nummax2 # self.nummax2 2的N次方 self.M_num = num2 self.index = [0] * num2 self.index2 = [0] * num2 def set_card(self, kk=-1, dd=8): """ :param kk: 数组长度参数 -1表示取之前定义值 """ if -1 == kk: self.MCARD = np.repeat(0, self.N_max) return 0 s1 = input('do you want to reset MCARD to zeros,all memory will be lost [y/n]:') if s1 == 'y': self.MCARD = np.repeat(0, self.N_max) else: print("no reset") else: self.setbase(kk, dd) self.MCARD = np.repeat(0, 2 ** self.nummax2) def record_num(self): """ :return: 返回字典词条数量 """ return self.i_key - 1 def card_test(self): """ 计算hash碰撞指数 """ aaa = self._record bbb = self.N_max ccc = 0 for i in self.MCARD: ccc += int(i > 0) ddd = self.M_num print math.log(1.0 * ccc / bbb, 10) * ddd, math.log((1.0 * aaa * ddd - ccc) / ccc, 10) * ddd
__author__ = 'axuanwu' # coding=utf8 import re import sys import os import time import math import numpy as np from myclass import * class ReadClassify(): def __init__(self): self.m_card = MCard() self.dict_class = {} self.classify_tongji = np.zeros((3, 9)) self.class_str = [] self.m_card.set_card(2 ** 27, 6) self.mat_row = 3000000 self.i_file = 0 self.class_tail = np.array([0.0] * self.mat_row) self.word_count = np.zeros((3000000, 9), float) # 用于记录最常见的300万个片段 self.class_score = np.array([0.0] * 9) self.root_dir = "" self.max_word_length = 5 self.re_ch = re.compile(u"[\u4E00-\u9FA5]+", re.U) self.re_eng = re.compile(u"[a-zA-Z0-9+\._@]+", re.U) self.fazhi = 3 def set_dict_class(self): file_list = os.listdir(os.path.join(self.root_dir, "train")) i = 0 for i_dir in file_list: self.dict_class[i_dir] = i self.class_str.append(i_dir) i += 1 def set_fazhi(self): o_file = open(os.path.join(os.getcwd(), "canshu.txt"), "r") count_my = [0] * 200 i = 0 for line in o_file: count_my[i] = int(line.rstrip()) i += 1 o_file.close() i = len(count_my) - 1 a = self.mat_row while count_my[i] < a: a -= count_my[i] i -= 1 self.fazhi = max([2, i]) def set_root(self, path="C:\\Users\\01053185\\Desktop\\yuliao\\yuliao"): self.root_dir = path def load_dict(self): print "loading knowledge takes 1~2 min" line_dict = max(self.word_count.shape) dict_path = open(os.path.join(os.getcwd(), "tong_ji2new.txt"), "r") temp_array = np.zeros((1, 9), float) for line in dict_path: line_s = line.strip().split("\t") for j in xrange(1, len(line_s)): temp_array[0, j - 1] = float(line_s[j]) # if sum(temp_array) < self.fazhi: # continue # 次数太少不录入特征字典 self.m_card.update_card(line_s[0].decode("utf-8", "ignore")) # 每次都是新词 aaa = self.m_card.read_card(line_s[0].decode("utf-8", "ignore")) self.word_count[aaa,] = temp_array if aaa == line_dict - 1: break # if aaa == 10000: # break dict_path.close() print "loading knowledge done" def cut_classify2(self, sentence): blocks = re.findall(self.re_ch, sentence) for blk in blocks: len_blk = len(blk) i = len_blk while i >= 2: j = self.max_word_length # 最大磁长 while j >= 2: if (i - j) < 0: j -= 1 continue index_word = self.m_card.read_card(blk[(i - j):i]) if index_word == 0: j -= 1 continue else: if self.i_file == self.class_tail[index_word]: # 词被存储过 pass else: # print blk[i:(i + j)] self.class_score += self.word_count[index_word,] self.class_tail[index_word] = self.i_file j -= 1 i -= 1 blocks = re.findall(self.re_eng, sentence) for blk in blocks: index_word = self.m_card.read_card(blk) if self.i_file == self.class_tail[index_word]: # 词被存储过 pass else: self.class_score += self.word_count[index_word,] self.class_tail[index_word] = self.i_file def cut_classify3(self, sentence): # 正向最大匹配 blocks = re.findall(self.re_ch, sentence) for blk in blocks: len_blk = len(blk) i = 0 while i < (len_blk - 2): j = self.max_word_length # 最大磁长 while j >= 2: if (i + j) > len_blk: j -= 1 continue index_word = self.m_card.read_card(blk[i:(i + j)]) if index_word == 0: j -= 1 continue else: if self.i_file == self.class_tail[index_word]: # 词被计算存储过 pass else: # print blk[i:(i + j)] self.class_score += self.word_count[index_word,] self.class_tail[index_word] = self.i_file break if j < 2: i += 1 else: i += j blocks = re.findall(self.re_eng, sentence) for blk in blocks: index_word = self.m_card.read_card(blk) if self.i_file == self.class_tail[index_word]: # 词被存储过 pass else: self.class_score += self.word_count[index_word,] self.class_tail[index_word] = self.i_file def cut_classify(self, sentence): blocks = re.findall(self.re_ch, sentence) for blk in blocks: len_blk = len(blk) i = len_blk while i >= 2: j = self.max_word_length # 最大磁长 while j >= 2: if (i - j) < 0: j -= 1 continue index_word = self.m_card.read_card(blk[(i - j):i]) if index_word == 0: j -= 1 continue else: if self.i_file == self.class_tail[index_word]: # 词被存储过 pass else: # print blk[i:(i + j)] self.class_score += self.word_count[index_word,] self.class_tail[index_word] = self.i_file break if j < 2: i -= 1 else: i -= j blocks = re.findall(self.re_eng, sentence) for blk in blocks: index_word = self.m_card.read_card(blk) if self.i_file == self.class_tail[index_word]: # 词被存储过 pass else: self.class_score += self.word_count[index_word,] self.class_tail[index_word] = self.i_file def classify_read(self): class_result = os.path.join(os.getcwd(), "class_result.txt") o_file = open(class_result, "w") class_numbers = self.word_count.shape # dir_path = os.path.join(self.root_dir, "train") dir_list = os.listdir(dir_path) for sdir in dir_list: dir_path = os.path.join(os.path.join(self.root_dir, "train"), sdir) # dir_path = "C:/Users/01053185/Desktop/yuliao/yuliao/test/C000024" file_list = os.listdir(dir_path) for files in file_list: self.i_file += 1 file_path = os.path.join(dir_path, files) self.class_score = np.array([0.0] * 9) i_file = open(file_path, "r") for line in i_file: self.cut_classify3(line.decode("gbk", 'replace').strip()) max_pro = max(self.class_score) for i in xrange(0, 9): if self.class_score[i] == max_pro: self.classify_tongji[0, self.dict_class[self.class_str[i]]] += 1 if sdir == self.class_str[i]: o_file.writelines(file_path + "\t" + self.class_str[i] + "\t" + "1\n") self.classify_tongji[1, self.dict_class[self.class_str[i]]] += 1 else: o_file.writelines(file_path + "\t" + self.class_str[i] + "\t" + "0\n") break o_file.close() try: self.classify_tongji[2,] = self.classify_tongji[1,] / self.classify_tongji[0,] except: print "hello word!" if __name__ == "__main__": my_classify = ReadClassify() my_classify.set_root() a = time.time() my_classify.set_dict_class() # my_classify.set_fazhi() my_classify.load_dict() # my_classify.m_card.read_card(u"实习") print "time is :",time.time() - a,"s" my_classify.classify_read() print "time is :",time.time() - a,"s" print my_classify.classify_tongji