标签:style blog color os ar for sp div on
1 #!/usr/bin/env python
2 #encoding=gbk
3 import os
4 import sys
5
6 G_ENCODING="gbk"
7 """
8 ===============================
9 中文分词
10 1. 机械分词
11 2. 统计分词
12 3. 理解分词
13 ===============================
14 基于树形结构分词策略(结合机械分词,统计分词)
15 例:笔记本电脑
16 dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9}
17 -------------------------------
18 | <s> |
19 -------------------------------
20 / / \ 21 [笔] [笔记] [笔记本] [笔记本电脑]
22 / / / 23 [记] [本] [电] [电脑]
24 / / 25 [本] [电] [电脑]
26 / \ /
27 [电] [电脑] [脑]
28 /
29 [脑]
30 -------------------------------
31 path: 笔 记 本 电 脑 -- score: [0.32768]
32 path: 笔 记 本 电脑 -- score: [0.4608]
33 path: 笔记 本 电 脑 -- score: [0.4608]
34 path: 笔记 本 电脑 -- score: [0.648]
35 path: 笔记本 电 脑 -- score: [0.576]
36 path: 笔记本 电脑 -- score: [0.81]
37 path: 笔记本电脑 -- score: [0.9]
38
39 best path: 笔记本电脑 -- score: [0.9]
40
41 -------------------------------
42 1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重)
43 2、最少切分、OOV、最少单字等策略
44 ==获取最佳分词路径
45 -------------------------------
46 Q1、如果句子过长,树非常大,遍历费时(需优化)
47 Q2、字典加载(需优化)
48 以下给出该思想的简单实现[python]:
49 """
50
51 class Stack():
52 def __init__(self, volume = 0):
53 self.list = [] if volume == 0 else [0 for i in range(0,volume)]
54 self.top = 0
55
56 def push(self, element):
57 if self.list != None:
58 self.top += 1
59 self.list[self.top] = element
60
61 def pop(self):
62 if self.list != None and self.top >= 0:
63 ele = self.list[self.top]
64 self.list[self.top] = None
65 self.top -= 1
66 return ele
67 return None
68 def empty(self):
69 return self.top == 0
70
71 class Node():
72 def __init__(self, data, next = None, prev = None, depth = 0, wlen = 0, weight = 0.0):
73 self.data = data
74 self.next = next if next != None else []
75 self.prev = prev
76 self.depth = depth
77 self.wlen = wlen
78 self.weight = weight
79
80 def isLeaf(self):
81 return self.next == None or self.next == []
82
83 class Tree():
84 def __init__(self, root = None):
85 self.root = root
86 """append a child node to child"""
87 def append(self, node, cnode):
88 if node != None and cnode != None:
89 node.next.append(cnode)
90 cnode.prev = node
91 cnode.depth = node.depth + 1
92 return 0
93 return -1
94
95 """depth first search(binary preorder)"""
96 def depth_first_search(self, node):
97 list = []
98 if node != None:
99 stack = Stack(30)
100 stack.push(node)
101 while not stack.empty():
102 tmp = stack.pop()
103 list.append(tmp)
104 for i in range(len(tmp.next) - 1, -1, -1):
105 stack.push(tmp.next[i])
106 return list
107
108 class Tokenizer():
109 """init the tree"""
110 def load(self, tree, pnode, cache, dict):
111 clen = len(cache)
112 for node in tree.depth_first_search(pnode):
113 if node.isLeaf():
114 i = node.wlen
115 j = i
116 while j < clen:
117 j += 1
118 tmp = cache[i:j].encode(G_ENCODING)
119 if dict.has_key(tmp) or len(tmp) == 1:
120 tnode = Node(tmp, wlen = j, weight = dict.get(tmp))
121 tree.append(node, tnode)
122 self.load(tree, tnode, cache, dict)
123 return 0
124 """backtrance"""
125 def backtrance(self, node, list):
126 if node.prev != None and node.prev.data != "<s>":
127 list.append(node.prev)
128 self.backtrance(node.prev, list)
129 return 0
130
131 def bestpath(self, tree):
132 highestScore = 0
133 bestpath = ""
134 for node in tree.depth_first_search(tree.root):
135 """find the leaf node and backtrance to find the bese path"""
136 if node.isLeaf():
137 list = [node]
138 self.backtrance(node, list)
139 list.reverse()
140 """
141 1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重)
142 2、最少切分、OOV、最少单字等策略
143 这里只是简单给出路径权重的乘积得分
144
145 """
146 sc = 1.0
147 tp = ""
148 for xn in list:
149 sc *= xn.weight if xn.weight > 0 else 1
150 tp += xn.data + " "
151 if sc > highestScore:
152 highestScore = sc
153 bestpath = tp.strip()
154 print "path: %s -- score: [%s]"%(tp.strip(), sc)
155 print "\nbest path: %s -- score: [%s]"%(bestpath, highestScore)
156 return bestpath
157 def example():
158 sent = "笔记本电脑"
159 dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9}
160 cache = unicode(sent, G_ENCODING)
161 tokenizer = Tokenizer()
162 tree = Tree(Node("<s>"))
163 """init tree"""
164 tokenizer.load(tree, tree.root, cache, dict)
165 """backtrance and find the best path"""
166 tokenizer.bestpath(tree)
167 example()
标签:style blog color os ar for sp div on
原文地址:http://www.cnblogs.com/ariesblogs/p/4063106.html