【算法】基于树形结构分词

时间：2014-10-30 18:38:30 阅读：228 评论：0 收藏：0 [点我收藏+]
  1 #!/usr/bin/env python
  2 #encoding=gbk
  3 import os
  4 import sys
  5 
  6 G_ENCODING="gbk"
  7 """
  8 ===============================
  9 中文分词
 10 1. 机械分词
 11 2. 统计分词
 12 3. 理解分词
 13 ===============================
 14 基于树形结构分词策略(结合机械分词，统计分词)
 15 例：笔记本电脑 
 16     dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9}
 17          -------------------------------
 18         |              <s>              |
 19          -------------------------------
 20         /         /         \            21       [笔]     [笔记]    [笔记本]    [笔记本电脑]
 22        /          /        /    23      [记]       [本]     [电] [电脑]
 24       /         /   25     [本]      [电] [电脑]
 26     /  \       /
 27  [电] [电脑] [脑]
 28   /
 29 [脑] 
 30 -------------------------------
 31 path: 笔 记 本 电 脑  -- score: [0.32768]
 32 path: 笔 记 本 电脑   -- score: [0.4608]
 33 path: 笔记 本 电 脑   -- score: [0.4608]
 34 path: 笔记 本 电脑    -- score: [0.648]
 35 path: 笔记本 电 脑    -- score: [0.576]
 36 path: 笔记本 电脑     -- score: [0.81]
 37 path: 笔记本电脑      -- score: [0.9]
 38 
 39 best path: 笔记本电脑 -- score: [0.9]
 40 
 41 -------------------------------
 42 1、路径加权（通过搜索引擎获取词语的词频，获得词语的权重）
 43 2、最少切分、OOV、最少单字等策略
 44 ==获取最佳分词路径
 45 -------------------------------
 46 Q1、如果句子过长，树非常大，遍历费时（需优化）
 47 Q2、字典加载（需优化）
 48 以下给出该思想的简单实现[python]:
 49 """
 50 
 51 class Stack():
 52     def __init__(self, volume = 0):
 53         self.list = [] if volume == 0 else [0 for i in range(0,volume)]
 54         self.top = 0
 55 
 56     def push(self, element):
 57         if self.list != None: 
 58             self.top += 1
 59             self.list[self.top] = element
 60 
 61     def pop(self):
 62         if self.list != None and self.top >= 0:
 63             ele = self.list[self.top]
 64             self.list[self.top] = None
 65             self.top -= 1
 66             return ele
 67         return None
 68     def empty(self):
 69         return self.top == 0
 70     
 71 class Node():
 72     def __init__(self, data, next = None, prev = None, depth = 0, wlen = 0, weight = 0.0):
 73         self.data = data
 74         self.next = next if next != None else []
 75         self.prev = prev
 76         self.depth = depth
 77         self.wlen = wlen
 78         self.weight = weight
 79 
 80     def isLeaf(self):
 81         return self.next == None or self.next == []
 82 
 83 class Tree():
 84     def __init__(self, root = None):
 85         self.root = root
 86     """append a child node to child"""
 87     def append(self, node, cnode):
 88         if node != None and cnode != None:
 89             node.next.append(cnode)
 90             cnode.prev = node
 91             cnode.depth = node.depth + 1
 92             return 0
 93         return -1
 94 
 95     """depth first search(binary preorder)"""  
 96     def depth_first_search(self, node):
 97         list = []
 98         if node != None:
 99             stack = Stack(30)
100             stack.push(node)
101             while not stack.empty():
102                 tmp = stack.pop()
103                 list.append(tmp)
104                 for i in range(len(tmp.next) - 1, -1, -1):
105                     stack.push(tmp.next[i])
106         return list
107 
108 class Tokenizer():
109     """init the tree"""
110     def load(self, tree, pnode, cache, dict):
111         clen = len(cache)
112         for node in tree.depth_first_search(pnode):
113             if node.isLeaf():
114                 i = node.wlen
115                 j = i
116                 while j < clen:
117                     j += 1
118                     tmp = cache[i:j].encode(G_ENCODING)
119                     if dict.has_key(tmp) or len(tmp) == 1:
120                         tnode = Node(tmp, wlen = j, weight = dict.get(tmp))
121                         tree.append(node, tnode)
122                         self.load(tree, tnode, cache, dict)
123         return 0
124     """backtrance"""
125     def backtrance(self, node, list):
126         if node.prev != None and node.prev.data != "<s>":
127             list.append(node.prev)
128             self.backtrance(node.prev, list)
129         return 0
130 
131     def bestpath(self, tree):
132         highestScore = 0
133         bestpath = ""
134         for node in tree.depth_first_search(tree.root):
135             """find the leaf node and backtrance to find the bese path"""
136             if node.isLeaf():
137                 list = [node]
138                 self.backtrance(node, list)
139                 list.reverse()
140                 """
141                 1、路径加权（通过搜索引擎获取词语的词频，获得词语的权重）
142                 2、最少切分、OOV、最少单字等策略
143                 这里只是简单给出路径权重的乘积得分
144             
145                 """
146                 sc = 1.0
147                 tp = ""
148                 for xn in list:
149                     sc *= xn.weight if xn.weight > 0 else 1
150                     tp += xn.data + " "
151                 if sc > highestScore: 
152                     highestScore = sc
153                     bestpath = tp.strip()
154                 print "path: %s -- score: [%s]"%(tp.strip(), sc)
155         print "\nbest path: %s -- score: [%s]"%(bestpath, highestScore)
156         return bestpath
157 def example():
158     sent = "笔记本电脑"
159     dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9}
160     cache = unicode(sent, G_ENCODING)
161     tokenizer = Tokenizer()
162     tree = Tree(Node("<s>"))
163     """init tree"""
164     tokenizer.load(tree, tree.root, cache, dict)
165     """backtrance and find the best path"""
166     tokenizer.bestpath(tree)
167 example()
【算法】基于树形结构分词
标签：style blog color os ar for sp div on
原文地址：http://www.cnblogs.com/ariesblogs/p/4063106.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行