代码整理

时间：2017-06-03 20:59:38 阅读：254 评论：0 收藏：0 [点我收藏+]

标签：ignore 就会案例 return logs 神经网络 sla range main

20170512 051201.py 文档读入数据，分列读取，计数

 1 import re
 2 import sys
 3 import importlib, sys
 4 importlib.reload(sys)
 5 import pprint
 6 
 7 def translate(str): # 正则表达式，筛选剩余数字和中文
 8     line = str.strip() #.decode(‘utf-8‘, ‘ignore‘) #
 9     p2 = re.compile(‘[^-\d+\u4e00-\u9fa5]‘) #编译形成正则表达式，重复使用 .\d+ -\d+
10     zh = " ".join(p2.split(line)).strip()
11     #以非中文进行分解，以空格连接。若非中文字符在一起，join拼接默认去掉空字符串‘‘
12     #zh = ",".join(zh.split()) #以空白分隔之后以逗号连接
13     outStr = zh
14     return outStr.split()
15 
16 ‘‘‘ data.txt
17 1 发动机 无异 响 。
18 1 发动机 噪音 小 。
19 1 方向盘 握 感 不错 。
20 -1 刹车 和 起步 有 异 响 。
21 -1 并且 发动机 噪音 有点 大 。
22 -1 发动机 声音 响 , 的确 响 。
23 ‘‘‘
24 if __name__ == ‘__main__‘:
25     file = open("D:\data.txt", ‘rt‘, encoding=‘utf8‘)  # f=open(文件名, 模式)
26     lines = file.readlines()[0:]
27     #translate(lines)#lines是字符串列表
28     L = []
29     S = []
30     for line in lines:
31         ls = translate(line)
32         L.append(ls[0])   # 读取第一行 L=[‘1‘,‘1‘,‘1‘,‘-1‘,‘-1‘,‘-1‘]
33         S.extend(ls[1:])  # append与extend的区别
34         # print(ls)
35     #print(L)
36     S.sort()
37     dataset = set(S) # 列表变集合，集合就会去重
38     wordlines =[]
39     for line in lines:
40         ls = translate(line)
41         wordline =[]
42         for word in dataset:
43             # print(ls.count(word),end=‘,‘) # 计数
44             wordline.append(ls.count(word))
45         wordlines.append(wordline)
46     print(dataset)
47     # {‘响‘, ‘小‘, ‘的确‘, ‘发动机‘, ‘和‘, ‘握‘, ‘方向盘‘, ‘刹车‘, ‘无异‘, ‘大‘, ‘噪音‘, ‘有‘, ‘并且‘, ‘异‘, ‘起步‘, ‘声音‘, ‘有点‘, ‘感‘, ‘不错‘}
48     pprint.pprint(wordlines)
49     # 按格式输出
50  # [[1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
51  # [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
52  # [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
53  # [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
54  # [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0],
55  # [2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]

20170512 部分未整理，关于BP神经网络和神经网络最大熵。

20170518 部分未整理，关于LTP分词。

20170518 051803.py 051804.py 结巴分词的使用案例

051806.py 分词，去停用词，统计词频，以字典形式排序输出。

 1 import jieba
 2 import re
 3 def translate(str):
 4     line = str.strip() #.decode(‘utf-8‘, ‘ignore‘) #
 5     p2 = re.compile(‘[^-\d+\u4e00-\u9fa5]‘) #编译形成正则表达式，重复使用 .\d+ -\d+
 6     zh = " ".join(p2.split(line)).strip()
 7     #以非中文进行分解，以空格连接。若非中文字符在一起，join拼接默认去掉空字符串‘‘
 8     #zh = ",".join(zh.split()) #以空白分隔之后以逗号连接
 9     outStr = zh
10     return outStr.split()
11 
12 if __name__ == ‘__main__‘:
13     file = open(‘flash.txt‘, ‘rt‘,encoding=‘utf-8‘) # 正常文本
14     lines = file.readlines()
15     lines = ‘‘.join(lines)
16     alist = lines.split(‘，‘)
17     fstop = open(‘synStop.txt‘, ‘rt‘, encoding=‘utf-8‘)
18     lstop = fstop.readlines()
19     lstop = ‘‘.join(lstop)
20     slist = lstop.split()
21     slist = ‘ ‘.join(slist)
22     L = []
23     S = []
24     count = 0
25     for line in alist:
26         print(count)
27         count+=1
28         seg_list = jieba.cut(line, cut_all=False)
29         List = "  ".join(seg_list)
30         ls = translate(List)
31         ls = list(set(ls).difference(set(slist)))
32         L.extend(ls[0:])
33     L.sort()
34     dataset = set(L)
35     wordlines = []
36     count = 0
37     wordline = [0 for n in range(len(dataset))]
38     for line in alist:
39         print(count)
40         count+=1
41         seg_list = jieba.cut(line, cut_all=False)
42         List = "  ".join(seg_list)
43         ls = translate(List)
44         ls = list(set(ls).difference(set(slist)))
45         dataset = list(dataset)
46         for index, word in enumerate(dataset):
47             #print(index)
48             wordline[index] = wordline[index] + ls.count(word)
49         #wordlines.append(wordline)
50     all = dict(‘‘)
51     datalist = list(dataset)
52     for i in range(len(dataset)):
53         all[datalist[i]] = wordline[i]
54     sort_all = sorted(all.items(), key=lambda item: item[1], reverse=True)
55     #print(sort_all)
56     output = open(‘output4.txt‘, ‘w‘)
57     for i in range(len(sort_all)):
58         print(sort_all[i])
59         output.writelines(str(sort_all[i]))
60         output.writelines(‘\n‘)
61     #output.writelines(str(sort_all))
62     output.close()

051807.py n-gram，计数

 1 l = [1,2,3,1,2,3,4,5,6]
 3 def count(l,num):
 4     worddict = {}
 5     for i in range(len(l)-(num-1)):
 6         t = tuple(l[i:i + num],)
 7         if t in worddict.keys():
 8             worddict[t]+=1
 9         else:
10             worddict[t] = 1
11     return worddict
13 print([count(l,1),count(l,2),count(l,3)])
14 ‘‘‘
15 [{(1,): 2, (2,): 2, (3,): 2, (4,): 1, (5,): 1, (6,): 1}, {(1, 2): 2, (2, 3): 2, (3, 1): 1, (3, 4): 1, (4, 5): 1, (5, 6): 1}, {(1, 2, 3): 2, (2, 3, 1): 1, (3, 1, 2): 1, (2, 3, 4): 1, (3, 4, 5): 1, (4, 5, 6): 1}]
16 ‘‘‘

代码整理

标签：ignore 就会案例 return logs 神经网络 sla range main

原文地址：http://www.cnblogs.com/Joyce-song94/p/6938339.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行