文本挖掘(Text Mining)是从非结构化文本信息中获取用户感兴趣或者有用的模式的过程。
import sys
import os
import time
from lxml import etree, html
from sklearn.datasets.base import Bunch
import pickle
from time import time
root = ‘D:/MLBook‘ + ‘/chapter02‘
# htm文件路径,以及读取文件
path = root + "/1.htm"
with open(path, "rb") as fp:
content = fp.read().decode()
page = html.document_fromstring(content) # 解析文件
text = page.text_content() # 去除所有标签
print(text[:100]) # 输出去除标签后解析结果
import jieba
seg_list = jieba.cut("小明1995年毕业于北京清华大学", cut_all=False)
print("Default Mode:", " ".join(seg_list)) # 默认模式
seg_list = jieba.cut("小明1995年毕业于北京清华大学")
print(" ".join(seg_list))
seg_list = jieba.cut("小明1995年毕业于北京清华大学", cut_all=True)
print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,\
后在日本京都大学深造") # 搜索引擎模式
print("/ ".join(seg_list))
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\q7356\AppData\Local\Temp\jieba.cache
Loading model cost 0.992 seconds.
Prefix dict has been built succesfully.
Default Mode: 小明 1995 年 毕业 于 北京 清华大学
小明 1995 年 毕业 于 北京 清华大学
Full Mode: 小/ 明/ 1995/ 年/ 毕业/ 于/ 北京/ 清华/ 清华大学/ 华大/ 大学
小明/ 硕士/ 毕业/ 于/ 中国/ 科学/ 学院/ 科学院/ 中国科学院/ 计算/ 计算所/ ,/ 后/ 在/ 日本/ 京都/ 大学/ 日本京都大学/ 深造
seg_list = jieba.cut("小明终于在1995年从北京清华大学毕业了。")
print(" ".join(seg_list))
小明 终于 在 1995 年 从 北京 清华大学 毕业 了 。
os.listdir(root+ ‘/train_corpus_seg‘)
class Segment:
def __init__(self, root):
self.root = root
def save_file(self, save_path, content):
with open(save_path, ‘wb‘) as fp:
def read_file(self, path):
with open(path, ‘rb‘) as fp:
content = fp.read().decode()
return content
def get_seg(self, corpus_path, seg_path):
获取每个目录下所有的文件 mydir in catelist
catelist = os.listdir(corpus_path)
corpus_path = self.root + corpus_path
seg_path = self.root + seg_path
catelist = os.listdir(corpus_path)
start = time()
for i, mydir in enumerate(catelist):
class_path = corpus_path + mydir + "/" # 拼出分类子目录的路径
seg_dir = seg_path + mydir + "/" # 拼出分词后语料分类目录
if not os.path.exists(seg_dir): # 是否存在目录,如果没有创建
file_list = os.listdir(class_path) # 获取 class_path 下的所有文件
for k, file_path in enumerate(file_list): # 遍历类别目录下文件
fullname = class_path + file_path # 拼出文件名全路径
content = self.read_file(fullname).strip() # 读取文件内容
content = content.replace("\r\n", "") # 删除换行和多余的空格
content_seg = jieba.cut(content.strip()) # 为文件内容分词
self.save_file(seg_dir + file_path, " ".join(content_seg)) # 将处理后的文件保存到分词后语料目录
if k == 0 and i == 0:
print(‘--‘ * 20)
if i == 0:
print(‘ 完成语料分词的类别依次为:‘)
print(‘\t%i: %s‘%(i, mydir))
print(‘--‘ * 20)
print("总计花费时间 %g 秒,中文语料分词结束!!!"%(time() - start))
def get_bunch(self, wordbag_path, seg_path):
获取每个目录下所有的文件 mydir in catelist
catelist = os.listdir(corpus_path)
wordbag_path::分词语料 Bunch 对象持久化路径,例如:"train_word_bag/train_set.dat"
wordbag_path = self.root + wordbag_path
seg_path = self.root + seg_path
catelist = os.listdir(seg_path)
bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
bunch.target_name.extend(catelist) # 将类别信息保存到 Bunch 对象
start = time()
for i, mydir in enumerate(catelist):
class_path = seg_path + mydir + "/" # 拼出分类子目录的路径
file_list = os.listdir(class_path) # 获取 class_path 下的所有文件
for k, file_path in enumerate(file_list): # 遍历类别目录下文件
fullname = class_path + file_path # 拼出文件名全路径
bunch.label.append(mydir) # 保存当前文件的分类标签
bunch.filenames.append(fullname) # 保存当前文件的文件路径
bunch.contents.append(self.read_file(fullname).strip()) # 保存文件词向量
if k == 0 and i == 0:
print(‘--‘ * 20)
if i == 0:
print(‘ 文本对象构建的类别依次为:‘)
print(‘\t%i: %s‘%(i, mydir))
print(‘--‘ * 20)
# 对象持久化
with open(wordbag_path, "wb") as file_obj:
pickle.dump(bunch, file_obj)
print("总计花费时间 %g 秒,构建文本对象结束!!!"%(time() - start))
return bunch
root = ‘D:/MLBook/chapter02/‘
corpus_path = "train_corpus_small/" # 未分词分类语料库路径
seg_path = "train_corpus_seg/" # 分词后分类语料库路径
S = Segment(root)
S.get_seg(corpus_path, seg_path)
0: art
1: computer
2: economic
3: education
4: environment
5: medical
6: military
7: politics
8: sports
9: traffic
总计花费时间 19.8525 秒,中文语料分词结束!!!
在实际应用中,为了后续生成向量空间模型的方便,这些分词后的文本信息还要转换为文本向量信息并对象化。这里我们使用 Scikit-Learn 库的 Bunch 数据结构:
(key, value)
:分词后文件词向量形式from sklearn.datasets.base import Bunch
将分好词的文本转换并持久化为 Bunch 类形式:
wordbag_path = "train_word_bag/train_set.dat" # 分词语料 Bunch 对象持久化路径
seg_path = "train_corpus_seg/" # 分词后分类语料库路径
S = Segment(root)
bunch = S.get_bunch(wordbag_path, seg_path)
0: art
1: computer
2: economic
3: education
4: environment
5: medical
6: military
7: politics
8: sports
9: traffic
总计花费时间 0.891003 秒,构建文本对象结束!!!
这样就在目录下生成了一个 train_set.dat
文本1:My dog ate my homework
文本2:My cat ate the sandwich
文本3:A dolphin ate the homework
a = ‘My dog ate my homework ‘
b = ‘My cat ate the sandwich ‘
c = ‘A dolphin ate the homework ‘
def str2list(s):
return s.lower().strip(‘ ‘).split(‘ ‘)
def wordbag(*args):
wb = []
for t in args:
return set(wb)
def word2vec(wb, t):
a2 = []
for k in wb:
if k in str2list(t):
return a2
wb = wordbag(*[a, b, c])
{‘a‘, ‘ate‘, ‘cat‘, ‘dog‘, ‘dolphin‘, ‘homework‘, ‘my‘, ‘sandwich‘, ‘the‘}
a2 = word2vec(wb, a)
b2 = word2vec(wb, b)
c2 = word2vec(wb, c)
print(‘文本1:‘, a2)
print(‘文本2:‘, b2)
print(‘文本3:‘, c2)
文本1: [1, 1, 0, 0, 1, 0, 0, 1, 0]
文本2: [0, 1, 1, 0, 0, 0, 1, 1, 1]
文本3: [1, 0, 0, 1, 0, 1, 1, 1, 0]
def get_word_stat(w):
d = {}
for v in str2list(w):
d[v] = d.get(v, 0) + 1
return d
def word2vec(wb, t):
L = []
for k in wb:
d = get_word_stat(t)
if k in d.keys():
return L
a2 = word2vec(wb, a) # ‘my‘ 出现了两次
b2 = word2vec(wb, b)
c2 = word2vec(wb, c)
print(‘文本1:‘, a2)
print(‘文本2:‘, b2)
print(‘文本3:‘, c2)
文本1: [1, 2, 0, 0, 1, 0, 0, 1, 0]
文本2: [0, 1, 1, 0, 0, 0, 1, 1, 1]
文本3: [1, 0, 0, 1, 0, 1, 1, 1, 0]
import numpy as np
def TF(t):
a3 = np.array(t)
return a3 / a3.sum()
a3 = TF(a2) # ‘my‘ 出现了两次
b3 = TF(b2)
c3 = TF(c2)
print(‘文本1:‘, a3)
print(‘文本2:‘, b3)
print(‘文本3:‘, c3)
文本1: [ 0.2 0.4 0. 0. 0.2 0. 0. 0.2 0. ]
文本2: [ 0. 0.2 0.2 0. 0. 0. 0.2 0.2 0.2]
文本3: [ 0.2 0. 0. 0.2 0. 0.2 0.2 0.2 0. ]
将词频信息变成了概率分布,这就是文档的 TF 信息。
def word2vec2(wb, t):
L = []
for k in wb:
D = get_word_stat(a + b + c)
d = get_word_stat(t)
if k in d.keys():
return L
def IDF(wb, t):
e = word2vec2(wb, t)
g = 3 / (np.array(e) + 1)
return np.log(g)
a4 = IDF(wb, a)
b4 = IDF(wb, b)
c4 = IDF(wb, c)
TFIDF1 = a3 * a4
TFIDF2 = b3 * b4
TFIDF3 = c3 * c4
array([ 0. , -0.11507283, 0. , 0. , 0.08109302,
0. , 0. , -0.05753641, 0. ])
下面我们使用 Scikit-Learn 包来实现 TF-IDF 算法:
def read_file(path):
with open(path, "rb") as fp:
content = fp.read()
return content.decode() # 转换为 string
class Word2Vector:
def __init__(self, root):
self.root = root
def read_bunch(self, path):
读取 bunch 对象
with open(self.root + path, "rb") as file_obj:
bunch = pickle.load(file_obj)
return bunch
def write_bunch(self, path, bunchobj):
写入 bunch 对象
with open(self.root + path, "wb") as file_obj:
pickle.dump(bunchobj, file_obj)
def read_stopword(self, stopword_path):
path = "train_word_bag/hlt_stop_words.txt"
return read_file(self.root + stopword_path).splitlines()
def tfidf(self, stopword_path, path, space_path):
path::bunch 保存路径
start = time()
# 1. 读取停用词表
stpwrdlst = self.read_stopword(stopword_path)
# 2. 导入分词后的词向量 bunch 对象
bunch = self.read_bunch(path)
# 3. 构建 tf-idf 词向量空间对象
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label,
filenames=bunch.filenames, tdm=[], vocabulary={})
# 4. 使用 TfidfVectorizer 初始化向量空间模型
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf = True, max_df = 0.5)
transformer = TfidfTransformer() # 该类会统计每个词语的 tf-idf 权值
# 文本转为词频矩阵,单独保存字典文件
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
# 创建词袋的持久化
self.write_bunch(space_path, tfidfspace)
print("花费时间:%g 秒,TF-IDF 词向量空间创建成功!!!"%(time() - start))
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
WV = Word2Vector(root)
path = "train_word_bag/train_set.dat" # 词向量空间保存路径
stopword_path = "train_word_bag/hlt_stop_words.txt"
space_path = "train_word_bag/tfdifspace.dat" # 词向量空间保存路径
WV.tfidf(stopword_path, path, space_path)
花费时间:1.6075 秒,TF-IDF 词向量空间创建成功!!!
