标签:中文 port 现在 rom 注意 header ati 文件 很多
本节目录
常用函数一:获取指定文件夹内所有文件
常用函数二:文本分词
常用函数三:词频统计
常用函数四:数据去重
写在前面
写代码也有很长时间了,总觉得应该做点什么有价值的事情,写代码初始阶段觉得做更多的项目,积累更多的经验是自己应该做的事情,这样可以使自己短时间内技术水平获得较大的提升。随着代码量和项目的增加,确实体会到了自身水平的进步,但同时由原来的尽可能多的做项目,学知识,逐渐转变为了尽可能精尽可能专的投入,更准确的说是当有了一定的知识基础和技术积累,追求不仅仅是知识和项目的广度,更注重的是自身的深度,学确实自己现在或以后有可能会用到的技术和知识,做自己认为有价值的项目,我想这就是我想表达的。然而,除了不断的学习,不断的前进,另外一点就是需要经常做回顾和记录,我想这就是博客和代码仓库的存在的价值,博客可以存储我们学过的知识,仓库可以存储我们写过的代码。有了这些东西,我们实现了随时随地可以找到我们自己保存的东西。我目前的困扰就是自己积累了大量的经验和代码,但是还没有形成一个系统的体系,很多重复性的工作会经常做,虽然实现过一次之后使用的时候会节省不少时间和精力,但是有时候由于=一些原因,会把时间浪费在找之前写的代码过程中,尤其代码越来越多之后,没有一个很好的存储目录结构,实际上是效率极低的,毕竟自己造的轮子或者搜集的轮子再次使用的时候因为找不到轮子在哪或者速度很慢不是我们追求的结果,这个Python常用函数系列博客就应运而生。
def print_directory_contents(dir_path, file_list): """ 这个函数接收文件夹的名称作为输入参数 返回该文件夹中文件的路径 以及其包含文件夹中文件的路径 """ import os for file in os.listdir(dir_path): file_path = os.path.join(dir_path, file) if os.path.isdir(file_path): print_directory_contents(file_path, file_list) else: file_list.append(file_path) if __name__ == ‘__main__‘: file_list = [] print_directory_contents(‘G:/programming/interview_question‘, file_list) print(file_list)
def print_directory_contents(dir_path): """ 这个函数接收文件夹的名称作为输入参数 返回该文件夹中文件的路径 以及其包含文件夹中文件的路径 """ import os for base_path, folders, files in os.walk(dir_path): for file in files: file_path = os.path.join(base_path, file) yield file_path if __name__ == ‘__main__‘: file_list = print_directory_contents(‘G:/programming/interview_question‘) for file in file_list: print(file)
os.listdir和os.walk各适合什么场景?
经验分享:两种方式均能实现想要的功能,若读取单层目录,选择os.listdir;若读取多层目录,选择os.walk
方式一:jieba分词+停用词+自定义词典
# -*- coding: utf-8 -*- """ Datetime: 2020/06/25 Author: Zhang Yafei Description: 文本分词 输入 停用词文件路径 词典文件路径 分词文件路径 表名(可选) 列名 分词结果列名 保存文件名 输出 分词结果-文件 """ import os import re import jieba import pandas as pd if not os.path.exists(‘res‘): os.mkdir(‘res‘) class TextCut(object): def __init__(self, dictionary=None, stopwords=None, ): self.dictionary = dictionary self.word_list = None if self.dictionary: jieba.load_userdict(self.dictionary) if stopwords: with open(stopwords, ‘r‘, encoding=‘utf-8‘) as swf: self.stopwords = [line.strip() for line in swf] else: self.stopwords = None @staticmethod def clean_txt(raw): file = re.compile(r"[^0-9a-zA-Z\u4e00-\u9fa5]+") return file.sub(‘ ‘, raw) def cut(self, text): sentence = self.clean_txt(text.strip().replace(‘\n‘, ‘‘)) return ‘ ‘.join([i for i in jieba.cut(sentence) if i.strip() and i not in self.stopwords and len(i) > 1]) def cut2(self, text): sentence = self.clean_txt(text.strip().replace(‘\n‘, ‘‘)) return ‘ ‘.join([i for i in jieba.cut(sentence) if i.strip() and i not in self.stopwords and len(i) > 1 and i in self.word_list]) def run(self, file_path, col_name, new_col_name, to_file, sheet_name=None, word_in_dict=False): if sheet_name: df = pd.read_excel(file_path, sheet_name=sheet_name) else: df = pd.read_excel(file_path) if word_in_dict: with open(self.dictionary, encoding=‘utf-8‘) as f: self.word_list = [word.strip() for word in f] df[new_col_name] = df[col_name].apply(self.cut2) else: df[new_col_name] = df[col_name].apply(self.cut) df.to_excel(to_file, index=False) print(‘######### 处理完成 ############‘) if __name__ == "__main__": # 1. 分词 text_cut = TextCut(stopwords=‘data/stopwords.txt‘, dictionary=‘data/word_dict.txt‘) text_cut.run(file_path=‘data/山西政策.xlsx‘, sheet_name=‘1.21-2.20‘, col_name=‘全文‘, new_col_name=‘全文分词‘, to_file=‘res/山西政策_分词.xlsx‘)
方式二:jieba分词+信息熵合并
# -*- coding: utf-8 -*- """ Datetime: 2020/03/01 Author: Zhang Yafei Description: 基于信息熵对分词结果进行合并 """ from collections import Counter from functools import reduce from pandas import read_excel, DataFrame class InfoEntropyMerge(object): def __init__(self, data, stopwords=‘data/stopwords.txt‘): self.data = data self.words_freq_one = {} self.words_freq_two = {} self.entropy_words_dict = {} if stopwords: with open(stopwords, ‘r‘, encoding=‘utf-8‘) as f: self.stopwords = {line.strip() for line in f} else: self.stopwords = None def count_word_freq_one(self, save_to_file=False, word_freq_file=None): keywords = (word for word_list in self.data for word in word_list if word) self.words_freq_one = Counter(keywords) if save_to_file: words = [word for word in self.words_freq_one] freqs = [self.words_freq_one[word] for word in words] words_df = DataFrame(data={‘word‘: words, ‘freq‘: freqs}) words_df.sort_values(‘freq‘, ascending=False, inplace=True) words_df.to_excel(word_freq_file, index=False) def count_freq(self, word1, word2): """ 统计相邻两个词出现的频率 :param word1: :param word2: :return: """ if (word1, word2) not in self.words_freq_two: self.words_freq_two[(word1, word2)] = 1 else: self.words_freq_two[(word1, word2)] += 1 return word2 def count_word_freq_two(self, save_to_file=False, word_freq_file=None): """ 计算相邻两个词出现的频率 :param save_to_file: :param word_freq_file: :return: """ for word_list in self.data: reduce(self.count_freq, word_list) if save_to_file and word_freq_file: words_list = [(word1, word2) for word1, word2 in self.words_freq_two] freqs = [self.words_freq_two[w1_w2] for w1_w2 in words_list] words_df = DataFrame(data={‘word‘: words_list, ‘freq‘: freqs}) words_df.sort_values(‘freq‘, ascending=False, inplace=True) words_df.to_excel(word_freq_file, index=False) @staticmethod def is_chinese(word): for ch in word: if ‘\u4e00‘ <= ch <= ‘\u9fff‘: return True return False def clac_entropy(self, save_to_file=False, dict_path=‘data/entropy_dict.txt‘): """ 计算信息熵: E(w1, w2) = P(w1,w2)/min(P(w1),P(w2)) :param save_to_file: 是否将熵值大于0.5的新词保存到文件中 :param dict_path: 保存字典路径 :return: """ for word1, word2 in self.words_freq_two: freq_two = self.words_freq_two[(word1, word2)] freq_one_min = min(self.words_freq_one[word1], self.words_freq_one[word2]) freq_one_max = max(self.words_freq_one[word1], self.words_freq_one[word2]) w1_w2_entropy = freq_two / freq_one_max if self.stopwords: if w1_w2_entropy > 0.5 and word1 not in self.stopwords and word2 not in self.stopwords and self.is_chinese(word1) and self.is_chinese(word2): # print(word1, word2, freq_two, freq_one_min, freq_one_max) self.entropy_words_dict[word1+word2] = w1_w2_entropy else: if w1_w2_entropy > 0.5: self.entropy_words_dict[word1+word2] = w1_w2_entropy print(‘信息熵大于0.5的词语组合:\n‘, self.entropy_words_dict) if save_to_file and dict_path: with open(dict_path, mode=‘r+‘, encoding=‘utf-8‘) as f: content = f.read() f.seek(0, 0) for word in self.entropy_words_dict: f.write(word+‘\n‘) f.write(content) print(f‘成功将信息熵大于0.5的词语保存到了{dict_path}中‘) def data_read(path, col_name): df = read_excel(path) texts = df.loc[df[col_name].notna(), col_name].str.split() return texts if __name__ == ‘__main__‘: text_list = data_read(path=‘res/国家政策_分词.xlsx‘, col_name=‘全文分词‘) info_entro = InfoEntropyMerge(data=text_list) info_entro.count_word_freq_one() info_entro.count_word_freq_two() info_entro.clac_entropy(save_to_file=False, dict_path=‘data/entropy_dict.txt‘)
经验分享:若有好的词典和停用词,优先选用方式一,否则选择方式二。
# -*- coding: utf-8 -*- """ Datetime: 2020/06/25 Author: Zhang Yafei Description: 统计词频 输入 文件名 列名 分割符 输出 词频统计结果-文件 """ from collections import Counter import pandas as pd def count_word_freq(file_path, col_name, to_file, sep=‘; ‘, multi_table=False): """ 统计词频 :param file_path: 读取文件路径 :param col_name: 统计词频所在列名 :param to_file: 保存文件路径 :param sep: 词语分割符 :param multi_table: 是否读取多张表 :return: """ if multi_table: datas = pd.read_excel(file_path, header=None, sheet_name=None) with pd.ExcelWriter(path=to_file) as writer: for sheet_name in datas: df = datas[sheet_name] keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split(sep) for word in word_list if word) words_freq = Counter(keywords) words = [word for word in words_freq] freqs = [words_freq[word] for word in words] words_df = pd.DataFrame(data={‘word‘: words, ‘freq‘: freqs}) words_df.sort_values(‘freq‘, ascending=False, inplace=True) words_df.to_excel(excel_writer=writer, sheet_name=sheet_name, index=False) writer.save() else: df = pd.read_excel(file_path) keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split() for word in word_list if word) words_freq = Counter(keywords) words = [word for word in words_freq] freqs = [words_freq[word] for word in words] words_df = pd.DataFrame(data={‘word‘: words, ‘freq‘: freqs}) words_df.sort_values(‘freq‘, ascending=False, inplace=True) words_df.to_excel(to_file, index=False) if __name__ == ‘__main__‘: # 对data.xlsx所有表中的keyword列统计词频,以默认‘; ‘为分割符切割词语,统计该列分词后的词频,结果保存至res.xlsx中 count_word_freq(file_path=‘data.xlsx‘, col_name=‘keyword‘, to_file=‘res.xlsx‘, multi_table=True)
经验分享:注意输入格式为excel文件,这也是我学习生活中常用的处理方式,直接拿去用,非常方便
另外,在我之前的一篇博客中,我介绍了Python统计词频常用的几种方式,不同的场景可以满足你各自的需求。博客传送门:https://www.cnblogs.com/zhangyafei/p/10653977.html
# -*- coding: utf-8 -*- """ Datetime: 2020/06/29 Author: Zhang Yafei Description: 04_数据去重 """ from pandas import read_csv, read_excel def data_drop_duplicate(file: str, to_file: str, columns: list = None, keep: str = ‘first‘): """ :param file: 要去重的文件路径 :param to_file: 去重之后保存的文件路径 :param columns: 哪些列重复的去重 :param keep: 重复的情况下,保留方式,默认 ‘first‘ """ if file.endswith(‘csv‘): df = read_csv(file) else: df = read_excel(file) if columns: df.drop_duplicates(subset=columns, keep=keep, inplace=True) else: df.drop_duplicates(keep=keep, inplace=True) if to_file.endswith(‘csv‘): df.to_csv(to_file, index=False) elif to_file.endswith(‘xlsx‘) or to_file.endswith(‘xls‘): df.to_excel(to_file, index=False) if __name__ == ‘__main__‘: # 修改参数 file 文件名 columns 去重的列 to_file 去重之后新文件名 data_drop_duplicate(file=‘data.xlsx‘, columns=[‘id‘, ‘title‘], to_file=‘new_data.xlsx‘)
经验分享:这个功能Python实现起来是非常的方便,基本上关键功能只需一行代码,之前觉得这个功能很简单,而且我用的时候这只是其中很小的一个步骤,没必要单独记录下来。直到后来有同学经常单独那这个功能来问我,我就觉得有必要记录下来了,我相信这也是这个系列博客存在的意义,记录过往,记录曾经,不重复造轮子,但造出来的轮子随时可以找到拿来用,方便你,也方便我。
标签:中文 port 现在 rom 注意 header ati 文件 很多
原文地址:https://www.cnblogs.com/zhangyafei/p/13233205.html