标签:zip strip __name__ com try odi http exce ons
import json import re import requests from urllib.parse import quote from bs4 import BeautifulSoup from pyquery import PyQuery as pq class BaiDuPerson: def __init__(self, name): self.temp_url = ‘https://baike.baidu.com/search/word?word=‘ self.headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" } self.response = ‘‘ self.save_path = r‘E:\百度json文件‘ self.name = name self.run() def get_response(self): url = self.temp_url + quote(self.name) response = requests.get(url=url, headers=self.headers) self.response = response.content.decode(‘utf8‘) def check_ambiguity(self): """校验人名是否有歧义--多个人名指代""" doc = pq(self.response) ul = doc(‘.polysemantList-wrapper.cmn-clearfix‘) if ul: return True else: return False def get_introduction(self): """ 获得简介 """ soup = BeautifulSoup(self.response, "lxml") try: result = soup.select(".lemma-summary")[0].text result = "".join(result) except: result = ‘‘ return result def get_person_lifetime(self): """ 获取生平数据 """ res = self.response.split(‘<h2 class="title-text"‘) h2_dict = {} if len(res) == 1: doc = pq(self.response) content = doc(‘.para‘).text() h2_dict[‘生平‘] = content else: for h2 in res[1:]: tmp2 = {} if ‘<div class="album-list">‘ in h2: h2 = h2.split(‘<div class="album-list">‘)[0] if ‘<dt class="reference-title"‘ in h2: h2 = h2.split(‘<dt class="reference-title"‘)[0] if ‘<div class="rs - container - foot"‘ in h2: h2 = h2.split(‘<div class="rs - container - foot"‘)[0] if ‘<div class="tashuo-bottom"‘ in h2: h2 = h2.split(‘<div class="tashuo-bottom"‘)[0] if ‘<div class="go-auth-box"‘ in h2: h2 = h2.split(‘<div class="go-auth-box"‘)[0] if ‘<div class="side-content">‘ in h2: h2 = h2.split(‘<div class="side-content">‘)[0] h2 = ‘<h2 class="title-text"‘ + h2 soup = BeautifulSoup(h2, "lxml") h2_key = soup.find("h2").get_text().replace(self.name, ‘‘).strip() h3_dict = {} if "<h3" in h2: for h3 in h2.split("<h3")[1:]: tmp3 = {} h3 = "<h3" + h3 soup = BeautifulSoup(h3, "lxml") replace = soup.find("h3").get_text() h3_title = replace.replace(self.name, ‘‘).strip() if "<ul" in h3: res = h3.split("<ul") ul_dict = {} for ul in res[1:]: ul = "<ul" + ul soup = BeautifulSoup(ul, "lxml") ul_title = soup.find("ul").get_text().replace(self.name, ‘‘).strip() tmp1 = {} for item in ul.split("</ul>")[1:]: v_list = [] # 存储多个关系 soup = BeautifulSoup(item, "lxml") ul_vlist = soup.find_all("div") for i in ul_vlist: ul_v = i.get_text().replace("\xa0", ‘‘) for shangbiao in re.findall(re.compile(r"\[\d+\]"), ul_v): ul_v = ul_v.replace(shangbiao, "") if ul_v == ‘‘: continue else: v_list.append(ul_v) tmp1[ul_title] = v_list ul_dict.update(tmp1) h3_dict.update(ul_dict) else: h3_v = soup.get_text().replace(replace, "").replace("\xa0", ‘‘) for shangbiao in re.findall(re.compile(r"\[\d+\]"), h3_v): h3_v = h3_v.replace(shangbiao, "") tmp3[h3_title] = [h3_v] h3_dict.update(tmp3) tmp2 = {h2_key: h3_dict} h2_dict.update(tmp2) else: h2_v = soup.get_text().replace(soup.find("h2").get_text(), "").replace("\xa0", ‘‘) for shangbiao in re.findall(re.compile(r"\[\d+\]"), h2_v): h2_v = h2_v.replace(shangbiao, "") h2_v = h2_v.split("\n") h2_v_list = [] for item in h2_v: if item and (not item == ‘编辑‘): h2_v_list.append(item) tmp = {h2_key: h2_v_list} h2_dict.update(tmp) return h2_dict def get_relationship(self): """ 获取人物关系 """ relationship = [] soup = BeautifulSoup(self.response, "lxml") res_ship = soup.select(".info .name") res_value = soup.select(".info .title") for i in range(len(res_ship)): temp = [] temp.append(self.name) temp.append(res_ship[i].string) temp.append(res_value[i].string) relationship.append(temp) return relationship def get_person_details(self): """获取人物标签栏数据""" doc = pq(self.response) person_detail_key_doc_list = doc(‘.basic-info.cmn-clearfix dt‘).items() person_detail_key_list = [] for key_doc in person_detail_key_doc_list: person_detail_key = key_doc.text().replace(‘ ‘,‘‘) person_detail_key_list.append(person_detail_key) person_detail_value_doc_list = doc(‘.basic-info.cmn-clearfix dd‘).items() person_detail_value_list = [] for value_doc in person_detail_value_doc_list: person_detail_value = value_doc.text().replace(‘ ‘,‘‘) person_detail_value_list.append(person_detail_value) person_detail_dict = dict(zip(person_detail_key_list, person_detail_value_list)) return person_detail_dict def get_name(self): """抓取的首页的人物名字""" soup = BeautifulSoup(self.response, "lxml") try: name = soup.find("h1").text except: name = ‘‘ return name def run(self): self.get_response() check_ambiguity_result = self.check_ambiguity() if check_ambiguity_result: with open(‘有歧义.txt‘, ‘a‘, encoding=‘utf8‘) as f: f.write(self.name+‘\n‘) else: introduction = self.get_introduction() person_name = self.get_name() relationship = self.get_relationship() person_lifetime = self.get_person_lifetime() person_detail = self.get_person_details() person_information = dict() person_information[‘Introduction‘] = introduction person_information[‘Rel‘] = relationship person_information[‘Details‘] = person_detail person_information.update(person_lifetime) with open(self.save_path+‘\\‘+person_name+‘.json‘, ‘w‘, encoding=‘utf8‘) as f: f.write(json.dumps(person_information, ensure_ascii=False)) if __name__ == ‘__main__‘: name = ‘裴寂‘ BaiDuPerson(name)
标签:zip strip __name__ com try odi http exce ons
原文地址:https://www.cnblogs.com/lqn404/p/13827435.html