码迷,mamicode.com
首页 > 其他好文 > 详细

百度百科-人物数据采集

时间:2020-10-18 16:34:16      阅读:17      评论:0      收藏:0      [点我收藏+]

标签:zip   strip   __name__   com   try   odi   http   exce   ons   

import json
import re

import requests
from urllib.parse import quote

from bs4 import BeautifulSoup
from pyquery import PyQuery as pq


class BaiDuPerson:
    def __init__(self, name):
        self.temp_url = https://baike.baidu.com/search/word?word=
        self.headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
        }
        self.response = ‘‘
        self.save_path = rE:\百度json文件
        self.name = name
        self.run()

    def get_response(self):
        url = self.temp_url + quote(self.name)
        response = requests.get(url=url, headers=self.headers)
        self.response = response.content.decode(utf8)

    def check_ambiguity(self):
        """校验人名是否有歧义--多个人名指代"""
        doc = pq(self.response)
        ul = doc(.polysemantList-wrapper.cmn-clearfix)
        if ul:
            return True
        else:
            return False

    def get_introduction(self):
        """
        获得简介
        """
        soup = BeautifulSoup(self.response, "lxml")
        try:

            result = soup.select(".lemma-summary")[0].text
            result = "".join(result)
        except:
            result = ‘‘
        return result

    def get_person_lifetime(self):
        """
        获取生平数据
        """
        res = self.response.split(<h2 class="title-text")
        h2_dict = {}
        if len(res) == 1:
            doc = pq(self.response)
            content = doc(.para).text()
            h2_dict[生平] = content
        else:
            for h2 in res[1:]:
                tmp2 = {}
                if <div class="album-list"> in h2:
                    h2 = h2.split(<div class="album-list">)[0]
                if <dt class="reference-title" in h2:
                    h2 = h2.split(<dt class="reference-title")[0]
                if <div class="rs - container - foot" in h2:
                    h2 = h2.split(<div class="rs - container - foot")[0]
                if <div class="tashuo-bottom" in h2:
                    h2 = h2.split(<div class="tashuo-bottom")[0]
                if <div class="go-auth-box" in h2:
                    h2 = h2.split(<div class="go-auth-box")[0]
                if <div class="side-content"> in h2:
                    h2 = h2.split(<div class="side-content">)[0]
                h2 = <h2 class="title-text" + h2
                soup = BeautifulSoup(h2, "lxml")
                h2_key = soup.find("h2").get_text().replace(self.name, ‘‘).strip()
                h3_dict = {}
                if "<h3" in h2:
                    for h3 in h2.split("<h3")[1:]:
                        tmp3 = {}
                        h3 = "<h3" + h3
                        soup = BeautifulSoup(h3, "lxml")
                        replace = soup.find("h3").get_text()
                        h3_title = replace.replace(self.name, ‘‘).strip()
                        if "<ul" in h3:
                            res = h3.split("<ul")
                            ul_dict = {}
                            for ul in res[1:]:
                                ul = "<ul" + ul
                                soup = BeautifulSoup(ul, "lxml")
                                ul_title = soup.find("ul").get_text().replace(self.name, ‘‘).strip()
                                tmp1 = {}

                                for item in ul.split("</ul>")[1:]:
                                    v_list = []  # 存储多个关系
                                    soup = BeautifulSoup(item, "lxml")
                                    ul_vlist = soup.find_all("div")
                                    for i in ul_vlist:
                                        ul_v = i.get_text().replace("\xa0", ‘‘)
                                        for shangbiao in re.findall(re.compile(r"\[\d+\]"), ul_v):
                                            ul_v = ul_v.replace(shangbiao, "")
                                        if ul_v == ‘‘:
                                            continue
                                        else:
                                            v_list.append(ul_v)
                                    tmp1[ul_title] = v_list
                                ul_dict.update(tmp1)
                            h3_dict.update(ul_dict)
                        else:
                            h3_v = soup.get_text().replace(replace, "").replace("\xa0", ‘‘)
                            for shangbiao in re.findall(re.compile(r"\[\d+\]"), h3_v):
                                h3_v = h3_v.replace(shangbiao, "")
                            tmp3[h3_title] = [h3_v]
                            h3_dict.update(tmp3)
                        tmp2 = {h2_key: h3_dict}
                    h2_dict.update(tmp2)
                else:
                    h2_v = soup.get_text().replace(soup.find("h2").get_text(), "").replace("\xa0", ‘‘)
                    for shangbiao in re.findall(re.compile(r"\[\d+\]"), h2_v):
                        h2_v = h2_v.replace(shangbiao, "")

                    h2_v = h2_v.split("\n")
                    h2_v_list = []
                    for item in h2_v:
                        if item and (not item == 编辑):
                            h2_v_list.append(item)

                    tmp = {h2_key: h2_v_list}
                    h2_dict.update(tmp)
        return h2_dict

    def get_relationship(self):
        """
        获取人物关系
        """
        relationship = []
        soup = BeautifulSoup(self.response, "lxml")
        res_ship = soup.select(".info .name")
        res_value = soup.select(".info .title")
        for i in range(len(res_ship)):
            temp = []
            temp.append(self.name)
            temp.append(res_ship[i].string)
            temp.append(res_value[i].string)
            relationship.append(temp)
        return relationship

    def get_person_details(self):
        """获取人物标签栏数据"""
        doc = pq(self.response)
        person_detail_key_doc_list = doc(.basic-info.cmn-clearfix dt).items()
        person_detail_key_list = []
        for key_doc in person_detail_key_doc_list:
            person_detail_key = key_doc.text().replace( ,‘‘)
            person_detail_key_list.append(person_detail_key)
        person_detail_value_doc_list = doc(.basic-info.cmn-clearfix dd).items()
        person_detail_value_list = []
        for value_doc in person_detail_value_doc_list:
            person_detail_value = value_doc.text().replace( ,‘‘)
            person_detail_value_list.append(person_detail_value)
        person_detail_dict = dict(zip(person_detail_key_list, person_detail_value_list))
        return person_detail_dict

    def get_name(self):
        """抓取的首页的人物名字"""
        soup = BeautifulSoup(self.response, "lxml")
        try:
            name = soup.find("h1").text
        except:
            name = ‘‘
        return name

    def run(self):
        self.get_response()
        check_ambiguity_result = self.check_ambiguity()
        if check_ambiguity_result:
            with open(有歧义.txt, a, encoding=utf8) as f:
                f.write(self.name+\n)
        else:
            introduction = self.get_introduction()
            person_name = self.get_name()
            relationship = self.get_relationship()
            person_lifetime = self.get_person_lifetime()
            person_detail = self.get_person_details()
            person_information = dict()
            person_information[Introduction] = introduction
            person_information[Rel] = relationship
            person_information[Details] = person_detail
            person_information.update(person_lifetime)
            with open(self.save_path+\\+person_name+.json, w, encoding=utf8) as f:
                f.write(json.dumps(person_information, ensure_ascii=False))


if __name__ == __main__:
    name = 裴寂
    BaiDuPerson(name)

 

百度百科-人物数据采集

标签:zip   strip   __name__   com   try   odi   http   exce   ons   

原文地址:https://www.cnblogs.com/lqn404/p/13827435.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!