标签:百度 写入 词条 根据 compile nbsp alt soup lin
今天主要完成了根据爬取的txt文档,从百度分类从信息科学类爬取百度词条信息,并写入CSV格式文件。
txt格式文件如图:
代码如下:
1 import requests 2 from bs4 import BeautifulSoup 3 import csv 4 import io 5 import re 6 7 url="https://baike.baidu.com/item/" 8 id=1 9 10 patton=re.compile(r‘.*信息科学分类.*|.*软件.*|.*科技产品.*|.*公司.*|.*互联网人物.*|.*互联网.*|.*科技术语.*|.*技术.*|.*网站.*‘) 11 12 #写入表头 13 def Head(): 14 with open(‘E:/bdbk.csv‘, ‘w‘, encoding=‘utf-8‘, newline=‘‘) as csvfile: 15 writer = csv.writer(csvfile) 16 writer.writerow(["序号", "名称", "属性", "内容", "网址"]) 17 18 19 def Href(url): 20 try: 21 global id, name, nature, content,tag 22 kv = {‘user-agent‘: ‘Mozilla/5.0‘} 23 r = requests.get(url, headers=kv) 24 r.encoding = "utf-8" 25 demo = r.text 26 soup = BeautifulSoup(demo, "html.parser") 27 print(url) 28 #print(soup.prettify()) 29 tag=soup.find_all("dd",{"id":"open-tag-item"})[0].get_text().replace("(","").replace(")","").strip().replace("\n","") 30 name=soup.find_all("h1")[0].get_text().strip() 31 nature=soup.find_all("h2")[0].get_text().replace("(","").replace(")","").strip() 32 if nature==‘目录‘: 33 nature=tag 34 content=str(soup.find_all("div",{"class":"lemma-summary"})).replace("/item","https://baike.baidu.com/item").strip().rstrip("]").lstrip("[") 35 except: 36 print("出错!") 37 if name!="百度百科错误页" and nature!="目录" and len(patton.findall(tag))!=0: 38 print("序号:"+str(id)) 39 print("名称:"+name) 40 print("属性:"+nature) 41 print("内容:"+content) 42 print("网址:"+url) 43 write(id, name, nature, content, url) 44 id+=1 45 46 def read(): 47 global url 48 f=open("E:/word4.txt",‘r+‘,encoding="utf-8") 49 for line in f: 50 url=url+line.rstrip("\n") 51 Href(url) 52 url = "https://baike.baidu.com/item/" 53 f.close() 54 55 def write(id,name,nature,content,url): 56 f = open(‘E:/bdbk.csv‘, ‘a+‘, encoding=‘utf-8‘, newline=‘‘) 57 csv_writer = csv.writer(f) 58 csv_writer.writerow([str(id),name,nature,content,url]) 59 f.close() 60 61 if __name__=="__main__": 62 Head() 63 #Href("https://baike.baidu.com/item/python") 64 read()
假期学习【十一】Python爬取百度词条写入csv格式 python 2020.2.10
标签:百度 写入 词条 根据 compile nbsp alt soup lin
原文地址:https://www.cnblogs.com/zlc364624/p/12292892.html