标签:nali 一个 ESS rom baidu bs4 split html 自己
今天根据昨天爬取到的网址进行了二次爬取,爬取内容为每个信件的内容,然而本应该是一项很简单的任务,但是奈何数据是真的‘脏’,所以今天知识对所有的三万个网址进行了信件内容的爬取。
使用的时beautifulsoup进行爬取,在爬取的同时对数据进行了简单的处理,完善了一些bug。之后将按照下一步对怕爬取到的数据进行清洗,之后导入数据库,在进行下一步的操作。
爬取信件内容源代码:
# -*- coding: utf-8 -*- """ Created on Tue Jan 28 15:14:59 2020 @author: 陈欢 """ import requests from bs4 import BeautifulSoup def ReadFile(): f=open(‘url‘,‘r‘,encoding = ‘utf-8-sig‘) Text=f.readlines() Text2=[] for i in range(len(Text)): x=Text[i].split(‘,‘,1) Text2.append(x[1]) return Text2 #爬取信件内容 def WriteFile(data): f=open(‘data2.csv‘,‘a+‘,encoding = ‘utf-8‘) for i in range(len(data)): if(i<(len(data)-1)): f.write(data[i]+"\t") else : f.write(data[i]+"\n") URLAll=ReadFile() error=[] time=1; #for i in range(0,100,10): #0到100以10为步长,range总是考虑后面的数减步长 headers = { # 假装自己是浏览器 ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.75 Chrome/73.0.3683.75 Safari/537.36‘, # 把你刚刚拿到的Cookie塞进来 ‘cookie‘: ‘HDJLJSID=39DBD6D5E12B9F0F8834E297FAFC973B; __jsluid_h=e6e550159f01ae9aceff30d191b09911; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216f9edc47471cb-0059c45dfa78d6-c383f64-1049088-16f9edc474895%22%7D; _gscu_564121711=80128103kc5dx617; X-LB=1.1.44.637df82f; _va_ref=%5B%22%22%2C%22%22%2C1580462724%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DM-f5ankfbAnnYIH43aTQ0bvcFij9-hVxwm64pCc6rhCu5DYwg6xEVis-OVjqGinh%26wd%3D%26eqid%3Dd6b151bf000cfb36000000025e1c5d84%22%5D; _va_ses=*; route=74cee48a71a9ef78636a55b3fa493f67; _va_id=b24752d801da28d7.1578917255.10.1580462811.1580450943.‘,} session = requests.Session() for i in URLAll: try: print(time) time+=1 print(i) url=i[0:len(i)-1] #url="http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=10000037" print(url) response = session.get(url, headers=headers) html = response.text #将网页内容以html返回 soup = BeautifulSoup(html,‘lxml‘)#解析网页的一种方法 LetterPerson =soup.find_all(‘div‘,class_="col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted")#来信人 LetterCount = soup.find_all(‘div‘,class_="col-xs-12 col-md-12 column p-2 text-muted mx-2")#信件内容 AnswerDepartment = soup.find_all(‘div‘,class_="col-xs-9 col-sm-7 col-md-5 o-font4 my-2")#回答机构(一个或多个) AnswerCount = soup.find_all(‘div‘,class_="col-xs-12 col-md-12 column p-4 text-muted my-3")#回答内容(对应回答机构数量) AnswerTime = soup.find_all(‘div‘,class_="col-xs-12 col-sm-3 col-md-3 my-2")#回答时间(对应回答机构数量) IsPlay=soup.find_all(‘span‘,class_="font14 offic blod")#是否回复 YPraise = soup.find_all(‘a‘,class_="dex_yes font12")#网友赞的数量 NPraise = soup.find_all(‘a‘,class_="dex_no font12")#网友不赞的数量 print(LetterPerson) x=url.split(‘=‘,1) url2=x[1] if(len(IsPlay)!=0): data=[] data.append(url2) data.append(LetterPerson[0].text) data.append(LetterCount[0].text) data.append("0") data.append("0") data.append("null") data.append("null") data.append("null") data.append("false") else : data=[] data.append(url2) data.append(LetterPerson[0].text) data.append(LetterCount[0].text) data.append(YPraise[0].text) data.append(NPraise[0].text) for j in range(len(AnswerDepartment)): data.append(AnswerDepartment[j].text) data.append(AnswerCount[j].text) data.append(AnswerTime[j].text) data.append("true") for j in range(len(data)): replace=data[j] replace=replace.replace(‘\r‘,‘‘) replace=replace.replace(‘\n‘,‘‘) replace=replace.replace(‘\t‘,‘‘) replace=replace.replace(‘\xa0‘,‘‘) replace=replace.replace(‘来信人:‘,‘‘) replace=replace.replace(‘[官方回答]:‘,‘‘) replace=replace.replace(‘答复时间:‘,‘‘) data[j]=replace.replace(‘ ‘,‘‘) #print(data) WriteFile(data) except IndexError: error.append(time-1) continue print(error)
标签:nali 一个 ESS rom baidu bs4 split html 自己
原文地址:https://www.cnblogs.com/huan-ch/p/12252838.html