码迷,mamicode.com
首页 > 编程语言 > 详细

假期学习【八】首都之窗百姓信件爬虫(完整版)2020.2.6 Python

时间:2020-02-07 01:24:07      阅读:110      评论:0      收藏:0      [点我收藏+]

标签:class   博客   requests   int   find   hud   read   logs   write   

时间:2020.2.6

今天把昨天做到一半的首都之窗百姓信件爬取完成了。

源码如下:

  1 import requests
  2 import io
  3 from bs4 import BeautifulSoup
  4 #信1705-1 赵路仓
  5 kv = {user-agent: Mozilla/5.0}
  6 id=AH20010700179
  7 
  8 def read():
  9     f=open(E://list.txt,r)
 10     for line in f:
 11         id=f.readline().rstrip(\n)
 12         print(id)
 13         url1 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=" + id  # 咨询
 14         url2 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId=" + id  # 建议
 15         url3 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId=" + id  # 投诉
 16         parser(url1)
 17         parser2(url2)
 18         parser3(url3)
 19     f.close()
 20 
 21 def write(contents):
 22     f=open(E://result.txt,a+)
 23     f.write(contents)
 24     print(contents,写入成功!)
 25     f.close()
 26 
 27 def parser(url):
 28     try:
 29         r = requests.get(url, headers=kv)
 30         print(r.status_code)
 31         demo = r.text
 32         soup = BeautifulSoup(demo, "html.parser")
 33         #print(soup.prettify())
 34         ‘‘‘print("标题:", soup.find("strong").get_text().lstrip().rstrip())
 35         print("来信人:",soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(‘来信人:‘).lstrip().rstrip())
 36         print("时间:",soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(‘时间:‘))
 37         print("网友同问:", soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友同问:").lstrip().rstrip())
 38         print("问题:", soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip())
 39         print("官方:", soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text())
 40         print("回答时间:",soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(‘答复时间:‘))
 41         print("回答:", soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip())‘‘‘
 42         if soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().find(
 43                 网友同问) != -1:
 44             write("咨询"+"||")
 45         write(soup.find("strong").get_text().lstrip().rstrip()+"||")
 46         write(soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(来信人:).lstrip().rstrip()+"||")
 47         write(soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(时间:)+"||")
 48         write(soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友同问:").lstrip().rstrip()+"||")
 49         write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip()+"||")
 50         write(soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().lstrip().rstrip()+"||")
 51         write(soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(答复时间:)+"||")
 52         write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace(" ",""))
 53         write(soup.find_all("a", {"class": "dex_yes font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||")
 54         write(soup.find_all("a", {"class": "dex_no font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||")
 55         write(\r\n)
 56     except:
 57         print("咨询爬取失败!")
 58 
 59 def parser2(url):
 60     try:
 61         r = requests.get(url, headers=kv)
 62         print(r.status_code)
 63         demo = r.text
 64         soup = BeautifulSoup(demo, "html.parser")
 65         #print(soup.prettify())
 66         ‘‘‘print("标题:", soup.find("strong").get_text().lstrip().rstrip())
 67         print("来信人:",soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(‘来信人:‘).lstrip().rstrip())
 68         print("时间:",soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(‘时间:‘))
 69         print("网友同问:", soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友同问:").lstrip().rstrip())
 70         print("问题:", soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip())
 71         print("官方:", soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text())
 72         print("回答时间:",soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(‘答复时间:‘))
 73         print("回答:", soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip())‘‘‘
 74         if soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().find(网友支持)!=-1:
 75             write("建议"+"||")
 76         write(soup.find("strong").get_text().lstrip().rstrip()+"||")
 77         write(soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(来信人:).lstrip().rstrip()+"||")
 78         write(soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(时间:)+"||")
 79         write(soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友支持:").lstrip().rstrip()+"||")
 80         write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip()+"||")
 81         write(soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().lstrip().rstrip()+"||")
 82         write(soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(答复时间:)+"||")
 83         write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace(" ",""))
 84         write(soup.find_all("a", {"class": "dex_yes font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||")
 85         write(soup.find_all("a", {"class": "dex_no font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||")
 86         write(\r\n)
 87     except:
 88         print("建议爬取失败!")
 89 
 90 def parser3(url):
 91     try:
 92         r = requests.get(url, headers=kv)
 93         print(r.status_code)
 94         demo = r.text
 95         soup = BeautifulSoup(demo, "html.parser")
 96         #print(soup.prettify())
 97         if soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().find(网友评价)!=-1:
 98             write("投诉"+"||")
 99         write(soup.find("strong").get_text().lstrip().rstrip()+"||")
100         write(soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(来信人:).lstrip().rstrip()+"||")
101         write(soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(时间:)+"||")
102         write(soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友评价数:").lstrip().rstrip()+"||")
103         write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip()+"||")
104         write(soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().lstrip().rstrip()+"||")
105         write(soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(答复时间:)+"||")
106         write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace(" ","")+"||")
107         write(soup.find_all("a", {"class": "dex_yes font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||")
108         write(soup.find_all("a", {"class": "dex_no font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||")
109         write(\r\n)
110     except:
111         print("投诉爬取失败!")
112 
113 if __name__=="__main__":
114     read()

遇到的问题:

  忽视了read()和readline()读取后面的\n导致不停报错,在开始时没有看到投诉类型的信件和建议类型的信件,后来补上。

爬取的文本内容用||隔开,分别代表信件类型,标题,来信人,问题时间,网友评价,问题内容,回答方,回答时间,回答内容,赞,踩11个属性,存为E://result.txt文件内。

附上页面的ID(txt格式)百度网盘:https://pan.baidu.com/s/1GvF8Kllvv-vqBblgWnA-LQ

BeautifulSoup的安装和使用可以参考我的博客:https://www.cnblogs.com/zlc364624/p/12264070.html

 

假期学习【八】首都之窗百姓信件爬虫(完整版)2020.2.6 Python

标签:class   博客   requests   int   find   hud   read   logs   write   

原文地址:https://www.cnblogs.com/zlc364624/p/12271490.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!