beautifulsoup爬取糗事百科

时间：2017-10-03 23:31:24 阅读：274 评论：0 收藏：0 [点我收藏+]

标签：win 作者文章 comment for art lib color get

 1 # _*_ coding:utf-8 _*_
 2 import urllib2
 3 from bs4 import BeautifulSoup
 4 
 5 user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"
 6 headers = {‘User-Agent‘:user_agent}
 7 url = "https://www.qiushibaike.com"
 8 # 爬取article链接的content内容
 9 def getContent(article_url, headers):
10     request = urllib2.Request(article_url,data=None,headers=headers)
11     response = urllib2.urlopen(request,timeout=60)
12     html = response.read().decode(‘utf-8‘)
13     soup = BeautifulSoup(html,‘html.parser‘)
14     contents= soup.select(‘.content‘)[0].strings
15     print u"内容："
16     for content in contents:
17         print u"%s" % content.strip()
18     print ‘\n‘
19 # ----------------------------
20 # 获取"https://www.qiushibaike.com/hot/page/1/"页面的作者、好笑、评论，文章地址信息
21 def getData(url, headers,pages=1):
22     for page in range(1,pages+1):
23         page_url = url + "/hot/page/" + str(page)
24         print "正在爬取第 %s 页+++" % page
25         request = urllib2.Request(page_url,data=None,headers=headers)
26         response = urllib2.urlopen(request,timeout=60)
27         html = response.read().decode(‘utf-8‘)
28         # print html
29         soup = BeautifulSoup(html,‘html.parser‘)
30 
31         authors = soup.select(‘h2‘)
32         smile_nums = soup.select(‘.stats-vote > .number‘)
33         comment_nums = soup.select(‘.stats-comments > .qiushi_comments > .number‘)
34         article_urls = soup.select(‘.contentHerf‘)
35         for i in range(25):
36             print "正在爬取第 %s 页的第 %s 条数据---" % (page,i+1)
37             author = authors[i].string.strip()
38             print u"作者： %s" % author
39             funny_num = smile_nums[i].string
40             comment_num = comment_nums[i].string
41             print u"好笑： %s" % funny_num
42             print u"评论： %s"% comment_num
43             article_url = article_urls[i][‘href‘]
44 
45             article_url = url + article_url
46             # print article_url
47             getContent(article_url, headers)
48 # ---------------------------------
49 getData(url,headers,pages=10)

beautifulsoup爬取糗事百科

标签：win 作者文章 comment for art lib color get

原文地址：http://www.cnblogs.com/stonelovy/p/7624685.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行