标签:blank 头像 import url 部分 页面 2.0 read 输入
爬取糗事百科段子,页面的URL是 http://www.qiushibaike.com/8hr/page/
使用requests获取页面信息,用XPath 做数据提取
获取每个帖子里的用户头像链接
、用户姓名
、段子内容
、点赞次数
和评论次数
1 # -*- coding:utf-8 -*- 2 import requests 3 from lxml import etree 4 5 def loadPage(url): 6 headers = { 7 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36‘, 8 ‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘} 9 try: 10 response = requests.get(url, headers=headers) 11 resHtml = response.text 12 html = etree.HTML(resHtml) 13 result = html.xpath(‘//div[contains(@id,"qiushi_tag")]‘) 14 for site in result: 15 item = {} 16 imgUrl = site.xpath(‘./div/a/img/@src‘)[0].encode(‘utf-8‘) 17 #username = site.xpath(‘.//img/@alt‘)[0].encode(‘utf-8‘) 18 username = site.xpath(‘.//h2‘)[0].text 19 content = site.xpath(‘.//div[@class="content"]/span‘)[0].text.strip().encode(‘utf-8‘) 20 # 投票次数 21 vote = site.xpath(‘.//i‘)[0].text 22 #print site.xpath(‘.//*[@class="number"]‘)[0].text 23 # 评论信息 24 comments = site.xpath(‘.//i‘)[1].text 25 print imgUrl, username, content, vote, comments 26 except Exception, e: 27 print e 28 29 def qiushiSpider(url, beginPage, endPage): 30 """ 31 作用:贴吧爬虫调度器,负责组合处理每个页面的url 32 url : 贴吧url的前部分 33 beginPage : 起始页 34 endPage : 结束页 35 """ 36 for page in range(beginPage, endPage + 1): 37 pn = page 38 fullurl = url + str(pn) 39 #print fullurl 40 loadPage(fullurl) 41 #print html 42 43 if __name__ == "__main__": 44 beginPage = int(raw_input("请输入起始页:")) 45 endPage = int(raw_input("请输入结束页:")) 46 #page = 1 47 url = ‘http://www.qiushibaike.com/8hr/page/‘ 48 qiushiSpider(url, beginPage, endPage)
保存到 json 文件内
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import urllib2 5 import json 6 from lxml import etree 7 8 9 def loadPage(url): 10 11 headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} 12 13 request = urllib2.Request(url, headers = headers) 14 html = urllib2.urlopen(request).read() 15 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html) 16 17 text = etree.HTML(html) 18 # 返回所有段子的结点位置,contains()模糊查询方法,第一个参数是要匹配的标签,第二个参数是标签名部分内容 19 node_list = text.xpath(‘//div[contains(@id, "qiushi_tag")]‘) 20 21 items ={} 22 for node in node_list: 23 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来,用户名 24 username = node.xpath(‘.//img/@alt‘)[0] 25 # 图片连接 26 image = node.xpath(‘.//div[@class="thumb"]//@src‘)#[0] 27 # 取出标签下的内容,段子内容 28 content = node.xpath(‘.//div[@class="content"]/span‘)[0].text 29 # 取出标签里包含的内容,点赞 30 zan = node.xpath(‘.//i‘)[0].text 31 # 评论 32 comments = node.xpath(‘.//i‘)[1].text 33 34 items = { 35 "username" : username, 36 "image" : image, 37 "content" : content, 38 "zan" : zan, 39 "comments" : comments 40 } 41 42 with open("qiushi.json", "a") as f: 43 f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n") 44 45 def qiushiSpider(url, beginPage, endPage): 46 """ 47 作用:贴吧爬虫调度器,负责组合处理每个页面的url 48 url : 贴吧url的前部分 49 beginPage : 起始页 50 endPage : 结束页 51 """ 52 for page in range(beginPage, endPage + 1): 53 pn = page 54 fullurl = url + str(pn) 55 #print fullurl 56 loadPage(fullurl) 57 #print html 58 59 if __name__ == "__main__": 60 beginPage = int(raw_input("请输入起始页:")) 61 endPage = int(raw_input("请输入结束页:")) 62 #page = 1 63 url = ‘http://www.qiushibaike.com/8hr/page/‘ 64 qiushiSpider(url, beginPage, endPage)
标签:blank 头像 import url 部分 页面 2.0 read 输入
原文地址:https://www.cnblogs.com/wanglinjie/p/9193460.html