码迷,mamicode.com
首页 > 其他好文 > 详细

糗百热点爬虫

时间:2018-12-04 22:21:20      阅读:172      评论:0      收藏:0      [点我收藏+]

标签:range   list   object   windows   with open   format   from   with   http   

 1 # -*- coding:utf-8 -*-
 2 # Author:Sure Feng
 3 
 4 import requests
 5 from lxml import etree
 6 import json
 7 
 8 
 9 class QiubaiSpider(object):
10     def __init__(self):
11         self.tempt_url = "https://www.qiushibaike.com/8hr/page/{}/"
12         self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
13 
14     def parse_url(self, url):
15         """发送请求,获取响应"""
16         respond = requests.get(url, self.headers)
17         return respond.content.decode()
18 
19     def get_content(self, html_str, num):
20         """提取数据"""
21         html = etree.HTML(html_str)
22         div_list = html.xpath("//div[@id=‘content-left‘]/div")  # 分组
23         content_list = []
24         for div in div_list:
25             item = {}
26             item["page"] = num
27             item["content"] = div.xpath(".//div[@class=‘content‘]/span/text()")
28             item["content"] = [i.replace("\n","") for i in item["content"]]
29             item["author_gender"] = div.xpath(".//div[contains(@class, ‘articleGender‘)]/@class")
30             item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(item["author_gender"])>0 else None
31             item["age"] = div.xpath(".//div[contains(@class, ‘articleGender‘)]/text()")
32             item["age"] = item["age"][0] if len(item["age"])>0 else None
33             item["content_img"] = div.xpath(".//div[@class=‘thumb‘]/a/img/@src")
34             item["content_img"] = "https" + item["content_img"][0] if len(item["content_img"])>0 else None
35             item["author_img"] = div.xpath(".//div[@class=‘author clearfix‘]//img/@src")
36             item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"])>0 else None
37             item["stats_vote"] = div.xpath(".//span[@class=‘stats-vote‘]/i/text()")
38             item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
39             content_list.append(item)
40         return content_list
41 
42     def save_conten(self, content_list):
43         """保存"""
44         with open("qiubai.txt", "a", encoding="utf-8") as f:
45             for content in content_list:
46                 f.write(json.dumps(content, ensure_ascii=False, indent=4))
47                 f.write("\n")
48         print("保存成功")
49 
50     def run(self):  # 实现主要逻辑
51         # 获取URL列表,遍历列表
52         start_url = [self.tempt_url.format(i) for i in range(1, 14)]
53         num = 1
54         for url in start_url:
55             # 发送请求,获取响应
56             html_str = self.parse_url(url)
57             # 提取数据
58             content_list = self.get_content(html_str, num)
59             # 保存
60             self.save_conten(content_list)
61             num += 1
62 
63 
64 if __name__ == __main__:
65     qiubai_spider = QiubaiSpider()
66     qiubai_spider.run()

 

糗百热点爬虫

标签:range   list   object   windows   with open   format   from   with   http   

原文地址:https://www.cnblogs.com/sure-feng/p/10066845.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!