标签:port name header chrome 方法 spi 逻辑 write return
下载图片
import requests response = requests.get(‘http://www.51gis.com.cn/static/upload/3e223daf9df6216f/f3e187dfc0e4143a.jpg‘) with open(‘51gis.jpg‘, ‘wb‘) as f: f.write(response.content)
==================================
import requests class TiebaSpider(object): def __init__(self, tieba_name): self.tieba_name = tieba_name self.url_temp = "http://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}" self.headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36‘ } def parse_url(self, url): #‘‘‘访问url‘‘‘ response = requests.get(url, headers=self.headers) return response.text def save_html(self, url_html, page_num): ‘‘‘保存页面‘‘‘ file_path = "《{}》-第{}页".format(self.tieba_name, page_num) with open(file_path + ‘.htm‘, ‘w‘) as f: f.write(url_html) def get_url_list(self): ‘‘‘构造url列表‘‘‘ # 方法1 url_list = [] for i in range(10): url_list.append(self.url_temp.format(i * 50)) return url_list # 方法2 #return [self.url_temp.format(i * 50) for i in range(10)] def run(self): ‘‘‘主要逻辑‘‘‘ # 1 构造url url_list = self.get_url_list() # 2 访问url for url in url_list: url_html = self.parse_url(url) # 3 保存 page_num = url_list.index(url) + 1 # 获取页码 self.save_html(url_html, page_num) if __name__ == ‘__main__‘: name = input(‘请输入你想要爬取的论坛名称:‘) tb_spider = TiebaSpider(name) tb_spider.run()
标签:port name header chrome 方法 spi 逻辑 write return
原文地址:https://www.cnblogs.com/gisoracle/p/12286371.html