标签:源码 name dict orm div utf8 ext format init
说明:无意滋生事端,仅学习分享,如有侵权,立即删除
用到的模块:json、lxml的etree、time.ctime、requests
源码如下:
import requests import time from lxml import etree import json class BiLiSpider(): def __init__(self): self.url = ‘https://www.bilibili.com/ranking/all/129/0/3?spm_id_from=333.851.b_62696c695f7265706f72745f64616e6365.39‘ self.headers = { ‘user-agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36‘, } def get_response(self): ret = requests.get(self.url,headers=self.headers) ret = ret.content.decode() return ret def make_content(self,ret1): all_item_list = list() # 创建一个列表用于接受字典,并用来排名 for ret_ in ret1: item = dict() # 创建一个字典用于接收每名的标题、得分、链接地址 all_item = dict() # 创建一个字典用于保存作者,并关联排名 item["title"] = ret_.xpath(‘.//div[@class="info"]/a/text()‘)[0] if len(ret_.xpath(‘.//div[@class="info"]/a/text()‘)) else None item[‘hot_score‘] = ret_.xpath(‘.//div[@class="pts"]/div/text()‘)[0] if len(ret_.xpath(‘.//div[@class="pts"]/div/text()‘)) else None item["title_href"] = ret_.xpath(‘.//div[@class="info"]/a/@href‘)[0] if len(ret_.xpath(‘.//div[@class="info"]/a/@href‘)) else None author = ret_.xpath(‘.//div[@class="detail"]/a/span/text()‘)[0] if len(ret_.xpath(‘.//div[@class="detail"]/a/span/text()‘)) else None all_item["{}".format(author)] = item all_item_list.append(all_item) return all_item_list def save_file_response(self,ret): with open("spider_bilibil({}).html".format(time.ctime()),‘w‘,encoding="utf8") as f: f.write(ret) print("保存响应内容成功") def save_file(self,ret): with open("哔哩哔哩舞蹈区前100名内容({}).json".format(time.ctime()),‘w‘,encoding="utf-8") as f: f.write(json.dumps(ret,ensure_ascii=False,indent=2)) print("保存哔哩哔哩舞蹈区前100名内容成功") def run(self): # 获取相应对象 ret = self.get_response() # 保存response对象 self.save_file_response(ret) # 实例化一个element对象 html = etree.HTML(ret) # 对element对象分组 ret1 = html.xpath("//ul[@class=‘rank-list‘]/li[@class=‘rank-item‘]") # 获取舞蹈区前100名的内容 all_item = self.make_content(ret1) # 保存文件 self.save_file(all_item) if __name__ == ‘__main__‘: obj = BiLiSpider() obj.run()
标签:源码 name dict orm div utf8 ext format init
原文地址:https://www.cnblogs.com/nuochengze/p/12770055.html