标签:cli get 爬取 xpath [] enc item web 本地
1 from selenium import webdriver 2 import os 3 import json 4 import time 5 6 7 class Douyu: 8 def __init__(self): 9 # 1.发送首页的请求 10 self.driver = webdriver.Chrome(‘../chromedriver.exe‘) 11 self.driver.get(‘https://www.douyu.com/g_yz‘) 12 13 # 获取页面内容 14 def get_content(self): 15 time.sleep(3) 16 li_list = self.driver.find_elements_by_xpath("//*[@class=‘layout-Cover-list‘]/li[@class=‘layout-Cover-item‘]") 17 length = len(li_list) 18 contents = [] 19 20 # 遍历房间列表 21 for i in range(length): 22 item = {} 23 item[‘主播‘] = self.driver.find_elements_by_xpath(‘//h2[@class="DyListCover-user"]‘)[i].text 24 item[‘房间名‘] = self.driver.find_elements_by_xpath(‘//h3[@class="DyListCover-intro"]‘)[i].get_attribute( 25 ‘title‘) 26 item[‘热度‘] = self.driver.find_elements_by_xpath(‘//span[@class="DyListCover-hot"]‘)[i].text 27 item[‘封面‘] = self.driver.find_elements_by_class_name(‘DyImg-content‘)[i].get_attribute( 28 ‘src‘) 29 contents.append(item) 30 return contents 31 32 # 保存数据到本地 33 def save_content(self, contents): 34 with open(‘douyu.json‘, ‘a‘,encoding=‘utf-8‘) as f: 35 for content in contents: 36 json.dump(content, f, ensure_ascii=False, indent=2) 37 f.write(os.linesep) 38 39 def run(self): 40 # 1.发送首页的请求:初始化时已经发送请求 41 # 2.获取第一页的数据 42 contents = self.get_content() 43 self.save_content(contents) 44 45 # 3.循环:点击下一页按钮,只要没有下一页的按钮 46 while self.driver.find_elements_by_class_name(‘dy-Pagination-item-custom‘)[1]: 47 # 点击下一页的按钮 48 self.driver.find_elements_by_class_name(‘dy-Pagination-item-custom‘)[1].click() 49 # 4.继续获取下一页的内容 50 contents = self.get_content() 51 # 4.保存下一页保存内容 52 self.save_content(contents) 53 54 55 if __name__ == ‘__main__‘: 56 douyu = Douyu() 57 douyu.run()
标签:cli get 爬取 xpath [] enc item web 本地
原文地址:https://www.cnblogs.com/crazyechoaoo/p/10682715.html