标签:抓取数据 append choice book exce proc threading 守护线程 线程
增加多线程抓取数据,增加url判断,若数据已抓取,不在重复抓取 (可参考URL管理器)
需要再添加上队列,否则全开
from lxml import etree import requests import time import os import random import urllib3 from multiprocessing import Pool import _thread import threading def getHeaders(): #随机获取一个headers user_agents = [‘Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1‘, ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50‘, ‘Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11‘, ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36‘, ‘Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36‘, ‘Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36‘, ‘Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36‘, ‘Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36‘, ‘Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36‘, ‘Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36‘, ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6‘, ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36‘, ‘Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36‘, ‘Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36‘ ] headers = {‘User-Agent‘: random.choice(user_agents),‘Connection‘:‘close‘} return headers """ request请求头 """ def getRequestHtml(target): req = "" try: req = requests.get(url = target,headers = getHeaders(),verify=False,proxies=None) req.encoding = "gb2312" requests.adapters.DEFAULT_RETRIES = 5 urllib3.disable_warnings() html = req.text return html except requests.exceptions.ConnectionError: req.status_code = "Connection refused" """ 获取章节列表和地址 """ def getContents(target,filePath): html = getRequestHtml(target) bookdata = etree.HTML(html) table_list = bookdata.xpath(‘//table[9]//tr[1]//td[2]//table[4]//tr[1]//td[1]//table[1]//a‘) return table_list """ 获取小说内容 """ def getContent(filePath, title,target): html = getRequestHtml(target) bookdata = etree.HTML(html) table_list = bookdata.xpath(‘//table[5]//tr[1]//td[2]//text()‘) saveData(filePath, title, table_list) """ 将小说内容写入到文件 """ def saveData(filepath, name, text): isExists = os.path.exists(filepath) if not isExists: os.makedirs(filepath) url = filepath+name+".txt" with open(url, mode="w", encoding="UTF-8") as f: f.writelines(text) f.write(‘\n\n‘) class myThread(threading.Thread): def __init__(self,filePath,title,url): threading.Thread.__init__(self) self.filePath = filePath self.title = title self.url = url def run(self): getContent(self.filePath, self.title, self.url) if __name__ == ‘__main__‘: #三国演义 目录地址 target = "https://www.kanunu8.com/files/old/2011/2447.html" filePath = "D:\\小说\\三国演义\\" #获取目录列表和地址列表 title_list = getContents(target,filePath) t_start = time.time() threadlist = [] for t in title_list: title = t.text url = "https://www.kanunu8.com/files/old/2011/"+t.get(‘href‘) print(title,url) #先判断是否已经抓取过了该页面 isEx = os.path.isfile(filePath+title+".txt") if not isEx: try: thread1 = myThread(filePath, title, url) thread1.setDaemon(True) # 设置守护线程,父线程会等待子线程执行完后再退出 thread1.start() threadlist.append(thread1) except: print("无法启动线程") else: print("该文件已经存在 不需要再次抓取") for tt in threadlist: tt.join() t_end = time.time() print(‘抓取本书耗时= %s‘ % (t_end - t_start))
Python XPath抓取小说《三国演义》 《三》 多线程简单实例
标签:抓取数据 append choice book exce proc threading 守护线程 线程
原文地址:https://www.cnblogs.com/dangzhengtao/p/12218897.html