码迷,mamicode.com
首页 > 编程语言 > 详细

Python XPath抓取小说《三国演义》 《三》 多线程简单实例

时间:2020-01-20 18:54:59      阅读:79      评论:0      收藏:0      [点我收藏+]

标签:抓取数据   append   choice   book   exce   proc   threading   守护线程   线程   

 

增加多线程抓取数据,增加url判断,若数据已抓取,不在重复抓取  (可参考URL管理器)

需要再添加上队列,否则全开

from lxml import etree
import requests
import time
import os
import random
import urllib3
from multiprocessing import Pool
import _thread
import threading


def getHeaders():

    #随机获取一个headers

    user_agents = [‘Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1‘,
                   ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50‘,
                   ‘Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11‘,
                   ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36‘,
                   ‘Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36‘,
                   ‘Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36‘,
                   ‘Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36‘,
                   ‘Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36‘,
                   ‘Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36‘,
                   ‘Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36‘,
                   ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6‘,
                   ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36‘,
                   ‘Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36‘,
                   ‘Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36‘
                   ]
    headers = {‘User-Agent‘: random.choice(user_agents),‘Connection‘:‘close‘}
    return headers

"""
request请求头
"""
def getRequestHtml(target):
    req = ""
    try:

        req = requests.get(url = target,headers = getHeaders(),verify=False,proxies=None)
        req.encoding = "gb2312"
        requests.adapters.DEFAULT_RETRIES = 5
        urllib3.disable_warnings()
        html = req.text
        return html
    except requests.exceptions.ConnectionError:
        req.status_code = "Connection refused"


"""
获取章节列表和地址
"""
def  getContents(target,filePath):

    html = getRequestHtml(target)
    bookdata = etree.HTML(html)
    table_list = bookdata.xpath(‘//table[9]//tr[1]//td[2]//table[4]//tr[1]//td[1]//table[1]//a‘)
    return table_list

"""
获取小说内容
"""
def getContent(filePath, title,target):
    html = getRequestHtml(target)
    bookdata = etree.HTML(html)
    table_list = bookdata.xpath(‘//table[5]//tr[1]//td[2]//text()‘)

    saveData(filePath, title, table_list)


"""
将小说内容写入到文件
"""


def saveData(filepath, name, text):

    isExists = os.path.exists(filepath)

    if not isExists:
        os.makedirs(filepath)

    url = filepath+name+".txt"
    with open(url, mode="w", encoding="UTF-8") as f:
        f.writelines(text)
        f.write(‘\n\n‘)



class myThread(threading.Thread):
    def __init__(self,filePath,title,url):
        threading.Thread.__init__(self)
        self.filePath = filePath
        self.title = title
        self.url = url
    def run(self):

        getContent(self.filePath, self.title, self.url)


if __name__ == ‘__main__‘:
    #三国演义 目录地址
    target = "https://www.kanunu8.com/files/old/2011/2447.html"
    filePath = "D:\\小说\\三国演义\\"
    #获取目录列表和地址列表
    title_list = getContents(target,filePath)

    t_start = time.time()
    threadlist = []
    for t in title_list:
        title = t.text
        url = "https://www.kanunu8.com/files/old/2011/"+t.get(‘href‘)
        print(title,url)

        #先判断是否已经抓取过了该页面
        isEx = os.path.isfile(filePath+title+".txt")
        if not isEx:
            try:
                thread1 = myThread(filePath, title, url)
                thread1.setDaemon(True)  # 设置守护线程,父线程会等待子线程执行完后再退出
                thread1.start()
                threadlist.append(thread1)
            except:
                print("无法启动线程")
        else:
            print("该文件已经存在 不需要再次抓取")

    for tt in threadlist:
        tt.join()

    t_end = time.time()

    print(‘抓取本书耗时= %s‘ % (t_end - t_start))

  

Python XPath抓取小说《三国演义》 《三》 多线程简单实例

标签:抓取数据   append   choice   book   exce   proc   threading   守护线程   线程   

原文地址:https://www.cnblogs.com/dangzhengtao/p/12218897.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!