码迷,mamicode.com
首页 > 编程语言 > 详细

小说爬取 python + urllib + lxml

时间:2019-10-01 14:02:07      阅读:77      评论:0      收藏:0      [点我收藏+]

标签:com   lib   win64   ade   get   mat   arch   sea   class   

from urllib import parse
from urllib import request
from lxml import etree
import time

class Novel:
    def __init__(self,*args):
        self.name = args[0]
        self.dict = args[1]
        self.txt = ‘‘
        for key in sorted(self.dict):
            self.txt = self.txt + self.dict[key]

    def write(self):
        f = open(self.name+.txt,w)
        f.write(self.txt)
        f.close()

#获取网页源代码
def get_http_page(url,**kw):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    }
    req = request.Request(url,headers=headers)
    response = request.urlopen(req)
    page = response.read()
    encoding = gbk
    if kw:
        encoding = kw[encoding]
    page = page.decode(encoding)
    return page

#获取漫画目录
def get_comics_directory(url):
    url_list = []
    page = get_http_page(url,encoding=utf-8)
    html = etree.HTML(page)
    result = html.xpath(/html/body/div[2]/div/div[2]/h3/a)
    elment_select = None
    if len(result):
        url2 = result[0].get(href)
    if url2:
        page = get_http_page(url2)
        html = etree.HTML(page)
        elment_select = html.xpath(/html/body/div[4]/div[9]/span[2]/select)
        if len(elment_select):
            result_option = elment_select[0].findall(option)
            for option in result_option:
                url_list.append(https://m.wenxuemi6.com{}.format(option.get(value)))
    return url_list

def downdload_txt(url_list,**kw):
    if kw:
        start = int(kw[start])
        stop = int (kw[stop])
        if start >= 0 and start < len(url_list) and stop > start and stop <len(url_list):
            count = kw[start]
            count_max = kw[stop]
    else:
        count = 0
        count_max = len(url_list)
    print(正在爬取目录和章节地址,请稍等……)
    d = {}
    while count < count_max:
        url = url_list[count]
        page = get_http_page(url)
        html = etree.HTML(page)
        result = html.xpath(/html/body/div[4]/ul[2]/li/a)
        txt = ‘‘
        if type(result).__name__ == list:
            for l in result:
                url = https://m.wenxuemi6.com{}.format(l.get(href))
                #url_list.append(‘https://m.wenxuemi6.com{}‘.format(l.get(‘href‘)))
                print(Download chapters by URL:{}.format(url))
                d2 = {{}.format(count): ‘‘}
                page = get_http_page(url)
                html = etree.HTML(page)
                url_next = html.xpath(//*[@id="pb_next"])
                t = html.xpath(//*[@id="nr1"]/text())
                t2 = html.xpath(//*[@id="nr1"]/p)
                txt_title = ‘‘
                txt_title_list = html.xpath(//*[@id="nr_title"]/text())
                if type(txt_title_list).__name__ == list:
                    if (len(txt_title_list) == 1):
                        txt_title = txt_title_list[0]
                txt = txt + txt_title + \r\n
                for l2 in t:
                    txt = txt + l2 + \r\n
                if type(t2).__name__ == list:
                    if len(t2) == 1:
                        url = https://m.wenxuemi6.com{}.format(l.get(href)[:-5] + _2.html)
                        print(Download chapters by URL:{}.format(url))
                        page = get_http_page(url)
                        html = etree.HTML(page)
                        t = html.xpath(//*[@id="nr1"]/text())
                        for l2 in t:
                            txt = txt + l2 + \r\n
                d2[{}.format(count)] = txt
                d.update(d2)
                time.sleep(1)
    return d



if __name__ == __main__:
    txt_name = input("请输入要搜索的书名:")
    url = https://m.wenxuemi6.com/search.php?keyword={}.format(parse.quote(txt_name))
    referer = url
    url_list = get_comics_directory(url)
    #下载第一页目录下的小说
    d = downdload_txt(url_list,start=0,stop=1)
    n1 = Novel(txt_name,d)
    #写出文件 [txt_name].txt 到当前目录下
    n1.write()

    #下载全本小说
    d2 = downdload_txt(url_list,start=0,stop=1)
    n2 = Novel(txt_name,d2)
    #写出文件 [txt_name].txt 到当前目录下
    n2.write()

 

小说爬取 python + urllib + lxml

标签:com   lib   win64   ade   get   mat   arch   sea   class   

原文地址:https://www.cnblogs.com/Dmail/p/11615049.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!