小说爬虫

时间：2019-08-26 10:12:40 阅读：126 评论：0 收藏：0 [点我收藏+]

标签：top number mozilla tps turn thread web 第一章 connect

原文引用https://www.dazhuanlan.com/2019/08/26/5d62f6fd2023a/

小说网址

https://www.uxiaoshuo.com/
使用绝色妖娆：鬼医至尊为例（主要是女朋友想看^_^）

下面是进程代码



import  requests
import threading
from bs4 import BeautifulSoup
import re
import os
import time
import sys
import threading

req_header={
‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
‘Accept-Encoding‘:‘gzip, deflate, br‘,
‘Accept-Language‘:‘zh-CN,zh;q=0.9‘,
‘Cookie‘:‘UM_distinctid=162afbabff819e-03f2f082776e95-b34356b-1fa400-162afbabff9294; CNZZDATA1259019190=1993576859-1523364262-https%253A%252F%252Fwww.baidu.com%252F%7C1523364262; bookid=124629; chapterid=6510968; chaptername=%25u7B2C1%25u7AE0%2520%25u6797%25u4E2D%25u9634%25u8C0B‘,
‘Host‘:‘www.uxiaoshuo.com‘,
‘Proxy-Connection‘:‘keep-alive‘,
‘Referer‘:‘https://www.uxiaoshuo.com/‘,
‘Upgrade-Insecure-Requests‘:‘1‘,
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘
}

req_url_base=‘http://www.uxiaoshuo.com‘                

def (txt_id):
    txt={}
    _req_url=[]
    txt[‘title‘]=‘‘
    txt[‘id‘]=str(txt_id)                                   #小说编号
    _req_url=txt[‘id‘].split(‘.‘)
    req_url=req_url_base+ txt[‘id‘]                         #根据小说编号获取小说URL
    print("小说编号："+req_url)
    try:
        res=requests.get(req_url, params=req_header)             #获取小说第一章接口
        soups=BeautifulSoup(res.text,"html.parser")             #soup转化
        #获取小说题目
        txt[‘title‘]=soups.select(‘#webhtml .box_con .con_top a‘)[1].text
        #打开小说文档写入小说相关信息
        fo = open(‘{0}.txt‘.format(txt[‘title‘]), "ab+")
        #循环写入章节内容
        while 1:
            if _req_url[-1]!=‘html‘:
                print(txt[‘title‘]+"全部下载成功！")
                break
            txt[‘c_title‘]=soups.select(‘#webhtml .box_con .zhangjieming h1‘)[0].text     ##章节名称
            txt[‘content‘]=soups.select(‘#webhtml .box_con .zhangjieTXT‘)[0]
            for i in txt[‘content‘].select("script"):           #去除无用内容
                i.decompose()
            for i in txt[‘content‘].select("div"):
                i.decompose()
            txt[‘content‘]=re.sub( ‘s+‘, ‘rnt‘, txt[‘content‘].text).strip(‘rn‘)
            #以二进制写入章节题目
            fo.write((‘n‘+txt[‘c_title‘]+‘rn‘).encode(‘UTF-8‘))
            #以二进制写入章节内容
            fo.write((‘n‘+txt[‘content‘]+‘n‘).encode(‘UTF-8‘))
            print(txt[‘c_title‘])
            # print(‘章节名:‘+txt[‘c_title‘])
            # print("章节内容：n"+txt[‘content‘])
            req_url=soups.select(‘#webhtml .zhangjieming .bottem1 a‘)[3][‘href‘]
            _req_url=req_url.split(‘.‘)
            req_url=req_url_base+req_url
            res=requests.get(req_url, params=req_header)             #获取下一章接口
            soups=BeautifulSoup(res.text,"html.parser")             #soup转化
    except Exception as e:
        print(e)
    finally:
        return

get_txt(‘/124/124629/7404934.html‘)
get_txt(‘/135/135169/7373986.html‘)

小说爬虫

标签：top number mozilla tps turn thread web 第一章 connect

原文地址：https://www.cnblogs.com/petewell/p/11410423.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行