码迷,mamicode.com
首页 > 其他好文 > 详细

第一个小爬虫(改进版)——下书网下载小说v2

时间:2019-04-16 16:33:51      阅读:219      评论:0      收藏:0      [点我收藏+]

标签:工作   decode   使用说明   spl   更新   www.   https   chapter   handler   

已知问题:

  1. 代理地址需要更新
  2. 中断只能重新开始

 

 1 import requests
 2 import urllib.request
 3 import re
 4 import os
 5 import string
 6 import time
 7 import random
 8 from urllib import request
 9 
10 path = os.getcwd()  # 获取当前路径
11 
12 
13 def open_url(url):
14     proxy_list = [
15         {http: 112.85.129.9:9999},
16         {http: 113.105.202.7:3128},
17         {http: 180.121.115.181:48184},
18         {http: 123.162.168.192:40274},
19         {http: 115.207.77.72:8118},
20         {http: 112.85.129.9:9999},
21         {http: 61.184.109.33:61320},
22         {htpps: 58.218.201.188:},
23     ]
24     proxy = random.choice(proxy_list)
25     print(str(proxy))
26     px = request.ProxyHandler(proxy)
27     opener = request.build_opener(px)
28     opener.addheaders = [(User-Agent,
29                           Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36)]
30     request.install_opener(opener)
31     try:
32         page = request.urlopen(url,timeout=10)
33         html = page.read().decode(utf-8)
34         return html
35     except:
36         print("出错额!!!!!!!!!!!!!!!!!!!!!")
37         return get_txt(open_url(url))
38 
39 
40 def get_txt(html):
41     lst1 = str(r<head>[\s\S]*<title>.*</title>)
42     lst2 = (r<div id="tac">[\s\S]*<div class="info bottominfo">)
43     l1 = str(re.findall(lst1,html))
44     l1 = l1[51:].replace(\‘,‘‘).replace(\", ‘‘).replace(>],‘‘)
45     l1list = l1.split(,)[0]
46     l2 = str(re.findall(lst2,html))
47     l2 = l2[92:].replace(r\u3000\u3000,      ).replace(<br/><br/>,\n)[:-60]
48     l2 = re.sub(\*, ,l2)
49     l2 = str(l2)
50     f = open(path+r\\%s.txt%(l5),a)
51     f.write(l1list)
52     f.write(\n\n)
53     f.write(l2)
54     f.write(\n\n\n)
55     print(l1list + →→→下载完成→→→)
56 
57 def get_titlename(html):
58     lst3 = str(r<head>[\s\S]*<title>.*</title>)
59     l3 = str(re.findall(lst3,html))
60     l3 = l3[43:].split(_)[0].replace(txt下载,\n  ——).replace((,‘‘).replace(),‘‘)
61     print(l3 + →正在下载)
62     f = open(path+r\\%s.txt%(l5),a)
63     f.write(l3)
64     f.write(\n\n)
65     print(l3 + →→→titlename下载完成→→→)
66 
67 def get_txtname(html):
68     lst4 = str(r<head>[\s\S]*<title>.*</title>)
69     l4 = str(re.findall(lst4,html))
70     l5 = l4[43:].split(txt)[0]
71     f = open(path+r\\%s.txt%(l5),a)
72     f.close
73     return l5
74 
75 if __name__ == __main__:
76     print(\n使用说明:
77           示例:《武道乾坤》,URL https://www.xiashu.la/2186/  ,该书目录为即为2186)
78     url0 = https://www.xiashu.la
79     ml = input(请输入目录)
80     url1 = url0 + r/ + ml + r/
81     print(你输入的目录为:%s%url1)
82     chapters = input(请输入总章节数(示例80页,则输入80):)
83     chapters = int(chapters)
84     print("当前工作目录 : %s" % path)
85     get_txtname(open_url(url1))
86     l5 = get_txtname(open_url(url1))
87     get_titlename(open_url(url1))
88     for chapter in range(1,chapters+1):
89         url = url1 +read_+ str(chapter) + .html
90         t = random.randint(1,2)
91         print(t)
92         time.sleep(1)#单位:秒
93         get_txt(open_url(url))

 

第一个小爬虫(改进版)——下书网下载小说v2

标签:工作   decode   使用说明   spl   更新   www.   https   chapter   handler   

原文地址:https://www.cnblogs.com/lasttime/p/10717649.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!