码迷,mamicode.com
首页 > 编程语言 > 详细

python 爬取qidian某一页全部小说

时间:2017-05-25 13:26:13      阅读:258      评论:0      收藏:0      [点我收藏+]

标签:append   代码   style   with   else   from   chap   blog   adc   

 

 

  1 import re
  2 import urllib.request
  3 from bs4 import BeautifulSoup
  4 import time
  5 
  6 url=input("第一页网址:")
  7 
  8 def gethtml(url):
  9                                       #获取页面源代码html
 10     page=urllib.request.urlopen(url)
 11     html=page.read().decode(utf-8)  #html是一个列表
 12     soup=BeautifulSoup(html,html.parser)
 13     
 14     return soup
 15 
 16 
 17 def getbookurl(soup):                   #获取该页所有书本的链接地址
 18     firsturl2=[]
 19     bookurl=soup.find_all("h4")
 20     bookurl1=re.findall(r<h4><a data-bid=".*?" data-eid=".*?" href="(.*?)" target="_blank",str(bookurl))
 21     for i in range(0,len(bookurl1)):
 22         bookurl="http:"+bookurl1[i]
 23        
 24         soup1=gethtml(bookurl)          #获取每本书第一章 的url
 25         time.sleep(0.2)
 26         firsturl=soup1.find_all("a",{"class":"red-btn J-getJumpUrl "})
 27         firsturl1=re.findall(rdata-firstchapterjumpurl=".*?" href="(.*?)" id="readBtn">,str(firsturl))
 28         if firsturl1[0]==‘‘:            #这里要进行判断,防止出错
 29             continue
 30         firsturl2.append(firsturl1[0])
 31     return firsturl2
 32 
 33 
 34 
 35 
 36 def getcontent(soup,load):
 37     
 38     content=soup.find_all("div",{"class":"read-content j_readContent"})
 39     
 40     content1=re.compile(r<p>([\s\S]*?)</p>)
 41     
 42     content2=content1.findall(str(content))
 43    
 44     content3=re.sub("</?\w+[^>]*>",‘‘,content2[0])
 45     
 46     content4=content3.replace(,。\n\n\0\0\0)  #到此,将章节内容获取完毕
 47 
 48     contentname=re.compile(r<h3 class="j_chapterName">(.*?)</h3>)
 49     
 50     contentname1=contentname.findall(str(soup))     #获取章节名称
 51 
 52     book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4   
 53 
 54     with open(load, a) as f:
 55 
 56         f.write(book)
 57 
 58     
 59 
 60 def nextcontent(soup):
 61 
 62     content=soup.find_all("div",{"class":"chapter-control dib-wrap"})
 63     
 64     #print(str(content))
 65     
 66     step=re.compile(r<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">)
 67 
 68     content1=step.findall(str(content))
 69 
 70     if content1 == []:
 71 
 72         step1=re.compile(r<a data-eid="qd_R118" href="(.*?)" id="j_chapterNext">)
 73 
 74         content2=step1.findall(str(content))
 75 
 76         url="http:"+content2[0]
 77 
 78         return url
 79     else:
 80         url="http:"+content1[0]
 81 
 82         return url
 83 
 84 def panduan(soup):
 85     
 86     content=soup.find_all("div",{"class":"chapter-control dib-wrap"})
 87     
 88     #print(str(content))
 89     
 90     step=re.compile(r<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">)
 91     
 92     content1=step.findall(str(content))
 93     
 94     return content1
 95     #-------------------------------------------------------------------------
 96     
 97     
 98     
 99     #-------------------------------------------------------------------------
100     
101 while 1==1:
102     soup2=gethtml(url)
103     firsturl2=getbookurl(soup2)
104 
105     for j in range(0,len(firsturl2)):
106         url="http:"+firsturl2[j]
107         soup1=gethtml("http:"+firsturl2[j])
108         bookname=re.findall(r<h1>(.*?)</h1> ,str(soup1))
109         load="d:/88/%s.txt" % bookname[0]
110         i=0
111         while 1==1:
112             soup=gethtml(url)
113             getcontent(soup,load)
114             url=nextcontent(soup)
115             content1=panduan(soup)
116             i+=1
117             print("第%d章下载完成" % i)
118     
119             if content1 == []:
120                 break
121             
122             time.sleep(0.2)
123         print("-------------第%d本书下载完成---------" % int(j+1))
124     

学习ing!!!

python 爬取qidian某一页全部小说

标签:append   代码   style   with   else   from   chap   blog   adc   

原文地址:http://www.cnblogs.com/jjj-fly/p/6903081.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!