最近老是写selenium的爬虫,想复习下requests + BeautifulSoup爬取网站内容。
先写一下思路: 打开网站,获取网站的首页显示的小说-------------->根据输入的内容来进行判断是否含有该小说,有,就对该小说进行访问。------------->打开含有小说目录的网页,匹配章节名称和URL---------->循环获取文本内容,并对内容进行清理,写入文本文档。
全部代码:
1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 4 import requests 5 import datetime 6 from bs4 import BeautifulSoup 7 import time 8 def book_info(): 9 url = ‘http://www.quanshuwang.com/‘ 10 headers = {‘User-Agent‘:‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36‘} 11 html = requests.get(url,timeout=30,headers=headers) 12 time.sleep(2) 13 html.encoding = html.apparent_encoding 14 soup = BeautifulSoup(html.text,‘html.parser‘) 15 #获取热门书籍 16 hot_list = soup.select(‘a.msgBorder‘) 17 hot_book = {} 18 for x in hot_list: 19 hot_book[x[‘title‘]] = x[‘href‘] 20 #print(hot_book) 21 #获取好看的书籍: 22 wonderful_list = soup.find_all(name=‘a‘,attrs={‘class‘:‘clearfix stitle‘}) 23 wonderful_book = {} 24 for y in wonderful_list: 25 wonderful_book[y[‘title‘]] = y[‘href‘] 26 #print(len(wonderful_list)) 27 #添加到一个总的字典中。 28 book_dict = {} 29 for k,v in hot_book.items(): 30 book_dict[k] = v 31 for k,v in wonderful_book.items(): 32 book_dict[k] = v 33 return book_dict 34 #询问用户想看什么书 35 def search_book(book_name,book_dict): 36 if book_name in book_dict: 37 return book_dict[book_name] 38 else: 39 return ‘对不起,您要查询的书籍没有找到。‘ 40 #获取书籍的网址,并访问。 41 def down_book(url_1): 42 url = url_1 43 headers = {‘User-Agent‘:‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36‘} 44 html2 = requests.get(url,timeout=30,headers=headers) 45 html2.encoding = html2.apparent_encoding 46 soup2 = BeautifulSoup(html2.text,‘html.parser‘) 47 #网页中有一个开始阅读的按钮,需要对其链接进行访问,然后才能进入目录界面。 48 read_url = soup2.select(‘a.reader‘)[0][‘href‘] 49 html3 = requests.get(read_url,timeout=30,headers=headers) 50 html3.encoding = html3.apparent_encoding 51 soup3 = BeautifulSoup(html3.text,‘html.parser‘) 52 info_list = soup3.select(‘div[class="clearfix dirconone"] a‘) 53 catalog_dict = {} 54 for x in info_list: 55 catalog_dict[x[‘title‘]] = x[‘href‘] 56 return catalog_dict 57 #a = down_book(search_book(‘盗墓笔记‘,book_info())) 58 #print(a) 59 def write_book(book_name,dicts): 60 headers = { 61 ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36‘} 62 with open(‘%s.txt‘ % book_name,‘w+‘) as w_b: 63 for k,v in dicts.items(): 64 w_b.write(‘\n\n\n%s \n\n\n‘ % k) 65 html4 = requests.get(v,timeout=30,headers=headers) 66 html4.encoding = html4.apparent_encoding 67 soup4 = BeautifulSoup(html4.text,‘html.parser‘) 68 text_list = soup4.select(‘div.mainContenr‘) 69 sss = ‘‘ 70 for xabc in text_list: 71 abcd = xabc.text.replace(‘ ‘,‘‘).replace(‘style5();‘,‘‘).replace(‘style6();‘,‘‘) 72 sss += abcd 73 print(sss) 74 w_b.write(sss) 75 return w_b.close() 76 77 start_time = datetime.datetime.now() 78 bbb = input(‘请输入你要查询的书籍:‘) 79 if search_book(bbb,book_info()) != ‘对不起,您要查询的书籍没有找到。‘: 80 a = down_book(search_book(bbb,book_info())) 81 write_book(bbb,a) 82 else: 83 print(search_book(bbb,book_info())) 84 end_time = datetime.datetime.now() 85 cha = (end_time - start_time).seconds 86 print(‘此次运行耗时%s秒.‘ % cha)
代码中都进行了注释,如果有不懂的地方,请在文章下方进行评论。
谢谢您的阅读!
----------------by sniper-huohuo -----------------
------------ 知耻而后勇 --------------