码迷,mamicode.com
首页 > 编程语言 > 详细

多线程爬取笔趣阁免费小说全站爬取

时间:2019-12-06 11:40:39      阅读:184      评论:0      收藏:0      [点我收藏+]

标签:from   mat   字符   date   empty   mit   localhost   span   visit   

import threading,os,time,requests,pymongo,re
from queue import Queue
from lxml import etree
from bs4 import BeautifulSoup as BP
client = pymongo.MongoClient(host=‘localhost‘,port=27017)
mg = client[‘biquge‘]
def get_fenlei():
"""
爬取图书全部分类
:return:
"""
collection = mg[‘biquge_info‘]
url = ‘http://www.xxbqg5200.com/‘
sp = requests.get(url=url,headers = headers,cookies=cookies)
soup = BP(sp.text,‘lxml‘)
fenlei_url = soup.select(‘#wrapper > div.nav > ul > li > a‘)
list1 = []
for i in fenlei_url:
href_url = i[‘href‘]
fenlei_name = i.get_text()
if ‘‘.join(href_url).split(‘/‘)[1] != ‘sort‘:
continue
else:
fenlei_href_url = ‘{}‘.format(‘http://www.xxbqg5200.com‘)+str(href_url)
list1.append(fenlei_href_url)
try:
date = {‘fenlei_name‘:fenlei_name,‘fenlei_url‘:fenlei_href_url}
collection.insert(date)
print(‘{}{}‘.format(fenlei_name,‘》》》》》存储成功‘))
except :
print(‘{}‘.format(fenlei_name,‘存储失败‘))
return list1
def get_page():
"""
爬取全部分类的分页链接
:return:
"""
list1_url = get_fenlei()
list_page = []
a = 0
for i in list1_url:
a+=1
page_url = ‘‘.join(i).split(‘/‘)[3]
page_href_url = ‘{}‘.format(‘http://www.xxbqg5200.com/‘)+str(page_url)+str(‘/‘)+str(a)+str(‘_‘)
for page in range(1,190):
page_url = "{}".format(page_href_url)+str(page)+str(‘/‘)
list_page.append(page_url)
return list_page
def get_tushu_url():
"""
爬取所有图书的链接
:return:
"""
global q,lock
while not q.empty():
lock.acquire()
url = q.get()
lock.release()
print(url,‘###################################‘)
collection = mg[‘biquge_info‘]
list1 = []
sp = requests.get(url=url,headers=headers,cookies=cookies)
soup = BP(sp.text,‘lxml‘)
tushu_url = soup.select(‘#newscontent > div.l > ul > li > span.s2 > a‘)
if tushu_url:
for tushu_href_url in tushu_url:
tushu_name_url = tushu_href_url[‘href‘]
tushu_name = tushu_href_url.get_text()
list1.append(tushu_name_url)
try:
date = {‘tushu_name‘:tushu_name,‘tushu_name_url‘:tushu_name_url}
collection.insert(date)
print(‘{}{}‘.format(tushu_name, ‘》》》》》存储成功‘))
except :
print(‘{}‘.format(tushu_name,‘存储失败‘))
else:
pass
"""
爬取章节所有链接
"""
list2 = []
for zhang_url in list1:
response = requests.get(zhang_url,headers=headers,cookies=cookies)
soup_zhang = BP(response.text,‘lxml‘)
zhangjie_url = soup_zhang.select(‘#list > dl > dd > a‘)
for zhang_href in zhangjie_url:
zhangjie_href = zhang_href[‘href‘]
zhangjie_name = zhang_href.get_text()
# print(zhangjie_name,‘)))))))))))‘,zhangjie_href)
content_url = ‘{}‘.format(‘http://www.xxbqg5200.com‘)+str(zhangjie_href)
list2.append(content_url)
try:
date_zhangjie = {‘zhangjie_name‘:zhangjie_name,‘zhangjie_href‘:zhangjie_href}
collection.insert(date_zhangjie)
print(‘{}{}‘.format(zhangjie_name, ‘》》》》》存储成功‘))
except :
print(‘{}‘.format(zhangjie_name,‘存储失败‘))
"""
爬取章节下的所有内容
"""
content_sql = mg[‘tushu_content‘]
for content_list_url in list2:
response1 = requests.get(content_list_url,headers=headers,cookies=cookies)
soup_content = BP(response1.text,‘lxml‘)
content_nei = soup_content.select(‘#content‘)
for text_content in content_nei:
filter_content = re.findall(‘[\u4e00-\u9fa5a-zA-Z0-9]+‘, text_content, re.S) # 只要字符串中的中文,字母,数字
filter_text_content = "".join(filter_content)
try:
date_content = {‘content‘:filter_text_content}
content_sql.insert(date_content)
print(‘{}‘.format( ‘》》》》》存储成功‘))
except :
print(‘{}‘.format(‘存储失败‘))
if __name__ == ‘__main__‘:
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36‘,
‘Referer‘: ‘http://www.xxbqg5200.com/register.php?do=submit‘,

}
cookies = {
‘Cookie‘: ‘Hm_lvt_bbb2110ecd75330bec79c7868b24e681=1575524043; PHPSESSID=03pt092b5nb8qsdl6pk425kh87; jieqiUserInfo=jieqiUserId%3D1912%2CjieqiUserName%3Dduanyibo%2CjieqiUserGroup%3D3%2CjieqiUserName_un%3Dduanyibo%2CjieqiUserLogin%3D1575524132; jieqiVisitInfo=jieqiUserLogin%3D1575524132%2CjieqiUserId%3D1912; Hm_lpvt_bbb2110ecd75330bec79c7868b24e681=1575524140‘,

}
q = Queue()
lock = threading.Lock()
list_url = get_page()
for i in list_url:
q.put(i)
for i in range(10):
t = threading.Thread(target=get_tushu_url)
t.start()

多线程爬取笔趣阁免费小说全站爬取

标签:from   mat   字符   date   empty   mit   localhost   span   visit   

原文地址:https://www.cnblogs.com/duanlinxiao/p/11993911.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!