码迷,mamicode.com
首页 > 数据库 > 详细

使用lxml的css选择器用法爬取奇书网并保存到mongoDB中

时间:2018-12-10 21:57:19      阅读:319      评论:0      收藏:0      [点我收藏+]

标签:环境   imp   soft   www.   exce   ide   link   记录   pymongo   

import requests
from lxml import etree
from fake_useragent import UserAgent
import pymongo
class QiShuSpider(object):
    def __init__(self):
        self.base_url="https://www.qisuu.la/soft/sort01/"
        self.headers={
            "User-Agent":UserAgent().random,
            "HOST":"www.qisuu.la",
            "Referer":"https://www.qisuu.la",
        }



    def get_index_code(self):
        #声明一个变量,记录重连的次数
        retry_link_count=0
        while True:
            try:
                response=requests.get(self.base_url,headers=self.headers)
                # print(response.text)
            except Exception as e:
                print("连接奇书网失败,原因是:",e)
                print("正在尝试第{}次重连....".format(retry_link_count))
                retry_link_count+=1
                if retry_link_count>=5:
                    print("尝试连接次数已经达到五次,停止连接")
                    break
            else:
                html_obj=etree.HTML(response.text)
                # print(response.text)
                #获取option这个标签列表
                option_list=html_obj.cssselect("select>option")
                return option_list

    def get_every_page_code(self):
        option_list=self.get_index_code()
        for option in option_list:
            value=option.get("value")
            #拼接每一页的完整地址
            base_url="https://www.qisuu.la"+value
            print("正在爬取{}链接".format(base_url))
            response=requests.get(base_url,headers=self.headers).text
            html_obj=etree.HTML(response)
            #获取每一本小数所在的a标签的一个列表
            a_list=html_obj.cssselect(".listBox li>a")
            for a in a_list:
                novel_href=a.get("href")
                #拼接每一本小说的完整地址
                novel_url="https://www.qisuu.la"+novel_href
                print("正在爬取链接为{}的小说".format(novel_url))
                self.parse_every_novel(novel_url)
    def parse_every_novel(self,novel_url):
        reponse=requests.get(novel_url,headers=self.headers)
        reponse.encoding="utf-8"
        html_obj=etree.HTML(reponse.text)
        novel_name=html_obj.cssselect(".detail_right>h1")[0].text
        clik_num=html_obj.cssselect(".detail_right>ul>li:nth-child(1)")[0].text
        novel_size=html_obj.cssselect(".detail_right>ul>li:nth-child(2)")[0].text
        novel_type=html_obj.cssselect(".detail_right>ul>li:nth-child(3)")[0].text
        update_time = html_obj.cssselect(".detail_right>ul>li:nth-child(4)")[0].text
        novel_status = html_obj.cssselect(".detail_right>ul>li:nth-child(5)")[0].text
        novel_author = html_obj.cssselect(".detail_right>ul>li:nth-child(6)")[0].text
        novel_run_envir=html_obj.cssselect(".detail_right>ul>li:nth-child(7)")[0].text
        novel_lasted_chapter=html_obj.cssselect(".detail_right>ul>li:nth-child(8)>a")[0].text
        dict_novel={"小说名称":novel_name,"点击次数":clik_num,"小说大小":novel_size,"小说类型":novel_type,"更新时间":update_time,"小说状态":novel_status,"小说作者":novel_author,"小说运行环境":novel_run_envir,"小说最新章节":novel_lasted_chapter}
        collection.insert_one(dict_novel)

    def start_spider(self):
        self.get_every_page_code()



if __name__ == __main__:
    client = pymongo.MongoClient(host="localhost", port=27017)
    db = client.novel
    collection = db.novel
    spider=QiShuSpider()
    spider.start_spider()

 

使用lxml的css选择器用法爬取奇书网并保存到mongoDB中

标签:环境   imp   soft   www.   exce   ide   link   记录   pymongo   

原文地址:https://www.cnblogs.com/chensang/p/10099088.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!