标签:.text collect clear href group parse response search elf
1.爬取目标网站:http://www.zhaoxiaoshuo.com/all.php?c=0&o=0&s=0&f=2&l=0&page=1
2.爬取目标网站信息:小说类型 小说书名 小说作者 小说字数 小说投票数 小说搜藏数 小说状态
3.scrapy框架结构:
zhaoxiaoshuo zhaoxiaoshuo spiders __init__.py zhaoxiaoshuo.py items.py middlewares.py pipelines.py settings.py __init__.py scrapy.cfg
(1)items.py
import scrapy class ZhaoxiaoshuoItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 小说类别 book_category = scrapy.Field() # 小说书名 book_name = scrapy.Field() # 小说作者 book_author = scrapy.Field() # 小说字数 book_words = scrapy.Field() # 小说投票数 book_vote = scrapy.Field() # 小说收藏数 book_collection = scrapy.Field() # 小说状态 book_status = scrapy.Field()
(2)spiders/zhaoxiaoshuo.py
import scrapy from scrapy.http import Request from bs4 import BeautifulSoup import re from zhaoxiaoshuo.items import ZhaoxiaoshuoItem class ZhaoXiaoShuo(scrapy.Spider): name = "zhaoxiaoshuo" allowed_domains = [‘zhaoxiaoshuo.com‘] first_url = ‘http://www.zhaoxiaoshuo.com‘ base_url = ‘http://www.zhaoxiaoshuo.com/all.php?c={}&o=0&s=0&f=2&l=0&page=1‘ def start_requests(self): for i in range(2, 22): url = self.base_url.format(str(i)) yield Request(url, self.get_max_page, meta={ ‘url‘: url }) yield Request(self.base_url.format(str(0)), self.get_max_page, meta={ ‘url‘: self.base_url.format(str(0)) }) def get_max_page(self, response): soup = BeautifulSoup(response.text, "lxml") max_page = int(re.search("\d+", soup.select(".pages a")[4].text).group()) url = response.meta[‘url‘] for page in range(1, max_page + 1): url = url.replace("page=1", "page={}".format(str(page))) yield Request(url, self.parse) def parse(self, response): soup = BeautifulSoup(response.text, "lxml") ul = soup.select(".clearfix")[2] lis = ul.select("li") for li in lis: # category = li.select(".width369")[0].text.strip() name = li.select(".green")[0].text.strip() status = li.select(".red")[0].text.strip() author = li.select(".width111")[0].text.strip() url = self.first_url + li.select(".green")[0][‘href‘] yield Request(url, self.get_information, meta={ # ‘category‘: category, ‘name‘: name, ‘status‘: status, ‘author‘: author }) def get_information(self, response): item = ZhaoxiaoshuoItem() soup = BeautifulSoup(response.text, "lxml") item[‘book_category‘] = soup.select(".crumbswrap a")[1].text.strip() item[‘book_name‘] = response.meta[‘name‘] item[‘book_author‘] = response.meta[‘author‘] item[‘book_words‘] = soup.select(".r420 p span")[1].text.strip() item[‘book_vote‘] = soup.select(".r420 p span")[2].text.strip() item[‘book_collection‘] = soup.select(".r420 p span")[2].text.strip() item[‘book_status‘] = response.meta[‘status‘] return item
(3)pipelines.py
因为并没有选择存储,所以没有编辑
(4)其它(默认处理)
标签:.text collect clear href group parse response search elf
原文地址:https://www.cnblogs.com/loveprogramme/p/9419539.html