标签:elf inter xhtml ini response add nbsp asc llb
setting:
from fake_useragent import UserAgent BOT_NAME = ‘wxapp‘ SPIDER_MODULES = [‘wxapp.spiders‘] NEWSPIDER_MODULE = ‘wxapp.spiders‘ ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 DEFAULT_REQUEST_HEADERS = { ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, ‘User-Agent‘: str(UserAgent().random), } ITEM_PIPELINES = { ‘wxapp.pipelines.WxappPipeline‘: 300, }
wxapp_spider
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from wxapp.items import WxappItem class WxappSpiderSpider(CrawlSpider): name = ‘wxapp_spider‘ allowed_domains = [‘wxapp-union.com‘] start_urls = [‘https://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1‘] rules = ( Rule(LinkExtractor(allow=r‘.+mod=list&catid=2&page=\d‘), follow=True), Rule(LinkExtractor(allow=r".+article-.+\.html"),callback="parse_detail", follow=False) ) def parse_detail(self, response): title = response.xpath("//h1[@class=‘ph‘]/text()").get() author_p = response.xpath("//p[@class=‘authors‘]") author = author_p.xpath(".//a/text()").get() time = author_p.xpath(".//span[@class = ‘time‘]/text()").get() article = response.xpath("//td[@id=‘article_content‘]//text()").getall() article = "".join(article).strip() print(title, author, time) print(article) item = WxappItem(title=title,author=author,time=time,content=article) yield item
items.py:
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class WxappItem(scrapy.Item): title = scrapy.Field() author = scrapy.Field() time = scrapy.Field() content = scrapy.Field()
pipelines.py:
# Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter from scrapy.exporters import JsonLinesItemExporter class WxappPipeline: def __init__(self): self.fp = open("wxjc.json","wb") self.export = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding = ‘utf-8‘) def process_item(self, item, spider): self.export.export_item(item) return item def close_spider(self,spider): self.fp.close()
标签:elf inter xhtml ini response add nbsp asc llb
原文地址:https://www.cnblogs.com/djwww/p/14957568.html