标签:spider lxml join pytho item response return bin import
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2018-11-08 22:33:55 # Project: qsbk from pyspider.libs.base_handler import * from lxml import html from urlparse import urljoin import datetime class Handler(BaseHandler): crawl_config = { } def __init__(self): self.start_url=‘https://www.qiushibaike.com/‘ @every(minutes=24 * 60) def on_start(self): self.crawl(self.start_url, callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): root=html.fromstring(response.content.decode(‘utf-8‘)) content_left_node = root.xpath("//div[@id=‘content-left‘]") div_node_list = content_left_node[0].xpath("./div") tasks=[] for div_node in div_node_list: title_node = div_node.xpath( ".//div[@class=‘author clearfix‘]/a[contains(@onclick,‘web-list-author-text‘)]/h2/text()") __content_url =div_node.xpath("./a[@class=‘contentHerf‘]/@href") content_url = urljoin(self.start_url, __content_url[0]) content_node = div_node.xpath(".//div[@class=‘content‘]/span[1]") content = content_node[0].xpath(‘string(.)‘) name = title_node[0] info = ‘‘.join(content) crawldate = datetime.datetime.now().strftime(‘%Y-%m-%d %H:%M:%S‘) item = {} item[‘name‘] = name.strip() if name else name item[‘info‘] = info.strip() if info else info item[‘crawldate‘] = crawldate item[‘url‘] = content_url tasks.append(item) return {‘data‘:tasks}
标签:spider lxml join pytho item response return bin import
原文地址:https://www.cnblogs.com/c-x-a/p/9932720.html