码迷,mamicode.com
首页 > 其他好文 > 详细

scrapy项目4

时间:2019-01-29 11:01:33      阅读:150      评论:0      收藏:0      [点我收藏+]

标签:pid   name   process   chrome   one   windows   title   字符   war   

# -*- coding: utf-8 -*-
import scrapy
from sun0769.items import Sun0769Item

class Sun07Spider(scrapy.Spider):
    name = sun07
    allowed_domains = [wz.sun0769.com]
    start_urls = [http://wz.sun0769.com/index.php/question/questionType?type=4&page=0]

    def parse(self, response):

        tr_list = response.xpath("//div[@class=‘greyframe‘]/table[2]/tr/td/table/tr")   #获取所有tr列表
        print(tr_list)

        for tr in tr_list:
            item = Sun0769Item()
            item["title"] = tr.xpath("./td[2]/a[@class=‘news14‘]/@title").extract_first()
            item["href"] = tr.xpath("./td[2]/a[@class=‘news14‘]/@href").extract_first()  #详情页网址
            item["publish_date"] = tr.xpath("./td[last()]/text()").extract_first()
            yield scrapy.Request(
                item["href"],  #详情页网址
                callback=self.parse_detail,
                meta = {"item":item}

            )
        #<a href="http://wz.sun0769.com/index.php/question/questionType?type=4&amp;page=100500">></a>
        # 尖括号href有值代表有下一页,没有代表最后一页
        next_url = response.xpath("//a[text()=‘>‘]/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(
                next_url,
                callback=self.parse
            )

    #处理详情页
    def parse_detail(self,response):
        item = response.meta["item"]
        item["content"] = response.xpath("//td[@class=‘txt16_3‘]//text()").extract()
        item["content_img"] = response.xpath("//td[@class=‘txt16_3‘]//img/@src").extract() #路径不完整,要拼接
        item["content_img"] =[http://wz.sun0769.com+i for i in item["content_img"]] #拼接路径
        # print(item)
        yield item

pipelines.py

import re
class Sun0769Pipeline(object):
    def process_item(self, item, spider):
        # print(item)
        item["content"] = self.process_content(item["content"]) #得到process_content里面content值
        print(item)
        return item

    def process_content(self,content):
        ##处理content里面的空白字符,\r\n,\t
        content = [re.sub(r"\xa0|\s|\r\n|\t","",i) for i in content] #把\xa0,空格,\r\n,\t替换成空字符串
        content = [i for i in content if len(i)>0]  #然后去除列表中的空字符串

        return content

settings

LOG_LEVEL = "WARNING"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36

 项目地址:https://github.com/CH-chen/sun0769

scrapy项目4

标签:pid   name   process   chrome   one   windows   title   字符   war   

原文地址:https://www.cnblogs.com/chvv/p/10332463.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!