scrapy爬虫部分

时间：2020-05-05 23:33:05 阅读：56 评论：0 收藏：0 [点我收藏+]

标签：manage spl format HERE pre enc sel inf page

items.py部分
import scrapy

class App01Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
original_url = scrapy.Field()
management_info = scrapy.Field()
com_name = scrapy.Field()
punish_num = scrapy.Field()
mana_results = scrapy.Field()
law_depart = scrapy.Field()
get_date = scrapy.Field()

app_01.py项目部分

-- coding: utf-8 --

import scrapy
import requests
from pyquery import PyQuery
from app01.items import *

class App01Spider(scrapy.Spider):
name = ‘app_01‘
allowed_domains = [‘http://gtghj.wuhan.gov.cn/pt-2256-7-1.html‘]
start_urls = [‘http://gtghj.wuhan.gov.cn/pt-2256-7-1.html‘]

def parse(self, response):
    count = 1
    s = PyQuery(response.text)
    page_1 = s(‘#info > div > strong‘).text()
    page = page_1.split(‘/‘)[1]
    for i in range(int(page)):
        url_page = ‘http://gtghj.wuhan.gov.cn/pt-2256-7-{}.html‘.format(i + 1)
        original_r = requests.get(url_page)
        original_r.encoding = ‘gbk‘
        original_s = PyQuery(original_r.text)
        original_urls = original_s(‘#info > ul > li > a‘).items()
        get_dates = original_s(‘#info > ul > li > span‘).items()
        for j, dates in zip(original_urls, get_dates):
            original_url = ‘http://gtghj.wuhan.gov.cn{}‘.format(j.attr(‘href‘))  # 详情url
            management_info = j.attr(‘title‘)  # 标题
            get_date = dates.text()  # 时间
            yield scrapy.Request(url=original_url,callback=self.parse_info,dont_filter=True#防止parse_info不回调,
                                 meta={‘management_info‘:management_info,‘get_date‘:get_date,‘original_url‘:original_url})
            count += 1

def parse_info(self,response):
    item = App01Item()#调用items中的App01Item()
    detail_s = PyQuery(response.text)
    com_name = detail_s(‘#show > table:nth-child(2) > tr > td > div > table > tr:nth-child(2) > td:nth-child(2)‘).text()
    mana_results = detail_s(‘#show > table:nth-child(2) > tr > td > div > table> tr:nth-child(8) > td:nth-child(2)‘).text()
    punish_num = detail_s(‘#show > table:nth-child(2) > tr > td > div > table > tr:nth-child(4) > td:nth-child(2)‘).text()
    law_depart = detail_s(‘#show > table:nth-child(2) > tr > td > div > table> tr:nth-child(9) > td:nth-child(2)‘).text()
    item[‘com_name‘] = com_name
    item[‘mana_results‘] = mana_results
    item[‘punish_num‘] = punish_num
    item[‘law_depart‘] = law_depart
    item[‘management_info‘] = response.meta[‘management_info‘]
    item[‘get_date‘] = response.meta[‘get_date‘]
    item[‘original_url‘] = response.meta[‘original_url‘]
    print(item)

main.py部分
from scrapy.cmdline import execute
execute(‘scrapy crawl app_01‘.split())

scrapy爬虫部分

标签：manage spl format HERE pre enc sel inf page

原文地址：https://www.cnblogs.com/marier/p/12833486.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行