码迷,mamicode.com
首页 > 其他好文 > 详细

scrapy 爬取拉勾网

时间:2018-10-24 15:38:22      阅读:292      评论:0      收藏:0      [点我收藏+]

标签:duplicate   sqlt   .exe   bsp   ecs   ace   view   creat   rem   

一、模板使用

scrapy 在建立爬虫的时候,还可以指定使用的模板进行建立

默认建立爬虫文件的命令:

scrapy genspider 爬虫名称 爬虫地址

可以用  scrapy genspider --list 命令 查看scrapy的模板

$ scrapy genspider --list
Available templates:
  basic
  crawl
  csvfeed
  xmlfeed

通过crawl模板生成拉钩网爬虫文件

$ scrapy genspider -t crawl lagou www.lagou.com

Created spider lagou using template crawl in module:
  ArticleSpider.spiders.lagou

 

二、编写lagou.py

技术分享图片
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import LagouJobItemLoader, LagouJobItem
from ..utils.common import get_md5
from datetime import datetime


class LagouSpider(CrawlSpider):
    name = lagou
    allowed_domains = [www.lagou.com]
    start_urls = [https://www.lagou.com/]

    rules = (
        # Rule(LinkExtractor(allow=r‘zhaopin/.*‘), follow=True),
        # Rule(LinkExtractor(allow=r‘gongsi/j\d+.html‘), follow=True),
        Rule(LinkExtractor(allow=rjobs/\d+.html), callback=parse_job, follow=True),
    )

    # 这个位置可以对这两个函数进行扩展
    # def parse_start_url(self, response):
    #     return []
    #
    # def process_results(self, response, results):
    #     return results

    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css(title, .job-name::attr(title))
        item_loader.add_value(url, response.url)
        item_loader.add_value(url_object_id, get_md5(response.url))
        item_loader.add_css(salary, .job_request .salary::text)
        item_loader.add_xpath(job_city, //*[@class="job_request"]/p/span[2]/text())
        item_loader.add_xpath(work_years, //*[@class="job_request"]/p/span[3]/text())
        item_loader.add_xpath(degree_need, //*[@class="job_request"]/p/span[4]/text())
        item_loader.add_xpath(job_type, //*[@class="job_request"]/p/span[5]/text())

        item_loader.add_css(tags, .position-label li::text)
        item_loader.add_css(publish_time, .publish_time::text)
        item_loader.add_css(job_advantage, .job-advantage p::text)
        item_loader.add_css(job_desc, .job_bt div)
        item_loader.add_css(job_addr, .work_addr)
        item_loader.add_css(company_name, #job_company dt a img::attr(alt))
        item_loader.add_css(company_url, #job_company dt a::attr(href))
        item_loader.add_value(crawl_time, datetime.now())

        job_item = item_loader.load_item()

        return job_item
View Code

 

三、编写items.py

技术分享图片
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose,TakeFirst,Join
from w3lib.html import remove_tags
from ArticleSpider.settings import SQL_DATE_FORMAT, SQL_DATETIME_FORMAT


def remove_splash(value):
    # 去掉工作城市的斜线
    return value.replace("/", "").strip()


def handle_jobaddr(value):
    addr_list = value.split("\n")
    # addr = []
    # for item in addr_list:
    #     if item.strip() != "查看地图":
    #         addr.append(item.strip())
    # return ‘‘.join(addr)
    # 简写
    addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"]
    return ‘‘.join(addr_list)


class LagouJobItemLoader(ItemLoader):
    default_output_processor = TakeFirst()


class LagouJobItem(scrapy.Item):    # 拉勾网职位信息
    title = scrapy.Field()
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    salary = scrapy.Field(
        input_processor=MapCompose(remove_splash),
    )
    job_city = scrapy.Field(
        input_processor=MapCompose(remove_splash),
    )
    work_years = scrapy.Field(
        input_processor=MapCompose(remove_splash),
    )
    degree_need = scrapy.Field(
        input_processor=MapCompose(remove_splash),
    )
    job_type = scrapy.Field()
    publish_time = scrapy.Field()
    job_advantage = scrapy.Field()
    job_desc = scrapy.Field()
    job_addr = scrapy.Field(
        input_processor=MapCompose(remove_tags, handle_jobaddr),
    )
    company_name = scrapy.Field()
    company_url = scrapy.Field()
    tags = scrapy.Field(
        input_processor=Join(",")
    )
    crawl_time = scrapy.Field()

    def get_insert_sql(self):
        insert_sql = """
            insert into lagou_job(title,url,url_object_id,salary,job_city,work_years,degree_need,
            job_type,publish_time,job_advantage,job_desc,job_addr,company_name,company_url,
            tags,crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc)
        """
        params = (
            self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"], self["degree_need"],
            self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"], self["company_name"],
            self["company_url"], self["tags"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT),
        )

        return insert_sql, params
View Code

 

四、编写pipelines.py

技术分享图片
# -*- coding: utf-8 -*-

from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
import json
import codecs   # codecs与open类似,但是减少了很多的编码工作
from scrapy.exporters import JsonItemExporter
from scrapy.pipelines.images import ImagesPipeline


class MysqlTwistedPipline(object):

    def __init__(self, dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls, settings):
        dbparms = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            user=settings["MYSQL_USER"],
            password=settings["MYSQL_PASSWORD"],
            charset=utf8,
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=True,
        )
        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)

        return cls(dbpool)

    def process_item(self, item, spider):
        # 使用twisted将mysql插入编程异步执行
        query = self.dbpool.runInteraction(self.do_insert, item)
        query.addErrback(self.handle_error)   # 处理异常

    def handle_error(self, failure):
        # 处理异步插入的异常
        print(failure)

    def do_insert(self, cursor, item):
        # 执行具体的插入操作
        insert_sql, params = item.get_insert_sql()
        cursor.execute(insert_sql, params)
View Code

 

五、编辑settings

技术分享图片
ITEM_PIPELINES = {
   ArticleSpider.pipelines.MysqlTwistedPipline: 300,
}


MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "article_spider"
MYSQL_USER = "root"
MYSQL_PASSWORD = "123"

SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
SQL_DATE_FORMAT = "%Y-%m-%d"
View Code

 

scrapy 爬取拉勾网

标签:duplicate   sqlt   .exe   bsp   ecs   ace   view   creat   rem   

原文地址:https://www.cnblogs.com/trunkslisa/p/9834578.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!