码迷,mamicode.com
首页 > 其他好文 > 详细

瓜子二手车爬虫源码

时间:2016-06-08 15:41:58      阅读:1459      评论:0      收藏:0      [点我收藏+]

标签:

/*使用javascript编写的爬虫源码,用于爬取瓜子二手车上的二车手信息。

代码粘贴到神箭手云爬虫平台(http://www.shenjianshou.cn/)上就可以直接跑了,

不需要安装编译环境。要爬取其他网站,可以更改源码即可。

 

代码执行具体步骤点这里

更多源码下载点这里

*/

 

 

var scanUrl = "http://www.guazi.com/hz/buy/";//@input(scanUrl, 入口url, 请输入一个需爬取城市的url,格式为:“http://www.guazi.com/城市名称/buy/”)

if (scanUrl.trim().length > 0) {
    var city = scanUrl.trim().substring(scanUrl.indexOf(".com/") + 5, scanUrl.indexOf("/buy/"));
}

var configs = {
    domains: ["guazi.com"],
    scanUrls: [scanUrl],
    contentUrlRegexes: ["https?://www\\.guazi\\.com/" + city + "/\\w+\\.htm"],
    helperUrlRegexes: ["https?://www\\.guazi\\.com/" + city + "/buy/(o\\d+/)?"],
    enableJS: false,
    interval: 10000,
    fields: [
        {
            name: "car_name",
            selector: "//h1[contains(@class,‘dt-titletype‘)]"
        },
        {
            name: "car_price",
            selector: "//span[contains(@class,‘fc-org pricestype‘)]"
        },
        {
            name: "car_license",
            selector: "//li[contains(@class,‘one‘)]/b"
        },
        {
            name: "car_mileage",
            selector: "//ul[contains(@class,‘assort‘)]/li[2]/b"
        },
        {
            name: "car_gearbox",
            selector: "//ul[contains(@class,‘assort‘)]/li[3]/b"
        },
        {
            name: "car_emission_standard",
            selector: "//li[contains(@class,‘em-sta detailHoverTips‘)]/b"
        },
        {
            name: "car_license_location",
            selector: "//ul[contains(@class,‘assort‘)]/li[5]/b"
        },
        {
            name: "car_owner",
            selector: "//li[contains(@class,‘owner‘)]/text()[2]"
        },
        {
            name: "car_description",
            selector: "//*[@id=‘base‘]/p"
        }
    ]
};

configs.afterExtractField = function(fieldName, data, page) {
    if (fieldName == "car_price") {
        var price = extract(data, "//b").replace("¥", "¥");
        var coinUnit = exclude(data, "//b");
        return (price + coinUnit);
    }
    else if (fieldName == "car_owner") {
        return data.trim();
    }
    else if (fieldName == "car_description") {
        return data.replace("<em></em>", "");
    }
    return data;
};

var crawler = new Crawler(configs);
crawler.start();

 

瓜子二手车爬虫源码

标签:

原文地址:http://www.cnblogs.com/datafactory/p/5569850.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!