码迷,mamicode.com
首页 > 其他好文 > 详细

爬取京东商品信息和评价的爬虫实现源码

时间:2016-05-26 14:25:34      阅读:204      评论:0      收藏:0      [点我收藏+]

标签:

话不多说,直接上源码:

var keyword = "d3.js";//@input(keyword, 查询关键字, 爬取该关键字搜索出来的京东商品)
var comment_count = 100;//@input(comment_count, 爬取的评论数, 最多爬取多少条评论)

var page_count = comment_count / 10;
keyword = keyword.trim();
var scanUrls = [];
scanUrls.push("http://search.jd.com/Search?keyword="+keyword.replace(/ /g, "+")+"&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=1&s=1&click=0");
var helperUrlRegexes = [];
helperUrlRegexes.push("http://search\\.jd\\.com/Search\\?keyword="+keyword.replace(/ /g, "\\+").replace(/\./g, "\\.")+"&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=\\d+&s=1&click=0");

var configs = {
    domains: ["search.jd.com","item.jd.com","club.jd.com"],
    scanUrls: scanUrls,
    contentUrlRegexes: ["http://item\\.jd\\.com/\\d+.html"],
    helperUrlRegexes: helperUrlRegexes, 
    fields: [
        {
            // 第一个抽取项
            name: "title",
            selector: "//div[@id=‘name‘]/h1",
            required: true 
        },
        {
            // 第一个抽取项
            name: "productid",
            selector: "//div[contains(@class,‘fl‘)]/span[2]",
            required: true 
        },
        {
            name: "comments",
            selector: "//div[@id=‘comment-pages‘]/span",
            repeated: true,
            children: [
                {
                    name: "page",
                    selector: "//text()"
                },
                {
                    name: "comments",
                    sourceType: SourceType.AttachedUrl,
                    attachedUrl: "http://club.jd.com/productpage/p-{$.productid}-s-0-t-3-p-{page}.html",
                    selectorType: SelectorType.JsonPath,
                    selector: "$.comments",
                    repeated: true,
                    children:[
                        {
                            name: "com_content",
                            selectorType: SelectorType.JsonPath,
                            selector: "$.content"
                        },
                        {
                            name: "com_nickname",
                            selectorType: SelectorType.JsonPath,
                            selector: "$.nickname"
                        }
                    ]
                }
            ]
        }
    ]
};
configs.afterDownloadPage = function(page, site) {
    var matches = /item\.jd\.com\/(\d+)\.html/.exec(page.url);
    if (!matches) return page;
    var commentUrl = "http://club.jd.com/productpage/p-"+matches[1]+"-s-0-t-3-p-0.html";
    var result = site.requestUrl(commentUrl);
    var data = JSON.parse(result);
    var commentCount = data.productCommentSummary.commentCount;
    var pages = commentCount / 10;
    if (pages > page_count) pages = page_count;
    var pageHtml = "<div id=\"comment-pages\">";
    for (var i = 0; i < pages; i++) {
        pageHtml += "<span>" + i + "</span>";
    }
    pageHtml += "</div>";
    var index = page.raw.indexOf("</body>");
    page.raw = page.raw.substring(0, index) + pageHtml + page.raw.substring(index);
    return page;
};
configs.onProcessHelperUrl = function(url, content, site){
    if(!content.indexOf("抱歉,没有找到")){
        var currentPage = parseInt(url.substring(url.indexOf("&page=") + 6));
        if(currentPage === 0){
            currentPage = 1;
        }
        var page = currentPage + 2;
        var nextUrl = url.replace("&page=" + currentPage, "&page=" + page);
        site.addUrl(nextUrl);
    }
    return true;
};
configs.afterExtractPage = function(page, data) {
    if (data.comments === null || data.comments === undefined) return data;
    var comments = [];
    for (var i = 0; i < data.comments.length; i++) {
        var p = data.comments[i];
        for (var j = 0; j < p.comments.length; j++) {
            comments.push(p.comments[j]);
        }
    }
    data.comments = comments;
    return data;
};
var crawler = new Crawler(configs);
crawler.start();

 

代码怎么运行,看这个:https://github.com/ShenJianShou/crawler_samples/blob/master/%E5%A6%82%E4%BD%95%E6%89%A7%E8%A1%8C%E6%A0%B7%E4%BE%8B%E4%BB%A3%E7%A0%81.txt

爬取京东商品信息和评价的爬虫实现源码

标签:

原文地址:http://www.cnblogs.com/datafactory/p/5530829.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!