NodeJS 爬虫爬取LOL英雄联盟的英雄信息，superagent+cheerio+async

时间：2016-09-01 20:11:05 阅读：379 评论：0 收藏：0 [点我收藏+]

标签：

1.模块使用

（1）superagent：Nodejs中的http请求库（每个语言都有无数个，java的okhttp，ios的afnetworking）

（2）cheerio:Nodejs中的html解析库（每个语言基本都有。。）

（3）async：Nodejs中的同/异步并发函数执行库（这个非常牛，其他语言同类型的不多）

2.爬取内容

多玩的英雄联盟英雄页面，通过解析页面内每个英雄的URL，然后并发请求英雄的详细数据，提取需要的数据得到结果

http://lol.duowan.com/hero/

这次主要是为了熟悉NodeJS爬虫，所以以1星级难度爬虫练手。

3.源码

var superagent = require(‘superagent‘);  
var cheerio = require(‘cheerio‘);  
var async = require(‘async‘);

start();

// 第一步，使用superagent发起get请求，获取含有英雄名字的页面
function start(){
    console.log(‘爬虫程序开始运行......‘);
    superagent  
    .get(‘http://lol.duowan.com/hero/‘)
    .end(function(err, res){          
        // 请求返回后的页面处理，使用cheerio提取英雄
        var $ = cheerio.load(res.text,{decodeEntities: false}); 
        //找到每个英雄的链接，并存入数组，等待并行请求
        var heroes = new Array();
        $("a.lol_champion").each(function(i, e) {
            heroes.push($(e).attr("href"));
        });

        //并发遍历heroes对象
        async.mapLimit(heroes,5, 
            function (heroUrl, callback) {
            // 对每个角色对象的处理逻辑
                fetchInfo(heroUrl, callback);
            }, 
            function (err, result) {
                if(err){
                    console.log("error is:"+err);
                }
                //这里的result就是callback回来的数组
                console.log("抓取结束，共计:"+result.length+"个");  
                result.forEach(function(hero){  
                    console.log(JSON.stringify(hero));
                });
            }
        );

        //串行遍历heroes对象
        // async.mapSeries(heroes,function (heroUrl, callback) {
        //     // 对每个角色对象的处理逻辑
        //         fetchInfo(heroUrl, callback);
        //     }, 
        //     function (err, result) {
        //         if(err){
        //             console.log("error is:"+err);
        //         }
        //         //这里的result就是callback回来的数组
        //         console.log("抓取结束，共计:"+result.length+"个");  
        //         result.forEach(function(hero){  
        //             console.log(JSON.stringify(hero));
        //         });
        //     }
        // );
    }); 
}

// 获取角色信息
var concurrencyCount = 0; // 当前并发数记录  
function fetchInfo(heroUrl, callback){  
    concurrencyCount++;
    console.log("...正在抓取:"+ heroUrl + "...当前并发数记录：" + concurrencyCount);
    // 根据URL，进行详细页面的爬取和解析
    superagent
        .get(heroUrl)
        .end(function(err, res){  
            if(err){
                console.log("fail");
                concurrencyCount--;
                var hero = {
                    succ:false
                }
                //callback左边的参数为error的string，不为null时会打断本次map
                callback(null,hero);
            }else{
                // 获取爬到的角色详细页面内容
                var $ = cheerio.load(res.text,{decodeEntities: false});  
                var heroTitle = $(‘.hero-title‘).first().text();
                var heroName = $(‘.hero-name‘).first().text();
                var heroType = $(‘.hero-tag‘).first().text()+" "+$(‘.hero-tag‘).last().text();
                console.log(‘找到英雄:‘+heroTitle+" "+heroName+"|"+heroType);
                concurrencyCount--;
                var hero = {
                    succ:true,
                    title:heroTitle,
                    name:heroName,
                    type:heroType
                }
                //callback后才会结束此并行“线程”
                callback(null, hero);
            }
        });
}

4，工程demo

https://github.com/rayshen/lolcrawler

NodeJS 爬虫爬取LOL英雄联盟的英雄信息，superagent+cheerio+async

标签：

原文地址：http://www.cnblogs.com/rayshen/p/5830995.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行