nodejs爬虫案例笔记

时间：2016-06-13 23:35:42 阅读：405 评论：0 收藏：0 [点我收藏+]

标签：

用nodeJs制作一个简单的网页爬虫

主要分为三个步骤，向目标请求数据，处理数据，打印数据。需要用到的模块有http，cheerio。

1.准备步骤，引入要使用的模块

2.向目标请求数据

http.get(url,function(res){
    var html=‘‘;

    res.on("data",function(data){
        html+=data;
    })

     res.on("end",function(){
         var couseData=filterChapter(html);//处理

         printcouse(couseData);//打印
     })
}).on("error",function(){
    console.log("something is error")
})

知识点：res服务器响应有两个事件，data事件是数据传输时触发，如果数据量比较大的话，会将数据分为小段小段的接受，每次都会触发data事件。end事件，所有数据接受完毕时触发。

3.处理数据

function filterChapter(html){
    var $=cheerio.load(html);

    var chapters=$(".chapter");

    var couseData=[];

    chapters.each(function(){
        var chapter=$(this);
        var chapterTil=chapter.find("strong").text();
        var chapterCon=chapter.find(".video").children("li");

        var chapterData={
            chapterTil:chapterTil,
            chapterCon:[]
        }
        chapterCon.each(function(){
            var chapterDetail=$(this).find(".studyvideo");
            var chapterDetailTil=chapterDetail.text();
            var id=chapterDetail.attr("href").split("video/")[1];

            chapterData.chapterCon.push({
                title:chapterDetailTil,
                id:id
            })
        })

        couseData.push(chapterData)
    })

    return couseData;

}

知识点：cheerio模块几乎能够解析任何的 HTML 和 XML document，并用jquery来进行dom操作。它为服务器特别定制的，快速、灵活、实施的jQuery核心实现。

在这里是要找到课程章节名称及子栏目

4.打印数据

function printcouse(couseData){
    couseData.forEach(function(item){
        chapterTil=item.chapterTil;

        console.log(chapterTil+"\n");

        item.chapterCon.forEach(function(chapterDetail){
            console.log(chapterDetail.id+chapterDetail.title+"\n")
        })
    })
}

以上只能抓取一个页面的课程数据，接下来是要在此基础上抓取多个页面的数据。

增加模块promise

1.对请求数据部分的处理，改成返回一个promise对象，方便后续的并发控制

function getPageAsync(url){
    return new Promise(function(resolve,reject){
        console.log("正在爬取课程")

        http.get(url,function(res){
            var html=‘‘;

            res.on("data",function(data){
                html+=data;
            })

             res.on("end",function(){
                 resolve(html)//resolve就是promise对象接下来要回调的函数
             })
            }).on("error",function(e){
                reject(e)
                console.log("something is error")
            })
    })
}

2.根据不同的url返回promise对象，将他们都放到数组里面

var videoIds=[348,259,197,75];
var fetchCouseArr=[];//promiser对象数组
videoIds.forEach(function(id){
    fetchCouseArr.push(getPageAsync(baseUrl+id))//存入数组
})

3.并发控制，同时抓取多个页面数组

Promise
.all(fetchCouseArr)//all方法接受一个数组，返回多个promise对象，每个promise对象都执行接下来的操作
.then(function(pages){//pages就是每个要去爬的页面
    var cousesData=[]

    pages.forEach(function(pages){
        var courses=filterChapter(pages)//数据处理

        cousesData.push(courses)
    })

    cousesData.sort(function(a,b){
        return a.number<b.number
    })

    printcouse(cousesData)//数据打印
})

知识点：promise的all方法接受一个数组参数，对于数组内的每个promise对象都执行接下来的操作，上面请求数据时最后返回的reslove(html)在这里就是then方法内的回调函数，pages参数就是请求到的html数据。

4.数据处理和数据打印原理还是和最开始一样

function filterChapter(html){
    var $=cheerio.load(html);

    var Title=$("#main .path>a").eq(3).children("span").text();

    var number=parseInt($(".meta-value").eq(2).children("strong").text(),10); 

    var couseData={
        Title:Title,
        videos:[],
        number:number
    };

     var chapters=$(".chapter");

    chapters.each(function(){
        var chapter=$(this);
        var chapterTil=chapter.find("strong").text();
        var chapterCon=chapter.find(".video").children("li");

        var chapterData={
            chapterTil:chapterTil,
            chapterCon:[]
        }
        chapterCon.each(function(){
            var chapterDetail=$(this).find(".studyvideo");
            var chapterDetailTil=chapterDetail.text();
            var id=chapterDetail.attr("href").split("video/")[1];

            chapterData.chapterCon.push({
                title:chapterDetailTil,
                id:id
            })
        })

        couseData.videos.push(chapterData)
    })

    return couseData;

}

function printcouse(cousesData){
    cousesData.forEach(function(courseData){
        console.log(courseData.number+‘人学过‘+courseData.Title+‘\n‘)
    })

    cousesData.forEach(function(courseData){
        console.log(‘###‘+courseData.Title+‘\n‘)

        courseData.videos.forEach(function(item)
        {
        //console.log(‘###‘+item.chapterTil+‘\n‘)
        var chapterTil=item.chapterTil;

        console.log(chapterTil+"\n");

        item.chapterCon.forEach(function(chapterDetail){
            console.log(chapterDetail.id+chapterDetail.title+"\n")
        })
    })
    })
}

nodejs爬虫案例笔记

标签：

原文地址：http://www.cnblogs.com/scdisplay/p/5582440.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行