码迷,mamicode.com
首页 > 其他好文 > 详细

node爬虫 - imooc

时间:2016-01-29 19:52:26      阅读:186      评论:0      收藏:0      [点我收藏+]

标签:

var http = require(‘http‘);
var cheerio = require(‘cheerio‘);
var url = ‘http://www.imooc.com/learn/348‘;

// 过滤章节信息
function filterChapters(html) {
    var $ = cheerio.load(html);
    var chapters = $(".chapter"); //.chapter和课程有关的最外层
    var courseData = [];    //课程信息

    chapters.each(function(index, item) {    //遍历5个大章节
        var chapter = $(item);    
        var chapterTitle = chapter.find(‘strong‘).text();
        var chapterItem = chapter.find(‘li‘);
        var chaperData = {
            ‘title‘: chapterTitle,
            ‘videos‘: []
        }
        chapterItem.each(function(index, item) {    //遍历每大章下面的小章节
            var video = $(item);
            var videoTitle = video.find(‘.studyvideo‘).text();

            var videoId = video.find(‘.studyvideo‘).attr(‘href‘).split(‘video/‘)[1];

            chaperData.videos.push({
                ‘title‘: videoTitle,
                ‘id‘: videoId
            });
        });
        courseData.push(chaperData);
    });

    return courseData;


}
// 显示课程信息
function printCourseInfo(course) {
    course.forEach(function(element) {
        var chapterTitle = element.title;
        console.log(chapterTitle + ‘\n‘);
        element.videos.forEach(function(video) {
            console.log(‘【‘ + video.id + ‘】‘ + video.title.trim()+ ‘\n‘);
        });
    });
}


http.get(url, function(response) {
    var html = ‘‘;
    response.on(‘data‘, function(data) {
        html += data;
    });
    response.on(‘end‘, function() {
        var courseArr = filterChapters(html);
        printCourseInfo(courseArr);
    });
}).on(‘error‘, function() {
    console.log(‘获取课程错误‘);
});


console.log(‘crawling..‘);

 

感谢scott老师带来的精彩教学,这是http小爬虫的事例代码

 

node爬虫 - imooc

标签:

原文地址:http://www.cnblogs.com/shinian007/p/5169518.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!