码迷,mamicode.com
首页 > Web开发 > 详细

简单的网页小爬虫

时间:2016-07-04 21:51:19      阅读:264      评论:0      收藏:0      [点我收藏+]

标签:


var http = require(‘http‘);
var Promise = require(‘bluebird‘); // 第三方 Promises 模块
var cheerio = require(‘cheerio‘);  // 爬虫分析模块
var BufferHelper = require(‘bufferhelper‘); // buffer 组装模块
var iconv = require(‘iconv-lite‘); // 字符转码模块

var baseUrl = ‘http://www.imooc.com/learn/‘;
var courseIds = [348, 637, 259, 75, 197]; //要爬取的课程ID
var pagesArr = []; //爬取到的HTML页面集合

// 批量爬取课程页面
courseIds.forEach(function (cid) {
    pagesArr.push(grabPageAsync((baseUrl + cid)));
});

// 异步爬取页面HTML
function grabPageAsync(url) {
    return new Promise(function (resolve, reject) {
        console.log(‘正在爬取 ‘ + url);

        http.get(url, function (res) {
            var bufferHelper = new BufferHelper();

            res.on(‘data‘, function (chunk) {
                bufferHelper.concat(chunk);
            });

            res.on(‘end‘, function () {
                console.log(‘爬取 ‘ + url + ‘ 成功‘);

                var fullBuffer = bufferHelper.toBuffer();
                var utf8Buffer = iconv.decode(fullBuffer, ‘UTF-8‘);
                var html = utf8Buffer.toString()
                resolve(html);
            });
        }).on(‘error‘, function (e) {
            // 爬取成功
            reject(e);

            console.log(‘爬取 ‘ + url + ‘ 失败‘);
        });
    });
}

// 提取课程信息并打印
Promise
    .all(pagesArr)
    .then(function (pages) {
        var coursesData = [];

        pages.forEach(function (html) {
            // 提取课程信息
            var courses = filterChapters(html);
            coursesData.push(courses);
        });
        // 打印课程信息
        printCourseInfo(coursesData);
    });

// 提取课程信息
function filterChapters(html) {
    var $ = cheerio.load(html);
    var $chapters = $(‘.chapter‘);
    var title = $(‘.hd .l‘).text();
    var number = parseInt($($(".meta-value strong")[3]).text().trim(), 10);
    var courseData = {
        title: title,
        number: number,
        videos: []
    };

    var $chapter;
    var chapterTitle;
    var chapterData = {};
    var $videos;
    var $video;
    var videoTitle;
    var id;

    $chapters.each(function () {
        $chapter = $(this);
        chapterTitle = $chapter.find(‘strong‘).text();
        chapterData = {
            chapterTitle: chapterTitle,
            videos: []
        };
        $videos = $chapter.find(‘.video‘).children(‘li‘);
        $videos.each(function () {
            $video = $(this).find(‘.studyvideo‘);
            videoTitle = $video.text();
            id = $video.attr(‘href‘).split(‘video/‘)[1];
            chapterData.videos.push({
                title: videoTitle,
                id: id
            })
        });
        courseData.videos.push(chapterData);
    });
    return courseData;
}

// 打印课程信息
function printCourseInfo(coursesData) {
    if(Object.prototype.toString.call(coursesData) == ‘[object Array]‘ && coursesData.length > 0){

        coursesData.forEach(function (courseData) {
            console.log(‘\n\n【‘ + courseData.number + ‘】人学过《‘ + courseData.title + ‘》‘);
            console.log(‘----------------------------------------------‘);

            courseData.videos.forEach(function (item) {
                console.log(‘\n‘ + item.chapterTitle);

                item.videos.forEach(function (video) {
                    console.log(‘ ‘ + video.title.trim());
                })
            });
        });
    }else{
        console.log(‘暂无课程信息‘);
    }
}

简单的网页小爬虫

标签:

原文地址:http://www.cnblogs.com/zbPlayer/p/5641594.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!