码迷,mamicode.com
首页 > Web开发 > 详细

Node.js(四)【HTTP小爬虫】

时间:2016-04-05 12:26:44      阅读:278      评论:0      收藏:0      [点我收藏+]

标签:

HTTP源码解读

HTTP性能测试

 1 var http = require(‘http‘);
 2 
 3 http
 4     .createServer(function (request, response) {
 5         response.writeHead(200, {‘Content-type‘: ‘text/plain‘});
 6         response.write(‘Hello Nodejs‘);
 7         response.end();
 8     })
 9     .listen(2016);
10 
11 console.log(‘success‘);
12 
13 //访问localhost:2016会看到Hello Nodejs的返回

 

HTTP小爬虫

//安装cheerio
npm install cheerio

/**********************************/

 1 var http = require(‘http‘);
 2 var url = ‘http://www.imooc.com/learn/348‘;
 3 
 4 http.get(url, function (response) {
 5     var html = ‘‘;
 6 
 7     response.on(‘data‘, function (data) {
 8         html += data;
 9     });
10 
11     response.on(‘end‘, function () {
12         console.log(html);
13     });
14 }).on(‘error‘, function () {
15     console.log(‘获取课程数据出错‘);
16 });

 

/**********************************/

 1 /**
 2  * NodeJs爬虫实例,爬imooc*/
 3 
 4 var http = require(‘http‘);
 5 //安装cheerio,并引入进来
 6 var cheerio = require(‘cheerio‘);
 7 var url = ‘http://www.imooc.com/learn/348‘;
 8 
 9 function filterChapter(html) {
10     var $ = cheerio.load(html);
11     var chapters = $(‘.chapter‘);
12 
13     //[{
14     //    chapterTitle: ‘‘,
15     //    videos: [
16     //        title: ‘‘,
17     //        id: ‘‘
18     //    ]
19     //}]
20 
21     var courseData = [];
22 
23     chapters.each(function (item) {
24         var chapter = $(this);
25         var chapterTitle = chapter.find(‘strong‘).text();
26         var videos = chapter.find(‘.video‘).children(‘li‘);
27 
28         var chapterData = {
29             chapterTitle: chapterTitle,
30             videos: []
31         };
32 
33         videos.each(function (item) {
34             var video = $(this).find(‘a‘);
35             var videoTitle = video.text();
36             var id = video.attr(‘href‘);
37 
38             chapterData.videos.push({
39                 videoTitle: videoTitle,
40                 id: id
41             });
42         });
43         //console.log(chapterData);
44 
45         courseData.push(chapterData);
46     });
47     //console.log(courseData);
48     //console.log(courseData.videos);
49 
50     /*courseData.forEach(function (item) {
51         //console.log(item.videos);
52         var videoTitle = item.videos[0].videoTitle;
53         var id = item.videos[0].id;
54         console.log(‘[‘ + id + ‘]‘ + videoTitle);
55     });*/
56 
57     return courseData;
58 }
59 
60 function printCourseInfo(courseData) {
61     // courseData是一个数组
62     courseData.forEach(function (item) {
63         var chapterTitle = item.chapterTitle;
64         console.log(chapterTitle + ‘\n‘);
65     });
66 
67     courseData.forEach(function (item) {
68         var videoTitle = item.videos[0].videoTitle;
69         var id = item.videos[0].id;
70         console.log(‘[‘ + id + ‘]‘ + videoTitle);
71     });
72     //console.log(‘test‘);
73     //console.log(courseData.videos);
74 }
75 
76 http.get(url, function (response) {
77     var html = ‘‘;
78 
79     response.on(‘data‘, function (data) {
80         html += data;
81     });
82 
83     response.on(‘end‘, function () {
84         var courseData = filterChapter(html);
85 
86         printCourseInfo(courseData);
87         //console.log(courseData);
88     });
89 }).on(‘error‘, function () {
90     console.log(‘获取课程数据出错‘);
91 });

 

Node.js(四)【HTTP小爬虫】

标签:

原文地址:http://www.cnblogs.com/lqcdsns/p/5354353.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!