标签:length 执行 终端 main war page write 写代码 关系
参考文章: nodejs 爬虫实战
1 Weizhens-Mac-mini:~ weizhen$ cd /Users/weizhen/Sites/nodejs/webcrawler
2 Weizhens-Mac-mini:webcrawler weizhen$ npm init
3 This utility will walk you through creating a package.json file.
4 It only covers the most common items, and tries to guess sensible defaults.
5
6 See `npm help json` for definitive documentation on these fields
7 and exactly what they do.
8
9 Use `npm install <pkg>` afterwards to install a package and
10 save it as a dependency in the package.json file.
11
12 Press ^C at any time to quit.
13 package name: (webcrawler)
14 version: (1.0.0)
15 description: webcrawler
16 entry point: (index.js)
17 test command: none
18 git repository: none
19 keywords: none
20 author: weizhen
21 license: (ISC)
22 About to write to /Users/weizhen/Sites/nodejs/webcrawler/package.json:
23
24 {
25 "name": "webcrawler",
26 "version": "1.0.0",
27 "description": "webcrawler",
28 "main": "index.js",
29 "scripts": {
30 "test": "none"
31 },
32 "repository": {
33 "type": "git",
34 "url": "none"
35 },
36 "keywords": [
37 "none"
38 ],
39 "author": "weizhen",
40 "license": "ISC"
41 }
42
43 Is this OK? (yes)
1 # -s 会自动将依赖关系写入 package.json 文件中
2 # express用来搭建简单的服务器
3 npm install express -s
4 # superagent用来请求页面
5 npm install superagent -s
6 # cheerio形如jquery处理页面元素
7 npm install cheerio -s
8 # xlsx用来将生成excel文件
9 npm install xlsx -s
1 const accessPage = (url, callback) => {
2 const superagent = require(‘superagent‘);
3 superagent.get(url).retry(3).end((err, res) => {
4 if (err) {
5 console.log(`访问页面失败${err}`);
6 } else {
7 callback && callback(res);
8 }
9 });
10 };
11
12 const formatData = (arr) => {
13 if (arr.length === 0) { return []; }
14 let fields = Object.keys(arr[0]);
15 let sheet_data = arr.map(item => fields.map(field => item[field]));
16 sheet_data.unshift(fields);
17 return sheet_data;
18 };
19
20 const saveToExcel = (sheet_data, fileName) => {
21 const XLSX = require(‘xlsx‘);
22 const sheet = XLSX.utils.aoa_to_sheet(sheet_data); // data to sheet
23 let book = XLSX.utils.book_new(); // create an empty workbook
24 XLSX.utils.book_append_sheet(book, sheet, ‘sheet1‘); // append sheet into workbook
25 XLSX.writeFile(book, fileName); // save to file
26 };
27
28 module.exports = {
29 accessPage,
30 formatData,
31 saveToExcel,
32 };
1 const express = require(‘express‘);
2 const app = express();
3 const util = require(‘./util.js‘);
4
5 let server = app.listen(3000, () => {
6 let { adress, port } = server.address();
7 console.log(`App is running at http://${adress}:${port}`);
8 });
9
10 app.get(‘/‘, async (req, res, next) => {
11 util.accessPage(‘http://news.baidu.com/‘, (resdom) => {
12 let hostNews = getPageInfo(resdom);
13 if (hostNews && hostNews.length > 0) {
14 const sheet_data = util.formatData(hostNews);
15 util.saveToExcel(sheet_data, ‘out.xlsx‘);
16 }
17 res.send(hostNews);
18 });
19 });
20
21 const getPageInfo = (resdom) => {
22 // 抓取页面信息
23 const cheerio = require(‘cheerio‘);
24 // 使用cheerio模块的load()方法,将htmldocument作为参数传入函数,就可以使用类似Jquery的$(selector)的方式获取页面元素
25 let $ = cheerio.load(resdom.text);
26 let hostNews = [];
27 $(‘div#pane-news ul li a‘).each((idx, ele) => {
28 let news = {
29 title: $(ele).text(),
30 href: $(ele).attr(‘href‘),
31 };
32 hostNews.push(news);
33 });
34 return hostNews;
35 }
标签:length 执行 终端 main war page write 写代码 关系
原文地址:https://www.cnblogs.com/vision2015/p/11434289.html