node.js爬虫，牛刀小试

时间：2018-06-13 11:43:50 阅读：221 评论：0 收藏：0 [点我收藏+]

标签：cti 文章 share \n div lin author err nic

　　暂时未完成，预计端午节前搞完。

/**
 * 获取依赖
 * @type {*}
 */
const superagent = require(‘superagent‘);
const cheerio = require(‘cheerio‘);
const fs = require(‘fs‘);
/**
 * 定义请求地址
 * @type {*}
 */
// const reptileUrl = "http://www.jianshu.com/";
// const reptileUrl = "http://www.imooc.com/learn/348";
const reptileUrl = "http://itianti.sinaapp.com/index.php/gpu/";
/**
 * 处理空格和回车
 * @param text
 * @returns {string}
 */
function replaceText(text) {
  return text.replace(/\n/g, "").replace(/\s/g, "");
}
/**
 * 核心业务
 * 发请求，解析数据，生成数据
 */
superagent.get(reptileUrl).end(function (err, res) {
    // 抛错拦截
    if (err) {
        throw new Error(err);
    }
    console.log(res.text)
    // 解析数据
    let $ = cheerio.load(res.text);
    /**
     * 存放数据容器
     * @type {Array}
     */
    let data = [];
    // 获取数据
    $(‘#list-container .note-list li‘).each(function (i, elem) {
        let _this = $(elem);
        data.push({
            id: _this.attr(‘data-note-id‘),
            slug: _this.find(‘.title‘).attr(‘href‘).replace(/\/p\//, ""),
            author: {
                slug: _this.find(‘.avatar‘).attr(‘href‘).replace(/\/u\//, ""),
                avatar: _this.find(‘.avatar img‘).attr(‘src‘),
                nickname: replaceText(_this.find(‘.blue-link‘).text()),
                sharedTime: _this.find(‘.time‘).attr(‘data-shared-at‘)
            },
            title: replaceText(_this.find(‘.title‘).text()),
            abstract: replaceText(_this.find(‘.abstract‘).text()),
            thumbnails: _this.find(‘.wrap-img img‘).attr(‘src‘),
            collection_tag: replaceText(_this.find(‘.collection-tag‘).text()),
            reads_count: replaceText(_this.find(‘.ic-list-read‘).parent().text()) * 1,
            comments_count: replaceText(_this.find(‘.ic-list-comments‘).parent().text()) * 1,
            likes_count: replaceText(_this.find(‘.ic-list-like‘).parent().text()) * 1
        });
    });
   // 生成数据
    // 写入数据, 文件不存在会自动创建
    fs.writeFile(__dirname + ‘/data/article.json‘, JSON.stringify({
        status: 0,
        data: data
    }), function (err) {
        if (err) throw err;
        console.log(‘写入完成‘);
    });
});

主要参考这两篇文章10分钟教你撸一个nodejs爬虫系统 Node.js学习之网络爬虫（使用cheerio抓取网页数据），但由于历史原因，一些代码已经不可用了，根据这两篇文章的思想，自己重写一个Node.js爬虫。

node.js爬虫，牛刀小试

标签：cti 文章 share \n div lin author err nic

原文地址：https://www.cnblogs.com/zhansu/p/9175948.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行