标签:javascript nodejs 爬虫 url filter
var http = require('http'); var fs = require('fs'); var iconv = require('iconv-lite'); function webSpider(argument) { this.url = argument.url; //开始的页面地址 this.outpath = argument.outpath || 'g://temp/'; this.filter = argument.filter || new filter({ regex : argument.regex, //默认过滤所有的图片 url : argument.url, custom : argument.custom }); this.pagin = argument.pagin || false; //默认不处理 this.download = argument.download; //是否立即下载[默认为是true] this.page = argument.page || new page({ filter:this.filter , //默认的过滤器 outpath:this.outpath , //默认输出路径 download:this.download //默认不处理 }); } // 开始执行 webSpider.prototype.start = function() { if(this.pagin == false) this.page.getHtml(this.url); else this.paginHandle(); }; //处理多页的问题 webSpider.prototype.paginHandle = function() { var _pagin = this.pagin,_urlRule = _pagin.urlRule, i = _pagin.start,len = _pagin.end, _page = this.page,_url; //仅有第一页,需要处理 _page.getHtml(this.url ,1); //处理剩余页 while(i<=len){ _url = _urlRule.replace(/({page})/g,i); _page.getHtml(_url ,i); i++; } }; //过滤对象 function filter(argument){ this.regex = argument.regex || /<img\s.*src="(.*?)"/g; this.custom = argument.custom; this.domainName = this.tools.getDomain(argument.url); this.url = argument.url; } filter.prototype={ tools : { getDomain : function _getDomain (url) { url = url.split('/'); return url[0]+'//'+url[2]; }, getTrueFileUrl : function _getTrueFileUrl (fileUrl ,domain) { if(fileUrl.indexOf('http')>=0) return fileUrl; return domain+'/'+fileUrl; //==================================================================================应该继续处理 } }, execute : function _execute (html) { if(!html){console.log('html is null');return;} //处理过滤条件 或 调用过滤方法 var arr = []; if(typeof(this.custom)=='function') {/*console.log('file -> custom');*/ arr = this.custom(html ,this.tools.getTrueFileUrl);} else {console.log('file -> regex'); arr = this.byRegex(html);} //return arr.removal(); return arr; }, byRegex : function _byRegex (html) { var results=[] ,match , _regex = this.regex , _domain = this.domainName, _url = this.url , getFilrUrl = this.tools.getTrueFileUrl,i=1; while ((match = _regex.exec(html)) != null) { console.log('>>:'+match[1]); results.push({src:getFilrUrl(_domain ,match[1]) ,id:i}); i++; } return results; } } //处理页面对象 ,包括获得页面的html -> 根据过滤对象获取所需的内容数组 -> 执行下载或自定义的返回方法 //方法包括:获取一个页面 function page(argument) { this.filter = argument.filter; this.outpath = argument.outpath; this.download = argument.download; } page.prototype={ //获取一页的html getHtml : function _getHtml (url ,pagei) { var self = this, data = null ,download = this.download ,charset = this.charset; http.get(url, function (res) { res.setEncoding('binary'); res.on('data', function (chunk) { data += chunk; }).on('end', function () { var arr = self.filter.execute(iconv.decode(new Buffer(data,'binary'),'gbk')); //保证中文不乱码的问题 if(download==true) self.downloadFiles(arr ,pagei); }); }).on('error',function () { console.log('getHtml is error'); }); }, //下载文件集合,集合必须包含链接 downloadFiles : function _downloadFiles (arr, pagei) { var len, _pagei = pagei || ''; if(arr && (len=arr.length) > 0){ for(var i=0,_tele;i<len;i++){ _tele = arr[i]; this.downloadFile(_tele.src, this.outpath ,_pagei+'_'+_tele.id); } }else{ console.log('results is null'); } }, //下载一个文件 //outpath 的最后一个字符 必须是/ //默认多线程下载 downloadFile : function _downloadFile (src ,outpath ,_i) { var filename = _i + '_'+ src.substring(src.lastIndexOf('/') + 1); if(!fs.exists(outpath)) fs.mkdir(outpath, 777 ,function () { var writestream = fs.createWriteStream(outpath + filename); http.get(src, function (res) { try{ res.pipe(writestream); writestream.on('finish', function(e){ console.log('download : ' + src); }).on('error' ,function(e) { console.log('####download Error:'+src); }); }catch(e){ console.log('>>>>#######download error:'+e); } }); }); } } module.exports=webSpider;
var fs = require('fs'); var cheerio = require('cheerio'); var webSpider = require('./webSpider'); var downloadZips =[]; function getApk () { var ws = new webSpider({ url:'http://www.duote.com/android/game.html', //默认第一页 pagin : { urlRule : 'http://www.duote.com/android/game_0_down_{page}.html', start : 2, end : 714 }, custom : function (html ,getpathfun) { //自定义过滤条件 var results=[], $ = cheerio.load(html) ,_this; $('.list_item .link').each(function(i){ _this = $(this); getPageLinks(getpathfun(_this.attr('href'),'http://www.duote.com/') ,i); }); setTimeout(function () { writeFile('E:/webFile/多特apk.txt' ,downloadZips.join(' ')); },1000); }, download : false }); ws.start(); } function getPageLinks(url ,i){ var ws = new webSpider({ url:url, //默认第一页 custom : function (html ,getpathfun) { //自定义过滤条件 var $ = cheerio.load(html); var _regex = /var sUrl = '(.*)';/g ,match; while ((match = _regex.exec(html)) != null) { downloadZips.push('\n'+$('.tit_area h1').text()+"\t\tsrc:"+getpathfun(match[1],'http://app.2345.cn')); } } }); ws.start(); } function writeFile(outpath ,str){ // 如果用writeFile,那么会删除旧文件,直接写新文件 fs.appendFile(outpath, str, function(err){ if(err) console.log("fail " + err); else console.log("写入文件ok"); }); } getApk();
版权声明:本文为博主原创文章,未经博主允许不得转载。
标签:javascript nodejs 爬虫 url filter
原文地址:http://blog.csdn.net/u013934914/article/details/47280281