标签:html解析器 lag message exist nav esc create pat utf-8
刚接触nodejs,做个东西练下手,通过nodejs直接转发整站,原本想把内容全翻译成英文,但google对流量行审查,被封IP,所以就没啥用了, 效果像这样
var b = function (a, b) { for (var d = 0; d < b.length - 2; d += 3) { var c = b.charAt(d + 2), c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c), c = "+" == b.charAt(d + 1) ? a >>> c : a << c; a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c } return a } var gettk = function (a,TKK) { //console.log(a,TKK); for (var e = TKK.split("."), h = Number(e[0]) || 0, g = [], d = 0, f = 0; f < a.length; f++) { var c = a.charCodeAt(f); 128 > c ? g[d++] = c : (2048 > c ? g[d++] = c >> 6 | 192 : (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512) ? (c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023), g[d++] = c >> 18 | 240, g[d++] = c >> 12 & 63 | 128) : g[d++] = c >> 12 | 224, g[d++] = c >> 6 & 63 | 128), g[d++] = c & 63 | 128) } a = h; for (d = 0; d < g.length; d++) a += g[d], a = b(a, "+-a^+6"); a = b(a, "+-3^+b+-f"); a ^= Number(e[1]) || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + "." + (a ^ h) } function getTransRecursive(text,allrs,callback) { console.log(text.length); var surplus=text.substring(4000); text=text.substring(0,4000); getTrans(text,function(rs){ allrs+=rs; if(surplus.length>0) { getTransRecursive(surplus,allrs,callback); }else{ callback(allrs); } }); } function getTrans(text,callback) { var tk=gettk(text, "424997.418814026"); var url=‘/translate_a/single?client=t&sl=zh-CN&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&pc=1&otf=1&ssel=6&tsel=3&kc=0&tk=‘+ tk ; var options = { hostname: ‘translate.google.cn‘, port: 80, path: url, method: ‘POST‘, headers: { // 必选信息, 可以抓包工看一下 "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" } }; var tbody=""; var req = http.request(options, function (remoteRes) { remoteRes.setEncoding(‘utf8‘); remoteRes.on(‘data‘, function (chunk) { tbody+=chunk; }); remoteRes.on("end",function(){ try { tbody=eval(tbody); tbody=tbody[0]; var rstext=""; for(var i in tbody) { rstext+=tbody[i][0]; } callback(rstext); } catch(err) { console.log("transErr1:"); console.log(err); callback(text); } }); }); req.on(‘error‘, function (e) { console.log(‘transErr2:‘ + e.message); callback(text); }); req.write("q="+ encodeURI(text)); req.end(); } function handleStr(newhost,url,str,isHtml,callback) { if(!isHtml||url==""||url=="/") { callback(str); return; } str= str.replace(/<script (([\s\S])*?)<\/script>/g,""); //替换掉所有列表中的网址 for(var key in hostList){ str= str.replace(hostList[key],key); } //callback(str); //根据域名进行翻译 switch(newhost) { case "www.guancha.cn": var $ = cheerio.load(str); var headtitle=$("head>title").text(); headtitle=unescape(headtitle.replace(/&#x/g,‘%u‘).replace(/;/g,‘‘)); var nav=$(".nav").html(); nav=unescape(nav.replace(/&#x/g,‘%u‘).replace(/;/g,‘‘)); var str = $(‘.all-txt‘).text(); var title= $(‘.left-main‘).find(‘h3‘).text(); getTransRecursive(str,"",function(str){ $(‘.all-txt‘).text(str); getTransRecursive(title,"",function(title){ $(‘.left-main‘).find(‘h3‘).text(title); getTransRecursive(headtitle,"",function(head){ $("head>title").text(title); getTransRecursive(nav,"",function(nav){ $(".nav").html(nav); callback($.html()); }); }); }); }); break; default: callback(str); break; } } var hostList={"www.thiscnnews.com":"www.guancha.cn","localhost1:8080":"user.guancha.cn"} var noCatchList=[‘/?s=dhshouye‘,‘/internation?s=dhguoji‘,‘/military-affairs?s=dhjunshi‘,‘/economy?s=dhcaijing‘,‘/industry-science?s=dhkeji‘,‘/TMT?s=dhtmt‘,‘/car?s=dhqiche‘,‘/thinktank?s=dhzhiku‘,‘/GuanWangKanPian?s=dhshipin‘]; function isInArray3(arr,value){ if(arr.indexOf&&typeof(arr.indexOf)==‘function‘){ var index = arr.indexOf(value); if(index >= 0){ return true; } } return false; } function getNewHost(host) { if(host in hostList) { return hostList[host]; }else{ return "www.guancha.cn"; } } function getRemote(newhost,url,res,file) { var body=""; var options = { hostname: newhost, port: 80, path:url, method: ‘GET‘ }; var req = http.request(options, function (remoteRes) { remoteRes.setEncoding(‘utf8‘); remoteRes.on(‘data‘, function (chunk) { body+=chunk; }); remoteRes.on("end",function(){ handleStr(newhost,url,body,remoteRes.headers[‘content-type‘].indexOf("text/html") != -1,function(rs){ if(file!=‘‘){ fs.writeFile(file, rs, {flag: ‘a‘}, function (err) { if(err) { console.error(err); } }); } res.end(rs); }); }); }); req.on(‘error‘, function (e) { res.end( e.message); console.log(‘problem with request: ‘ + e.message); }); req.end(); } var mkdirs = module.exports.mkdirs = function(dirpath, mode, callback) { fs.exists(dirpath, function(exists) { if(exists) { callback(dirpath); } else { //尝试创建父目录,然后再创建当前目录 mkdirs(path.dirname(dirpath), mode, function(){ fs.mkdir(dirpath, mode, callback); }); } }); }; function getCatch(newhost,url,callback) { var file=__dirname+‘/tmp/‘+newhost.replace(‘:‘,‘‘); mkdirs(file,777,function(){ file+=‘/‘+cryptos.md5(url); fs.exists(file,function(exists){ if(exists) { fs.readFile(file,‘utf-8‘,function(err,data){ if(err){ callback(false,file); } else{ callback(true,file,data); } }); }else{ callback(false,file); } }); }); } var useCatch=true; //html解析器 var cheerio = require(‘cheerio‘); //文件操作模块 var fs = require(‘fs‘),path = require(‘path‘); //加密模块 var cryptos=require("./cryptos"); //引入http模块 var http = require("http"); //设置主机名 var hostName = ‘127.0.0.1‘; //设置端口 var port = 9000; //创建服务 var server = http.createServer(function(req,res){ var newhost= getNewHost(req.headers.host); var url=req.url; if(useCatch&&!isInArray3(noCatchList,url)) { getCatch(newhost,url,function(hasData,file,data){ if(hasData) { res.end(data); }else{ getRemote(newhost,url,res,file); } }); }else{ getRemote(newhost,url,res,‘‘); } }); server.listen(port,hostName,function(){ console.log(‘run‘); });
标签:html解析器 lag message exist nav esc create pat utf-8
原文地址:https://www.cnblogs.com/meieiem/p/9244751.html