标签:python phantomjs casperjs 安装 基础介绍
最近在使用Python爬取网页内容时,总是遇到JS临时加载、动态获取网页信息的困难。例如爬取CSDN下载资源评论、搜狐图片中的“原图”等,此时尝试学习Phantomjs和CasperJS来解决这个问题。这第一篇文章当然就是安装过程及入门介绍。 下载地址:http://phantomjs.org/
官网介绍:
PhantomJS is a headless WebKit scriptable with a JavaScript API. It has fast and native support for various
web standards: DOM handling, CSS selector, JSON, Canvas, and SVG.
Full web stack No browser required.
PhantomJS是一个服务器端的 JavaScript API 的
WebKit(开源的浏览器引擎)。其支持各种Web标准: DOM 处理, CSS 选择器, JSON, Canvas 和 SVG。PhantomJS可以用于页面自动化,网络监测,网页截屏,以及无界面测试等。
下载PhantomJS解压后如下图所示:
console.log('Hello world!'); phantom.exit();通过Ctrl+R打开CMD调用phantomjs.exe执行该程序输出如下图所示:
var system = require('system'); if (system.args.length === 1) { console.log('Try to pass some args when invoking this script!'); } else { system.args.forEach(function (arg, i) { console.log(i + ': ' + arg); }); } phantom.exit();运行程序及输出结果如下图所示:
var page = require('webpage').create(); page.open('http://www.baidu.com', function () { page.render('example.png'); phantom.exit(); });运行程序结果如下图所示:
var page = require('webpage').create(), system = require('system'), t, address; if (system.args.length === 1) { console.log('Usage: loadspeed.js <some URL>'); phantom.exit(1); } else { t = Date.now(); address = system.args[1]; page.open(address, function (status) { if (status !== 'success') { console.log('FAIL to load the address'); } else { t = Date.now() - t; console.log('Page title is ' + page.evaluate(function () { return document.title; })); console.log('Loading time ' + t + ' msec'); } phantom.exit(); }); }运行程序如所示:
var page = require('webpage').create(); page.open('http://www.csdn.net', function(status) { var title = page.evaluate(function() { return document.title; }); phantom.outputEncoding="gbk"; console.log('Page title is ' + title); phantom.exit(); });输出如下图所示:
var page = require('webpage').create(); phantom.outputEncoding="gbk"; page.onConsoleMessage = function(msg) { console.log('Page title is ' + msg); }; page.open('http://www.csdn.net', function(status) { page.evaluate(function() { console.log(document.title); }); phantom.exit(); });调用phantomjs gettile2.js即可。
var page = require('webpage').create(); console.log('The default user agent is ' + page.settings.userAgent); page.settings.userAgent = 'SpecialAgent'; page.open('http://www.httpuseragent.org', function (status) { if (status !== 'success') { console.log('Unable to access network'); } else { var ua = page.evaluate(function () { return document.getElementById('myagent').innerText; }); console.log(ua); } phantom.exit(); });输入如下指令,获取id=myagent元素的值:
var page = require('webpage').create(); page.open('http://www.sample.com', function() { page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() { page.evaluate(function() { $("button").click(); }); phantom.exit() }); });The above snippet will open up a web page, include the jQuery library into the page, and then click on all buttons using jQuery. It will then exit from the web page. Make sure to put the exit statement within the page.includeJs or else it may exit prematurely before the javascript code is included.
var page = require('webpage').create(), system = require('system'), address; if (system.args.length === 1) { console.log('Usage: netlog.js <some URL>'); phantom.exit(1); } else { address = system.args[1]; page.onResourceRequested = function (req) { console.log('requested: ' + JSON.stringify(req, undefined, 4)); }; page.onResourceReceived = function (res) { console.log('received: ' + JSON.stringify(res, undefined, 4)); }; page.open(address, function (status) { if (status !== 'success') { console.log('FAIL to load the address'); } phantom.exit(); }); }输入指令:
received: { "contentType": "text/javascript; charset=gbk", "headers": [ { "name": "Server", "value": "bfe/1.0.8.5" }, { "name": "Date", "value": "Tue, 18 Aug 2015 20:10:03 GMT" }, { "name": "Content-Type", "value": "text/javascript; charset=gbk" }, { "name": "Content-Length", "value": "88" }, { "name": "Connection", "value": "keep-alive" }, { "name": "Cache-Control", "value": "private" } ], "id": 13, "redirectURL": null, "stage": "end", "status": 200, "statusText": "OK", "time": "2015-08-18T20:09:38.085Z", "url": "https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=&json=1&p=3& sid=16486_16222_1421_16896_16738_12825_12868_16800_16659_16424_16514_15936_12073 _13932_16866&csor=0&cb=jQuery110208203572703059763_1439928574608&_=1439928574609 " }获取如何把该特性用于HAR 输出以及基于YSlow的性能分析的更多信息,请参阅网络监控页面:network monitoring
下载地址:http://casperjs.org/
官方文档:http://docs.casperjs.org/en/latest/
PS:准备下一篇文章介绍
版权声明:本文为博主原创文章,未经博主允许不得转载。
[Python学习] 在Windows下安装PhantomJS和CasperJS及入门介绍(上)
标签:python phantomjs casperjs 安装 基础介绍
原文地址:http://blog.csdn.net/eastmount/article/details/47023199