标签:爬虫
httpclient 和jsoup都可以实现模拟浏览器抓取页面,前者发送请求,后者解析htm标签比较强大。本例直接使用jsoup实现请求和解析。
package com.chongdong.log.test;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Jsoup;
import org.jsoup.helper.HttpConnection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.Test;
/**
*
* 类名称:JsoupTest
* 类描述: jsoup 抓取 mitsuku聊天 信息
* 创建人:zk
* 创建时间:2015-7-20 下午3:52:06
* 修改人:zk
* 修改时间:2015-7-20 下午3:52:06
* 修改备注:
* 开发进度:
* @version 1.0
*
*/
public class JsoupTest {
public static void postMitSuKu(){
Map<String, String> map = new HashMap<String, String>();
/** 表单 提交的参数
* input:how old are you
botid:9fa364f2fe345a10
custid:c04f62ad1e044059
faq : http://www.pandorabots.com/botmaster/en/faq#h1
* **/
map.put("input", "hi");
map.put("botid", "9fa364f2fe345a10"); //开启 可在 跳转到聊天框界面 获取 标示
map.put("custid", "c04f62ad1e044059"); //当一个空请求 即可获取 相当于 标示
Connection conn = Jsoup.connect("http://fiddle.pandorabots.com/pandora/talk-xml");
/* conn.header("(Request-Line)", "POST /cgi-bin/login?lang=zh_CN HTTP/1.1");*/
conn.header("Accept", "*/*");
conn.header("Accept-Encoding", "gzip,deflate,sdch");
conn.header("Accept-Language", "zh-CN,zh;q=0.8");
/* conn.header("Content-Length", "58");*/
conn.header("Origin", "http://www.square-bear.co.uk");
conn.header("Pragma", "no-cache");
conn.header("Connection", "Keep-Alive");
//必须 填写 表单提交
conn.header("Content-Type", "application/x-www-form-urlencoded");
conn.header("Host", "fiddle.pandorabots.com");
conn.header("Referer", "http://www.square-bear.co.uk/mitsuku/mitsy_retro.swf");
conn.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
try {
Response response = (Response) conn.ignoreContentType(true).method(Method.POST).data(map).execute();
//response.
String json=response.body();
System.out.println(json);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/////////////////////////////////////////////////////////////////////////////////////////
//方案2: 通过 开发抓包工具可知 表单的提交方式 应该为http post 此处为 get 方法 不合适
/**
* 请求英文对话的网页,抓取结果
* @param url
* @return
*/
private static String processLogic(String url){
String result = "";
try {
Document document = Jsoup.connect(url).ignoreContentType(true).ignoreHttpErrors(true)
.followRedirects(true).timeout(5000).userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0;)").get();
Elements elements = document.select("result that");
result = elements.text();
System.out.println(result);
} catch (Exception e) {
e.printStackTrace();
result = "ok";
}
return result;
}
public static void main(String[] args) {
for (int i = 0; i < 100; i++) {
// Thread thread=new Thread();
//thread.start();
postMitSuKu();
}
/*
通过 开发抓包工具可知 表单的提交方式 应该为http post 此处为 get 方法 不合适
String url = "http://fiddle.pandorabots.com/pandora/talk-xml?input=%s&botid=9fa364f2fe345a10&custid=bbbb30debe1bc7f7";
processLogic(url);
*/
}
}
版权声明:本文为博主原创文章,未经博主允许不得转载。
标签:爬虫
原文地址:http://blog.csdn.net/u011761678/article/details/47247301