码迷,mamicode.com
首页 > 其他好文 > 详细

物流轨迹抓取

时间:2017-06-23 22:05:29      阅读:364      评论:0      收藏:0      [点我收藏+]

标签:rri   exe   byte   osi   tde   continue   tac   timeout   arch   

package com.vanwell.module.util.express;

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.UUID;

import org.elasticsearch.common.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.BeansException;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.vanwell.module.common.ErrorConstants;
import com.vanwell.module.common.Result;
import com.vanwell.module.common.util.CommonUtil;
import com.vanwell.module.util.exception.StackTraceUtil;
import com.vanwell.module.util.http.HttpUtils;
import com.vanwell.module.util.spring.ServiceFactory;
import com.vanwell.thirdparty.fedroad.api.QueryTrackApi;
import com.vanwell.thirdparty.fedroad.pojo.QueryTrackReq;
import com.vanwell.thirdparty.fedroad.pojo.QueryTrackRes;

/**
* Created by aixiaofeng on 17/2/6.
*/
public class FedroadSpider extends ExpressSpider {

private static final SimpleDateFormat FMT_COL_DATE = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
private static final SimpleDateFormat FMT_DATE = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

private static QueryTrackApi queryTrackApi;

@Override
public Result<String> doQuery(String express, String expressNo, String attr) {
String deliveryNo = null;
if (StringUtils.isNotBlank(attr) || !StringUtils.lowerCase(expressNo).startsWith("ec")) {
deliveryNo = queryDeliveryNoByApi(express, expressNo, attr);
}
return queryByPage(express, StringUtils.isNotBlank(deliveryNo) ? deliveryNo : expressNo, attr);
}

private String queryDeliveryNoByApi(String express, String expressNo, String attr) {
if (queryTrackApi == null) {
try {
queryTrackApi = ServiceFactory.getBean(QueryTrackApi.class);
} catch (BeansException e) {
queryTrackApi = new QueryTrackApi();
}
if (queryTrackApi == null) {
queryTrackApi = new QueryTrackApi();
}
}
QueryTrackReq reqTrack = new QueryTrackReq();
reqTrack.getParameters().setPackageNo(expressNo);
Result<QueryTrackRes> res = queryTrackApi.doRequest(reqTrack);
if (res.isSuccess() && res.getData() != null && res.getData().getTrackList() != null
&& CommonUtil.isNotEmpty(res.getData().getTrackList().getTrackList())) {
return res.getData().getTrackList().getTrackList().get(0).getDeliveryNo();
}
return null;
}

private Result<String> queryByPage(String express, String expressNo, String attr) {
Result<String> result = new Result<>();
String res = "";
String BOUNDARY = UUID.randomUUID().toString();
String urlStr = "https://www.fedroad.com";//访问页面
try {
StringBuilder strBuilder = new StringBuilder();
//请求链接,拿到document
HttpURLConnection conn = null;
Connection connection = HttpUtils.getConnection(urlStr);
Connection.Response response = connection.method(Connection.Method.GET).execute();
Document document = response.parse();
//定位到form表单
Elements formDocuments = document.select("#aspnetForm");

//获取conn连接
URL url = new URL(urlStr);
conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(5000);
conn.setReadTimeout(30000);
conn.setDoOutput(true);
conn.setDoInput(true);
conn.setUseCaches(false);
conn.setRequestMethod("POST");
conn.setRequestProperty("Connection", "Keep-Alive");
conn.setRequestProperty("User-Agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36");
conn.setRequestProperty("Content-Type", "multipart/form-data; boundary=" + BOUNDARY);

OutputStream out = new DataOutputStream(conn.getOutputStream());
//拼POST装请求参数
for (int i = 0; i < formDocuments.select("input").size(); i++) {
if (formDocuments.select("input").get(i).attr("class").contains("user_loginout")) {
continue;
}
strBuilder.append("--" + BOUNDARY + "\r\n");
strBuilder.append("Content-Disposition: form-data; name=\"" + formDocuments.select("input").get(i).attr("name") + "\"" + "\r\n\r\n");

if (formDocuments.select("input").get(i).attr("name").contains("search_shippingorder")) {
strBuilder.append(expressNo + "\r\n");
} else {
strBuilder.append(formDocuments.select("input").get(i).val() + "\r\n");
}
}
strBuilder.append("--" + BOUNDARY + "--");
out.write(strBuilder.toString().getBytes());
byte[] endData = ("\r\n--" + BOUNDARY + "--\r\n").getBytes();
out.write(endData);
out.flush();
out.close();

// 读取返回数据
strBuilder = new StringBuilder();
BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line;
while ((line = reader.readLine()) != null) {
strBuilder.append(line).append("\n");
}
res = strBuilder.toString();
//关闭
reader.close();
//获取返回的document(就是你需要的)
document = Jsoup.parse(res);
Elements trackinfo = document.select(".trackinfo tr");
JSONObject json = new JSONObject();
JSONArray arr = new JSONArray();
通过Jsoup 获取相应的字段 进行组装
for (Element trElement : trackinfo) {
if (trElement.select("td").attr("class").contains("title")) {
continue;
}
Elements tdElement = trElement.getElementsByTag("td");
JSONObject item = new JSONObject();
if (tdElement.get(0).text().trim().isEmpty()) {
continue;
} else {
item.put("time", FMT_DATE.format(FMT_COL_DATE.parse(tdElement.get(0).text().trim())));
}
item.put("context", tdElement.get(1).text().trim());
arr.add(item);
}
json.put("data", arr);
//成功返回
return result.setSuccess(true).setCode(ErrorConstants.SUCCESS).setData(json.toString());
} catch (Exception e) {
result.setCode(ErrorConstants.HTTP_ERR).setMessage(StackTraceUtil.getStackTrace(e));
LOGGER.error(" - doQuery error,express = " + express + "," + expressNo, e);
waitRandom();
}
//拿到抓取到的参数
return result;
}

// 测试
public static void main(String[] args) {
FedroadSpider spider = new FedroadSpider();
Result<String> ret = spider.doQuery(null, "EC000021436MY", null);
System.out.print(ret);
}
}

物流轨迹抓取

标签:rri   exe   byte   osi   tde   continue   tac   timeout   arch   

原文地址:http://www.cnblogs.com/dreammyone/p/7071659.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!