标签:
2016年一月,刚做完三个课程设计,C++网络版打地鼠,北山超市收银系统J2EE,JAVA聊天程序,累不堪言,置身奋斗之年承受这些是应该的,毕竟自己的技术还太菜了,没有一个开发者应有的底气。
-------------------------------------- 前记
在此之际,一同事介绍了一个项目,做一个教务信息记录抓取到自己的网页显示之。
--------------------------------------缘由
做这个东西首先也百度了下,网上的文章大多没什么营养,不过也基于网上的文章,整合出了实际可行的一条实现路径。
分析:实现要想抓取网页,抓网页的程序运行规则必须http协议的法则,换句话说,就是你的程序做的事情和浏览器差不多。
实现:首先抓取网页通过java的一些内置类,或者Apache的一些类来实现,例如httpclient。其次如果网页有验证措施,还需要使用相应的容器来存储验证对象(基于JAVA的开发),例如session,cookie。
到这里大概流程:模拟网络蜘蛛访问网页--->是否有验证(有/没有)----->(没有)------->直接抓取网页,在抓取网页时根据自己需要的数据做合适的抓取,没有特殊需求的话建议不要全盘抓取,这会影响抓取速度------>现在剩下的就是用土方法或者正则表达式处理抓到的数据。(有验证)-------->保存cookie,或者session,每次提交查询时构造头信息即可,下列代码直接复制运行可能会报错,自己修改之,现在直接上代码:
servlet执行类:
package src.servlet.cls; import java.io.IOException; import java.net.URLEncoder; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.httpclient.Cookie; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.tomcat.util.net.URL; import src.docatch.cls.CatchDataByCookie; import src.filter.cls.ChickLoad; import src.filter.cls.ViewState; /** * Servlet implementation class redirecter */ @WebServlet("/loadingForCookie") public class LoadingGetCookie extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public LoadingGetCookie() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub response.getWriter().append("Served at: ").append(request.getContextPath()); doPost(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { org.apache.catalina.util.URLEncoder ue = new org.apache.catalina.util.URLEncoder(); String ViewState = new ViewState().getViewState(); String Account = request.getParameter("tbYHM").trim(); String Password = request.getParameter("tbPSW").trim(); String url = "http://jw.fjcc.edu.cn/Default3.aspx?__VIEWSTATE="+ue.encode(ViewState).toString()+"&tbYHM="+Account+"&tbPSW="+Password+"&ddlSF=%D1%A7%C9%FA&imgDL.x=17&imgDL.y=8"; if(Account == null || Password == null){ //url = ""; System.out.println("参数丢失"); return; } ///***************************登录成功 //javax.servlet.http.Cookie mycookie = new javax.servlet.http.Cookie(); Cookie[] cookie = null; //---------------------------------- GetMethod getMethod = new GetMethod(url);//Get方法 HttpClient httpclient = new HttpClient(); httpclient.getHostConfiguration().setHost(url, 80); if(httpclient.executeMethod(getMethod) == 200){ cookie = httpclient.getState().getCookies();//获取曲奇饼干 for (int i = 0; i < cookie.length; i++) { System.out.println("cookiename=="+cookie[i].getName()); System.out.println("cookieValue=="+cookie[i].getValue()); //System.out.println("Domain=="+cookie[i].getDomain()); //System.out.println("Path=="+cookie[i].getPath()); //System.out.println("Version=="+cookie[i].getVersion()); response.addCookie(new javax.servlet.http.Cookie(cookie[i].getName(),cookie[i].getValue()));//重新构造参数 } } String path; ////由于request的不可刷新性,本类不负责检测账户是否是有效登录 //转入下一级处理 response.sendRedirect((path = String.valueOf(request.getPathInfo())).substring(0, path.length()-4)+"tab.jsp?user="+Account);//转向 /* response.setCharacterEncoding("GBK"); try{ //后续访问使用 response.getWriter().println(new CatchDataByCookie("http://jw.fjcc.edu.cn/xscj.aspx?xh="+Account, request).getDataByCookie()); }catch(Exception e){ e.printStackTrace(); response.getWriter().println("请求超时!"); }*/ } }
过滤规则接口:
package src.implement.cls; /* * 集合抓取方法集合 */ public interface catchFromWeb { //抓取方法 public void doCatch(); }
核心抓取器(cookie验证):
package src.docatch.cls; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import javax.servlet.http.HttpServletRequest; import src.implement.cls.catchFromWeb; public class CatchDataByCookie implements catchFromWeb{ //http://jw.fjcc.edu.cn/xskcxxcx.aspx?xh=131702237&type=xs private URL domainUrl; private URLConnection conn; //选择加载 private String indexBreak;//结束的条件 //**************************** InputStreamReader isr;// BufferedReader br;// //---------------------------- private StringBuilder webContext;// // HttpServletRequest req;// //---------------------------- public CatchDataByCookie(String url ,HttpServletRequest request){ ////////////////////////// try { domainUrl = new URL(url); conn = domainUrl.openConnection(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } //------------------------- this.req = request; } //----------------------------- public StringBuilder getDataByCookie(){ doCatch(); return webContext; } //----------------------------- public StringBuilder getDataByCookie(String flag){ indexBreak = flag; doCatch(); return webContext; } @Override public void doCatch() { for(int i = 0;i < req.getCookies().length;i++){ if(req.getCookies()[i] == null){ //System.out.println("锟斤拷取cookie失锟斤拷"); break; } //ASP.NET_SessionId=nlselh45eorhjmv2kydb4k55; conn.setRequestProperty("Cookie", req.getCookies()[i].getName()+"="+req.getCookies()[i].getValue());//锟斤拷取cookie //System.out.println("注入cookie:"+req.getCookies()[i].getValue()); } //---------------------------------------------- conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36"); conn.setRequestProperty("Content-Type", "text/plain; charset=utf-8"); conn.setRequestProperty("Connection", "Keep-Alive"); //----------------------------------------------- String tmpStr = null;// webContext = new StringBuilder(); try { isr = new InputStreamReader(conn.getInputStream(),"GBK"); br = new BufferedReader(isr); while((tmpStr = br.readLine()) != null){ if(indexBreak != null && tmpStr.indexOf(indexBreak) != -1){//没有找到目标 break; } webContext.append(tmpStr); } System.out.println(tmpStr); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ try { br.close(); isr.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
实现了接口的过滤规则“:
package src.filter.cls; import src.docatch.cls.CatchAchievements; public class ViewState { public String getViewState(){ CatchAchievements ca = new CatchAchievements("http://jw.fjcc.edu.cn/"); String result = String.valueOf(ca.getResult("justify"));//输出结果 //********************* result = result.substring(result.indexOf("__VIEWSTATE"), result.length()).trim(); result = result.substring(result.indexOf("value=\"")+7, result.indexOf("/>")-2); return result; } }
标签:
原文地址:http://www.cnblogs.com/homes/p/5169996.html