码迷,mamicode.com
首页 > Web开发 > 详细

记录一次网页抓取程序的实现

时间:2016-01-30 01:48:21      阅读:293      评论:0      收藏:0      [点我收藏+]

标签:

2016年一月,刚做完三个课程设计,C++网络版打地鼠,北山超市收银系统J2EE,JAVA聊天程序,累不堪言,置身奋斗之年承受这些是应该的,毕竟自己的技术还太菜了,没有一个开发者应有的底气。

-------------------------------------- 前记

在此之际,一同事介绍了一个项目,做一个教务信息记录抓取到自己的网页显示之。

--------------------------------------缘由

做这个东西首先也百度了下,网上的文章大多没什么营养,不过也基于网上的文章,整合出了实际可行的一条实现路径。

分析:实现要想抓取网页,抓网页的程序运行规则必须http协议的法则,换句话说,就是你的程序做的事情和浏览器差不多。

实现:首先抓取网页通过java的一些内置类,或者Apache的一些类来实现,例如httpclient。其次如果网页有验证措施,还需要使用相应的容器来存储验证对象(基于JAVA的开发),例如session,cookie。

到这里大概流程:模拟网络蜘蛛访问网页--->是否有验证(有/没有)----->(没有)------->直接抓取网页,在抓取网页时根据自己需要的数据做合适的抓取,没有特殊需求的话建议不要全盘抓取,这会影响抓取速度------>现在剩下的就是用土方法或者正则表达式处理抓到的数据。(有验证)-------->保存cookie,或者session,每次提交查询时构造头信息即可,下列代码直接复制运行可能会报错,自己修改之,现在直接上代码:

servlet执行类:

package src.servlet.cls;

import java.io.IOException;
import java.net.URLEncoder;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.tomcat.util.net.URL;

import src.docatch.cls.CatchDataByCookie;
import src.filter.cls.ChickLoad;
import src.filter.cls.ViewState;

/**
 * Servlet implementation class redirecter
 */
@WebServlet("/loadingForCookie")
public class LoadingGetCookie extends HttpServlet {
	private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public LoadingGetCookie() {
        super();
        // TODO Auto-generated constructor stub
    }

	/**
	 * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
	 */
	protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
		// TODO Auto-generated method stub
		response.getWriter().append("Served at: ").append(request.getContextPath());
		doPost(request, response);
	}

	/**
	 * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
	 */
	protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

		   org.apache.catalina.util.URLEncoder ue = new org.apache.catalina.util.URLEncoder();
	       String ViewState = new ViewState().getViewState();
	       String Account = request.getParameter("tbYHM").trim();
	       String Password = request.getParameter("tbPSW").trim();
	       
	       
	       String url = "http://jw.fjcc.edu.cn/Default3.aspx?__VIEWSTATE="+ue.encode(ViewState).toString()+"&tbYHM="+Account+"&tbPSW="+Password+"&ddlSF=%D1%A7%C9%FA&imgDL.x=17&imgDL.y=8";
	       if(Account == null || Password == null){
	    	   //url = "";
	    	   System.out.println("参数丢失");
	    	   return;
	       }
	       ///***************************登录成功
	       //javax.servlet.http.Cookie mycookie = new javax.servlet.http.Cookie();
	       Cookie[] cookie = null;
	       //----------------------------------
	       GetMethod getMethod = new GetMethod(url);//Get方法
	       HttpClient httpclient = new HttpClient();
	       httpclient.getHostConfiguration().setHost(url, 80);
	       if(httpclient.executeMethod(getMethod) == 200){
	    	   cookie = httpclient.getState().getCookies();//获取曲奇饼干
	    	   for (int i = 0; i < cookie.length; i++) {
	    		   System.out.println("cookiename=="+cookie[i].getName());
	    		   System.out.println("cookieValue=="+cookie[i].getValue());
	    		   //System.out.println("Domain=="+cookie[i].getDomain());
	    		   //System.out.println("Path=="+cookie[i].getPath());
	    		   //System.out.println("Version=="+cookie[i].getVersion());
	    		   response.addCookie(new javax.servlet.http.Cookie(cookie[i].getName(),cookie[i].getValue()));//重新构造参数
	    		   
	    	   }
	    	   
	       }
	       
	       String path;
	       ////由于request的不可刷新性,本类不负责检测账户是否是有效登录 
	       //转入下一级处理
	       response.sendRedirect((path = String.valueOf(request.getPathInfo())).substring(0, path.length()-4)+"tab.jsp?user="+Account);//转向

	       /*
	       response.setCharacterEncoding("GBK");
	       try{
	    	   //后续访问使用
	       response.getWriter().println(new CatchDataByCookie("http://jw.fjcc.edu.cn/xscj.aspx?xh="+Account, request).getDataByCookie());
	       }catch(Exception e){
	    	   e.printStackTrace();
	    	   response.getWriter().println("请求超时!");
	       }*/
	}

}

  过滤规则接口:

package src.implement.cls;

/*
 * 集合抓取方法集合
 */
public interface catchFromWeb {
    //抓取方法
    public void doCatch();

}

 

核心抓取器(cookie验证):

package src.docatch.cls;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

import javax.servlet.http.HttpServletRequest;

import src.implement.cls.catchFromWeb;

public class CatchDataByCookie implements catchFromWeb{

    //http://jw.fjcc.edu.cn/xskcxxcx.aspx?xh=131702237&type=xs
    private URL domainUrl;
    private URLConnection conn;
    //选择加载
    private String indexBreak;//结束的条件
    //****************************
    InputStreamReader isr;//
    BufferedReader br;//
    //----------------------------
    private StringBuilder webContext;//
    //
    HttpServletRequest req;//
    //----------------------------
    public CatchDataByCookie(String url ,HttpServletRequest request){
        //////////////////////////
        try {
            domainUrl = new URL(url);
            conn = domainUrl.openConnection();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        //-------------------------
        this.req = request;
        
    }
    //-----------------------------
    public StringBuilder getDataByCookie(){
        doCatch();
        return webContext;
    }
    //-----------------------------
    public StringBuilder getDataByCookie(String flag){
        indexBreak = flag;
        doCatch();
        return webContext;
    }
    @Override
    public void doCatch() {
        for(int i = 0;i < req.getCookies().length;i++){
            if(req.getCookies()[i] == null){
                //System.out.println("锟斤拷取cookie失锟斤拷");
                break;
            }
            //ASP.NET_SessionId=nlselh45eorhjmv2kydb4k55;
            conn.setRequestProperty("Cookie", req.getCookies()[i].getName()+"="+req.getCookies()[i].getValue());//锟斤拷取cookie
            //System.out.println("注入cookie:"+req.getCookies()[i].getValue());
        }
        //----------------------------------------------
        conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36");
        conn.setRequestProperty("Content-Type", "text/plain; charset=utf-8");
        conn.setRequestProperty("Connection", "Keep-Alive");
        //-----------------------------------------------
        String tmpStr = null;//
        webContext = new StringBuilder();
        try {
            isr = new InputStreamReader(conn.getInputStream(),"GBK");
            br = new BufferedReader(isr);
            while((tmpStr = br.readLine()) != null){
                      if(indexBreak != null && tmpStr.indexOf(indexBreak) != -1){//没有找到目标
                          break;
                      }
                      webContext.append(tmpStr);
                  }
                  System.out.println(tmpStr);

        } catch (UnsupportedEncodingException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            try {
                br.close();
                isr.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }    
        }

        
    }
    
    
}

实现了接口的过滤规则“:

package src.filter.cls;

import src.docatch.cls.CatchAchievements;

public class ViewState {
     public String getViewState(){
        CatchAchievements ca = new CatchAchievements("http://jw.fjcc.edu.cn/");
         String result = String.valueOf(ca.getResult("justify"));//输出结果
         //*********************
         result = result.substring(result.indexOf("__VIEWSTATE"), result.length()).trim();
         result = result.substring(result.indexOf("value=\"")+7, result.indexOf("/>")-2);
         return result;
     }
}

 

记录一次网页抓取程序的实现

标签:

原文地址:http://www.cnblogs.com/homes/p/5169996.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!