码迷,mamicode.com
首页 > 编程语言 > 详细

Java HttpClient(4.2) 爬虫代码

时间:2015-11-26 12:34:39      阅读:190      评论:0      收藏:0      [点我收藏+]

标签:

package spider;

import java.io.BufferedReader;

import java.io.ByteArrayOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.util.ArrayList;

import java.util.Collection;

import java.util.Date;

import java.util.zip.GZIPInputStream;



import javax.net.ssl.SSLHandshakeException;



import org.apache.commons.lang.StringUtils;

import org.apache.http.Header;

import org.apache.http.HttpEntity;

import org.apache.http.HttpEntityEnclosingRequest;

import org.apache.http.HttpHost;

import org.apache.http.HttpRequest;

import org.apache.http.HttpResponse;

import org.apache.http.HttpVersion;

import org.apache.http.NoHttpResponseException;

import org.apache.http.ParseException;

import org.apache.http.StatusLine;

import org.apache.http.client.ClientProtocolException;

import org.apache.http.client.HttpRequestRetryHandler;

import org.apache.http.client.entity.GzipDecompressingEntity;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.client.params.ClientPNames;

import org.apache.http.client.params.CookiePolicy;

import org.apache.http.conn.params.ConnManagerParams;

import org.apache.http.conn.params.ConnRoutePNames;

import org.apache.http.conn.routing.HttpRoute;

import org.apache.http.conn.scheme.PlainSocketFactory;

import org.apache.http.conn.scheme.Scheme;

import org.apache.http.conn.scheme.SchemeRegistry;

import org.apache.http.conn.ssl.SSLSocketFactory;

import org.apache.http.entity.ContentType;

import org.apache.http.impl.client.DefaultHttpClient;

import org.apache.http.impl.conn.PoolingClientConnectionManager;

import org.apache.http.message.BasicHeader;

import org.apache.http.params.BasicHttpParams;

import org.apache.http.params.CoreConnectionPNames;

import org.apache.http.params.CoreProtocolPNames;

import org.apache.http.params.HttpParams;

import org.apache.http.protocol.ExecutionContext;

import org.apache.http.protocol.HttpContext;

import org.apache.http.util.EntityUtils;





/**

 * http连接、抓取管理类

 * @author lidongyang

 * @createtime Oct 18, 2012 1:55:18 PM

 * 

 * @note 基本测试版

 */

public class HttpConnectionManager {

    

    /** 

     * 连接池里的最大连接数

     */  

    public static final int MAX_TOTAL_CONNECTIONS = 100;

    

    /** 

     * 每个路由的默认最大连接数

     */  

    public static final int MAX_ROUTE_CONNECTIONS = 50;

    

    /** 

     * 连接超时时间

     */  

    public static final int CONNECT_TIMEOUT = 50000;

    

    /**

     * 套接字超时时间

     */

    public static final int SOCKET_TIMEOUT = 50000;

    

    /**

     * 连接池中 连接请求执行被阻塞的超时时间

     */

    public static final long CONN_MANAGER_TIMEOUT = 60000;

    

    /**

     * http连接相关参数

     */

    private static HttpParams parentParams;

    

    /**

     * http线程池管理器

     */

    private static PoolingClientConnectionManager cm;

    

    /**

     * http客户端

     */

    private static DefaultHttpClient httpClient;

    

    /**

     * 默认目标主机

     */

    private static final HttpHost DEFAULT_TARGETHOST = new HttpHost("http://www.qq.com", 80);

    

    /**

     * 初始化http连接池,设置参数、http头等等信息

     */

    static {

        SchemeRegistry schemeRegistry = new SchemeRegistry();

        schemeRegistry.register(

                 new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));

        schemeRegistry.register(

                 new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));



        cm = new PoolingClientConnectionManager(schemeRegistry);



        cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);

        

        cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);



        cm.setMaxPerRoute(new HttpRoute(DEFAULT_TARGETHOST), 20);        //设置对目标主机的最大连接数

        

        parentParams = new BasicHttpParams(); 

        parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);



        parentParams.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST);    //设置默认targetHost

        

        parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);

        

        parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT);

        parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT);

        parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT);

        

        parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, true);

        parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS, true);

        

        //设置头信息,模拟浏览器

        Collection

 collection = new ArrayList

();

        collection.add(new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"));

        collection.add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));

        collection.add(new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5"));

        collection.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7"));

        collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate"));

        

        parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection);

        //请求重试处理

        HttpRequestRetryHandler httpRequestRetryHandler = new HttpRequestRetryHandler() {

            public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {

                if (executionCount >= 5) {

                    // 如果超过最大重试次数,那么就不要继续了

                    return false;

                }

                if (exception instanceof NoHttpResponseException) {

                    // 如果服务器丢掉了连接,那么就重试

                    return true;

                }

                if (exception instanceof SSLHandshakeException) {

                    // 不要重试SSL握手异常

                    return false;

                }

                HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);

                boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);

                if (idempotent) {

                    // 如果请求被认为是幂等的,那么就重试

                    return true;

                }

                return false;

            }

        };

        

        httpClient = new DefaultHttpClient(cm, parentParams);

        

        httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler);

    }

    

    /**

     * 抓取页面代码

     * @param url 目标页面的url

     * @return 页面代码

     */

    public String getHtml(String url) {

        HttpHost proxyHost = new HttpHost("211.142.236.137", 8080);//代理

        

        String html = getHtml(url, proxyHost);

        

        int count = 0;

        while(StringUtils.isEmpty(html)){

            proxyHost = new HttpHost("211.142.236.137", 80);//更换代理

            html = getHtml(url, proxyHost);

            count++;

            if(count > 3){

                System.out.println("抓取失败");

                break;

            }

        }

        

System.out.println(html.length());

        return html;

    }

    

    /**

     * 抓取url所指的页面代码

     * @param url 目标页面的url

     * @return 页面代码

     */

    public String getHtml(String url, HttpHost proxyHost) {

        String html = "";

        HttpGet httpGet = new HttpGet(url);

        httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);//设置代理

        

        HttpResponse httpResponse;

        HttpEntity httpEntity;

        try {

            httpResponse = httpClient.execute(httpGet);

            

            StatusLine statusLine = httpResponse.getStatusLine();

            int statusCode = statusLine.getStatusCode();

System.out.println(statusCode);

            if(200 != statusCode) {

                return html;

            }

            

            httpEntity = httpResponse.getEntity();

            if(httpEntity != null){

                html = readHtmlContentFromEntity(httpEntity);

            }

        } catch (ClientProtocolException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        } catch (IOException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        } finally {

            if(httpGet != null){

                httpGet.releaseConnection();

            }

        }

        

        return html;

    }

    

    /**

     * 从response返回的实体中读取页面代码

     * @param httpEntity Http实体

     * @return 页面代码

     * @throws ParseException

     * @throws IOException

     */

    private String readHtmlContentFromEntity(HttpEntity httpEntity) throws ParseException, IOException {

        String html = "";

        Header header = httpEntity.getContentEncoding();

        if(httpEntity.getContentLength() < 2147483647L){            //EntityUtils无法处理ContentLength超过2147483647L的Entity

            if(header != null && "gzip".equals(header.getValue())){

                html = EntityUtils.toString(new GzipDecompressingEntity(httpEntity));

            } else {

                html = EntityUtils.toString(httpEntity);

            }

        } else {

            InputStream in = httpEntity.getContent();

            if(header != null && "gzip".equals(header.getValue())){

                html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset().toString());

            } else {

                html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset().toString());

            }

            if(in != null){

                in.close();

            }

        }

        return html;

    }

    

    /**

     * 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样)

     * @param httpHost 封装了代理的ip地址和端口

     * @param url 用来测试的页面

     * @return true 可用 false 不可用

     */

    public boolean isProxyUsable(HttpHost proxyHost, String url) {

        HttpGet httpGet = new HttpGet(url);

        httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);

        try {

            HttpResponse httpResponse = httpClient.execute(httpGet);

            

            StatusLine statusLine = httpResponse.getStatusLine();

            int statusCode = statusLine.getStatusCode();

System.out.println(statusCode);

            if(200 != statusCode) {

                return false;

            }

            HttpEntity httpEntity = httpResponse.getEntity();

            if(httpEntity != null) {

                String html = readHtmlContentFromEntity(httpEntity);

System.out.println(html.length());

                if(StringUtils.isEmpty(html)){

                    return false;

                }

            } else {

                return false;

            }

            

        } catch (ClientProtocolException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

            return false;

        } catch (IOException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

            return false;

        }

        

        return true;

    }

    

    /**

     * 解压服务器返回的gzip流

     * @param in 抓取返回的InputStream流

     * @param charSet 页面内容编码

     * @return 页面内容的String格式

     * @throws IOException

     */

    private String unZip(InputStream in, String charSet) throws IOException {

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        GZIPInputStream gis = null;

        try {

            gis = new GZIPInputStream(in);

            byte[] _byte = new byte[1024];

            int len = 0;

            while ((len = gis.read(_byte)) != -1) {

                baos.write(_byte, 0, len);

            }

            String unzipString = new String(baos.toByteArray(), charSet);

            return unzipString;

        } finally {

            if (gis != null) {

                gis.close();

            }

            if(baos != null){

                baos.close();

            }

        }

    }

    

    /**

     * 读取InputStream流

     * @param in InputStream流

     * @return 从流中读取的String

     * @throws IOException

     */

    private String readInStreamToString(InputStream in, String charSet) throws IOException {

        StringBuilder str = new StringBuilder();

        String line;

        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in, charSet));

        while((line = bufferedReader.readLine()) != null){

            str.append(line);

            str.append("\n");

        }

        if(bufferedReader != null) {

            bufferedReader.close();

        }

        return str.toString();

    }

    

    /**

     * for test

     * @author lidongyang

     * @createtime Oct 18, 2012 2:35:09 PM

     */

    public class Test implements Runnable {

        String url;

        int threadNum;

        

        public Test() {

            

        }

        

        public Test(String url, int threadNum) {

            this.url = url;

            this.threadNum = threadNum;

        }

        

        @Override

        public void run() {

            getHtml(url);

        }

    }

    

    

    /**

     * for test

     * @param args

     * @throws InterruptedException 

     */

    public static void main(String[] args) throws InterruptedException{

        HttpConnectionManager httpConnectionManager = new HttpConnectionManager();

        Date start = new Date();

        httpConnectionManager.getHtml("http://www.qq.com");

        Date end = new Date();

        System.out.println((end.getTime() - start.getTime())/1000.0 + " 秒");

    }

}

 

Java HttpClient(4.2) 爬虫代码

标签:

原文地址:http://www.cnblogs.com/iamverygood/p/4996998.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!