码迷,mamicode.com
首页 > Web开发 > 详细

读取网页内容不在出现乱码

时间:2014-05-21 08:01:16      阅读:340      评论:0      收藏:0      [点我收藏+]

标签:blog   class   c   code   java   http   

有没有发现每次读取网页内容时,都要去找网页的编码类型,这次研究出来一个公共的方法,下次读取网页内容时

再不会出现乱码了.

package package org.httpclient;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import org.apache.commons.lang3.ArrayUtils;

import com.auto.generate.log.LogManager;

public class HttpClient {
	
	private String charset ; 
	
	public Cookie[]cookies ;
	
	public String userAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36" ; 
	/**
	 * 换行字符
	 * */
	public static final String LINE_SEPARATOR = System.getProperty("line.separator") ; 
	
	/**
	 * 执行读取网页代码
	 * */
	public String execute(String url){ 
		StringBuffer stringBuffer = new StringBuffer(); 
		try {
			URL u=new URL( url );
			HttpURLConnection conn = (HttpURLConnection)u.openConnection() ; 
			StringBuffer cookieBuffer = new StringBuffer();
			if(null != cookies && cookies.length > 0){
				int length = 0 ;
				for(int x=0;x<cookies.length;x++){
					Cookie cookie = cookies[x];
					if(length != 0){
						cookieBuffer.append("; ") ;
					}
					length++;
					cookieBuffer.append( cookie.getName()).append("=")
					.append(cookie.getValue()) ; 
				} 
			}
			conn.setRequestProperty("User-agent" , userAgent ) ;
			conn.addRequestProperty("Cookie" , cookieBuffer.toString() );  
			conn.connect( ) ; 
			this.charset = conn.getContentType().replaceAll(".+?charset=" , "" )  ;  
			InputStream is = conn.getInputStream() ; 
			BufferedReader bf = new BufferedReader(new InputStreamReader( is , charset ));
			String line = null ;
			while( (line = bf.readLine()) != null ){
				stringBuffer.append( line ).append( LINE_SEPARATOR ) ;  
			}
			String cookie = conn.getHeaderField( "Set-Cookie" ) ;
			splitCookies( cookie ) ; 
			conn.disconnect() ; 
		} catch (Exception e) {
			LogManager.err(  "http请求错误" , e ) ; 
		}
		return stringBuffer.toString( ) ; 
	}
	
	
	
	
	private void splitCookies(String cookie){
		if(null!=cookie){
			String[]cookieArray = cookie.split(";"); 
			for(String cookieStr : cookieArray){
				String[]tempArr = cookieStr.split("=") ; 
				if(tempArr.length == 2){
					cookies = ArrayUtils.add(cookies, new Cookie( tempArr[0], tempArr[1] ));
				}
			}
		}
	}
	/**
	 * 获取当前网页的编码类型
	 * */
	public String getCharset() {
		return charset;
	}
	
	/**
	 * 获取网页cookie消息
	 * */
	public Cookie[] getCookies() {
		return cookies;
	}
	/**
	 * 设置user-Agent
	 * */
	public void setUserAgent(String userAgent) {
		if(null == userAgent || "".equals( userAgent.trim() )){
			return ; 
		}
		this.userAgent = userAgent ; 
	}

	public void setCookies(Cookie[] cookies) {
		this.cookies = cookies;
	}
}


cookie类:

package org.httpclient;

public class Cookie {
	
	private String name ; 
	
	private String value ;
	
	public Cookie(String name, String value) {
		this.name = name;
		this.value = value;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public String getValue() {
		return value;
	}

	public void setValue(String value) {
		this.value = value;
	}
	
}



读取网页内容不在出现乱码,布布扣,bubuko.com

读取网页内容不在出现乱码

标签:blog   class   c   code   java   http   

原文地址:http://blog.csdn.net/hfmbook/article/details/26371127

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!