读取网页内容不在出现乱码

时间：2014-05-21 08:01:16 阅读：340 评论：0 收藏：0 [点我收藏+]

有没有发现每次读取网页内容时，都要去找网页的编码类型，这次研究出来一个公共的方法，下次读取网页内容时

再不会出现乱码了.

package package org.httpclient;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import org.apache.commons.lang3.ArrayUtils;

import com.auto.generate.log.LogManager;

public class HttpClient {
	
	private String charset ; 
	
	public Cookie[]cookies ;
	
	public String userAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36" ; 
	/**
	 * 换行字符
	 * */
	public static final String LINE_SEPARATOR = System.getProperty("line.separator") ; 
	
	/**
	 * 执行读取网页代码
	 * */
	public String execute(String url){ 
		StringBuffer stringBuffer = new StringBuffer(); 
		try {
			URL u=new URL( url );
			HttpURLConnection conn = (HttpURLConnection)u.openConnection() ; 
			StringBuffer cookieBuffer = new StringBuffer();
			if(null != cookies && cookies.length > 0){
				int length = 0 ;
				for(int x=0;x<cookies.length;x++){
					Cookie cookie = cookies[x];
					if(length != 0){
						cookieBuffer.append("; ") ;
					}
					length++;
					cookieBuffer.append( cookie.getName()).append("=")
					.append(cookie.getValue()) ; 
				} 
			}
			conn.setRequestProperty("User-agent" , userAgent ) ;
			conn.addRequestProperty("Cookie" , cookieBuffer.toString() );  
			conn.connect( ) ; 
			this.charset = conn.getContentType().replaceAll(".+?charset=" , "" )  ;  
			InputStream is = conn.getInputStream() ; 
			BufferedReader bf = new BufferedReader(new InputStreamReader( is , charset ));
			String line = null ;
			while( (line = bf.readLine()) != null ){
				stringBuffer.append( line ).append( LINE_SEPARATOR ) ;  
			}
			String cookie = conn.getHeaderField( "Set-Cookie" ) ;
			splitCookies( cookie ) ; 
			conn.disconnect() ; 
		} catch (Exception e) {
			LogManager.err(  "http请求错误" , e ) ; 
		}
		return stringBuffer.toString( ) ; 
	}
	
	
	
	
	private void splitCookies(String cookie){
		if(null!=cookie){
			String[]cookieArray = cookie.split(";"); 
			for(String cookieStr : cookieArray){
				String[]tempArr = cookieStr.split("=") ; 
				if(tempArr.length == 2){
					cookies = ArrayUtils.add(cookies, new Cookie( tempArr[0], tempArr[1] ));
				}
			}
		}
	}
	/**
	 * 获取当前网页的编码类型
	 * */
	public String getCharset() {
		return charset;
	}
	
	/**
	 * 获取网页cookie消息
	 * */
	public Cookie[] getCookies() {
		return cookies;
	}
	/**
	 * 设置user-Agent
	 * */
	public void setUserAgent(String userAgent) {
		if(null == userAgent || "".equals( userAgent.trim() )){
			return ; 
		}
		this.userAgent = userAgent ; 
	}

	public void setCookies(Cookie[] cookies) {
		this.cookies = cookies;
	}
}

cookie类：

package org.httpclient;

public class Cookie {
	
	private String name ; 
	
	private String value ;
	
	public Cookie(String name, String value) {
		this.name = name;
		this.value = value;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public String getValue() {
		return value;
	}

	public void setValue(String value) {
		this.value = value;
	}
	
}

读取网页内容不在出现乱码,布布扣,bubuko.com

读取网页内容不在出现乱码

标签：blog class c code java http

原文地址：http://blog.csdn.net/hfmbook/article/details/26371127

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行