码迷,mamicode.com
首页 > Web开发 > 详细

使用jchardet1.1 判断文件或网页编码

时间:2014-08-27 18:41:28      阅读:208      评论:0      收藏:0      [点我收藏+]

标签:jchardet   java字符编码   字符编码   utf-8   gb2312   

package org.shefron.utils;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.net.URL;
import java.util.Arrays;

import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;

public final class FileCharsetDetector {

	/**
	 *
	 * @param filePath the local filepath
	 * @param langFlag
	 *            nsPSMDetector.CHINESE,
	 *            nsPSMDetector.SIMPLIFIED_CHINESE,
	 *            nsPSMDetector.TRADITIONAL_CHINESE,
	 *            nsPSMDetector.ALL ...
	 * @return
	 * @throws Exception
	 */
	public final static CharsetObj getPriorityCharset(String filePath, int langFlag) throws Exception {
		if(langFlag <1 || langFlag>6){
			langFlag = nsPSMDetector.ALL;
		}

		final CharsetObj charsetObj = new CharsetObj();

		FileInputStream fis = null;
		try {
			fis = new FileInputStream(filePath);

			nsDetector det = new nsDetector(langFlag);
			// Set an observer...
			// The Notify() will be called when a matching charset is found.

			det.Init(new nsICharsetDetectionObserver() {
				public void Notify(String charset) {
					charsetObj.setFound(true);
					charsetObj.setCharset(charset);
				}
			});

			BufferedInputStream imp = new BufferedInputStream(fis);

			byte[] buf = new byte[1024];
			int len;
			boolean done = false;
			boolean isAscii = true;

			while ((len = imp.read(buf, 0, buf.length)) != -1) {

				// Check if the stream is only ascii.
				if (isAscii)
					isAscii = det.isAscii(buf, len);

				// DoIt if non-ascii and not done yet.
				if (!isAscii && !done)
					done = det.DoIt(buf, len, false);
			}
			det.DataEnd();

			if (isAscii) {
				charsetObj.setAscii(true);
			}

			charsetObj.setProbableCharsets(det.getProbableCharsets());


		} catch (Exception e) {
			System.out.println("File handler Error!");
			throw new Exception("File handler Error:"+e.getMessage());
		}

		return charsetObj;

	}

	/**
	 *
	 * @param filePath the local filepath
	 * @return
	 * @throws Exception
	 */
	public final static CharsetObj getPriorityCharset(String filePath) throws Exception{
		return getPriorityCharset(filePath, nsPSMDetector.ALL);

	}

	/**
	 *
	 * @param url the www url
	 * @return
	 * @throws Exception
	 */
	public final static CharsetObj getPriorityCharset(URL url) throws Exception{
		return getPriorityCharset(url, nsPSMDetector.ALL);

	}

	/**
	 *
	 * @param url the www url
	 * @param langFlag
	 * @return
	 * @throws Exception
	 */
	public static CharsetObj getPriorityCharset(URL url, int langFlag) throws Exception {
		if(langFlag <1 || langFlag>6){
			langFlag = nsPSMDetector.ALL;
		}

		final CharsetObj charsetObj = new CharsetObj();

		try {
			nsDetector det = new nsDetector(langFlag);
			// Set an observer...
			// The Notify() will be called when a matching charset is found.

			det.Init(new nsICharsetDetectionObserver() {
				public void Notify(String charset) {
					charsetObj.setFound(true);
					charsetObj.setCharset(charset);
				}
			});

			BufferedInputStream imp = new BufferedInputStream(url.openStream());

			byte[] buf = new byte[1024];
			int len;
			boolean done = false;
			boolean isAscii = true;

			while ((len = imp.read(buf, 0, buf.length)) != -1) {

				// Check if the stream is only ascii.
				if (isAscii)
					isAscii = det.isAscii(buf, len);

				// DoIt if non-ascii and not done yet.
				if (!isAscii && !done)
					done = det.DoIt(buf, len, false);
			}
			det.DataEnd();

			if (isAscii) {
				charsetObj.setAscii(true);
			}

			charsetObj.setProbableCharsets(det.getProbableCharsets());


		} catch (Exception e) {
			System.out.println("File handler Error!");
			throw new Exception("File handler Error:"+e.getMessage());
		}

		return charsetObj;
	}

	public final static class CharsetObj {
		private boolean found = false;
		private String[] probableCharsets = null;
		private String charset = null;
		public String getCharset() {
			return charset;
		}

		@Override
		public String toString() {
			return "CharsetObj [charset=" + this.getCharset() + ", found=" + this.isFound()
					+ ", isAscii=" + this.isAscii() + ", probableCharsets="
					+ Arrays.toString(this.getProbableCharsets()) + "]";
		}

		public void setCharset(String charset) {
			this.charset = charset;
		}

		private boolean isAscii = true;

		public boolean isFound() {
			return found;
		}

		public String[] getProbableCharsets() {
			return probableCharsets;
		}

		public boolean isAscii() {
			return isAscii;
		}

		public void setFound(boolean found) {
			this.found = found;
		}

		public void setProbableCharsets(String[] probableCharsets) {
			this.probableCharsets = probableCharsets;
		}

		public void setAscii(boolean isAscii) {
			this.isAscii = isAscii;
		}

	}

}

使用jchardet1.1 判断文件或网页编码

标签:jchardet   java字符编码   字符编码   utf-8   gb2312   

原文地址:http://blog.csdn.net/shefron/article/details/38874617

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!