您的位置:首页 > 编程语言 > Java开发

java检测文件编码——cpdetector

2015-05-17 22:51 369 查看
cpdetector一个可以自动检测文本编码格式的项目

detector按照“谁最先返回非空的探测结果,就以该结果为准”的原则返回探测到的 字符集编码。

使用需要用到三个第三方JAR包:antlr.jar、chardet.jar和cpdetector.jar

cpDetector是基于统计学原理的,不保证完全正确。

以下是读取xxx.txt文件中的内容,以html的方式返回给浏览器的简单servlet实例。在实现的过程了,遇到的最大问题就是,浏览器打开中文乱码问题,原因是.txt文件保存时的编码不统一,所以在“out.println(new String(buffer, charset));”时charset不能写死,而应该通过某种途径获取.txt文件的编码格式,获取的方式网上主要有以下三种,亲测第三种解决了问题,第一第二中方法都不完善。

package com.hwc.a.servlet;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

public class TxtToHtmlServlet extends HttpServlet {
private static final long serialVersionUID = 1L;

public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
String path = request.getParameter("path");
if (path != null && !"".equals(path)) {
// TODO 开始下载
path = new String(path.getBytes("ISO-8859-1"), "utf-8");
InputStream fis = null;
PrintWriter out = null;
try {
// path是指欲下载的文件的路径。
// File file = new File(request.getRealPath("/")+"/"+path);
File file = new File(path);
// 取得文件名。
String filename = file.getName();
// 取得文件的后缀名。
filename = filename.substring(0, filename.lastIndexOf("."));

// 以流的形式下载文件。
fis = new BufferedInputStream(new FileInputStream(file));
byte[] buffer = new byte[fis.available()];
fis.read(buffer);
// 清空response
response.reset();

String charset = getFileEncode(path);
System.out.println("============getFileEncode charset:" + charset);
if (charset == null) {
charset = getCharset(path);
System.out.println("============getCharset charset:" + charset);
}

response.setHeader("Content-type", "text/html;charset="+ charset);
response.setContentType("text/html;charset=" + charset);
out = response.getWriter();
out.println(new String(buffer, charset));
out.flush();
} catch (IOException ex) {
ex.printStackTrace();
} finally {
if (fis != null) {
fis.close();
}
if (out != null) {
out.close();
}
}
}
}

/**
* 方法一: 仅作参考,不准确
* @param fileName
* @return
* @throws IOException
*/
private String getCharset(String fileName) throws IOException {

BufferedInputStream bin = new BufferedInputStream(new FileInputStream(
fileName));
int p = (bin.read() << 8) + bin.read();

String code = null;

switch (p) {
case 0xefbb:
code = "UTF-8";
break;
case 0xfffe:
code = "Unicode";
break;
case 0xfeff:
code = "UTF-16BE";
break;
default:
code = "GB2312";
}
return code;
}

/**
* 方法二: 仅作参考,不准确
* @param head
* @return
*/
private String codetype(byte[] head) {
byte[] codehead = new byte[4];
// 截取数组
System.arraycopy(head, 0, codehead, 0, 4);
String code = "";
if (head[0] == -1 && head[1] == -2) {
code = "UTF-16";
} else if (head[0] == -2 && head[1] == -1) {
code = "Unicode";
} else if (head[0] == -17 && head[1] == -69 && head[2] == -65)
code = "UTF-8";
else {
code = "gb2312";
}
return code;
}

/**
* 方法三:比较准确,解决了实际问题
* @param filePath
* @return
*/
public static String getFileEncode(String filePath) {
String charsetName = null;
try {
File file = new File(filePath);
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
detector.add(new ParsingDetector(false));
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());
java.nio.charset.Charset charset = null;
charset = detector.detectCodepage(file.toURI().toURL());
if (charset != null) {
charsetName = charset.name();
} else {
charsetName = "UTF-8";
}
} catch (Exception ex) {
ex.printStackTrace();
return null;
}
return charsetName;
}

public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
doGet(request, response);
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: