识别文件编码
2017-08-15 15:33
741 查看
概述
项目中经常遇到上传文本文件,当含有中文时,由于编码就会出现乱码,其根本原因就是用户上传文件的编码与解析的编码不一致.基本有两种解决方案规定用户上传的文件的编码
自动识别文件编码
对于第一种的解决方法简单粗暴,通常的做法是提供一个规定了默认编码示例文件供用户下载,但是这种的不确定性因素比较大,因此考虑通用的自动识别也是有必要的.自动识别文件编码的工具包有很多,仅摘取几例学习.
自动识别编码工具包其基本原理就是取一串字节流,然后根据各个不同编码集的编码规则依次进行匹配判断.为了简化操作,不采用真实的web环境,直接使用本地文件测试(因为web传递的字节流,更简单的,直接使用字节数组测试).
example
识别工具类有很多,此处举例仅作参考测试主要以ansi,unicode,unicode big endian,utf-8,以文件流的形式进行测试
还有另一种简化操作,使用字节数组测试,为了取到与文件流相同的效果,将字节数组写入流中
/* 对于需要重复读取的流(判断编码取一次,获取内容取一次),需要使用支持reset的流. 注:有些解析器支持字节数组,但是处理字节数组与处理流是有区别的,可能会得到不同的结果*/ BufferedInputStream in = new BufferedInputStream(new ByteArrayInputStream(content.getBytes("GBK")));
tika
package charset; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import org.apache.tika.detect.AutoDetectReader; import org.junit.Test; public class ParseCharset { public static String content = "中国"; @Test public void parseByTika() { AutoDetectReader detect = null; InputStream in = null; try { in = new FileInputStream("C:\\Users\\admin\\Desktop\\temp\\test.txt"); //detect = new AutoDetectReader(getInputStream(charsetName)); detect = new AutoDetectReader(in); Charset charset = detect.getCharset(); //System.out.println(charset.name()); String row = null; while ((row = detect.readLine()) != null) { if (!charset.name().startsWith("UTF")) row = new String(row.getBytes(charset.name()), "GBK"); System.out.println("charset : " + charset.name() +"; content : "+ row); } } catch (Exception e) { e.printStackTrace(); } finally { try { in.close(); detect.close(); } catch (IOException e) { e.printStackTrace(); } } /***************运行结果****************/ /* unicode big endian charset : UTF-16BE; content : 中国 ansi charset : IBM855; content : 中国 unicode charset : UTF-16LE; content : 中国 utf-8 charset : UTF-8; content : 中国 注:一般解析不出来,当ISO-8859-1(字节编码,数据不会丢失)处理 */ /***************相关依赖****************/ /* * pom依赖 * <!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core --> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> <version>1.16</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers --> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-parsers</artifactId> <version>1.16</version> </dependency> *解析基本所有常见格式的文件,得到文件的metadata,content等内容,返回格式化信息 *解析的内容有 文件格式,文件内容,文件编码,字符串语言等 * * */ } }
tika解析的核心源码,AutoDetectReader配置了三种解析器Icu4jEncodingDetector,UniversalEncodingDetector ,HtmlEncodingDetector,轮询解析,以UniversalEncodingDetector 为例
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.txt; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.metadata.Metadata; public class UniversalEncodingDetector implements EncodingDetector { private static final int BUFSIZE = 1024; private static final int LOOKAHEAD = 16 * BUFSIZE; public Charset detect(InputStream input, Metadata metadata) throws IOException { if (input == null) { return null; } input.mark(LOOKAHEAD); try { UniversalEncodingListener listener = new UniversalEncodingListener(metadata); byte[] b = new byte[BUFSIZE]; int n = 0; int m = input.read(b); while (m != -1 && n < LOOKAHEAD && !listener.isDone()) { n += m; listener.handleData(b, 0, m); m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n)); } return listener.dataEnd(); } catch (LinkageError e) { return null; // juniversalchardet is not available } finally { input.reset(); } } }
重点关注的listener.handleData
if (this.done) { return; } if (length > 0) { this.gotData = true; } if (this.start) { this.start = false; if (length > 3) { int b1 = buf[offset] & 0xFF; int b2 = buf[offset+1] & 0xFF; int b3 = buf[offset+2] & 0xFF; int b4 = buf[offset+3] & 0xFF; //判断规则 switch (b1) { case 0xEF: if (b2 == 0xBB && b3 == 0xBF) { this.detectedCharset = Constants.CHARSET_UTF_8; } break; case 0xFE: if (b2 == 0xFF && b3 == 0x00 && b4 == 0x00) { this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_3412; } else if (b2 == 0xFF) { this.detectedCharset = Constants.CHARSET_UTF_16BE; } break; case 0x00: if (b2 == 0x00 && b3 == 0xFE && b4 == 0xFF) { this.detectedCharset = Constants.CHARSET_UTF_32BE; } else if (b2 == 0x00 && b3 == 0xFF && b4 == 0xFE) { this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_2143; } da8f break; case 0xFF: if (b2 == 0xFE && b3 == 0x00 && b4 == 0x00) { this.detectedCharset = Constants.CHARSET_UTF_32LE; } else if (b2 == 0xFE) { this.detectedCharset = Constants.CHARSET_UTF_16LE; } break; } // swich end if (this.detectedCharset != null) { this.done = true; return; } } } // if (start) end int maxPos = offset + length; for (int i=offset; i<maxPos; ++i) { int c = buf[i] & 0xFF; if ((c & 0x80) != 0 && c != 0xA0) { if (this.inputState != InputState.HIGHBYTE) { this.inputState = InputState.HIGHBYTE; if (this.escCharsetProber != null) { this.escCharsetProber = null; } if (this.probers[0] == null) { this.probers[0] = new MBCSGroupProber(); } if (this.probers[1] == null) { this.probers[1] = new SBCSGroupProber(); } if (this.probers[2] == null) { this.probers[2] = new Latin1Prober(); } } } else { if (this.inputState == InputState.PURE_ASCII && (c == 0x1B || (c == 0x7B && this.lastChar == 0x7E))) { this.inputState = InputState.ESC_ASCII; } this.lastChar = buf[i]; } } // for end CharsetProber.ProbingState st; if (this.inputState == InputState.ESC_ASCII) { if (this.escCharsetProber == null) { this.escCharsetProber = new EscCharsetProber(); } st = this.escCharsetProber.handleData(buf, offset, length); if (st == CharsetProber.ProbingState.FOUND_IT) { this.done = true; this.detectedCharset = this.escCharsetProber.getCharSetName(); } } else if (this.inputState == InputState.HIGHBYTE) { for (int i=0; i<this.probers.length; ++i) { st = this.probers[i].handleData(buf, offset, length); if (st == CharsetProber.ProbingState.FOUND_IT) { this.done = true; this.detectedCharset = this.probers[i].getCharSetName(); return; } } } else { // pure ascii // do nothing }其实各种不同解析器,基本都配备多个编码解析器,一个个进行匹配,都匹配不上,则返回默认(比如AutoDetectReader的ISO-8859-1,或者设置默认),因为更关注实现这种功能的思路,而并不是各种编码之间的区别,故对最底层的解析判断不进行深入研究.
cpdetector
cpdetector是一个开源的字符检测工具(主页)public void parseByIo() { try { File file = new File("C:\\Users\\admin\\Desktop\\temp\\test.txt"); CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); //相关解析器 detector.add(new ByteOrderMarkDetector()); detector.add(JChardetFacade.getInstance()); detector.add(new ParsingDetector(true)); detector.add(ASCIIDetector.getInstance()); detector.add(UnicodeDetector.getInstance()); //获取编码 java.nio.charset.Charset charset = null; charset = detector.detectCodepage(file.toURI().toURL()); //读取文本内容 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); String content = reader.readLine(); if (!charset.name().startsWith("UTF")) content = new String(content.getBytes(charset.name()), "GBK"); System.out.println("charset : " + charset.name() +"; content : "+ content); reader.close(); /***************运行结果****************/ /* unicode big endian charset : UTF-16BE; content : 中国 ansi charset : windows-1252; content : 中国 unicode charset : UTF-16LE; content : 中国 utf-8 charset : UTF-8; content : 中国 */ /***************相关依赖****************/ /*https://sourceforge.net/projects/cpdetector/files/cpdetector/javadoc/ * 下载相关jar,引入项目中 * antlr-2.7.4.jar * chardet-1.0.jar * cpdetector-1.0.10.jar * jargs-1.0.jar * */ } catch (Exception e) { e.printStackTrace(); } }这个源码没有,所以只能用,无法究竟其原理
TikaEncodingDetector
public void parseByany23() { InputStream in = null; try { in = new FileInputStream("C:\\Users\\admin\\Desktop\\temp\\test.txt"); TikaEncodingDetector detector = new TikaEncodingDetector(); String guessEncoding = detector.guessEncoding(in); String preGuessEncoding=guessEncoding; if (!guessEncoding.startsWith("UTF")) { guessEncoding = "GBK"; } in.close(); //读取文本内容 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("C:\\Users\\admin\\Desktop\\temp\\test.txt"), guessEncoding)); String content = reader.readLine(); System.out.println("charset : " + preGuessEncoding +"; content : "+ content); reader.close(); } catch (Exception e) { e.printStackTrace(); } finally { try { in.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /***************运行结果****************/ /* unicode big endian charset : UTF-16BE; content : 中国 ansi charset : IBM420_ltr; content : 中国 unicode charset : UTF-16LE; content : 中国 utf-8 charset : UTF-8; content : 中国 注:此方法只为单纯获取编码,但是不能再获取编码的获取编码内容,需要打开文件2次(fileInputStream不支持reset) */ /***************相关依赖****************/ /* * pom依赖 <!-- https://mvnrepository.com/artifact/org.apache.any23/apache-any23-encoding --> <dependency> <groupId>org.apache.any23</groupId> <artifactId>apache-any23-encoding</artifactId> <version>1.1</version> </dependency> * */ }核心解析
/** * Return an array of all charsets that appear to be plausible * matches with the input data. The array is ordered with the * best quality match first. * <p> * Raise an exception if * <ul> * <li>no charsets appear to match the input data.</li> * <li>no input text has been provided</li> * </ul> * * @return An array of CharsetMatch objects representing possibly matching charsets. * @stable ICU 3.4 */ public CharsetMatch[] detectAll() { CharsetRecognizer csr; int i; CharsetMatch charsetMatch; int confidence; ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>(); // Iterate over all possible charsets, remember all that // give a match quality > 0. for (i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { csr = ALL_CS_RECOGNIZERS.get(i).recognizer; charsetMatch = csr.match(this); if (charsetMatch != null) { confidence = charsetMatch.getConfidence() & 0x000000ff; if (confidence > 0) { // Just to be safe, constrain confidence = Math.min(confidence, MAX_CONFIDENCE); // Apply charset hint. if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) { // Reduce lack of confidence (delta between "sure" and current) by 50%. confidence += (MAX_CONFIDENCE - confidence) / 2; } CharsetMatch m = new CharsetMatch(this, csr, confidence, charsetMatch.getName(), charsetMatch.getLanguage()); matches.add(m); } } } Collections.sort(matches); // CharsetMatch compares on confidence Collections.reverse(matches); // Put best match first. CharsetMatch[] resultArray = new CharsetMatch[matches.size()]; resultArray = matches.toArray(resultArray); return resultArray; }
以上原理tika还有个解析类也是同理
@Test public void parseByCharsetDetector () { try { BufferedInputStream in = new BufferedInputStream(new FileInputStream(new File("C:\\Users\\admin\\Desktop\\temp\\test.txt"))); System.out.println(in.markSupported()); CharsetDetector detector = new CharsetDetector(); detector.setText(in); CharsetMatch cm = detector.detect(); String charsetName = cm.getName(); BufferedReader reader=null; if (!charsetName.startsWith("UTF")) { reader= new BufferedReader(new InputStreamReader(in, "GBK")); } else { reader = new BufferedReader(cm.getReader()); } String content = reader.readLine(); //读取文本内容 System.out.println("charset : " + charsetName +"; content : "+ content); reader.close(); in.close(); } catch (Exception e) { e.printStackTrace(); } /***************运行结果****************/ /* unicode big endian charset : UTF-16LE; content : 中国 ansi charset : IBM420_ltr; content : 中国 unicode charset : UTF-16BE; content : 中国 utf-8 charset : UTF-8; content : 中国 注:此方法对流的要求必须要允许reset,此处编码为IBM420_ltr,会报错,故需重新读取一次流 */ /***************相关依赖****************/ /* * pom依赖 <!-- https://mvnrepository.com/artifact/org.codehaus.groovy/groovy-all --> <!-- https://mvnrepository.com/artifact/org.apache.any23/apache-any23-encoding --> <dependency> <groupId>org.apache.any23</groupId> <artifactId>apache-any23-encoding</artifactId> <version>1.1</version> </dependency> * */ }
CharsetToolkit
@Test public void parseByCharsetToolkit() { File file = new File("C:\\Users\\admin\\Desktop\\temp\\test.txt"); try { CharsetToolkit detector = new CharsetToolkit(file); detector.setDefaultCharset(Charset.forName("GBK")); Charset charset = detector.getCharset(); BufferedReader reader = detector.getReader(); String content = reader.readLine(); //读取文本内容 if (!charset.name().startsWith("UTF")) content = new String(content.getBytes(charset.name()), "GBK"); System.out.println("charset : " + charset.name() +"; content : "+ content); reader.close(); } catch (Exception e) { e.printStackTrace(); } /***************运行结果****************/ /* unicode big endian charset : UTF-16BE; content : 中国 ansi charset : GBK; content : 中国 unicode charset : UTF-16LE; content : 中国 utf-8 charset : UTF-8; content : 中国 注:此方法对无法解析的都使用期默认的编码,故要设置默认编码 */ /***************相关依赖****************/ /* * pom依赖 <!-- https://mvnrepository.com/artifact/org.codehaus.groovy/groovy-all --> <dependency> <groupId>org.codehaus.groovy</groupId> <artifactId>groovy-all</artifactId> <version>2.4.12</version> </dependency> * */ }核心源码
/** * Guess the encoding of the provided buffer. * If Byte Order Markers are encountered at the beginning of the buffer, we immediately * return the charset implied by this BOM. Otherwise, the file would not be a human * readable text file. * <p> * If there is no BOM, this method tries to discern whether the file is UTF-8 or not. * If it is not UTF-8, we assume the encoding is the default system encoding * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one). * <p> * It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence. * <pre> * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * </pre> * With UTF-8, 0xFE and 0xFF never appear. * * @return the Charset recognized. */ private Charset guessEncoding() { // if the file has a Byte Order Marker, we can assume the file is in UTF-xx // otherwise, the file would not be human readable if (hasUTF8Bom()) return Charset.forName("UTF-8"); if (hasUTF16LEBom()) return Charset.forName("UTF-16LE"); if (hasUTF16BEBom()) return Charset.forName("UTF-16BE"); // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding // otherwise, the file is in US-ASCII boolean highOrderBit = false; // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid // if it's not the case, we can assume the encoding is the default encoding of the system boolean validU8Char = true; // TODO the buffer is not read up to the end, but up to length - 6 int length = buffer.length; int i = 0; while (i < length - 6) { byte b0 = buffer[i]; byte b1 = buffer[i + 1]; byte b2 = buffer[i + 2]; byte b3 = buffer[i + 3]; byte b4 = buffer[i + 4]; byte b5 = buffer[i + 5]; if (b0 < 0) { // a high order bit was encountered, thus the encoding is not US-ASCII // it may be either an 8-bit encoding or UTF-8 highOrderBit = true; // a two-bytes sequence was encountered if (isTwoBytesSequence(b0)) { // there must be one continuation byte of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!isContinuationChar(b1)) validU8Char = false; else i++; } // a three-bytes sequence was encountered else if (isThreeBytesSequence(b0)) { // there must be two continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2))) validU8Char = false; else i += 2; } // a four-bytes sequence was encountered else if (isFourBytesSequence(b0)) { // there must be three continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) validU8Char = false; else i += 3; } // a five-bytes sequence was encountered else if (isFiveBytesSequence(b0)) { // there must be four continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4))) validU8Char = false; else i += 4; } // a six-bytes sequence was encountered else if (isSixBytesSequence(b0)) { // there must be five continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5))) validU8Char = false; else i += 5; } else validU8Char = false; } if (!validU8Char) break; i++; } // if no byte with an high order bit set, the encoding is US-ASCII // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) if (!highOrderBit) { // returns the default charset rather than US-ASCII if the enforce8Bit flag is set. if (this.enforce8Bit) return this.defaultCharset; else return Charset.forName("US-ASCII"); } // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, // otherwise the file would not be human readable if (validU8Char) return Charset.forName("UTF-8"); // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding return this.defaultCharset; }
相关文章推荐
- 识别常见编码格式文件并转换成UTF-8编码的java实现
- php自动识别文件编码并转换为UTF-8的方法
- VIM 文件编码识别与乱码处理
- VIM 文件编码识别与乱码处理
- php自动识别文件编码并转换为UTF-8的方法
- java编程识别文件的编码
- ASP.NET自动识别GB2312与UTF-8编码的文件
- C#如何自动识别文件的编码
- Java 自动识别文件编码
- C#自动识别文件编码
- 十分钟内学会:自动识别GB2312与UTF-8编码的文件
- VIM 文件编码识别与乱码处理
- C++ UTF-8编码识别(分析文件内容,非文件头)
- java识别文件编码格式代码(无引用jar包,测试可行)
- 识别常见编码格式文件并转换成UTF-8编码 的java实现 源码
- 自动识别文件编码
- Vim文件编码识别与乱码处理
- php识别文件编码,并读出内容,对大文件也很有效率
- linux 文件编码识别
- 用file来识别文件的编码方式