Dom4j解析XML中遇到的一些问题
2013-03-11 16:45
405 查看
最近在用Dom4j解析XML文件,遇到了一些问题,记录如下:
1. BOM头问题,得到的异常是:
Nested exception: org.xml.sax.SAXParseException: Content is not allowed in prolog.
(1)http://koti.mbnet.fi/akini/java/unicodereader/,里面提供了两个删掉BOM头的方法(我用了第一个):
XmlUtil
1. BOM头问题,得到的异常是:
Nested exception: org.xml.sax.SAXParseException: Content is not allowed in prolog.
(1)http://koti.mbnet.fi/akini/java/unicodereader/,里面提供了两个删掉BOM头的方法(我用了第一个):
XmlUtil
import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PushbackInputStream; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.List; import org.dom4j.DocumentHelper; import org.dom4j.Element; import org.dom4j.XPath; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class XmlUtil extends InputStream { private static final Logger log = (Logger) LoggerFactory .getLogger(XmlUtil.class); private static final int BOM_SIZE = 4; PushbackInputStream internalIn; boolean isInited = false; String defaultEnc; String encoding; public XmlUtil(InputStream in, String defaultEnc) { internalIn = new PushbackInputStream(in, BOM_SIZE); this.defaultEnc = defaultEnc; } public String getDefaultEncoding() { return defaultEnc; } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are unread * back to the stream, only BOM bytes are skipped. */ protected void initXmlBOM() throws IOException { if (isInited) return; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { encoding = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { encoding = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { encoding = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { encoding = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEnc; unread = n; } // log.info("read=" + n + ", unread=" + unread); if (unread > 0) internalIn.unread(bom, (n - unread), unread); isInited = true; } public String getEncoding() { if (!isInited) { try { initXmlBOM(); } catch (IOException ex) { IllegalStateException ise = new IllegalStateException( "Init method failed."); ise.initCause(ise); throw ise; } } return encoding; } public static void removeXmlBomAndComment(String filePath) { XmlUtil uins = null; BufferedReader bufr = null; OutputStreamWriter osw = null; String enc = "ISO-8859-1"; String fileContent = ""; String leftBracket = "<!--"; String rightBracket = "-->"; int leftBracketIndex = 0; int rightBracketIndex = 0; String line = ""; StringBuffer fileContentBuffer = new StringBuffer(); try { // 根据BOM Mark编码方式,对文件进行重新编码 uins = new XmlUtil(new FileInputStream(filePath), enc); enc = uins.getEncoding(); if (enc == null) { bufr = new BufferedReader(new InputStreamReader(uins)); } else { bufr = new BufferedReader(new InputStreamReader(uins, enc)); } while ((line = bufr.readLine()) != null) { fileContentBuffer.append(line); } uins.close(); bufr.close(); // 删除"<!-- -->"格式的注释 fileContent = fileContentBuffer.toString(); leftBracketIndex = fileContent.indexOf(leftBracket); rightBracketIndex = fileContent.indexOf(rightBracket); while (leftBracketIndex < rightBracketIndex && rightBracketIndex != 0) { fileContent = fileContent.substring(0, leftBracketIndex) + fileContent.substring(rightBracketIndex + 3, fileContent.length()); leftBracketIndex = fileContent.indexOf(leftBracket); rightBracketIndex = fileContent.indexOf(rightBracket); } // 将处理过的内容,写入文件 osw = new OutputStreamWriter(new FileOutputStream(filePath)); osw.write(fileContent); osw.flush(); osw.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (uins != null) { try { uins.close(); } catch (IOException e) { e.printStackTrace(); } } if (bufr != null) { try { bufr.close(); } catch (IOException e) { e.printStackTrace(); } } if (osw != null) { try { osw.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * 如果根元素有声明命名空间,通过xpath匹配子元素时,需要特殊处理。 * */ public static List<Element> getNameSpaceElement(Element root, String node) { // 获得节点的命名空间 HashMap<String, String> map = new HashMap<String, String>(); map.put("mvn", root.getNamespaceURI()); XPath xpath = DocumentHelper.createXPath("//mvn:" + node); xpath.setNamespaceURIs(map); @SuppressWarnings("unchecked") List<Element> selectedNodes = (List<Element>) xpath.selectNodes(root .getDocument()); return selectedNodes; } @Override public void close() throws IOException { // init(); isInited = true; internalIn.close(); } @Override public int read() throws IOException { // init(); isInited = true; return internalIn.read(); } }
相关文章推荐
- 解析xml时遇到的一些问题
- dom4j用XPath解析xml 遇到的问题
- 解析xml时遇到的一些问题
- dom4j解析xml时遇到的问题
- 解析xml时遇到的一些问题
- 使用dom4j解析xml文件时遇到一个怪问题
- java xml文件解析时遇到的编码问题
- 用Dom4j解析XML及中文问题
- C++ tinyxml解析小试及VC6.0调试中遇到的问题
- dom4j解析xml遇中文,加载报错问题
- dom4j 使用xpath 解析 persistence.xml 出现xmlns后不能解析问题解决
- XML类似的解析时,会遇到'XXX' 不是 'NCName' 的有效值的问题
- opencv3.3 级联分类器生成xml以及遇到的一些问题
- 用dom4j解析xml文件写入记事本时,解决换行问题
- 使用dom4j解析xml 遇到困难
- DOM解析XML遇到的子节点个数问题
- 解析xml中遇到问题
- VisualSVN Server端创建遇到的一些问题解析
- 用eclipse配置hibernate的hbm.xml文件时遇到的一些问题
- 用Dom4j解析XML及中文问题(一)