您的位置:首页 > 其它

Dom4j解析XML中遇到的一些问题

2013-03-11 16:45 405 查看
最近在用Dom4j解析XML文件,遇到了一些问题,记录如下:

1. BOM头问题,得到的异常是:

Nested exception: org.xml.sax.SAXParseException: Content is not allowed in prolog.

(1)http://koti.mbnet.fi/akini/java/unicodereader/,里面提供了两个删掉BOM头的方法(我用了第一个):

XmlUtil

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PushbackInputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.List;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.XPath;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class XmlUtil extends InputStream {
private static final Logger log = (Logger) LoggerFactory
.getLogger(XmlUtil.class);
private static final int BOM_SIZE = 4;
PushbackInputStream internalIn;
boolean isInited = false;
String defaultEnc;
String encoding;

public XmlUtil(InputStream in, String defaultEnc) {
internalIn = new PushbackInputStream(in, BOM_SIZE);
this.defaultEnc = defaultEnc;
}

public String getDefaultEncoding() {
return defaultEnc;
}

/**
* Read-ahead four bytes and check for BOM marks. Extra bytes are unread
* back to the stream, only BOM bytes are skipped.
*/
protected void initXmlBOM() throws IOException {
if (isInited)
return;

byte bom[] = new byte[BOM_SIZE];
int n, unread;
n = internalIn.read(bom, 0, bom.length);

if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
encoding = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
encoding = "UTF-32LE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
&& (bom[2] == (byte) 0xBF)) {
encoding = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encoding = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encoding = "UTF-16LE";
unread = n - 2;
} else {
// Unicode BOM mark not found, unread all bytes
encoding = defaultEnc;
unread = n;
}
// log.info("read=" + n + ", unread=" + unread);

if (unread > 0)
internalIn.unread(bom, (n - unread), unread);

isInited = true;
}

public String getEncoding() {
if (!isInited) {
try {
initXmlBOM();
} catch (IOException ex) {
IllegalStateException ise = new IllegalStateException(
"Init method failed.");
ise.initCause(ise);
throw ise;
}
}
return encoding;
}

public static void removeXmlBomAndComment(String filePath) {
XmlUtil uins = null;
BufferedReader bufr = null;
OutputStreamWriter osw = null;
String enc = "ISO-8859-1";

String fileContent = "";
String leftBracket = "<!--";
String rightBracket = "-->";
int leftBracketIndex = 0;
int rightBracketIndex = 0;

String line = "";
StringBuffer fileContentBuffer = new StringBuffer();
try {
// 根据BOM Mark编码方式,对文件进行重新编码
uins = new XmlUtil(new FileInputStream(filePath), enc);
enc = uins.getEncoding();

if (enc == null) {
bufr = new BufferedReader(new InputStreamReader(uins));
} else {
bufr = new BufferedReader(new InputStreamReader(uins, enc));
}

while ((line = bufr.readLine()) != null) {
fileContentBuffer.append(line);
}
uins.close();
bufr.close();

// 删除"<!-- -->"格式的注释
fileContent = fileContentBuffer.toString();
leftBracketIndex = fileContent.indexOf(leftBracket);
rightBracketIndex = fileContent.indexOf(rightBracket);
while (leftBracketIndex < rightBracketIndex
&& rightBracketIndex != 0) {
fileContent = fileContent.substring(0, leftBracketIndex)
+ fileContent.substring(rightBracketIndex + 3,
fileContent.length());
leftBracketIndex = fileContent.indexOf(leftBracket);
rightBracketIndex = fileContent.indexOf(rightBracket);
}

// 将处理过的内容,写入文件
osw = new OutputStreamWriter(new FileOutputStream(filePath));
osw.write(fileContent);
osw.flush();
osw.close();

} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (uins != null) {
try {
uins.close();
} catch (IOException e) {
e.printStackTrace();
}
}

if (bufr != null) {
try {
bufr.close();
} catch (IOException e) {
e.printStackTrace();
}
}

if (osw != null) {
try {
osw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

/**
* 如果根元素有声明命名空间,通过xpath匹配子元素时,需要特殊处理。
* */
public static List<Element> getNameSpaceElement(Element root, String node) {
// 获得节点的命名空间
HashMap<String, String> map = new HashMap<String, String>();
map.put("mvn", root.getNamespaceURI());
XPath xpath = DocumentHelper.createXPath("//mvn:" + node);
xpath.setNamespaceURIs(map);

@SuppressWarnings("unchecked")
List<Element> selectedNodes = (List<Element>) xpath.selectNodes(root
.getDocument());
return selectedNodes;
}

@Override
public void close() throws IOException {
// init();
isInited = true;
internalIn.close();
}

@Override
public int read() throws IOException {
// init();
isInited = true;
return internalIn.read();
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: