您的位置:首页 > Web前端 > JavaScript

让你的Jsoup支持Xpath

2016-09-29 16:31 253 查看
Xpath是专业的xml结构化文档的查询语言,语法功能强大,本文不涉及xpath语法教程。

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据,但是选取某个元素时还是没有xpath那么简单直接,而且xpath带了很多选择库。

然而遗憾的时,jsoup并不支持xpath,于是博主就写了一个让jsoup支持的xpath的工具类,希望能帮助到有需要的朋友!

下图是测试效果

下面贴上源码:

package com.lhh.parse;

import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import com.sun.org.apache.xerces.internal.dom.ElementImpl;

/**
 * Jsoup的xpath解析工具类
 * 
 * @author liuhh
 *
 */
@SuppressWarnings("restriction")
public class JsoupParserUtils {

<span style="white-space:pre">	</span>protected final static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

<span style="white-space:pre">	</span>private final static Logger log = LoggerFactory.getLogger(JsoupParserUtils.class);

<span style="white-space:pre">	</span>private final static XPath xPath = XPathFactory.newInstance().newXPath();

<span style="white-space:pre">	</span>protected static TransformerFactory tf = TransformerFactory.newInstance();

<span style="white-space:pre">	</span>private static final Lock LOCK = new ReentrantLock();

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 得到该节点的子节点个数
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param ele
<span style="white-space:pre">	</span> * @param xpath
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static int getEleChildNum(final org.jsoup.nodes.Element ele, final String xpath) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>Object res = parse(ele, xpath, XPathConstants.NODESET);
<span style="white-space:pre">			</span>if (null != res && res instanceof NodeList) {
<span style="white-space:pre">				</span>NodeList nodeList = (NodeList) res;
<span style="white-space:pre">				</span>return nodeList == null ? 0 : nodeList.getLength();
<span style="white-space:pre">			</span>}
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>log.error("根据xpath:{},获取子节点个数出现错误,错误原因:" + e.getMessage(), xpath);
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return 0;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 判断文档中是否存在xpath节点
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param document
<span style="white-space:pre">	</span> * @param xpath
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static boolean exists(final org.jsoup.nodes.Element ele, final String xpath) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>Object res = parse(ele, xpath, XPathConstants.BOOLEAN);
<span style="white-space:pre">			</span>if (null != res && res instanceof Boolean) {
<span style="white-space:pre">				</span>return (boolean) res;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>return false;
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>log.error("检查xpath:{},是否存在时出现错误,!" + e.getMessage(), xpath);
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return false;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 根据xpath得到w3c的Element对象
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param document
<span style="white-space:pre">	</span> * @param xpath
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static ElementImpl getW3cElementImpl(final org.jsoup.nodes.Element ele, final String xpath) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>Object res = parse(ele, xpath, XPathConstants.NODE);
<span style="white-space:pre">			</span>if (null != res && res instanceof ElementImpl) {
<span style="white-space:pre">				</span>return (ElementImpl) res;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>return null;
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>log.error("根据xpath:{},得到w3c的Element对象出现错误,原因:" + e.getMessage(), xpath);
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return null;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 根据xpath得到jsoup的Element对象
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param document
<span style="white-space:pre">	</span> * @param xpath
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static org.jsoup.nodes.Element getJsoupElement(final org.jsoup.nodes.Element ele, final String xpath) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>Object res = parse(ele, xpath, XPathConstants.NODE);
<span style="white-space:pre">			</span>if (null != res && res instanceof ElementImpl) {
<span style="white-space:pre">				</span>ElementImpl elementImpl = (ElementImpl) res;
<span style="white-space:pre">				</span>return getJsoupEle(elementImpl);
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>return null;
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>log.error("根据xpath:{},得到jsoup的Element对象出现错误,原因:" + e.getMessage(), xpath);
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return null;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 根据xpath得到jsoup的Elements对象
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param document
<span style="white-space:pre">	</span> * @param xpath
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static Elements getJsoupElements(final org.jsoup.nodes.Element ele, final String xpath) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>NodeList nodeList = getNodeList(ele, xpath);
<span style="white-space:pre">			</span>if (null != nodeList && nodeList.getLength() > 0) {
<span style="white-space:pre">				</span>int len = nodeList.getLength();
<span style="white-space:pre">				</span>Elements elements = new Elements();
<span style="white-space:pre">				</span>for (int i = 0; i < len; i++) {
<span style="white-space:pre">					</span>Node node = nodeList.item(i);
<span style="white-space:pre">					</span>if (null != node && node instanceof ElementImpl) {
<span style="white-space:pre">						</span>org.jsoup.node
eafd
s.Element element = getJsoupEle(((ElementImpl) node));
<span style="white-space:pre">						</span>elements.add(element);
<span style="white-space:pre">					</span>}
<span style="white-space:pre">				</span>}
<span style="white-space:pre">				</span>return elements;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>log.error("根据xpath:{},得到jsoup的Element对象出现错误,原因:" + e.getMessage(), xpath);
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return null;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 从Jsoup的Element中解析出W3C的NodeList
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param ele
<span style="white-space:pre">	</span> * @param xpath
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static NodeList getNodeList(final org.jsoup.nodes.Element ele, final String xpath) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>Object res = parse(ele, xpath, XPathConstants.NODESET);
<span style="white-space:pre">			</span>if (null != res && res instanceof NodeList) {
<span style="white-space:pre">				</span>return (NodeList) res;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>log.error(e.getMessage(), e);
<span style="white-space:pre">		</span>}

<span style="white-space:pre">		</span>return null;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 得到节点的某一个属性值
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param document
<span style="white-space:pre">	</span> * @param xpath
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static String getXpathString(final org.jsoup.nodes.Element ele, final String xpath) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>int textNum = getEleChildNum(ele, xpath);
<span style="white-space:pre">			</span>if (1 == textNum) {
<span style="white-space:pre">				</span>Object res = parse(ele, xpath, XPathConstants.STRING);
<span style="white-space:pre">				</span>if (null != res) {
<span style="white-space:pre">					</span>return res.toString();
<span style="white-space:pre">				</span>}
<span style="white-space:pre">			</span>} else {
<span style="white-space:pre">				</span>List<String> res = getXpathListString(ele, xpath);
<span style="white-space:pre">				</span>if (res != null && res.size() > 0) {
<span style="white-space:pre">					</span>StringBuilder stringBuilder = new StringBuilder();
<span style="white-space:pre">					</span>for (Iterator<String> iterator = res.iterator(); iterator.hasNext();) {
<span style="white-space:pre">						</span>String text = iterator.next();
<span style="white-space:pre">						</span>if (null != text) {
<span style="white-space:pre">							</span>stringBuilder.append(text.replace("\r\n", "."));
<span style="white-space:pre">						</span>}
<span style="white-space:pre">					</span>}
<span style="white-space:pre">					</span>return stringBuilder.toString();
<span style="white-space:pre">				</span>}
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>return null;
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>e.printStackTrace();
<span style="white-space:pre">			</span>log.error("根据xpath:{}查询字符串时出现错误:" + e.getMessage(), xpath);
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return null;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 查询字符串列表
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param document
<span style="white-space:pre">	</span> * @param xpath
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static List<String> getXpathListString(final org.jsoup.nodes.Element ele, final String xpath) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>Object res = parse(ele, xpath, XPathConstants.NODESET);
<span style="white-space:pre">			</span>if (null != res && res instanceof NodeList) {
<span style="white-space:pre">				</span>NodeList nodeList = (NodeList) res;
<span style="white-space:pre">				</span>int length = nodeList.getLength();
<span style="white-space:pre">				</span>if (length <= 0) {
<span style="white-space:pre">					</span>return null;
<span style="white-space:pre">				</span>}
<span style="white-space:pre">				</span>List<String> list = new ArrayList<>();
<span style="white-space:pre">				</span>for (int i = 0; i < length; i++) {
<span style="white-space:pre">					</span>Node node = nodeList.item(i);
<span style="white-space:pre">					</span>list.add(null == node ? null : node.getNodeValue());
<span style="white-space:pre">				</span>}
<span style="white-space:pre">				</span>return list;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>return null;
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>log.error("根据xpath:{}查询字符串列表时出现错误:" + e.getMessage(), xpath);
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return null;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 获取xpath解析结果
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param doc
<span style="white-space:pre">	</span> * @param xPathStr
<span style="white-space:pre">	</span> * @param qName
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static Object parse(final org.jsoup.nodes.Element doc, final String xPathStr, final QName qName) {
<span style="white-space:pre">		</span>Node node = fromJsoup(doc);
<span style="white-space:pre">		</span>return parse(node, xPathStr, qName);
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param doc
<span style="white-space:pre">	</span> * @param xPathStr
<span style="white-space:pre">	</span> * @param qName
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static Object parse(final Node doc, final String xPathStr, final QName qName) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>if (doc == null) {
<span style="white-space:pre">				</span>log.warn("解析文档为null!");
<span style="white-space:pre">				</span>return null;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>if (StringUtils.isBlank(xPathStr)) {
<span style="white-space:pre">				</span>log.warn("解析的Xpath路径为空!");
<span style="white-space:pre">				</span>return null;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>if (null == qName) {
<span style="white-space:pre">				</span>log.warn("解析类型为null!");
<span style="white-space:pre">				</span>return null;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>try {
<span style="white-space:pre">				</span>LOCK.lock();
<span style="white-space:pre">				</span>Object res = xPath.evaluate(xPathStr, doc, qName);
<span style="white-space:pre">				</span>return res;
<span style="white-space:pre">			</span>} finally {
<span style="white-space:pre">				</span>// TODO: handle finally clause
<span style="white-space:pre">				</span>LOCK.unlock();
<span style="white-space:pre">			</span>}
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>log.warn("解析Xpath:{},出现错误,解析类型:{},错误原因:{}!", xPathStr, qName, e.getMessage());
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return null;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 根据ElementImpl得到Jsoup的Element
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param elementImpl
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static org.jsoup.nodes.Element getJsoupEle(final ElementImpl elementImpl) {
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>String value = getW3cDocString(elementImpl);
<span style="white-space:pre">			</span>org.jsoup.nodes.Document document = Jsoup.parse(value);
<span style="white-space:pre">			</span>return document.body().child(0);
<span style="white-space:pre">		</span>} catch (Exception e) {
<span style="white-space:pre">			</span>// TODO: handle exception
<span style="white-space:pre">			</span>log.error("根据ElementImpl得到Jsoup的Element出现错误,错误原因:" + e.getMessage());
<span style="white-space:pre">			</span>return null;
<span style="white-space:pre">		</span>}

<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 将w3c的Document转为jsoup的Document
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param in
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static org.jsoup.nodes.Document fromW3C(final Document doc) throws Exception {
<span style="white-space:pre">		</span>String string = getW3cDocString(doc);
<span style="white-space:pre">		</span>org.jsoup.nodes.Document res = Jsoup.parse(string);
<span style="white-space:pre">		</span>return res;

<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 将jsoup的Document转为w3c的Document
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param in
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static Node fromJsoup(final org.jsoup.nodes.Element in) {
<span style="white-space:pre">		</span>DocumentBuilder builder;
<span style="white-space:pre">		</span>try {
<span style="white-space:pre">			</span>if (null == in) {
<span style="white-space:pre">				</span>return null;
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>builder = factory.newDocumentBuilder();
<span style="white-space:pre">			</span>Document out = builder.newDocument();
<span style="white-space:pre">			</span>if (in instanceof org.jsoup.nodes.Document) {
<span style="white-space:pre">				</span>List<org.jsoup.nodes.Node> childs = in.childNodes();
<span style="white-space:pre">				</span>if (childs != null && childs.size() > 0) {
<span style="white-space:pre">					</span>org.jsoup.nodes.Element rootEl = in.child(0);
<span style="white-space:pre">					</span>NodeTraversor traversor = new NodeTraversor(new W3CBuilder(out));
<span style="white-space:pre">					</span>traversor.traverse(rootEl);
<span style="white-space:pre">					</span>return out;
<span style="white-space:pre">				</span>} else {
<span style="white-space:pre">					</span>// out.setNodeValue(in.);
<span style="white-space:pre">					</span>return out;
<span style="white-space:pre">				</span>}
<span style="white-space:pre">			</span>}else if (in instanceof org.jsoup.nodes.Element) {
<span style="white-space:pre">				</span>NodeTraversor traversor = new NodeTraversor(new W3CBuilder(out));
<span style="white-space:pre">				</span>traversor.traverse(in);
<span style="white-space:pre">				</span>return out;
<span style="white-space:pre">			</span>}

<span style="white-space:pre">		</span>} catch (ParserConfigurationException e) {
<span style="white-space:pre">			</span>return null;
<span style="white-space:pre">		</span>}
<span style="white-space:pre">		</span>return null;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 将W3c的doc转为字符串
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param doc
<span style="white-space:pre">	</span> * @return
<span style="white-space:pre">	</span> * @throws Exception
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static String getW3cDocString(final Node doc) throws Exception {
<span style="white-space:pre">		</span>try (StringWriter writer = new StringWriter()) {
<span style="white-space:pre">			</span>DOMSource domSource = new DOMSource(doc);
<span style="white-space:pre">			</span>StreamResult result = new StreamResult(writer);
<span style="white-space:pre">			</span>LOCK.lock();
<span style="white-space:pre">			</span>try {
<span style="white-space:pre">				</span>Transformer transformer = tf.newTransformer();
<span style="white-space:pre">				</span>transformer.transform(domSource, result);
<span style="white-space:pre">				</span>return writer.toString();
<span style="white-space:pre">			</span>} finally {
<span style="white-space:pre">				</span>LOCK.unlock();
<span style="white-space:pre">			</span>}
<span style="white-space:pre">		</span>} catch (TransformerException e) {
<span style="white-space:pre">			</span>throw new IllegalStateException(e);
<span style="white-space:pre">		</span>}
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>/**
<span style="white-space:pre">	</span> * 将Jsoup的node属性拷贝到w3c的Element中
<span style="white-space:pre">	</span> * 
<span style="white-space:pre">	</span> * @param source
<span style="white-space:pre">	</span> * @param el
<span style="white-space:pre">	</span> */
<span style="white-space:pre">	</span>public static void copyAttributes(final org.jsoup.nodes.Node source, final Element el) {
<span style="white-space:pre">		</span>for (Attribute attribute : source.attributes()) {
<span style="white-space:pre">			</span>el.setAttribute(attribute.getKey(), attribute.getValue());
<span style="white-space:pre">		</span>}
<span style="white-space:pre">	</span>}

}

class W3CBuilder implements NodeVisitor {
<span style="white-space:pre">	</span>private final Document doc;
<span style="white-space:pre">	</span>private Element dest;

<span style="white-space:pre">	</span>public W3CBuilder(Document doc) {
<span style="white-space:pre">		</span>this.doc = doc;
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>public void head(final org.jsoup.nodes.Node source, int depth) {
<span style="white-space:pre">		</span>if (source instanceof org.jsoup.nodes.Element) {
<span style="white-space:pre">			</span>org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
<span style="white-space:pre">			</span>Element el = doc.createElement(sourceEl.tagName());
<span style="white-space:pre">			</span>JsoupParserUtils.copyAttributes(sourceEl, el);
<span style="white-space:pre">			</span>if (dest == null) {
<span style="white-space:pre">				</span>doc.appendChild(el);
<span style="white-space:pre">			</span>} else {
<span style="white-space:pre">				</span>dest.appendChild(el);
<span style="white-space:pre">			</span>}
<span style="white-space:pre">			</span>dest = el;
<span style="white-space:pre">		</span>} else if (source instanceof org.jsoup.nodes.TextNode) {
<span style="white-space:pre">			</span>org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
<span style="white-space:pre">			</span>Text text = doc.createTextNode(sourceText.getWholeText());
<span style="white-space:pre">			</span>dest.appendChild(text);
<span style="white-space:pre">		</span>} else if (source instanceof org.jsoup.nodes.Comment) {
<span style="white-space:pre">			</span>org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
<span style="white-space:pre">			</span>Comment comment = doc.createComment(sourceComment.getData());
<span style="white-space:pre">			</span>dest.appendChild(comment);
<span style="white-space:pre">		</span>} else if (source instanceof org.jsoup.nodes.DataNode) {
<span style="white-space:pre">			</span>org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
<span style="white-space:pre">			</span>Text node = doc.createTextNode(sourceData.getWholeData());
<span style="white-space:pre">			</span>dest.appendChild(node);
<span style="white-space:pre">		</span>} else {

<span style="white-space:pre">		</span>}
<span style="white-space:pre">	</span>}

<span style="white-space:pre">	</span>public void tail(final org.jsoup.nodes.Node source, int depth) {
<span style="white-space:pre">		</span>if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
<span style="white-space:pre">			</span>dest = (Element) dest.getParentNode();
<span style="white-space:pre">		</span>}
<span style="white-space:pre">	</span>}
}
测试类

package com.lhh.parse;

import java.io.IOException;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class JsoupParserUtilsTest {

public static void main(String[] args) throws Exception, IOException {
String url = "http://mil.news.sina.com.cn/china/2016-09-29/doc-ifxwmamy9955666.shtml";
Document doc = Jsoup.parse(new URL(url), 10000);
String titleXpath = "//*[@id='main_title']/text()";
String timeXpath = "//*[@id='page-tools']/span/span[position() = 1]";
System.out.println(JsoupParserUtils.exists(doc, "/html/body/div[position>1000000]"));
System.out.println(JsoupParserUtils.getXpathString(doc, titleXpath));
Element element = JsoupParserUtils.getJsoupElement(doc, timeXpath);
System.out.println(element.text());
System.out.println(element.attr("class"));
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: