您的位置:首页 > Web前端 > HTML

HTMLPARSER 爬取 html网页 获取标题 关键字 内容 url

2015-11-24 15:23 771 查看
package com.bonc.pure.util;

import java.awt.List;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.InputStreamReader;

import java.sql.Blob;

import java.sql.Clob;

import java.util.Date;

import org.hibernate.Hibernate;

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

import org.htmlparser.beans.StringBean;

import org.htmlparser.filters.NodeClassFilter;

import org.htmlparser.filters.TagNameFilter;

import org.htmlparser.tags.BaseHrefTag;

import org.htmlparser.tags.LinkTag;

import org.htmlparser.tags.MetaTag;

import org.htmlparser.tags.TitleTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

import org.htmlparser.visitors.TextExtractingVisitor;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.web.util.HtmlUtils;

import com.bonc.pure.util1.HtmlText;

/**

* 解析HTML工具类

* <p>Title:ZolParser </p>

* @author 施海洲

* @date 2015-10-29上午11:27:21

*/

public class ZolParser {

private static Logger log = LoggerFactory.getLogger(ZolParser.class);

HtmlText htmlText=new HtmlText();

String path ="D://bonc_test_bin//黑客hackjd.com.htm";

public HtmlText test() throws Exception

{

/** 开始时间 */

long now = System.currentTimeMillis();

// 获取源代码

String htmlCode = setHtmlCode();

/** 获取标题 */

String ss=getTitle(htmlCode);

htmlText.setHtml_Title(ss);

/** 获取关键字 */

htmlText.setHtml_Keyword(getkeywords(htmlCode));

/** 获取源URL */

htmlText.setHtml_Url(geturl(htmlCode));

/**上传时间*/

Date time=new Date();

htmlText.setHtml_date(time);

/**获取内容*/

String s= getPlainText(htmlCode);

@SuppressWarnings("deprecation")

Clob clob = (Clob) Hibernate.createBlob(s.getBytes("GBK"));

// htmlText.setHtml_Content(clob);

/**获取文件名*/

htmlText.setHtml_name(getName());

/** 结束时间 */

long current = System.currentTimeMillis();

// 消耗时间

log.info("全文消耗时间:" + (current - now));

return htmlText;

}

/**

* 获取HTML代码

*

* @return

* @throws Exception

*/

public String setHtmlCode() throws Exception

{

/** 通过HttpParser解析器解析HTML */

StringBuffer abstr = new StringBuffer();

BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(path), "GBK"));

String temp="";

while((temp=reader.readLine())!=null){

abstr.append(temp);

abstr.append("\r\n");

}

String result =abstr.toString(); // 在这里得到文件中所有的字符串

//Parser parsers = Parser.createParser(result, "GB2312");// creat a parser ,which you want to parse.

Parser parsers = new Parser(result);

// 设置编码

parsers.setEncoding("utf-8");

// 设置标签过滤器

NodeFilter filter = new TagNameFilter("html");

// 过滤标签

NodeList nList = parsers.extractAllNodesThatMatch(filter);

// 返回字符串

return nList.toHtml();

}

/**

* 获取文件名

* @param

* @return fileName

* @throws ParserException

*/

public String getName() throws ParserException

{

File tempFile =new File( path.trim());

String fileName = tempFile.getName();

System.out.println("fileName = " + fileName);

return fileName;

}

/**

* 获取标题信息

*

* @param htmlCode

* @return

* @throws ParserException

*/

public String getTitle(String htmlCode) throws ParserException

{

Parser parser = new Parser();

parser.setInputHTML(htmlCode);

List list=new List();

/* 配置过滤器 */

NodeFilter newFilter = new NodeClassFilter() {

public boolean accept(Node node) {

if (node instanceof TitleTag) {

TitleTag mt = (TitleTag) node;

if (mt.getTagName() != null) {

return true;

} else {

return false;

}

}

return false;

}

};

String title=null;

NodeList Title = parser.extractAllNodesThatMatch(newFilter);

for (int i = 0; i < Title.size(); i++) {

if (Title.elementAt(i) instanceof TitleTag) {

TitleTag tt = (TitleTag) Title.elementAt(i);

title=tt.getTitle();

}

}

return title;

}

/**

* 获取关键字信息

*

* @param htmlCode

* @throws ParserException

*/

public String getkeywords(String htmlCode) throws ParserException

{

Parser parser = new Parser();

parser.setInputHTML(htmlCode);

List list=new List();

// Map<String, String> map = new HashMap<String, String>();

/* 配置过滤器 */

NodeFilter newFilter = new NodeClassFilter() { // 将Meta下面的keywords和description过滤出来

public boolean accept(Node node) {

if (node instanceof MetaTag) {

MetaTag mt = (MetaTag) node;

if (mt.getMetaTagName() != null) {

return true;

} else {

return false;

}

} else if(node instanceof TitleTag ) {

return true;

}

return false;

}

};

String t=null;

NodeList keywords = parser.extractAllNodesThatMatch(newFilter);

for (int i = 0; i < keywords.size(); i++) {

if (keywords.elementAt(i) instanceof TitleTag) {

TitleTag tt = (TitleTag) keywords.elementAt(i);

} else {

MetaTag mt = (MetaTag) keywords.elementAt(i);

if (mt.getMetaTagName().equals("keywords") | mt.getMetaTagName().equals("Keywords")) {

log.info("keywords:"+ mt.getMetaContent());

t=mt.getMetaContent();

}

}

}

return t;

}

/**

*获取URL

*

* @param htmlCode

* @throws ParserException

*/

@SuppressWarnings("serial")

public String geturl(String htmlCode) throws ParserException

{

Parser parser = new Parser();

parser.setInputHTML(htmlCode);

/* 配置过滤器 */

NodeFilter newFilter = new NodeClassFilter() {

public boolean accept(Node node) {

if (node instanceof BaseHrefTag) {

BaseHrefTag mt = (BaseHrefTag) node;

if (mt.getTagName()!= null) {

return true;

} else {

return false;

}

}

return false;

}

};

String url=null;

NodeList URL = parser.extractAllNodesThatMatch(newFilter);

for (int i = 0; i < URL.size(); i++) {

if (URL.elementAt(i) instanceof LinkTag) {

LinkTag tag = (LinkTag) URL.elementAt(i);

url= HtmlUtils.htmlUnescape(tag.getLink());

}

}

return url;

}

/**

* 获取纯文本信息

*

* @param htmlCode

* @return

* @throws Exception */

public String getPlainText(String htmlCode) throws Exception {

Parser parser = new Parser();

parser.setInputHTML(htmlCode);

StringBean sb = new StringBean();

TextExtractingVisitor visitor = new TextExtractingVisitor();

// 设置不需要得到页面所包含的链接信息

sb.setLinks(false);

// 设置将不间断空格由正规空格所替代

sb.setReplaceNonBreakingSpaces(true);

// 设置将一序列空格由一个单一空格所代替

sb.setCollapse(true);

parser.visitAllNodesWith(visitor);

String html = visitor.getExtractedText();

return html;

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: