您的位置:首页 > 理论基础 > 计算机网络


2014-11-18 20:41 393 查看
       1. 获取对应url的html内容。
       2. 分析html内容,获取链接。
       3. 不断迭代前两个步骤,直到喊停。
              wiki:  http://htmlparser.sourceforge.net/
       下载链接: http://sourceforge.net/projects/htmlparser/files/Integration-Builds/2.0-20060923/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;

public class HtmlRetrieve {

* @param html_url the url of html.
* @return if url is not exist, return null else return the content of html.
* */
public String GetContentOfHtml(String html_url){
URL url;
try {
url = new URL(html_url);
HttpURLConnection urlConn = (HttpURLConnection)url.openConnection();
if(urlConn != null)
BufferedReader reader = new BufferedReader(new InputStreamReader(urlConn.getInputStream(),HtmlEncoding.gbk_encoding));
StringBuffer strBuffer = new StringBuffer();
String line;
while((line = reader.readLine())!=null)
return strBuffer.toString();
} catch (Exception e) {
// TODO Auto-generated catch block
return null;

* This function help save html content into a file.
* @param filePath the path of file.
* @param html_content the content of html need to be saved.
public void SaveToFile(String filePath,String html_content)
try {
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(new File(filePath)));
} catch (IOException e) {
// TODO Auto-generated catch block

public static void main(String []args){
HtmlRetrieve htR = new HtmlRetrieve();
try {
String content = htR.GetContentOfHtml("http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&CATALOGID=1265_xyjy&txtKsrq=2000-11-08&txtZzrq=2014-11-20&TABKEY=tab1&REPORT_ACTION=navigate&tab1PAGENUM=5");
//NodeList nodelist = parse.parse(null);
} catch (Exception e) {
// TODO Auto-generated catch block

import java.net.URL;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.sax.Attributes;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

* @author kalvin
* this class helps you to get content in html.

public class HtmlParse {
public HtmlParse()

* function: retrieve the table content of html.you can add some filter condition.
* In htmlParse jar, it has many filters to help your filter the conent of html.
* @param html_content
* @param encoding
* @param className
* @param filter_str
* @return
public String ParseHtmlTableFromHtml(String html_content,String encoding,Class className,final String filter_str)
Parser parser = Parser.createParser(html_content, encoding);
if(parser != null)
NodeClassFilter nodeClassFilter = new NodeClassFilter(className){

private static final long serialVersionUID = 1L;

public boolean accept(Node node){
return true;
return false;

StringBuffer strBuffer = new StringBuffer();
if(strBuffer != null)
try {
NodeList nodeList = parser.extractAllNodesThatMatch(nodeClassFilter);
Node node = null;
if(nodeList != null)
int size = nodeList.size();
for(int i = 0; i < size; i++)
node = nodeList.elementAt(i);
if(node != null)

} catch (ParserException e) {
// TODO Auto-generated catch block
return strBuffer.toString();
return null;

public static void main(String args[])
HtmlParse htmlParse = new HtmlParse();
HtmlRetrieve htmlRetrieve = new HtmlRetrieve();
String html_content = htmlRetrieve.GetContentOfHtml("http://istock.jrj.com.cn/list,600071.html");
String filter_str = "table class=\"table\" id=\"topiclisttitle\"";

String table_content = htmlParse.ParseHtmlTableFromHtml(html_content,HtmlEncoding.gbk_encoding,TableTag.class, filter_str);
if(table_content != null)
/*Parser parse;
try {
parse = new Parser("http://istock.jrj.com.cn/list,600071.html");

NodeFilter nodeFilter = new NodeClassFilter(TableTag.class){
public boolean accept(Node node)
if(node.getText().startsWith("table class=\"table\" id=\"topiclisttitle\""))
return true;
return false;

TagNameFilter tagNameFilter = new TagNameFilter("tr");

TagNameFilter tdTagNameFilter = new TagNameFilter("td");

StringFilter trStringFilter = new StringFilter("cls-data-tr");

HasAttributeFilter attributeFilter = new HasAttributeFilter("class='cls-data-tr'");
AndFilter andFilter = new AndFilter(attributeFilter,trStringFilter);

NodeList nodeList = parse.extractAllNodesThatMatch(nodeFilter);

int size = nodeList.size();

Node node = nodeList.elementAt(0);
/*Node node = null;
for(int i= 0; i < size; i++)
node = nodeList.elementAt(i);
NodeList tdNodeList = node.getChildren();
tdNodeList = tdNodeList.extractAllNodesThatMatch(tdTagNameFilter);
int tdNodeSize = tdNodeList.size();
for(int j=0; j< tdNodeSize; j++)
node = tdNodeList.elementAt(j);
XmlDocument document = new XmlDocument();

/*} catch (ParserException e) {
// TODO Auto-generated catch block




* @author kalvin
* This class support some encodings.
public class HtmlEncoding {
public static final String gbk_encoding = "GBK";
public static final String utf8_encoding = "utf-8";
public static final String utf16_encoding = "utf-16";

import org.dom4j.Document;
import org.dom4j.io.SAXReader;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

* @author kalvin
* Html are constructed by mang tags.
* eg:
*   <html><head><title></title></head><body><table></table></body></html>
*   html,head,title,body,table.All of these are tags.

public class ParseTable {

* help you retrieve the value of properties of table.
* @param xml_table
* @param encoding
* @return
public String ParseTableOfHtml(String xml_table,String encoding)
//parser is the class of htmlparser jar.
//you can use parser parse html.
Parser parser = null;
parser = Parser.createParser(xml_table, HtmlEncoding.utf16_encoding);

if(parser == null)
return null;
//this is a filter condition.In htmlParser, there are many filter conditions.
//this is a tag filter condition. About detail information, you can refer to
//the htmlparse document.About the document, you can get from 开源中国.
TagNameFilter tagNameFilter = new TagNameFilter("tr");

NodeList nodeList = null;
StringBuffer strBuffer = new StringBuffer();

try {
nodeList = parser.extractAllNodesThatMatch(tagNameFilter);
Node node = null;

if(nodeList != null)
int trNodeListSize = nodeList.size();
for(int i=0; i < trNodeListSize; i++)
node = nodeList.elementAt(i);
if(node != null)
NodeList tdNodeList = node.getChildren();
if(tdNodeList != null)
int tdNodeListSize = tdNodeList.size();
for(int j = 0; j< tdNodeListSize; j++)
node = tdNodeList.elementAt(j);
if(node != null)
strBuffer.append(" ");

node =node.getFirstChild();
while(node != null)
if(node instanceof TagNode)
TagNode tagNode = (TagNode)node;
String value = tagNode.getAttribute("href");
if(value != null && !value.equals(""))
strBuffer.append(" ");
node = node.getNextSibling();

} catch (ParserException e) {
// TODO Auto-generated catch block
return strBuffer.toString();
public static void main(String []args)
HtmlParse htmlParse = new HtmlParse();
HtmlRetrieve htmlRetrieve = new HtmlRetrieve();
//String html_content = htmlRetrieve.GetContentOfHtml("http://istock.jrj.com.cn/list,600071,p1.html");
//String filter_str = "table class=\"table\" id=\"topiclisttitle\"";
String html_content1 = htmlRetrieve.GetContentOfHtml("http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&CATALOGID=1265_xyjy&txtKsrq=2000-11-08&txtZzrq=2014-11-20&TABKEY=tab1&REPORT_ACTION=navigate&tab1PAGENUM=1&txtkey2=000001");
String filter_str1 = "table   bgcolor=\'#E0E0E0\' id=\"REPORTID_tab1\" class=\'cls-data-table\'";
String table_content = htmlParse.ParseHtmlTableFromHtml(html_content1,HtmlEncoding.gbk_encoding,TableTag.class, filter_str1);

Parser parser = Parser.createParser(table_content, HtmlEncoding.gbk_encoding);

ParseTable parseTable = new ParseTable();
System.out.println(parseTable.ParseTableOfHtml(table_content, HtmlEncoding.gbk_encoding));
/*if(parser != null)
TagNameFilter tagNameFilter = new TagNameFilter("tr");
HasAttributeFilter attributeFilter = new HasAttributeFilter("class");

AndFilter andFilter = new AndFilter(tagNameFilter,attributeFilter);
try {
NodeList nodeList = parser.extractAllNodesThatMatch(tagNameFilter);
Node node = null;
if(nodeList != null)
int size = nodeList.size();
for(int i= 0; i < size; i++)
node = nodeList.elementAt(i);
if(node != null)
NodeList tdNodeList = node.getChildren();
if(tdNodeList != null)
int tdNodeSize = tdNodeList.size();
for(int j = 0; j < tdNodeSize; j++)
node = tdNodeList.elementAt(j);
if(node != null)
if(node instanceof TagNode)
TagNode tagNode = (TagNode)node;

NodeList aHrefList = node.getChildren();
int aHrefListSize = aHrefList.size();
for(int k=0; k < aHrefListSize; k++)
node = aHrefList.elementAt(k);
if(node instanceof TagNode)
TagNode tagNode = (TagNode)node;

} catch (ParserException e) {
// TODO Auto-generated catch block


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  网络爬虫