您的位置：首页 > 理论基础 > 计算机网络

Java+MySQL实现网络爬虫程序（转）

2017-03-15 20:52 441 查看

网络爬虫，也叫网络蜘蛛，有的项目也把它称作“walker”。维基百科所给的定义是“一种系统地扫描互联网，以获取索引为目的的网络程序”。网络上有很多关于网络爬虫的开源项目，其中比较有名的是Heritrix和Apache Nutch。有时需要在网上搜集信息，如果需要搜集的是获取方法单一而人工搜集费时费力的信息，比如统计一个网站每个月发了多少篇文章、用了哪些标签，为自然语言处理项目搜集语料，或者为模式识别项目搜集图片等等，就需要爬虫程序来完成这样的任务。而且搜索引擎必不可少的组件之一也是网络爬虫。很多网络爬虫都是用Python，Java或C#实现的。我这里给出的是Java版本的爬虫程序。为了节省时间和空间，我把程序限制在只扫描本博客地址下的网页（也就是http://johnhan.net/但不包括http://johnhany.net/wp-content/下的内容），并从网址中统计出所用的所有标签。只要稍作修改，去掉代码里的限制条件就能作为扫描整个网络的程序使用。或者对输出格式稍作修改，可以作为生成博客sitemap的工具。
代码也可以在这里下载：johnhany/WPCrawler。环境需求我的开发环境是Windows7 + Eclipse。需要XAMPP提供通过url访问MySQL数据库的端口。还要用到三个开源的Java类库： Apache HttpComponents 4.3 提供HTTP接口，用来向目标网址提交HTTP请求，以获取网页的内容； HTML Parser 2.0 用来解析网页，从DOM节点中提取网址链接； MySQL Connector/J 5.1.27 连接Java程序和MySQL，然后就可以用Java代码操作数据库。代码
代码位于三个文件中，分别是：crawler.java，httpGet.java和parsePage.java。包名为net.johnhany.wpcrawler。

crawler.java

package net.johnhany.wpcrawler;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

public class crawler {

public static void main(String args[]) throws Exception {
String frontpage = "http://johnhany.net/";
Connection conn = null;

//connect the MySQL database
try {
Class.forName("com.mysql.jdbc.Driver"); String dburl = "jdbc:mysql://localhost:3306?useUnicode=true&characterEncoding=utf8"; conn = DriverManager.getConnection(dburl, "root", ""); System.out.println("connection built");
} catch (SQLException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}

String sql = null;
String url = frontpage;
Statement stmt = null;
ResultSet rs = null;
int count = 0;

if(conn != null) {
//create database and table that will be needed
try {
sql = "CREATE DATABASE IF NOT EXISTS crawler"; stmt = conn.createStatement(); stmt.executeUpdate(sql); sql = "USE crawler"; stmt = conn.createStatement(); stmt.executeUpdate(sql); sql = "create table if not exists record (recordID int(5) not null auto_increment, URL text not null, crawled tinyint(1) not null, primary key (recordID)) engine=InnoDB DEFAULT CHARSET=utf8"; stmt = conn.createStatement(); stmt.executeUpdate(sql); sql = "create table if not exists tags (tagnum int(4) not null auto_increment, tagname text not null, primary key (tagnum)) engine=InnoDB DEFAULT CHARSET=utf8"; stmt = conn.createStatement(); stmt.executeUpdate(sql);
} catch (SQLException e) {
e.printStackTrace();
}

//crawl every link in the database
while(true) {
//get page content of link "url"
httpGet.getByString(url,conn);
count++;

//set boolean value "crawled" to true after crawling this page
sql = "UPDATE record SET crawled = 1 WHERE URL = '" + url + "'";
stmt = conn.createStatement();

if(stmt.executeUpdate(sql) > 0) {
//get the next page that has not been crawled yet
sql = "SELECT * FROM record WHERE crawled = 0";
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
if(rs.next()) {
url = rs.getString(2);
}else {
//stop crawling if reach the bottom of the list
break;
}

//set a limit of crawling count
if(count > 1000 || url == null) {
break;
}
}
}
conn.close();
conn = null;

System.out.println("Done.");
System.out.println(count);
}
}
}
httpGet.java
package net.johnhany.wpcrawler;

import java.io.IOException;
import java.sql.Connection;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

public class httpGet {

public final static void getByString(String url, Connection conn) throws Exception {
CloseableHttpClient httpclient = HttpClients.createDefault();

try {
HttpGet httpget = new HttpGet(url);
System.out.println("executing request " + httpget.getURI());

ResponseHandler<String> responseHandler = new ResponseHandler<String>() {

public String handleResponse(
final HttpResponse response) throws ClientProtocolException, IOException {
int status = response.getStatusLine().getStatusCode();
if (status >= 200 && status < 300) {
HttpEntity entity = response.getEntity();
return entity != null ? EntityUtils.toString(entity) : null;
} else {
throw new ClientProtocolException("Unexpected response status: " + status);
}
}
};
String responseBody = httpclient.execute(httpget, responseHandler);
/*
//print the content of the page
System.out.println("----------------------------------------");
System.out.println(responseBody);
System.out.println("----------------------------------------");
*/
parsePage.parseFromString(responseBody,conn);

} finally {
httpclient.close();
}
}
}
parsePage.java
package net.johnhany.wpcrawler;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import java.net.URLDecoder;

public class parsePage {

public static void parseFromString(String content, Connection conn) throws Exception {
Parser parser = new Parser(content);
HasAttributeFilter filter = new HasAttributeFilter("href");

try {
NodeList list = parser.parse(filter);
int count = list.size();

//process every link on this page
for(int i=0; i<count; i++) {
Node node = list.elementAt(i);

if(node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
String nextlink = link.extractLink();
String mainurl = "http://johnhany.net/";
String wpurl = mainurl + "wp-content/";

//only save page from "http://johnhany.net"
if(nextlink.startsWith(mainurl)) {
String sql = null;
ResultSet rs = null;
PreparedStatement pstmt = null;
Statement stmt = null;
String tag = null;

//do not save any page from "wp-content"
if(nextlink.startsWith(wpurl)) {
continue;
}

try {
//check if the link already exists in the database
sql = "SELECT * FROM record WHERE URL = '" + nextlink + "'"; stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY,ResultSet.CONCUR_UPDATABLE); rs = stmt.executeQuery(sql); if(rs.next()) { }else { //if the link does not exist in the database, insert it sql = "INSERT INTO record (URL, crawled) VALUES ('" + nextlink + "',0)"; pstmt = conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); pstmt.execute();
System.out.println(nextlink);

//use substring for better comparison performance
nextlink = nextlink.substring(mainurl.length());
//System.out.println(nextlink);

if(nextlink.startsWith("tag/")) {
tag = nextlink.substring(4, nextlink.length()-1);
//decode in UTF-8 for Chinese characters
tag = URLDecoder.decode(tag,"UTF-8");
sql = "INSERT INTO tags (tagname) VALUES ('" + tag + "')";
pstmt = conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);
//if the links are different from each other, the tags must be different
//so there is no need to check if the tag already exists
pstmt.execute();
}
}
} catch (SQLException e) {
//handle the exceptions
System.out.println("SQLException: " + e.getMessage());
System.out.println("SQLState: " + e.getSQLState());
System.out.println("VendorError: " + e.getErrorCode());
} finally {
//close and release the resources of PreparedStatement, ResultSet and Statement
if(pstmt != null) {
try {
pstmt.close();
} catch (SQLException e2) {}
}
pstmt = null;

if(rs != null) {
try {
rs.close();
} catch (SQLException e1) {}
}
rs = null;

if(stmt != null) {
try {
stmt.close();
} catch (SQLException e3) {}
}
stmt = null;
}

}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
}

程序原理

所谓“互联网”，是网状结构，任意两个节点间都有可能存在路径。爬虫程序对互联网的扫描，在图论角度来讲，就是对有向图的遍历（链接是从一个网页指向另一个网页，所以是有向的）。常见的遍历方法有深度优先和广度优先两种。相关理论知识可以参考树的遍历：这里和这里。我的程序采用的是广度优先方式。

程序从crawler.java的main()开始运行。

Class.forName("com.mysql.jdbc.Driver");
String dburl = "jdbc:mysql://localhost:3306?useUnicode=true&characterEncoding=utf8";
conn = DriverManager.getConnection(dburl, "root", "");
System.out.println("connection built");

首先，调用DriverManager连接MySQL服务。这里使用的是XAMPP的默认MySQL端口3306，端口值可以在XAMPP主界面看到：