您的位置:首页 > 理论基础 > 计算机网络

一个简单网络爬虫示例

2016-02-04 11:17 387 查看
在学生时期,可能听到网络爬虫这个词会觉得很高大上,但是它的简单实现可能学生都不难懂。
网络爬虫应用,就是把整个互联网真的就当做一张网,像蜘蛛网那样,应用就像一个虫子,在网上面按照一定的规则爬动。
现在互联网应用最广的就是http(s)协议了,本文例子就是基于使用http(s)协议的,只作为示例,不涉及复杂的算法(实际上是最重要的)。


设计思路:

程序入口从一个或多个url开始,通过http(s)获取url的内容,对获取到内容处理,获取内容中需要爬取的信息,获取到内容中的url链接,再重复以上步骤。

不多说,详情看代码已经注释:

/**
* 功能概要:主程序
*
* @author hwz
*/
public class MainApp {

private Integer corePoolSize = 10;

private Integer maxPoolSize = 20;

private ThreadPoolExecutor executor;

/** 工作队列 */
private SpiderQueue workQueue;

public void start(String url) throws Exception {
//初始化线程池
LinkedBlockingDeque<Runnable> executorQueue = new LinkedBlockingDeque<Runnable>(maxPoolSize);
executor = new ThreadPoolExecutor(corePoolSize, maxPoolSize, 60L, TimeUnit.SECONDS,
executorQueue);

workQueue = new SpiderQueue(1024);
SpiderUrl spiderUrl = new SpiderUrl(url, 0);
try {
workQueue.add(spiderUrl);
}
catch (Exception e) {
System.out.println("insert url into workQueue error,url=" + url);
e.printStackTrace();
}

//提交第一个执行任务
executor.submit(new SimpleSpider(workQueue, "thread-" + "main"));
int i=0;
int idle = 0;
while(true) {
//判断是否增加更多线程执行任务
if (workQueue.size() > 20 && executor.getActiveCount() < maxPoolSize) {
idle = 0;
System.out.println("submit new thread,workQueue.size=" + workQueue.size() +
",executorQueue.activeCount=" + executor.getActiveCount() + ",i=" + i);
executor.submit(new SimpleSpider(workQueue, "thread-" + i++));
Thread.sleep(500);
}
else if (workQueue.size() == 0){
idle++;
System.out.println("main method, idle times=" + idle);

//主线程空闲20次,结束运行
if (idle > 20) {
System.out.println("main method, idle times=" + idle + ",end!");
break;
}
Thread.sleep(1000);
}
else {
Thread.sleep(2000);
}
}
System.out.println("End!,workQueue.size=" + workQueue.size() +
",executorQueue.activeCount=" + executor.getActiveCount() + ",executorQueue.CompletedTaskCount" +
executor.getCompletedTaskCount() +  ",i=" + i);
workQueue.printAll();
executor.shutdown();
System.exit(0);
}

public static void main(String[] args) throws Exception {

MainApp app = new MainApp();
app.start("http://www.csdn.net/");
}
}


/**
*
* 功能概要:自定义爬虫工作同步队列,使用ArrayList实现
*
* @author hwz
*/
public class SpiderQueue {

/** 存储器 */
private List<SpiderUrl> queue;

public SpiderQueue(int size) {
queue = new ArrayList<SpiderUrl>(size);
}

public synchronized void add(SpiderUrl spiderUrl) {
queue.add(spiderUrl);
}

public synchronized SpiderUrl poll() {
if (queue.isEmpty()) {
return null;
}
//控制台打印结果,方便查看
SpiderUrl spiderUrl = queue.remove(0);
System.out.println("SpiderQueue,poll,SpiderUrl=" + spiderUrl.toString() + ",remain size=" + queue.size());
return spiderUrl;
}

public synchronized SpiderUrl peek() {
if (queue.isEmpty()) {
return null;
}
return queue.get(0);
}

public synchronized boolean isExsit(SpiderUrl spiderUrl) {
return queue.contains(spiderUrl);
}

public synchronized int size() {
return queue.size();
}

public void printAll() {
System.out.println("Enter printAll.");
for (SpiderUrl spiderUrl : queue) {
System.out.println(spiderUrl);
}
}
}


/**
*
* 功能概要:爬虫工作的url
*
* @author hwz
*/
public class SpiderUrl {

/** http(s) url */
private String url;

/** 该url是入口url的第几层  */
private int deep;

public SpiderUrl(String url, int deep) {
this.url = url;
this.deep = deep;
}

public String getUrl() {
return url;
}

public void setUrl(String url) {
this.url = url;
}

public int getDeep() {
return deep;
}

public void setDeep(int deep) {
this.deep = deep;
}

@Override
public boolean equals(Object obj) {
if (!(obj instanceof SpiderUrl)) {
return false;
}
SpiderUrl oth = (SpiderUrl) obj;
return this.url.equals(oth.getUrl());
}

@Override
public int hashCode() {
return url.hashCode();
}

@Override
public String toString() {
return getClass().toString() + "[url:" + url + ",deep:" + deep +"]";
}
}


/**
*
* 功能概要:爬虫工作类,主要实现类
*
* @author hwz
*/
public class SimpleSpider implements Runnable{

private String threadName;

private SpiderUrl url;

private SpiderQueue workQueue;

public SimpleSpider(SpiderQueue workQueue, String threadName) {
this.workQueue = workQueue;
this.threadName = threadName;
}

@Override
public void run() {
System.out.println(threadName + " start run");
//连续空闲10次循环,结束任务
int idle = 0;
while (idle < 10) {
url = workQueue.poll();
if (url != null) {
//url 解析
parseUrl(url);
idle = 0;
}
else {
System.out.println(threadName + " idle...,times=" + idle++);
try {
Thread.sleep(1000);
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
}
System.out.println(threadName + " end run...");
}

/**
* url解析
* @param url
* @return void
*/
private void parseUrl(SpiderUrl url) {
if (url == null) {
return;
}
try {
int deep = url.getDeep() + 1;
URL netUrl = new URL(url.getUrl());
URLConnection connection = netUrl.openConnection();
String contentType = connection.getContentType();
//获取内容
String resource = getResource(connection);
//获取标题
String title = getTitle(resource);
//获取链接
List<String> urls = getUrls(resource);
System.out.println(threadName +  ",parseUrl url=" + url + ",contentType=" + contentType + ",title=" + title + ",urls=" + urls);
//控制爬取链接层数,如果获取到的url全部加入工作队列,将会是指数级增加,最后程序挂掉
if (deep < 3) {
SpiderUrl newUrl;
for (String u : urls) {
newUrl = new SpiderUrl(u,deep);
if(!workQueue.isExsit(newUrl)) {
workQueue.add(newUrl);
}
}
}
}
catch (IOException e) {
e.printStackTrace();
}
}

/**
* 读取http url 内容
* @param connection
* @return
* @return String
*/
private String getResource(URLConnection connection) {
if (connection == null) {
return null;
}
StringBuilder sb = new StringBuilder();
try {
InputStream inputStream = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(inputStream, "UTF-8");
int input;
while ( (input = isr.read()) != -1) {
sb.append((char)input);
}
}
catch (IOException e) {
System.out.println(threadName + ",get resource error,connection=" + connection);
}
return sb.toString();
}

/**
* 从url内容获取标题
* @param content
* @return
* @return String
*/
private  String getTitle(String content) {
if (content == null) {
return null;
}
Pattern pattern = Pattern.compile("(<title>.{1,}</title>)");
Matcher matcher = pattern.matcher(content);
String title = null;
if (matcher.find()) {
title = matcher.group(0).replaceAll("<title>", "").replaceAll("</title>", "");
}
return title;
}

/**
* 从url内容中获取存在的url链接
* @param content
* @return
* @return List<String>
*/
private  List<String> getUrls(String content) {
if (content == null) {
return null;
}
Pattern pattern = Pattern.compile("(<a.{1,}?href=['\"]?[a-zA-z]+:\\/\\/[^\\s]*?[\\s>]{1})");
Matcher matcher = pattern.matcher(content);
String a;
String lastChar;
List<String> links = new ArrayList<String>();
while (matcher.find()) {
a = matcher.group(0).replaceAll("<a.{1,}?href=['\"]?", "");
a = a.trim();
lastChar = a.substring(a.length()-1);
if (lastChar.equals("'") || lastChar.equals("\"") || lastChar.equals(">")) {
a = a.substring(0,a.length()-1);
}
links.add(a);
}
return links;
}
}


该代码示例,旨在说明一个简单的爬虫,关于多线程和http的处理没有过多考虑,如存在错误,请指出。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: