您的位置:首页 > 理论基础 > 计算机网络

Java利用正则实现网络爬虫

2018-01-30 22:39 363 查看
工具类(待优化)

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class CreeperUtil
{
 
private static CreeperUtil
test6;

private CreeperUtil
() {

}
 
public static CreeperUtil
getIntance() {
if (test6 ==
null)
{
test6 =
new CreeperUtil();
}
return test6;
}

public String
threadTool (String url,String
regex,String...
group)
{
//创建一个线程池

        ExecutorService
pool =
Executors.newFixedThreadPool(2);

        //创建两个有返回值的任务

        Callable c1 =
CreeperUtil.getIntance().new InitCallable(url,regex,group);

        //执行任务并获取Future对象

        Future f1 =
pool.submit(c1);

        //从Future对象上获取任务的返回值,并输出到控制台
        try {
return f1.get().toString();
}
catch (InterruptedException
e) {
//
TODO Auto-generated
catch block
e.printStackTrace();
}
catch (ExecutionException
e) {
//
TODO Auto-generated
catch block
e.printStackTrace();
}

        return null;
}

class InitCallable
implements Callable<String>{

private String
url;

private String
data;

private String
regex;

private String
group[];

InitCallable(){

}

public InitCallable(String
url,String
regex,String...
group){
this.data =
mesh(url);
this.url =
url;
this.regex =
regex;
this.group =
group;
}

public String
call() throws Exception
{
return regular(data,regex,group);
}

}

/**

*
@Title:
mesh
*
@Description:
获取资源
*
@throws 
*/
public String
mesh (String url)
{
StringBuilder
sb =
new StringBuilder();
URL
urls =
null;
URLConnection
con =
null;
InputStream
stream =
null;
InputStreamReader
isr =
null;
BufferedReader
br =
null;
try {
urls =
new URL(url);
con =
urls.openConnection();
stream =
con.getInputStream();
isr =
new InputStreamReader(stream);
br =
new BufferedReader(isr);
String
line =
null;
while ((line =
br.readLine())
!= null){
sb.append(line);
}
}
catch (Exception
e) {
e.printStackTrace();
}
finally {
try {
br.close();
isr.close();
stream.close();
}
catch (IOException
e) {
e.printStackTrace();
}

}
return sb.toString();
}

/**

*
@Title:
regular
*
@Description:正则匹配
*
@throws 
*/
public String
regular (String data,String
regex,String...
group)
{
Pattern
pattern =
Pattern.compile(regex);
Matcher
matcher =
pattern.matcher(data);
StringBuilder
sb =
new StringBuilder();
while(matcher.find())
{
if (group.length ==
0) {
sb.append(matcher.group());
}
else {
for (int i =
1,j =
group.length+1
; i <
j ;
i++) {
sb.append(matcher.group(i)+"\t");
}
}
sb.append("\n");
}
return sb.toString();
}
}
客户端调用

class Client{
public static void main(String[]
args) {
String
url =
"file:///C:/Users/Administrator/Desktop/HTML%E4%BB%A3%E7%A0%81/1.html";
String
regex =
"<li.*?J_Cat a-all\">.*?<a.*?data-dataid=\"(.*?)\".*?>(.*?)</a>.*?<a.*?data-dataid=\"(.*?)\".*?>(.*?)</a>.*?<a.*?data-dataid=\"(.*?)\".*?>(.*?)</a>.*?<\\/li>";
String
group[]
= {"1","2","3","4","5","6"};
long start =
System.currentTimeMillis();
String
result =
CreeperUtil.getIntance().threadTool(url,regex,group);
long end =
System.currentTimeMillis();
System.out.println(result+"\n花费时间:"+(end-start));
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: