您的位置:首页 > 其它

提取网页链接

2016-05-03 10:33 337 查看
package com.zyw.regex;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class TestRegularExpression {
public static void main(String[] args) {
Map<UrlContent,Boolean> firstallUrl=new LinkedHashMap();
Map<UrlContent,Boolean> secondallUrl=new LinkedHashMap();
Pattern p=Pattern.compile("<a.*?href=[\"']?((https?://)?/?[^\"']+)[\"']?.*?>(.+)</a>");//匹配整个<a></a>
Pattern p1=Pattern.compile("(https?|ftp|http)://[a-zA-Z0-9]*.[a-zA-Z0-9]+.\\w{2,3}/[\\w\\d-/.]*(?=\")");//匹配url
Pattern p2=Pattern.compile("(?<=>)[\\w\\s\u4e00-\u9fa5]*(?=</a>)");//匹配<a></a>中内容
addUrl(firstallUrl, "http://www.qq.com/", p, p1, p2);
for (Iterator it = firstallUrl.keySet().iterator(); it.hasNext();) {
UrlContent key = (UrlContent) it.next();
addUrl(secondallUrl, key.getUrl(), p, p1, p2);
if (secondallUrl.size() > 1000)
break;
}
int i = 0;
for (UrlContent key : secondallUrl.keySet()) {
System.out.println(++i + " " + key.getUrl() + " -----"+ key.getContent());
}
}

public static void addUrl(Map<UrlContent, Boolean> allUrl,String link, Pattern p,Pattern p1, Pattern p2) {
try {
URL url = new URL(link);
InputStream in = url.openStream();
InputStreamReader isr = new InputStreamReader(in, "utf-8");
BufferedReader br = new BufferedReader(isr);
String s = "";
while ((s = br.readLine()) != null) {
Matcher m=p.matcher(s);
while (m.find()){
UrlContent content=new UrlContent();
String text=m.group();
Matcher m1=p1.matcher(text);
Matcher m2=p2.matcher(text);
while (m1.find()){
content.setUrl(m1.group());
}
while (m2.find()){
content.setContent(m2.group());
}
if(content.getUrl()!=null)
allUrl.put(content, false);
}
s = br.readLine();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

}


package com.zyw.regex;

public class UrlContent {
private String url;
private String content;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: