提取网页链接
2016-05-03 10:33
337 查看
package com.zyw.regex; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; public class TestRegularExpression { public static void main(String[] args) { Map<UrlContent,Boolean> firstallUrl=new LinkedHashMap(); Map<UrlContent,Boolean> secondallUrl=new LinkedHashMap(); Pattern p=Pattern.compile("<a.*?href=[\"']?((https?://)?/?[^\"']+)[\"']?.*?>(.+)</a>");//匹配整个<a></a> Pattern p1=Pattern.compile("(https?|ftp|http)://[a-zA-Z0-9]*.[a-zA-Z0-9]+.\\w{2,3}/[\\w\\d-/.]*(?=\")");//匹配url Pattern p2=Pattern.compile("(?<=>)[\\w\\s\u4e00-\u9fa5]*(?=</a>)");//匹配<a></a>中内容 addUrl(firstallUrl, "http://www.qq.com/", p, p1, p2); for (Iterator it = firstallUrl.keySet().iterator(); it.hasNext();) { UrlContent key = (UrlContent) it.next(); addUrl(secondallUrl, key.getUrl(), p, p1, p2); if (secondallUrl.size() > 1000) break; } int i = 0; for (UrlContent key : secondallUrl.keySet()) { System.out.println(++i + " " + key.getUrl() + " -----"+ key.getContent()); } } public static void addUrl(Map<UrlContent, Boolean> allUrl,String link, Pattern p,Pattern p1, Pattern p2) { try { URL url = new URL(link); InputStream in = url.openStream(); InputStreamReader isr = new InputStreamReader(in, "utf-8"); BufferedReader br = new BufferedReader(isr); String s = ""; while ((s = br.readLine()) != null) { Matcher m=p.matcher(s); while (m.find()){ UrlContent content=new UrlContent(); String text=m.group(); Matcher m1=p1.matcher(text); Matcher m2=p2.matcher(text); while (m1.find()){ content.setUrl(m1.group()); } while (m2.find()){ content.setContent(m2.group()); } if(content.getUrl()!=null) allUrl.put(content, false); } s = br.readLine(); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
package com.zyw.regex; public class UrlContent { private String url; private String content; public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }
相关文章推荐
- Android学习笔记(一)——Activity基础知识
- ring0下cr0的作用
- 缺少local文件导致,资源文件读取失败
- 35.Spark系统运行内幕机制循环流程
- Swift—属性观察者-备
- Oracle数据库中CLOB类型字段取出转换String类型方法
- 这个可以有
- PHP简单创建日历的方法
- memcached for windows 修改端口和最大内存,以及常用命令
- memcached for windows 修改端口和最大内存,以及常用命令
- IO管理一基础
- Java核心知识点-NIO
- mysql数据库text、varchar、blob数据类型比较
- SVN 错误:Error validating server certificate for 'https://xxxxxxx':443... Mac os svn客户端证书验证缓存 解决
- HTTP POST请求报文格式分析与Java实现文件上传
- [剑指offer]最小的K个数
- 这个可以有
- NET在SQL Server中的图片存取技术
- 【Linux笔记】Linux目录结构
- Exchange(2007/2010/2013)共存环境中IMAP和POP的工作方式