您的位置:首页 > 编程语言 > Java开发

写个爬虫获取全国地区数据

2017-03-03 15:14 288 查看
公司需要全国的地址数据,网上找半天找不到最后找到这个网址,怎么办?咱是程序猿,直接写个爬虫搞起!



看看测试效果

public static void main(String[] args) throws IOException {

//构建一个网页处理器
PageProcessor processor = new NetworkProcessor();
//构建两个数据提取器,专门用来提取城市地址数据
AnalyHandler analy = new CityUrlExtractHandler();
analy.setAnalyHandler(new CityNameExtractHandler());
//构建一个网页数据过滤器
PageFilter filter = new DefaultPageFilter();

//构建一个网页对象
Page page = new Page();
page.setRegex("<td>.*?</td>");//整个网页数据提取规则
page.setUrl("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html");//需要爬取的url
page.setLevel(1);	//页面深度第一次是1

//构建一个爬虫启动对象
InvokeCenter ic = new InvokeCenter(processor, analy);
ic.setPageFilter(filter);

//取得爬取结果
List<Map<String,String>> result = ic.start(page);
String fileDir = "C:\\Users\\Administrator\\Desktop\\cityData.txt";

fullDisk(result,fileDir);
}



花了两个多小时写的,感觉还不错,只有InvokeCenter爬虫启动中心还需要做修改,让其可以支撑各种网页的爬取,这个我就不做了,如果你拿到了我的代码,你可以去改下试试!

好了,贴上全部代码!

package com.fyrj.compoment.crawler;

/***
* 代表一个抽象网页
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public class Page {

/***
* 网页URL
*/
private String url ;

/***
* 页面数据
*/
private String viewData;

/***
* 该页面提取数据的规则表达式
*/
private String regex;

/***
* 页面深度
*/
private int level;

public String getUrl() {
return url;
}

public void setUrl(String url) {
this.url = url;
}

public String getViewData() {
return viewData;
}

public void setViewData(String viewData) {
this.viewData = viewData;
}

public String getRegex() {
return regex;
}

public void setRegex(String regex) {
this.regex = regex;
}

public int getLevel() {
return level;
}

public void setLevel(int level) {
this.level = level;
}

}


package com.fyrj.compoment.crawler;

import java.util.UUID;

/***
* 静态编码以及ID生成器
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public class NumberAndIdGenerator {
private static int number = 100001;

private NumberAndIdGenerator() {}

/***
* 采用静态内部类实现单列
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public static class Inner{
private static NumberAndIdGenerator instance = new NumberAndIdGenerator();
}

public static NumberAndIdGenerator getInstence(){
return Inner.instance;
}
/***
* 生成编码
* @return
*/
public static String createNumber(){
return ++number+"";
}

/***
* 生成Id
* @return
*/
public static String createId(){
return UUID.randomUUID().toString().replaceAll("-", "");
}

}


package com.fyrj.compoment.crawler;

/***
* 页面数据获取接口
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public interface PageProcessor {

Page resolverPage(Page page);

}


package com.fyrj.compoment.crawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

/***
* 通过网络获取页面数据
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public class NetworkProcessor implements PageProcessor {

@Override
public Page resolverPage(Page page) {
String urlStr = page.getUrl();
if( urlStr == null || urlStr.equals("")){
throw new RuntimeException("page url not be null!");
}
try{

URL url = new URL(urlStr);
HttpURLConnection urlConnection = (HttpURLConnection)url.openConnection();
int responsecode=urlConnection.getResponseCode();
BufferedReader reader;
StringBuffer bufferData = new StringBuffer();
if(responsecode==200){
reader=new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"GBK"));
String lineData = null;
while( null!=(lineData=reader.readLine())){
bufferData.append(lineData);
}
}else{
throw new IOException("获取不到网页的源码,服务器响应代码为:"+responsecode+"URL为:"+urlStr);
}
page.setViewData(bufferData.toString());
}catch(Exception e){
throw new RuntimeException("获取不到网页的源码,出现异常:",e);
}

return page;
}

}


package com.fyrj.compoment.crawler;

import java.util.Map;

/***
* 动态匹配规则中心
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public interface RuleDefinition {

/***
* 规则定义
* @return
*/
String getRegex();

/***
* 争对拿到的数据进行解析处理
* @param lineData
* @param result
* @return
*/
boolean fullData( String lineData, Map<String,String> result );
}


package com.fyrj.compoment.crawler;

/***
* 页面解析过滤器
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public abstract class  PageFilter {
protected PageFilter pageFilter ;

public Page doFilter(Page page){
this.filter(page);
if(this.pageFilter!=null){
pageFilter.filter(page);
}
return page;
};

protected abstract void filter(Page page);
}


package com.fyrj.compoment.crawler;

/***
* 针对地址解析这个网站得有这么个过滤器
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public class DefaultPageFilter extends PageFilter {

@Override
public void filter(Page page) {
if( page.getLevel() >1 ){
page.setRegex("<td>.*?</td><td>.*?</td>");
page.setViewData(page.getViewData()
.replaceAll("<td>名称</td>", "")
.replaceAll("<td>\\D</td>", "")
);
}

}

}


package com.fyrj.compoment.crawler;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/***
* 数据提取器
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public abstract class AnalyHandler {

protected String dataStr;
protected AnalyHandler analyHandler ;

//采用链条式一直调用处理者直到处理完毕
public Map<String,String> analyStart(){
Map<String,String> result = new HashMap<>();
analyDefine(result);
if( this.analyHandler!=null ){
analyHandler.setDataStr(dataStr);
analyHandler.analyDefine(result);
}
return result;
}

protected Map<String,String> analyDefine( Map<String,String> result ){
//这里采用规则链条调用...直到能匹配出数据!或者最后都匹配不到数据那就放弃算了!
List<RuleDefinition> rList = getRuleDefinitionChain();
for (RuleDefinition ruleDefinition : rList) {
String regex = ruleDefinition.getRegex();
List<String> list = new ArrayList<String>();
Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
Matcher ma = pa.matcher(dataStr);
while (ma.find()){
list.add(ma.group());
}
if(list.size()>0){
String lineData = list.get(0);
boolean fullResult = ruleDefinition.fullData(lineData, result);
//如果解析成功拿到了东西,就不再继续往下解析,否则继续到第二个规则对象中去解析数据!
if(fullResult){
return result;
}
}
}
return result;
}

public void setAnalyHandler( AnalyHandler analyHandler){
this.analyHandler = analyHandler;
}

public void setDataStr( String dataStr){
this.dataStr = dataStr;
};

abstract List<RuleDefinition> getRuleDefinitionChain();
}


package com.fyrj.compoment.crawler;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/***
* 解析出城市名称
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public class CityNameExtractHandler extends AnalyHandler{

@Override
List<RuleDefinition> getRuleDefinitionChain() {
List<RuleDefinition> rList = new ArrayList<>();
//第一规则,这个争对第一级地址可用
rList.add(new RuleDefinition() {
@Override
public String getRegex() {
return "html'>.*?<br/>";
}

@Override
public boolean fullData(String lineData, Map<String, String> result) {
String cityName = lineData.substring(lineData.lastIndexOf("html'>")+6,lineData.lastIndexOf(
bfa0
"<br/>"));;
if(cityName==null || "".equals(cityName)){
//如果没解析到数据,证明这次是失败的!
return false;
}
result.put("CITY_NAME", cityName);
return true;
}
});

//第二规则,这个针对中间地址可用
rList.add(new RuleDefinition() {
@Override
public String getRegex() {
return "html'>\\D*?</a>";
}

@Override
public boolean fullData(String lineData, Map<String, String> result) {
String cityName = lineData.substring(lineData.lastIndexOf("html'>")+6,lineData.lastIndexOf("</a>"));;
if(cityName==null || "".equals(cityName)){
//如果没解析到数据,证明这次是失败的!
return false;
}
result.put("CITY_NAME", cityName);
return true;
}
});

//第三规则,这个针对最后一级地址可用
rList.add(new RuleDefinition() {
@Override
public String getRegex() {
return "<td>\\D*?</td>";
}

@Override
public boolean fullData(String lineData, Map<String, String> result) {
String cityName = lineData.substring(lineData.lastIndexOf("<td>")+4,lineData.lastIndexOf("</td>"));;
if(cityName==null || "".equals(cityName)){
//如果没解析到数据,证明这次是失败的!
return false;
}
result.put("CITY_NAME", cityName);
return true;
}
});
return rList;
}

}


package com.fyrj.compoment.crawler;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/***
* 解析出城市URL
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public class CityUrlExtractHandler extends AnalyHandler{

@Override
List<RuleDefinition> getRuleDefinitionChain() {
List<RuleDefinition> rList = new ArrayList<>();
rList.add(new RuleDefinition() {
@Override
public String getRegex() {
return "<a href='.*?\\.html'";
}

@Override
public boolean fullData(String lineData, Map<String, String> result) {
String url = lineData.substring(lineData.lastIndexOf("href='")+6,lineData.lastIndexOf(".html")+5);
if(url==null || "".equals(url)){
//如果没解析到数据,证明这次是失败的!
return false;
}
result.put("URL",url);
return true;
}
});
return rList;
}
}


package com.fyrj.compoment.crawler;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/***
* 爬虫启动器
* @author ying.cai
* @email 919126624@qq.com
* @version 1.0
*/
public class InvokeCenter {

//页面处理器,能打开页面...
private PageProcessor processor;

//数据提取调用链,用于针对每行具体数据进行数据提取
private AnalyHandler analyHandler;

//页面过滤器,用于替换规则和页面数据
private PageFilter pageFilter;

public InvokeCenter( PageProcessor processor,AnalyHandler analy ) {
this.processor = processor;
this.analyHandler = analy;
}

public void setPageFilter(PageFilter pageFilter){
this.pageFilter = pageFilter;
}

public List<Map<String,String>> start( Page page){
List<Map<String,String>> result = new ArrayList<Map<String,String>>();
resolverPage(page);
result = capacityPageData(page,result,null);
return result;
}

public void resolverPage(Page page){
page = processor.resolverPage(page);
}

/***
* 对网页进行深度解析
* @param page 代表一个抽象网页
* @param result
* @param level 这个是爬虫深度!可能到了不同的深度,page匹配规则需要改变
* @param parentId 父数据ID
* @return
*/
public List<Map<String,String>> capacityPageData(Page page,List<Map<String,String>> result,String parentId){
//修改数据匹配规则 ,这里采用过滤器设计
if( null!=pageFilter ){
pageFilter.doFilter(page);
}
//用正则解析页面数据
List<String> list = new ArrayList<String>();
if( page.getRegex()==null || "".equals(page.getRegex()) ){
throw new RuntimeException("page regex not be null!");
}
Pattern pa = Pattern.compile(page.getRegex(), Pattern.CANON_EQ);
Matcher ma = pa.matcher(page.getViewData());
while (ma.find()){
list.add(ma.group());
}

//接下来要对每一行数据进行分析,转换成我需要的map数据
for (String lineStr : list) {

analyHandler.setDataStr(lineStr);
//得到经过一些列解析器解析后的数据映射
Map<String,String> map = analyHandler.analyStart();

afterProcess(result, page.getLevel(), parentId, map);

//如果存在URL,则需要继续递归
if( map.containsKey("URL") ){
//这行代码应该抽取出去,因为URL的构建规则随着要爬取的网页不同都是会变的!
String newUrl = page.getUrl().substring(0,page.getUrl().lastIndexOf("/")+1) + map.get("URL");

Page newPage = new Page();
newPage.setUrl(newUrl);
newPage.setRegex(page.getRegex());
//继续往下走,页面深度+1
newPage.setLevel(page.getLevel()+1);
resolverPage(newPage);
capacityPageData(newPage, result,map.get("ID"));
}
}
return result;
}

/***
* 争对解析到的数据的后期处理
* @param result
* @param level
* @param parentId
* @param map
*/
private void afterProcess(List<Map<String, String>> result, int level, String parentId, Map<String, String> map) {
if(map.get("CITY_NAME")!=null){
//针对结果再做些处理!
map.put("ID", NumberAndIdGenerator.getInstence().createId());
map.put("NUMBER", NumberAndIdGenerator.getInstence().createNumber());
map.put("LEVEL", level+"");
map.put("PARENT_ID", parentId);
result.add(map);
System.out.println(map);
}
}

}


package com.fyrj.compoment.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.Map;

public class Test {

public static final String SQL_TEMPLATE = "insert into table_name ('id','name','number','level','parent_id')"
+" values ('#ID#','#NAME#','#NUMBER#','#LEVEL#','#PARENT_ID#') ; ";

public static void main(String[] args) throws IOException { //构建一个网页处理器 PageProcessor processor = new NetworkProcessor(); //构建两个数据提取器,专门用来提取城市地址数据 AnalyHandler analy = new CityUrlExtractHandler(); analy.setAnalyHandler(new CityNameExtractHandler()); //构建一个网页数据过滤器 PageFilter filter = new DefaultPageFilter(); //构建一个网页对象 Page page = new Page(); page.setRegex("<td>.*?</td>");//整个网页数据提取规则 page.setUrl("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html");//需要爬取的url page.setLevel(1); //页面深度第一次是1 //构建一个爬虫启动对象 InvokeCenter ic = new InvokeCenter(processor, analy); ic.setPageFilter(filter); //取得爬取结果 List<Map<String,String>> result = ic.start(page); String fileDir = "C:\\Users\\Administrator\\Desktop\\cityData.txt"; fullDisk(result,fileDir); }

/***
* 写入磁盘
* @param result
* @throws IOException
*/
public static void fullDisk(List<Map<String,String>> result,String fileDir) throws IOException{
BufferedWriter writer = new BufferedWriter(new FileWriter(new File(fileDir)));
try {
for (Map<String, String> map : result) {
writer.write( findFinalStr(map) );
writer.newLine();
writer.flush();
}
} catch (IOException e) {

}finally{
if( writer!=null ){
writer.close();
}
}
}

public static String findFinalStr( Map<String, String> map ){
String finalStr = Test.SQL_TEMPLATE.replaceAll("#ID#", map.get("ID"))
.replaceAll("#ID#", map.get("ID"))
.replaceAll("#NAME#", map.get("CITY_NAME"))
.replaceAll("#NUMBER#", map.get("NUMBER"))
.replaceAll("#LEVEL#", map.get("LEVEL"))
.replaceAll("#PARENT_ID#", map.get("PARENT_ID")==null?"NULL":map.get("PARENT_ID"));
return finalStr;
}
}


写的不好的地方请见谅,这个没做太多思考写的,能满足构建全国地区数据的需求。不过你可以对InvokeCenter进行修改使其可以动态支持各种网页;
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  java 爬虫 设计模式