【单机版】一个小爬虫+PageRank代码实现
2015-11-01 22:08
561 查看
在这个小程序里边,首先是使用一个爬虫,获取网页的出链网址,然后在对获取的所有网页进行执行PageRank算法。
import java.io.IOException; import java.util.ArrayList; import java.util.Scanner; public class GetWebStructure { public String StartAddress; public int times; public ArrayList<WebNodeWithLink> partOfWebStructure=new ArrayList<WebNodeWithLink>(); public ArrayList<WebNode> webStructure=new ArrayList<WebNode>(); public ArrayList<WebsiteToNumber> Mapping=new ArrayList<WebsiteToNumber>();//这个结果用于存储web站点和列号之间的关系 public GetWebStructure(String StartAddress,int times){ this.StartAddress=StartAddress; this.times=times; } public void run() { String sourcePage=StartAddress; WebNodeWithLink tmpOfWebNodeWithLink=new WebNodeWithLink(sourcePage,null); partOfWebStructure.add(tmpOfWebNodeWithLink); do{ //System.out.println("正在处理partOfWebStructuresize中的新元素"); int count=0; int thisWebNumber=-1; //查看当前节点是否已经被存储 for(int n=0;n<Mapping.size();n++) if(Mapping.get(n).website.equals(partOfWebStructure.get(0).sourcePage)) { count++; thisWebNumber=Mapping.get(n).number; } //如果已经被存储,则不重新采集它的子节点,否则进入采集器陷阱 if(count==0) { //当webStructure的大小超过预定times时,不再对partOfWebStructure进行追加 //因为这时候partOfWebStructure中包含的节点已经足以为webStructure中的节点添加子节点 //也不再对webStructure和Mapping进行追加,因为追加后的新节点ID作为times个节点的子节点没有意义 //所以只有partOfWebStructure中的节点在Mapping中已经存在,才会将其标记为times个节点中某个节点的子节点 if(webStructure.size()<times){ //为partOfWebStructure中当前元素获取出链节点 try{ partOfWebStructure.get(0).setTargetPage();//在使用这个记录时才设置它的链出网址和出度 }catch (Exception e){ partOfWebStructure.remove(0); continue;//出错则进行下一轮do while循环 } //将获得的出链节点添加到partOfWebStructure中 for(int t=0;t<partOfWebStructure.get(0).outDegree;t++){ sourcePage=partOfWebStructure.get(0).targetPage.get(t); tmpOfWebNodeWithLink=new WebNodeWithLink(sourcePage,partOfWebStructure.get(0).sourcePage); //必须被添加到partOfWebStructure,否则当前的子节点将没有机会被添加到它的父节点 //带来了采集器陷阱 partOfWebStructure.add(tmpOfWebNodeWithLink); } //将使用过的网址信息映射为数字放置在Mapping中 WebsiteToNumber tmpOfWebsiteToNumber=new WebsiteToNumber(); tmpOfWebsiteToNumber.website=partOfWebStructure.get(0).sourcePage; thisWebNumber=Mapping.size(); tmpOfWebsiteToNumber.number=thisWebNumber; Mapping.add(tmpOfWebsiteToNumber); //将使用过的元素储存在webStructure中,但是这里的使用过的元素并不完整,将在每次向Mapping中添加 使用过的元素的子元素时补充targetPage值 WebNode tmpOfWebNode=new WebNode(tmpOfWebsiteToNumber.number); webStructure.add(tmpOfWebNode); } } if(thisWebNumber!=-1){//当前元素既不存在于Mapping中,同时不在times的范围内,便会=-1,不为它的父节点追加出链 //寻找父节点的ID int parentPageNumber=0; for(int i=0;i<Mapping.size();i++) { if(Mapping.get(i).website.equals(partOfWebStructure.get(0).parentPage)){ parentPageNumber=Mapping.get(i).number; break; } } //为使用过的元素的父节点添加targetPage for(int i=0;i<webStructure.size();i++) { if(webStructure.get(i).sourcePage==parentPageNumber&&(!webStructure.get(i).targetPage.contains(new Integer(thisWebNumber)))){ webStructure.get(i).targetPage.add(new Integer(thisWebNumber)); break; } } } //将partOfWebStructuresize中使用过的节点删除 partOfWebStructure.remove(0); System.out.printf("%7d",webStructure.size()); //当partOfWebStructure存储为空时,即里边的元素完全被remove掉,循环结束 }while(partOfWebStructure.size()!=0); System.out.println("\n运算完成"); } public static void main(String[] args) throws IOException, InterruptedException{ System.setProperty("sun.net.client.defaultConnectTimeout", String.valueOf(100000)); System.setProperty("sun.net.client.defaultReadTimeout", String.valueOf(100000)); int webnumbers=0; Scanner keyboard=new Scanner(System.in); System.out.println("您打算使用获取多少个网页?"); webnumbers=keyboard.nextInt(); System.out.println("正在运算请稍后......."); GetWebStructure test=new GetWebStructure("http://www.baidu.com",webnumbers);//本来设置的2147483647,结果outofmemeory test.run(); //test.print(); System.out.println("您打算使向量v和v'之间的差距小于多少时停止迭代?"); double difference; difference=keyboard.nextDouble(); keyboard.close(); PageRank PageRankTest=new PageRank(test.webStructure,test.Mapping,difference); PageRankTest.computeRank(); PageRankTest.print("网页数目"+webnumbers+"迭代差距"+difference); } }
import java.io.IOException; import java.util.ArrayList; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class WebNodeWithLink { String sourcePage; String parentPage; int outDegree; ArrayList<String> targetPage=new ArrayList<String>(); public WebNodeWithLink(String sourcePage,String parentPage) { this.sourcePage=sourcePage; this.parentPage=parentPage; } public void setTargetPage(){ Document doc=null; try { doc = Jsoup.connect(sourcePage).get(); Elements links=doc.select("a[href~=^(?!(javascript:|_blank|#)).*$]"); for(Element link:links){ String linkHref=link.attr("href");//取得链接地址 if(linkHref.equals("/")) linkHref=sourcePage; if(linkHref.length()>2&&linkHref.substring(0, 2).equals("//")) linkHref="http:"+linkHref; if(linkHref.length()>1&&linkHref.substring(0, 1).equals("/")) linkHref=sourcePage+linkHref; if(!linkHref.equals("")) targetPage.add(linkHref); } outDegree=targetPage.size(); } catch (IOException e1) { //System.out.println("出现异常,异常网址为:"+sourcePage); //e1.printStackTrace(); } } }
import java.util.ArrayList; public class WebNode{ int sourcePage; ArrayList<Integer> targetPage=new ArrayList<Integer>(); public WebNode(int sourcePage) { this.sourcePage=sourcePage; } }
public class WebsiteToNumber { public String website; public int number; }
import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.PrintWriter; import java.math.BigDecimal; import java.util.ArrayList; import java.util.concurrent.CountDownLatch; public class PageRank { private ArrayList<WebNode> webStructure=new ArrayList<WebNode>(); private ArrayList<WebsiteToNumber> Mapping=new ArrayList<WebsiteToNumber>(); double difference; double []v; triple []M;//M是转移矩阵,但是它是一个稀疏矩阵,可以采用三元组的方式进行表示以减少内存使用 public PageRank(ArrayList<WebNode> webStructure,ArrayList<WebsiteToNumber> Mapping,double difference){ this.webStructure=webStructure; this.Mapping=Mapping; this.difference=difference; //对转移矩阵M进行初始化 int webStructureSize=this.webStructure.size(); int length=0; for(int countLength=0;countLength<webStructureSize;countLength++) length+=this.webStructure.get(countLength).targetPage.size(); M=new triple[length]; int count=0; for(int targetPageNumber=0;targetPageNumber<webStructureSize;targetPageNumber++){ for(int webNumber=0;webNumber<webStructureSize;webNumber++) { for(int find=0;find<webStructure.get(webNumber).targetPage.size();find++) if(webStructure.get(webNumber).targetPage.get(find).intValue()==webStructure.get(targetPageNumber).sourcePage) { M[count]=new triple(); M[count].row=targetPageNumber; M[count].column=webNumber; count++; break; } } } } //下边进行PageRank的矩阵运算 public void computeRank() throws InterruptedException{ int webStructureSize=webStructure.size(); v=new double[webStructureSize]; double InitialValue=1.0/webStructureSize; double []v_New=new double[webStructureSize]; for(int i=0;i<webStructureSize;i++) v_New[i]=InitialValue; double nowDifference; do{ //为下一轮迭代做准备 for(int i=0;i<webStructureSize;i++){ v[i]=v_New[i]; v_New[i]=0; } for(int targetPageNumber=0;targetPageNumber<webStructureSize;targetPageNumber+=4){ CountDownLatch latch=new CountDownLatch(4); Thread thread1=new MatrixMultiplication(M,webStructure,v,v_New,(targetPageNumber+0), latch); thread1.start(); if((targetPageNumber+1)<webStructureSize){ Thread thread2=new MatrixMultiplication(M,webStructure,v,v_New,(targetPageNumber+1),latch); thread2.start(); } else{latch.countDown();} if((targetPageNumber+2)<webStructureSize){ Thread thread3=new MatrixMultiplication(M,webStructure,v,v_New,(targetPageNumber+2),latch); thread3.start(); } else{latch.countDown();} if((targetPageNumber+3)<webStructureSize){ Thread thread4=new MatrixMultiplication(M,webStructure,v,v_New,(targetPageNumber+3),latch); thread4.start(); } else{latch.countDown();} latch.await(); //System.out.println("一轮for完毕"); } //接下来计算nowDifference nowDifference=0; for(int i=0;i<webStructureSize;i++){ double tmp=sub(v_New[i],v[i]); if(tmp<0) tmp=-tmp; nowDifference=add(tmp,nowDifference); } System.out.print(">"+nowDifference+"\n"); }while(nowDifference>difference); System.out.print("迭代结束\n"); } public double add(double value1, double value2) { BigDecimal b1 = new BigDecimal(Double.toString(value1)); BigDecimal b2; try{ b2 = new BigDecimal(Double.toString(value2));} catch(Exception e){System.out.printf("error"+Double.toString(value2)); return b1.doubleValue() ;} return b1.add(b2).doubleValue(); } public double sub(double value1,double value2){ BigDecimal b1 = new BigDecimal(Double.toString(value1)); BigDecimal b2 = new BigDecimal(Double.toString(value2)); return b1.subtract(b2).doubleValue(); } public void print(String information){ PrintWriter outputStream=null; try{ outputStream=new PrintWriter(new FileOutputStream("D://"+information+"result.txt")); } catch (FileNotFoundException e) { e.printStackTrace(); } for(int i=0;i<webStructure.size();i++){ outputStream.println("Rank值为"+v[i]+"出度为"+webStructure.get(i).targetPage.size()+" 源网址为 "+Mapping.get(i).website); } outputStream.close(); System.out.println("打印完成"); } }
public class triple{int row; int column;}
import java.math.BigDecimal; import java.util.ArrayList; import java.util.concurrent.CountDownLatch; public class MatrixMultiplication extends Thread { triple []M; int targetPageNumber; int webStructureSize; private ArrayList<WebNode> webStructure=new ArrayList<WebNode>(); double []v; double []v_New; CountDownLatch latch; public MatrixMultiplication(triple []M, ArrayList<WebNode> webStructure,double []v,double []v_New,int targetPageNumber,CountDownLatch latch){ this.M=M; this.webStructure=webStructure; webStructureSize=webStructure.size(); this.v=v; this.v_New=v_New; this.targetPageNumber=targetPageNumber; this.latch=latch; } public double add(double value1, double value2) { BigDecimal b1 = new BigDecimal(Double.toString(value1)); BigDecimal b2; try{ b2 = new BigDecimal(Double.toString(value2));} catch(Exception e){System.out.printf("error"+Double.toString(value2)); return b1.doubleValue() ;} return b1.add(b2).doubleValue(); } public double mul(double value1, double value2) { BigDecimal b1 = new BigDecimal(Double.toString(value1)); BigDecimal b2 = new BigDecimal(Double.toString(value2)); return b1.multiply(b2).doubleValue(); } public void run(){ double InitialValue=1.0/webStructureSize; double sum=0; int begin=0; int end=0; for(int count=0;count<M.length;count++) if(M[count].row==targetPageNumber) { begin=count; do { if(++count<M.length) ; else break; }while(M[count].row==targetPageNumber); end=--count; break; } for(int count=begin;count<=end;count++){ sum= add(mul((1.0/webStructure.get(M[count].column).targetPage.size()),v[M[count].column]),sum); } sum=add(mul(sum,0.85),mul(0.15,InitialValue)); v_New[targetPageNumber]=sum; //System.out.println("v_New[targetPageNumber]"+v_New[targetPageNumber]+"v[targetPageNumber]"+v[targetPageNumber]); latch.countDown(); } }
相关文章推荐
- java中的快速排序实现
- C++基础——C++风格的类型转换(static_cast、const_cast、dynamic_cast、reinterpret_cast)
- Java Web基础——Action+Service +Dao三层的功能划分
- Spring MVC之@RequestParam @RequestBody @RequestHeader 等详解
- 以前整理的C++资料(二)
- ORA-01810: 格式代码出现两次
- asp.net:repeater嵌套(常用于新闻等在首页归类显示)
- C#编程练习(分支语句+运算符的应用)
- 以前整理的C++资料(一)
- java的发展前景
- C语言库函数的实现
- java 单例模式 几个实现方法
- java 单例模式 几个实现方法
- 细数JDK里的设计模式
- python 类的继承,内置函数(1)
- C#简介
- python 冒泡和快排,不多说【无聊】
- JAVA设计模式(19) —<行为型>备忘录模式(Memento)
- python笔记——爬虫4
- 【JAVA】MyEclipse-注册-破解