您的位置:首页 > 编程语言

【单机版】一个小爬虫+PageRank代码实现

2015-11-01 22:08 561 查看
在这个小程序里边,首先是使用一个爬虫,获取网页的出链网址,然后在对获取的所有网页进行执行PageRank算法。

import java.io.IOException;
import java.util.ArrayList;
import java.util.Scanner;

public class GetWebStructure {

public String StartAddress;

public int times;

public ArrayList<WebNodeWithLink> partOfWebStructure=new ArrayList<WebNodeWithLink>();

public ArrayList<WebNode> webStructure=new ArrayList<WebNode>();

public ArrayList<WebsiteToNumber> Mapping=new ArrayList<WebsiteToNumber>();//这个结果用于存储web站点和列号之间的关系

public  GetWebStructure(String StartAddress,int times){

this.StartAddress=StartAddress;

this.times=times;

}

public void run() {

String  sourcePage=StartAddress;

WebNodeWithLink tmpOfWebNodeWithLink=new WebNodeWithLink(sourcePage,null);

partOfWebStructure.add(tmpOfWebNodeWithLink);

do{

//System.out.println("正在处理partOfWebStructuresize中的新元素");

int count=0;

int	thisWebNumber=-1;

//查看当前节点是否已经被存储

for(int n=0;n<Mapping.size();n++)

if(Mapping.get(n).website.equals(partOfWebStructure.get(0).sourcePage)) {

count++;

thisWebNumber=Mapping.get(n).number;

}

//如果已经被存储,则不重新采集它的子节点,否则进入采集器陷阱

if(count==0)    {

//当webStructure的大小超过预定times时,不再对partOfWebStructure进行追加

//因为这时候partOfWebStructure中包含的节点已经足以为webStructure中的节点添加子节点

//也不再对webStructure和Mapping进行追加,因为追加后的新节点ID作为times个节点的子节点没有意义

//所以只有partOfWebStructure中的节点在Mapping中已经存在,才会将其标记为times个节点中某个节点的子节点

if(webStructure.size()<times){

//为partOfWebStructure中当前元素获取出链节点

try{

partOfWebStructure.get(0).setTargetPage();//在使用这个记录时才设置它的链出网址和出度

}catch (Exception e){

partOfWebStructure.remove(0);

continue;//出错则进行下一轮do while循环

}

//将获得的出链节点添加到partOfWebStructure中

for(int t=0;t<partOfWebStructure.get(0).outDegree;t++){

sourcePage=partOfWebStructure.get(0).targetPage.get(t);

tmpOfWebNodeWithLink=new WebNodeWithLink(sourcePage,partOfWebStructure.get(0).sourcePage);

//必须被添加到partOfWebStructure,否则当前的子节点将没有机会被添加到它的父节点

//带来了采集器陷阱

partOfWebStructure.add(tmpOfWebNodeWithLink);

}

//将使用过的网址信息映射为数字放置在Mapping中

WebsiteToNumber tmpOfWebsiteToNumber=new WebsiteToNumber();

tmpOfWebsiteToNumber.website=partOfWebStructure.get(0).sourcePage;

thisWebNumber=Mapping.size();

tmpOfWebsiteToNumber.number=thisWebNumber;

Mapping.add(tmpOfWebsiteToNumber);

//将使用过的元素储存在webStructure中,但是这里的使用过的元素并不完整,将在每次向Mapping中添加 使用过的元素的子元素时补充targetPage值

WebNode tmpOfWebNode=new WebNode(tmpOfWebsiteToNumber.number);

webStructure.add(tmpOfWebNode);

}

}

if(thisWebNumber!=-1){//当前元素既不存在于Mapping中,同时不在times的范围内,便会=-1,不为它的父节点追加出链

//寻找父节点的ID

int parentPageNumber=0;

for(int i=0;i<Mapping.size();i++)
{

if(Mapping.get(i).website.equals(partOfWebStructure.get(0).parentPage)){

parentPageNumber=Mapping.get(i).number;

break;

}

}

//为使用过的元素的父节点添加targetPage

for(int i=0;i<webStructure.size();i++)

{

if(webStructure.get(i).sourcePage==parentPageNumber&&(!webStructure.get(i).targetPage.contains(new Integer(thisWebNumber)))){

webStructure.get(i).targetPage.add(new Integer(thisWebNumber));

break;

}

}
}

//将partOfWebStructuresize中使用过的节点删除

partOfWebStructure.remove(0);

System.out.printf("%7d",webStructure.size());

//当partOfWebStructure存储为空时,即里边的元素完全被remove掉,循环结束

}while(partOfWebStructure.size()!=0);

System.out.println("\n运算完成");

}

public static void main(String[] args) throws IOException, InterruptedException{

System.setProperty("sun.net.client.defaultConnectTimeout", String.valueOf(100000));

System.setProperty("sun.net.client.defaultReadTimeout", String.valueOf(100000));

int webnumbers=0;

Scanner keyboard=new Scanner(System.in);

System.out.println("您打算使用获取多少个网页?");

webnumbers=keyboard.nextInt();

System.out.println("正在运算请稍后.......");

GetWebStructure test=new  GetWebStructure("http://www.baidu.com",webnumbers);//本来设置的2147483647,结果outofmemeory

test.run();

//test.print();

System.out.println("您打算使向量v和v'之间的差距小于多少时停止迭代?");

double difference;

difference=keyboard.nextDouble();

keyboard.close();

PageRank PageRankTest=new PageRank(test.webStructure,test.Mapping,difference);

PageRankTest.computeRank();

PageRankTest.print("网页数目"+webnumbers+"迭代差距"+difference);
}

}

import java.io.IOException;
import java.util.ArrayList;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class WebNodeWithLink {

String sourcePage;

String parentPage;

int outDegree;

ArrayList<String> targetPage=new ArrayList<String>();

public WebNodeWithLink(String sourcePage,String parentPage)	{

this.sourcePage=sourcePage;

this.parentPage=parentPage;
}

public void setTargetPage(){

Document doc=null;

try {

doc = Jsoup.connect(sourcePage).get();

Elements links=doc.select("a[href~=^(?!(javascript:|_blank|#)).*$]");

for(Element link:links){

String linkHref=link.attr("href");//取得链接地址

if(linkHref.equals("/")) 																linkHref=sourcePage;

if(linkHref.length()>2&&linkHref.substring(0, 2).equals("//")) 	linkHref="http:"+linkHref;

if(linkHref.length()>1&&linkHref.substring(0, 1).equals("/")) 	linkHref=sourcePage+linkHref;

if(!linkHref.equals(""))  targetPage.add(linkHref);

}

outDegree=targetPage.size();

} catch (IOException e1) {

//System.out.println("出现异常,异常网址为:"+sourcePage);

//e1.printStackTrace();
}

}

}

import java.util.ArrayList;

public class 	WebNode{

int sourcePage;

ArrayList<Integer> targetPage=new ArrayList<Integer>();

public WebNode(int sourcePage)	{

this.sourcePage=sourcePage;

}

}

public class WebsiteToNumber {

public String website;

public int number;

}

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;

public class PageRank {

private ArrayList<WebNode> webStructure=new ArrayList<WebNode>();

private ArrayList<WebsiteToNumber> Mapping=new ArrayList<WebsiteToNumber>();

double difference;

double []v;

triple []M;//M是转移矩阵,但是它是一个稀疏矩阵,可以采用三元组的方式进行表示以减少内存使用

public PageRank(ArrayList<WebNode> webStructure,ArrayList<WebsiteToNumber> Mapping,double difference){

this.webStructure=webStructure;

this.Mapping=Mapping;

this.difference=difference;

//对转移矩阵M进行初始化

int webStructureSize=this.webStructure.size();

int length=0;

for(int countLength=0;countLength<webStructureSize;countLength++)

length+=this.webStructure.get(countLength).targetPage.size();

M=new triple[length];

int count=0;

for(int targetPageNumber=0;targetPageNumber<webStructureSize;targetPageNumber++){

for(int webNumber=0;webNumber<webStructureSize;webNumber++)

{
for(int find=0;find<webStructure.get(webNumber).targetPage.size();find++)

if(webStructure.get(webNumber).targetPage.get(find).intValue()==webStructure.get(targetPageNumber).sourcePage)

{

M[count]=new triple();

M[count].row=targetPageNumber;

M[count].column=webNumber;

count++;

break;

}

}

}

}

//下边进行PageRank的矩阵运算
public void computeRank() throws InterruptedException{

int webStructureSize=webStructure.size();

v=new double[webStructureSize];

double InitialValue=1.0/webStructureSize;

double []v_New=new double[webStructureSize];

for(int i=0;i<webStructureSize;i++)

v_New[i]=InitialValue;

double nowDifference;

do{

//为下一轮迭代做准备

for(int i=0;i<webStructureSize;i++){

v[i]=v_New[i];

v_New[i]=0;

}

for(int targetPageNumber=0;targetPageNumber<webStructureSize;targetPageNumber+=4){

CountDownLatch latch=new CountDownLatch(4);

Thread thread1=new MatrixMultiplication(M,webStructure,v,v_New,(targetPageNumber+0), latch);

thread1.start();

if((targetPageNumber+1)<webStructureSize){

Thread thread2=new MatrixMultiplication(M,webStructure,v,v_New,(targetPageNumber+1),latch);

thread2.start();

}

else{latch.countDown();}

if((targetPageNumber+2)<webStructureSize){

Thread thread3=new MatrixMultiplication(M,webStructure,v,v_New,(targetPageNumber+2),latch);

thread3.start();

}

else{latch.countDown();}

if((targetPageNumber+3)<webStructureSize){

Thread thread4=new MatrixMultiplication(M,webStructure,v,v_New,(targetPageNumber+3),latch);

thread4.start();

}

else{latch.countDown();}

latch.await();

//System.out.println("一轮for完毕");

}

//接下来计算nowDifference

nowDifference=0;

for(int i=0;i<webStructureSize;i++){

double tmp=sub(v_New[i],v[i]);

if(tmp<0) tmp=-tmp;

nowDifference=add(tmp,nowDifference);

}

System.out.print(">"+nowDifference+"\n");

}while(nowDifference>difference);

System.out.print("迭代结束\n");
}

public double add(double value1, double value2) {

BigDecimal b1 = new BigDecimal(Double.toString(value1));

BigDecimal b2;

try{ b2 = new BigDecimal(Double.toString(value2));}

catch(Exception e){System.out.printf("error"+Double.toString(value2)); return b1.doubleValue() ;}

return b1.add(b2).doubleValue();

}

public double sub(double  value1,double value2){

BigDecimal b1 = new BigDecimal(Double.toString(value1));

BigDecimal b2 = new BigDecimal(Double.toString(value2));

return b1.subtract(b2).doubleValue();
}

public void print(String information){

PrintWriter outputStream=null;

try{

outputStream=new PrintWriter(new FileOutputStream("D://"+information+"result.txt"));

} catch (FileNotFoundException e) {

e.printStackTrace();

}

for(int i=0;i<webStructure.size();i++){

outputStream.println("Rank值为"+v[i]+"出度为"+webStructure.get(i).targetPage.size()+"   源网址为 "+Mapping.get(i).website);

}

outputStream.close();

System.out.println("打印完成");
}
}

public class triple{int row; int column;}

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;

public class MatrixMultiplication extends Thread {

triple []M;

int targetPageNumber;

int webStructureSize;

private ArrayList<WebNode> webStructure=new ArrayList<WebNode>();

double []v;

double []v_New;

CountDownLatch latch;

public MatrixMultiplication(triple []M, ArrayList<WebNode> webStructure,double []v,double []v_New,int targetPageNumber,CountDownLatch latch){

this.M=M;

this.webStructure=webStructure;

webStructureSize=webStructure.size();

this.v=v;

this.v_New=v_New;

this.targetPageNumber=targetPageNumber;

this.latch=latch;

}

public double add(double value1, double value2) {

BigDecimal b1 = new BigDecimal(Double.toString(value1));

BigDecimal b2;

try{ b2 = new BigDecimal(Double.toString(value2));}

catch(Exception e){System.out.printf("error"+Double.toString(value2)); return b1.doubleValue() ;}

return b1.add(b2).doubleValue();

}

public double mul(double value1, double value2) {

BigDecimal b1 = new BigDecimal(Double.toString(value1));

BigDecimal b2 = new BigDecimal(Double.toString(value2));

return b1.multiply(b2).doubleValue();

}

public void run(){

double InitialValue=1.0/webStructureSize;

double sum=0;

int begin=0;

int end=0;

for(int count=0;count<M.length;count++)

if(M[count].row==targetPageNumber)  {

begin=count;

do {

if(++count<M.length) ;

else break;

}while(M[count].row==targetPageNumber);

end=--count;

break;

}

for(int count=begin;count<=end;count++){

sum= add(mul((1.0/webStructure.get(M[count].column).targetPage.size()),v[M[count].column]),sum);

}

sum=add(mul(sum,0.85),mul(0.15,InitialValue));

v_New[targetPageNumber]=sum;

//System.out.println("v_New[targetPageNumber]"+v_New[targetPageNumber]+"v[targetPageNumber]"+v[targetPageNumber]);

latch.countDown();

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: