您的位置:首页 > Web前端 > JavaScript

webcrawler-----自制爬虫搜索种子和电影(jsoup)

2015-05-27 13:01 405 查看
文章来源:开源中国社区http://www.oschina.net/code/snippet_778875_48198

自制的爬虫可以抓取网上的电影和种子,设置爬虫的深度足够可以爬取所有的资源。这里以163开始并无大碍,以任何一个好点的网站开始都是可以的。

需要的jar包:
IKAnalyzer2012FF_u1.jar
jsoup-1.8.2.jar
lucene-core-4.10.2.jar
lucene-highlighter-5.1.0.jar
目录中生成的文件,如下图所示:



把目录中生成的文件全部导入到sor4.7.2中利用solr的web管理界面可以进行检索,检索结果如下图所示:



java代码如下所示:

package tzj;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.Version;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class PTest {

File dirc=new File("dirc2");

//当前层的URL
Stack<String[]> stackcurt=new Stack<String[]>();

//下一层的URL和名称
Stack<String[]> stacknext=new Stack<String[]>();

//控制递归层数
final int deep=6;

Analyzer analyzer=new IKAnalyzer();

FSDirectory directory;

IndexWriterConfig config;

IndexWriter indexWriter;

final String userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36";

public PTest(){
try {
directory=FSDirectory.open(dirc);
config=new IndexWriterConfig(Version.LATEST, analyzer);
indexWriter=new IndexWriter(directory, config);
config.setMaxBufferedDocs(10);
} catch (IOException e) {
e.printStackTrace();
}
}

public static void main(String[] args) throws IOException{

PTest pTest=new PTest();
String starturl="http://www.163.com";
pTest.startWork(starturl);

System.out.println(pTest.search("妻子"));
//      System.out.println(
//      pTest.isIndexed("http://www.9amhg.com/?intr=806"));

}

/** 使用方式并不是递归得到URL,而是逐层解析。
* 得到页面内容
* @param url
* @param deep
*/
public void getPageContent(String[] urlname){

Connection connection=HttpConnection.connect(urlname[0]);
connection.ignoreContentType(true);
connection.ignoreHttpErrors(true);
connection.userAgent(userAgent);
connection.referrer(urlname[2]);

org.jsoup.nodes.Document document=null;
Response response=null;
boolean store=false;

//如果本身就是资源就直接索引
if(isResource(urlname[0])){
store=true;
}else
{

try {
response=connection.execute();
//如果这个链接返回的是视频就直接索引
if(isMP4(response.contentType()))
store=true;

else
{
document = response.parse();
}
} catch (IOException e) {
e.printStackTrace();
}

}

try {
indexPageContent(urlname[0],urlname[1],store);
} catch (IOException e) {
e.printStackTrace();
}

if(document!=null){
Elements elements=document.getElementsByTag("a");

Iterator<Element> iterator=elements.iterator();

while(iterator.hasNext()){
Element element=iterator.next();
String attrurl=element.attr("href");
attrurl=processUrl(attrurl, urlname[0]);
try {
if(!isIndexed(attrurl))
stacknext.push(new String[]{attrurl,element.text(),urlname[0]});
} catch (IOException e) {
e.printStackTrace();
}

}
}

}
/**
* 还原url链接
* @param url
* @param parenturl
* @return
*/
public String processUrl(String url,String parenturl){

Pattern pattern_host=Pattern.compile("http://\\w+\\.\\w+\\.\\w+");

Matcher matcher_host=pattern_host.matcher(parenturl);
String host=null;

if(matcher_host.find())
host=matcher_host.group();

if(host==null)
return url;

if(url.startsWith("/"))
return host+url;
else if(url.startsWith("../")){

while(url.startsWith("../")){
url=url.substring(3);
parenturl=parenturl.substring(0, parenturl.lastIndexOf("/")-1);
parenturl=parenturl.substring(0, parenturl.lastIndexOf("/"));
}

return parenturl+"/"+url;
}

return url;
}

/**
* 判断返回类型是不是视频
* @param contenttype
* @return
*/
public boolean isMP4(String contenttype){

if(contenttype.startsWith("video")){
return true;
}

return false;
}

/**
* 判断这个链接是不是资源
* @param url
* @return
*/
public boolean isResource(String url){

//添加资源识别的种类
if(url.endsWith(".mp4")
||url.endsWith(".torrent")){
return true;
}

return false;

}

/**
* 开始工作
* @param urls
*/
public void startWork(String...urls){

int deep=1;

for (int i = 0; i < urls.length; i++) {
stackcurt.push(new String[]{urls[i],"",""});
}

while(this.deep>deep)
{
if(stackcurt.isEmpty())
{
stackcurt=stacknext;
stacknext=new Stack<String[]>();
deep++;
}

if(stackcurt.isEmpty())
break;

try {
getPageContent(stackcurt.pop());
} catch (RuntimeException e) {
//e.printStackTrace();
}
}

}

/**
* 索引链接
* @param url
* @throws IOException
*/
public void indexPageContent(String url,String title,boolean store) throws IOException{

System.out.println("索引"+url+"\t"+title);
Document document=new Document();
document.add(new TextField("url",url,Store.YES));
document.add(new TextField("md5code",md5(url),Store.YES));

document.add(new TextField("title", title,Store.YES));

indexWriter.addDocument(document);

indexWriter.commit();

}
/**
* 关键字分组
* @param content
* @return
*/
public boolean setGroup(String content){

return false;
}

/**
* 判断URL是不是已经被索引过了
* @param url
* @return
* @throws IOException
*/
public boolean isIndexed(String url) throws IOException{

if(!directory.fileExists("segments.gen"))
return false;

IndexReader indexReader= DirectoryReader.open(directory);

IndexSearcher indexSearcher=new IndexSearcher(indexReader);
TermQuery query=new TermQuery(new Term("md5code", md5(url)));

TopDocs docs= indexSearcher.search(query, 1);

return docs.scoreDocs.length>0?true:false;
}

/**
* 查询
* @param keyword
* @return
* @throws IOException
*/
public List<String> search(String keyword) throws IOException{

IndexSearcher indexSearcher=new IndexSearcher(DirectoryReader.open(directory));

Query booleanQuery=
new QueryBuilder(analyzer).createBooleanQuery("title", keyword);

TopDocs docs=indexSearcher.search(booleanQuery, 10);

List<String> urls=new ArrayList<String>();

Highlighter highlighter=new Highlighter(new SimpleHTMLFormatter(),new QueryScorer(booleanQuery));

ScoreDoc[] docss=docs.scoreDocs;
if(docs!=null&&docss.length>0){
for (int i = 0; i < docss.length; i++) {
Document document=indexSearcher.doc(docss[i].doc);
urls.add(document.get("url"));

TokenStream tokenStream = analyzer.tokenStream("title", new StringReader(document.get("title")));
try {
String str = highlighter.getBestFragment(tokenStream,  document.get("title"));
System.out.println(str);
} catch (InvalidTokenOffsetsException e) {
e.printStackTrace();
}
System.out.println("\t"+document.get("url")+"\t"+document.get("title")+"\t");

System.out.println("\n");
}
}

return urls;
}

/**
* MD5 加密
* @param plainText
* @return
*/
public static String md5(String plainText) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
md.update(plainText.getBytes());
byte b[] = md.digest();

int i;

StringBuffer buf = new StringBuffer();
for (int offset = 0; offset < b.length; offset++) {
i = b[offset];
if (i < 0)
i += 256;
if (i < 16)
buf.append("0");
buf.append(Integer.toHexString(i));
}

return buf.toString().substring(8, 24);
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
return  null;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: