您的位置:首页 > 产品设计 > UI/UE

搜索引擎luence之目录索引

2014-06-09 15:01 204 查看
/*
* Created on 2004-11-20
*
* index a dir file
*/
package demo;

/**使用lucence生成目录索引
*
*/
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.BufferedReader;
import java.io.FileReader;

import java.io.File;

public class DirSpider {
private String indexDir;
private String sSourceDir;
//控制显示配置参数
private boolean verbose;
//控制是否增量索引
private boolean incremental;

private IndexWriter index;
//从主函数的参数中得到所需配置信息
public static void main(String[] args) {
try {
DirSpider s = new DirSpider(args);
s.go();
} catch (Exception e) {
e.printStackTrace();
}
}

public DirSpider(String argv[]) throws Exception  {
verbose = false;
incremental = false;

for (int i = 0; i < argv.length; i++) {
if (argv[i].equals("-s"))
sSourceDir = argv[++i];
if(argv[i].equals("-d"))
indexDir = argv[++i];
else if(argv[i].equals("-v"))
verbose = true;
else if(argv[i].equals("-a"))
incremental = true;
}

if (sSourceDir == null)
throw new IllegalArgumentException("Missing required argument: -s [SourceDir dir]");

if (indexDir == null)
throw new IllegalArgumentException("Missing required argument: -d [index dir]");
}

public void go() throws Exception  {
long start = System.currentTimeMillis();

// create the index directory -- or append to existing
if (verbose) {
System.out.println("Creating index in: " + indexDir);
if (incremental) System.out.println("    - using incremental mode");
}
index = new IndexWriter(new File(indexDir), new StandardAnalyzer(),
!incremental);

File dir = new File(sSourceDir);

indexDir(dir);

index.optimize();
index.close();
if(verbose)
System.out.println("index complete in :"+(System.currentTimeMillis() - start)/1000);
}

private void indexDir(File dir)
{
File[] files = dir.listFiles();

for (int i = 0; i < files.length; i++) {
File f = files[i];
if (f.isDirectory()) {
indexDir(f);  // recurse 递归调用
} else if (f.getName().endsWith(".txt")) {//现在只对txt文本文件索引
indexFile(f);
}
}
}

private void indexFile(File item) {
if (verbose) System.out.println("Adding FILE: " + item);

News news = loadFile(item);

if ( news!= null && news.body != null) {
Document doc = new Document();
Field f = new Field("url", news.URL ,
Field.Store.YES, Field.Index.UN_TOKENIZED,
Field.TermVector.NO);
doc.add(f);

f = new Field("title", news.title ,
Field.Store.YES, Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);

f = new Field("content", news.body.toString() ,
Field.Store.YES, Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(f);
System.out.println(news);
//要清晰异常是什么,该怎样去处理。
try{
index.addDocument(doc);
}
catch(Exception e)
{
e.printStackTrace();
//System.exit(0)和System.exit(1)分别表示正常退出和异常退出
System.exit(-1);
}
}else{
System.out.println("索引数据为空!");
}
}

private static News loadFile(File sSourceFile){
News news = new News();

try
{	//注意附加URL的方式
//news.URL = "http://www.lietu.com/segtest/"+sSourceFile;
news.URL="http://localhost:8080/Chapter2WebPart/"+sSourceFile;

BufferedReader br = new BufferedReader(new FileReader(sSourceFile));
String s;

if ( (s = br.readLine()) != null )
{
news.title = s;
System.out.println(s);
while( (s = br.readLine()) != null ) {
news.body.append(s);
news.body.append('\n');
}
}
br.close();
}
catch (Exception e)
{
e.printStackTrace();
}

return news;
}
}
class News {
public String URL;
public String title;
public StringBuffer body;

public News(){
this.URL = "";
this.title = "";
this.body = new StringBuffer();
}

public String toString(){
return "URL :"+URL+" title :" + title +" body :"+ body.toString() ;
}
}
package com.lietu.web.test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import com.lietu.web.bean.example.Article;

/**
* 根据luence索引搜索数据
*
*/
public class MySearche {

public MySearche() {
super();
}

public static void main(String[] args) {

new MySearche().indexSeaches("中");
}
// 索引查询
public Collection indexSeaches(String str) {
System.out.println("str:" + str);
Collection collections = new ArrayList();
try {
// 创建搜索对象
IndexSearcher indexSearcher = new IndexSearcher("C:/index");
// 创建索引读取对象
IndexReader reader = IndexReader.open("C:/index");
Query titleQuery = null;
// 创建查询分析器,对File属性title进行查询,采用的分析器是StandardAnalyzer
QueryParser queryparser = new QueryParser("title",
new StandardAnalyzer());
try {
// 分析用户输入的字符串
Query query = queryparser.parse(str);
// 根据用户的输入开始搜索
Hits hit = indexSearcher.search(query);
System.out.println("hit.length():" + hit.length());
for (int i = 0; i < hit.length(); i++) {
Article article = new Article();
// 得到title
article.setTitle(hit.doc(i).get("title"));
// 得到content
article.setContent(hit.doc(i).get("content"));
// 把document添加到集合中,并且返回
collections.add(article);

System.out.println("标题:" + hit.doc(i).get("title"));
System.out.println("内容:" + hit.doc(i).get("content"));
}
} catch (ParseException e) {

e.printStackTrace();
}
} catch (CorruptIndexException e) {

} catch (IOException e) {

e.printStackTrace();
}
return collections;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: