您的位置：首页 > 其它

基于Lucene 的一个简单文件索引

2018-02-28 15:02 435 查看

1.lucene适用范围文本检索、网站信息检索、数据库搜索

2.lucene的组件【1】.document对象表示被索引的文档，IndexWriter将文档add到index中【2】用户query时，通过indexSearcher搜索lucene的index，同时计算term weight和score，之后返回结果
3.API调用方法：创建索引
创建indexWriter写入index，包括参数：1）INDEX_DIR,所以文件位置；2)Analyzer,文档词法分析
创建进行索引文档：new Document，把field放进去，indexWriter addDocument即可
搜索
创建queryParse函数，parser查询语句，查询语法树放到query中
用indexSearcher调用search方法搜索语法树，得到结果

package com.triple.one.indexer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* Func:
* Created by tripleone on 18/2/28.
*/
public class Indexer {

private IndexWriter writer;

public Indexer(String indexDir) throws IOException{
Directory dir = FSDirectory.open(new File(indexDir));

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

writer = new IndexWriter(dir, analyzer,true,
IndexWriter.MaxFieldLength.UNLIMITED);

}

public void close() throws IOException{
writer.close();
}

/**
* 索引文件
* @param dataDir
* @param filter
* @return
* @throws IOException
*/
public int index(String dataDir, FileFilter filter)
throws IOException{
File[] files = new File(dataDir).listFiles();

for (File f: files){
if(!f.isDirectory()
&& !f.isHidden()
&& f.exists()
&& f.canRead()
&& (filter == null || filter.accept(f))){
try {
indexFile(f);
} catch (Exception e) {
e.printStackTrace();
}
}
}
return writer.numDocs();
}

private static class TextFilesFilter implements FileFilter{

public boolean accept(File path){
//只对.txt文件进行索引
return path.getName().toLowerCase().endsWith(".txt");
}
}
protected Document getDocument(File f)throws Exception{
Document doc = new Document();
doc.add(new Field("文档", new FileReader(f)));
doc.add(new Field("文件名", f.getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("路径", f.getCanonicalPath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
return doc;

}
private void indexFile(File f)throws Exception{
System.out.println("Indexing "+f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc);
}

public static void main(String[] args) throws IOException {
System.out.println(System.getProperty("user.dir"));

String indexDir = System.getProperty("user.dir") + "/test1"; //示例索引
String dataDir = System.getProperty("user.dir") + "/test2" ; //对该目录下文档进行索引

long start = System.currentTimeMillis();
Indexer indexer = new Indexer(indexDir);
int numIndexed = 0;

try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());

} catch (IOException e) {
e.printStackTrace();
}finally {
indexer.close();
}

long end = System.currentTimeMillis();
System.out.println("搜索到 " + numIndexed + " 文件夹共花费 "
+ (end - start) + " 毫秒" );

}
}

依赖包： 
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.3.0</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>3.3.0</version>
</dependency>


<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>3.3.0</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>3.3.0</version>
</dependency>


<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>3.3.0</version>
</dependency>


<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>3.3.0</version>
</dependency>

test2里共有6个以txt结尾的文件

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航