您的位置:首页 > 其它

基于Lucene 的一个简单文件索引

2018-02-28 15:02 435 查看
1.lucene适用范围文本检索、网站信息检索、数据库搜索




2.lucene的组件【1】.document对象表示被索引的文档,IndexWriter将文档add到index中【2】用户query时,通过indexSearcher搜索lucene的index,同时计算term weight和score,之后返回结果
3.API调用方法:创建索引
创建indexWriter写入index,包括参数:1)INDEX_DIR,所以文件位置;2)Analyzer,文档词法分析
创建进行索引文档:new Document,把field放进去,indexWriter addDocument即可
搜索
创建queryParse函数,parser查询语句,查询语法树放到query中
用indexSearcher调用search方法搜索语法树,得到结果

package com.triple.one.indexer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* Func:
* Created by tripleone on 18/2/28.
*/
public class Indexer {

private IndexWriter writer;

public Indexer(String indexDir) throws IOException{
Directory dir = FSDirectory.open(new File(indexDir));

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

writer = new IndexWriter(dir, analyzer,true,
IndexWriter.MaxFieldLength.UNLIMITED);

}

public void close() throws IOException{
writer.close();
}

/**
* 索引文件
* @param dataDir
* @param filter
* @return
* @throws IOException
*/
public int index(String dataDir, FileFilter filter)
throws IOException{
File[] files = new File(dataDir).listFiles();

for (File f: files){
if(!f.isDirectory()
&& !f.isHidden()
&& f.exists()
&& f.canRead()
&& (filter == null || filter.accept(f))){
try {
indexFile(f);
} catch (Exception e) {
e.printStackTrace();
}
}
}
return writer.numDocs();
}

private static class TextFilesFilter implements FileFilter{

public boolean accept(File path){
//只对.txt文件进行索引
return path.getName().toLowerCase().endsWith(".txt");
}
}
protected Document getDocument(File f)throws Exception{
Document doc = new Document();
doc.add(new Field("文档", new FileReader(f)));
doc.add(new Field("文件名", f.getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("路径", f.getCanonicalPath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
return doc;

}
private void indexFile(File f)throws Exception{
System.out.println("Indexing "+f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc);
}

public static void main(String[] args) throws IOException {
System.out.println(System.getProperty("user.dir"));

String indexDir = System.getProperty("user.dir") + "/test1"; //示例索引
String dataDir = System.getProperty("user.dir") + "/test2" ; //对该目录下文档进行索引

long start = System.currentTimeMillis();
Indexer indexer = new Indexer(indexDir);
int numIndexed = 0;

try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());

} catch (IOException e) {
e.printStackTrace();
}finally {
indexer.close();
}

long end = System.currentTimeMillis();
System.out.println("搜索到 " + numIndexed + " 文件夹共花费 "
+ (end - start) + " 毫秒" );

}
}
依赖包: <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>3.3.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>3.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>3.3.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>3.3.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-highlighter -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>3.3.0</version>
</dependency>



test2里共有6个以txt结尾的文件

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: