您的位置:首页 > 其它

Lucene之删除索引

2014-12-17 02:01 344 查看

分类: 【java】2013-08-30 22:22 467人阅读 评论(0) 收藏 举报

1.前言

之前的博客《Lucene全文检索之HelloWorld》已经简单介绍了Lucene的索引生成和检索。本文着重介绍Lucene的索引删除。

2.应用场景

索引建立完成后,因为有些原因,被索引的文件已经删除。此时,索引仍然存在,为了不产生“虚假检索结果”,需要将失效的索引删除

3.HelloLucene类(重点关注deleteIndexByQuery方法)

[java] view plaincopy

package com.njupt.zhb;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.LongField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.index.Term;

import org.apache.lucene.queryparser.classic.ParseException;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

/*

*@author: ZhengHaibo

*web: http://blog.csdn.net/nuptboyzhb
*mail: zhb931706659@126.com

*2013-08-27 Nanjing,njupt,China

*/

public class HelloLucene {

/**

* Index all text files under a directory.

* String indexPath = "index";//索引保存的路径

* String docsPath = "";//文档保存的路径(待索引)

*/

public void index(String indexPath,String docsPath) {

try {

// 1.创建Directory

Directory dir = FSDirectory.open(new File(indexPath));//保存在硬盘上

// 2.创建IndexWriter

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);

IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44,

analyzer);

iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);// 设置创建或追加模式

IndexWriter writer = new IndexWriter(dir, iwc);

final File docDir = new File(docsPath);

indexDocs(writer, docDir);

writer.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

public void indexDocs(IndexWriter writer, File file) throws IOException {

if (file.canRead()) {

if (file.isDirectory()) {//如果是文件夹,则遍历文件夹内的所有文件

String[] files = file.list();

// an IO error could occur

if (files != null) {

for (int i = 0; i < files.length; i++) {

indexDocs(writer, new File(file, files[i]));

}

}

} else {//如果是文件

FileInputStream fis;

try {

fis = new FileInputStream(file);

} catch (FileNotFoundException fnfe) {

return;

}

try {

// 3.创建Document对象

Document doc = new Document();

// 4.为Document添加Field

// Add the path of the file as a field named "path". Use a

// field that is indexed (i.e. searchable), but don't

// tokenize

// the field into separate words and don't index term

// frequency

// or positional information:

//以文件的文件路径建立Field

Field pathField = new StringField("path", file.getPath(),Field.Store.YES);

doc.add(pathField);//添加到文档中

//以文件的名称建立索引域

doc.add( new StringField("filename", file.getName(),Field.Store.YES));//添加到文档中

// Add the last modified date of the file a field named

// "modified".

// Use a LongField that is indexed (i.e. efficiently

// filterable with

// NumericRangeFilter). This indexes to milli-second

// resolution, which

// is often too fine. You could instead create a number

// based on

// year/month/day/hour/minutes/seconds, down the resolution

// you require.

// For example the long value 2011021714 would mean

// February 17, 2011, 2-3 PM.

doc.add(new LongField("modified", file.lastModified(),Field.Store.YES));

// Add the contents of the file to a field named "contents".

// Specify a Reader,

// so that the text of the file is tokenized and indexed,

// but not stored.

// Note that FileReader expects the file to be in UTF-8

// encoding.

// If that's not the case searching for special characters

// will fail.

//以文件的内容建立索引域(Field)

doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {

// New index, so we just add the document (no old

// document can be there):

System.out.println("adding " + file);

writer.addDocument(doc);//将文档写入到索引中(以创建的方式)

} else {

// Existing index (an old copy of this document may have

// been indexed) so

// we use updateDocument instead to replace the old one

// matching the exact

// path, if present:

System.out.println("updating " + file);

writer.updateDocument(new Term("path", file.getPath()),doc);//以追加方式写入到索引中

}

} finally {

fis.close();

}

}

}

}

/**

* 搜索

* http://blog.csdn.net/nuptboyzhb
*/

public void searcher(String indexPath,String searchKeyword){

try {

IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));

IndexSearcher searcher = new IndexSearcher(reader);

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);

String field = "contents";//搜索域是:文档的内容

QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);

Query query= parser.parse(searchKeyword);//搜索内容中含有searchKeyword字符串的文档

TopDocs tds=searcher.search(query, 10);//搜索前十个

ScoreDoc[] sds= tds.scoreDocs;

for (ScoreDoc sd:sds) {//将内容中含有“南京”关键字的文档遍历一遍

Document document=searcher.doc(sd.doc);

System.out.println("score:"+sd.score+"--filename:"+document.get("filename")+

"--path:"+document.get("path")+"--time"+document.get("modified"));//打印检索结果中文档的路径

}

reader.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}catch (ParseException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

/**

* 删除索引

* @param indexPath 索引所在的路径

* @param deleteKeyword 删除含有该内容的索引

*/

public void deleteIndexByQuery(String indexPath,String deleteKeyword){

try {

//1.新建一个IndexWrite

IndexWriter writer = new IndexWriter(FSDirectory.open(new File(indexPath)),new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)));

//2.生成一个Query

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);

String field = "contents";//搜索域是:文档的内容

QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);

Query query= parser.parse(deleteKeyword);//生成搜索内容中含有deleteKeyword的文档

//3.按Query参数的方式删除索引,即删除了含有deleteKeyword的索引

writer.deleteDocuments(query);

writer.commit();//提交,正是删除

writer.close();//关闭

//

//writer.deleteDocuments(new Term(field, ""));

}catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}catch (ParseException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

4.编写Junit测试

[java] view plaincopy

package com.njupt.zhb;

import org.junit.Test;

/*

*@author: ZhengHaibo

*web: http://blog.csdn.net/nuptboyzhb
*mail: zhb931706659@126.com

*2013-08-25 Nanjing,njupt,China

*/

public class TestJunit {

@Test

public void TestIndex(){

HelloLucene hLucene=new HelloLucene();

hLucene.index("index", "D:\\lucene");

}

@Test

public void TestSearcher(){

HelloLucene hLucene=new HelloLucene();

hLucene.searcher("index","南京");

}

@Test

public void TestDeleteIndexByQuery(){

HelloLucene hLucene=new HelloLucene();

System.out.println("未删除前,查询关键字:北京 --结果:");

hLucene.searcher("index","北京");

hLucene.deleteIndexByQuery("index", "北京");

System.out.println("删除后,查询关键字:北京 --结果:");

hLucene.searcher("index","北京");

}

}

5.实验结果

5.1运行TestIndex方法

>控制台打印的信息

[html] view plaincopy

updating D:\lucene\lucene1.txt

updating D:\lucene\lucene2.txt

updating D:\lucene\lucene3.txt

updating D:\lucene\北京.txt

updating D:\lucene\南京.txt

此时的index目录下的截图:



5.2运行TestSearcher方法

>搜索含有关键字“南京”的文档

[html] view plaincopy

score:0.53033006--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375

score:0.48666292--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791

score:0.2155931--filename:北京.txt--path:D:\lucene\北京.txt--time1377784223795

score:0.1530931--filename:南京.txt--path:D:\lucene\南京.txt--time1377784261486

5.3运行TestDeleteIndexByQuery方法

>

[html] view plaincopy

未删除前,查询关键字:北京 --结果:

score:0.4847152--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791

score:0.39226472--filename:北京.txt--path:D:\lucene\北京.txt--time1377784223795

score:0.10348864--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375

score:0.029874597--filename:南京.txt--path:D:\lucene\南京.txt--time1377784261486

删除后,查询关键字:北京 --结果:

删除后,再次查询关键字时,无查询结果。

此时,index目录下的文件结构为:



多出了一个_0_1.del文件

项目源代码:http://download.csdn.net/detail/nuptboyzhb/6041239

未经允许,不得用于商业目的
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: