您的位置:首页 > 其它

lucene 学习笔记

2012-04-10 10:41 337 查看
package com.test.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;

public class Main {

private static final Version version = Version.LUCENE_35;
private static final Analyzer analyzer = new StandardAnalyzer(version);
private static final File indexDir = new File("E:/lucene/index");
private static final File dataDir = new File("E:/lucene/data");

/**
* 建立索引
*
* @throws Exception
*/
public void index() throws Exception {
IndexWriter writer = getIndexWriter();
File[] files = dataDir.listFiles();
for (File file : files) {
if (file.isDirectory()) {// 略过文件夹
continue;
}
Document doc = getDoc(file);
writer.addDocument(doc);
}
writer.close();
}

private IndexWriter getIndexWriter() {
IndexWriterConfig iwc = null;
IndexWriter writer = null;
try {
Directory dir = null;
dir = FSDirectory.open(indexDir);// 索引文件保存在文件系统上, 存在io操作, 速度较慢
dir = new RAMDirectory();// 放在内存里, 速度快, 没有io操作, 但是程序一退出, 就没有了

// 可以结合以上两个优点:索引存放在文件系统上, 程序启动时, 把索引库读到内存,
// 程序退出时, 把经过增删改的索引库保存会硬盘上
// 1.启动时读入

writer = new IndexWriter(dir, iwc);
} catch (Exception e) {
e.printStackTrace();
}
return writer;
}

/**
* 测试两种存放方式:
*
* <pre>
* 1.启动时从filesystem加载索引到ram
* 2.对ram中的索引进行增删改查
* 3.退出时保存:从ram到filesystem
* </pre>
*/
@Test
public void test() {
IndexWriterConfig iwc1 = null;
IndexWriterConfig iwc2 = null;
IndexWriter fsWriter = null;
IndexWriter ramWriter = null;
// indexWriterConfig 不能用两次: the object cannot be set twice!
iwc1 = new IndexWriterConfig(version, analyzer);
iwc1.setOpenMode(OpenMode.CREATE_OR_APPEND);// ram中,添加文档,使用创建或追加
iwc2 = new IndexWriterConfig(version, analyzer);
iwc2.setOpenMode(OpenMode.CREATE);// 因为是从ram中写入, ram中保存的是最新的, 所以直接创建
try {
Directory fsDir = FSDirectory.open(indexDir);
Directory ramDir = new RAMDirectory(fsDir);// 从systemfile加载
ramWriter = new IndexWriter(ramDir, iwc2);// 操作内存索引的writer
ramWriter.addDocument(getDoc(new File("E:/lucene/data/test")));// 直接添加,方便点
ramWriter.commit();
ramWriter.close();// 关闭后才能把ram中最新的索引写回systemfile

fsWriter = new IndexWriter(fsDir, iwc1);// 操作硬盘索引的writer
fsWriter.addIndexes(ramDir);
fsWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}

private Document getDoc(File file) {
Document doc = new Document();
/**
* 网页搜索时有:url地址, 标题, 内容等, 而通常不需要通过url进行搜索, 但是url还是得存起来,
* 这时需要用Field.Store.YES, Field.Index.NOT_ANALYZED <br/>
* 索引
*
* <pre>
* +--+不索引
* +--+索引
* 		+---+分词
* 		+---+不分词
* </pre>
*/

Field name = new Field("name", file.getName(), Field.Store.YES,
Field.Index.ANALYZED);// 索引
Field size = new Field("size", String.valueOf(file.length()),
Field.Store.YES, Field.Index.NOT_ANALYZED);
Field content = new Field("content", readFile(file), Field.Store.YES,
Field.Index.ANALYZED);
doc.add(name);
doc.add(size);
doc.add(content);
return doc;
}

private String readFile(File file) {
StringBuffer content = new StringBuffer();
String line = "";
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(
new FileInputStream(file)));
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return content.toString();
}

private Query getQuery(String fieldName, String key) throws Exception {// 单个field中索引
QueryParser parser = new QueryParser(version, fieldName, analyzer);
return parser.parse(key);
}

private Query getQuery(String[] fields, String key) throws Exception {// 多个field中检索
QueryParser parser = new MultiFieldQueryParser(version, fields,
analyzer);
return parser.parse(key);
}

@Test
public void search() throws Exception {
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));// 存放在文件系统
// IndexReader reader = IndexReader.open(new RAMDirectory());// 存放在内存
IndexSearcher searcher = new IndexSearcher(reader);

Query query = getQuery(new String[] { "name", "content" }, "test");
TopDocs hits = searcher.search(query, 100);// 100 是搜索最大记录数, 不是分页用的, 搞错了
int total = hits.totalHits;
if (total > 0) {
System.out.println("共找到" + total + "条记录");
} else {
System.out.println("没有找到记录");
}
ScoreDoc[] scoreDocs = hits.scoreDocs;
int start = 0;
int end = hits.totalHits;
// for (ScoreDoc doc : hits.scoreDocs) {//这样不便分页
for (int i = start; i < end; i++) {// 可以分页
int sn = scoreDocs[i].doc;// 相当于获取主键,
Document document = searcher.doc(sn);// 根据主键获取文档
print(document);
}
searcher.close();
}

private void print(Document doc) {
System.out
.println("--------------------------------------------------");
System.out.println("name   :" + doc.get("name"));
System.out.println("size   :" + doc.get("size"));
System.out.println("content:\n" + doc.get("content"));
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: