您的位置:首页 > 编程语言

lucenc代码阅读指南、测试范例

2013-03-18 22:50 330 查看

lucenc代码阅读指南、测试范例

阅读指南

Lucene 原理与代码分析完整版 -- 力荐

Lucene介绍及源码剖析: http://javenstudio.org/blog/annotated-lucene -- 核心IndexWriter

下载:Annotated+Lucene+.pdf: http://ishare.iask.sina.com.cn/f/24103589.html

阅读步骤:

1、了解检索的基本原理和概念

2、了解lucene的基本概念

3、熟悉lucene的索引文件格式 -- 关键

4、熟悉lucene的索引流程:具体代码的类层次较多,且引入不必要的设计模式致使代码阅读相对困难。基本思路:controler + model 封装索引链,实现多线程并发处理(数据不共享)。

5、熟悉lucene的搜索流程

6、了解lucene搜索语法解析器 和 熟悉分词

推荐资料深入剖析lucene的源码,非常有价值。光看文档,不够形象,大体看过文档后,建议结合源码理解文档内容。代码能让读者有大体的基本概念,但文档对源码细节的解释容易让读者"只见枝叶不见森林”,理解困难。根据文档作者提供的大体思路,结合实际源码,读起来更容易。

测试

测试对于了解lucene的工作原理、代码执行流程极有帮助,是阅读代码的重要辅助手段。

IndexerExample.java



/*
* Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  IndexerExample.java
* Exec    : java  -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  IndexerExample
*
*/

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class IndexerExample {

private static void EnExample() throws Exception {

// Store the index on disk
Directory directory = FSDirectory.getDirectory("/tmp/testindex");
// Use standard analyzer
Analyzer analyzer = new StandardAnalyzer();
// Create IndexWriter object
IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
iwriter.setMaxFieldLength(25000);
// make a new, empty document
Document doc = new Document();
File f = new File("/tmp/test.txt");

// Add the path of the file as a field named "path".  Use a field that is
// indexed (i.e. searchable), but don't tokenize the field into words.
doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));

String text = "This is the text to be indexed.";
doc.add(new Field("fieldname", text, Field.Store.YES,      Field.Index.TOKENIZED));
doc.add(new Field("name", text, Field.Store.YES,      Field.Index.TOKENIZED));

// Add the last modified date of the file a field named "modified".  Use
// a field that is indexed (i.e. searchable), but don't tokenize the field
// into words.
doc.add(new Field("modified",
DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
Field.Store.YES, Field.Index.UN_TOKENIZED));
// Add the contents of the file to a field named "contents".  Specify a Reader,
// so that the text of the file is tokenized and indexed, but not stored.
// Note that FileReader expects the file to be in the system's default encoding.
// If that's not the case searching for special characters will fail.
doc.add(new Field("contents", new FileReader(f)));

iwriter.addDocument(doc);
iwriter.optimize();
iwriter.close();

}

private static void CnExample() throws Exception {

// Store the index on disk
Directory directory = FSDirectory.getDirectory("/tmp/testindex");
// Use chinese analyzer
Analyzer analyzer = new ChineseAnalyzer();
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer());
wrapper.addAnalyzer("name", analyzer);

// Create IndexWriter object
IndexWriter iwriter = new IndexWriter(directory, wrapper, true);
iwriter.setMaxFieldLength(25000);
// make a new, empty document
Document doc = new Document();
File f = new File("/tmp/test.txt");

// Add the path of the file as a field named "path".  Use a field that is
// indexed (i.e. searchable), but don't tokenize the field into words.
doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));

String text = "This is the text to be indexed.";
doc.add(new Field("fieldname", text, Field.Store.YES, Field.Index.TOKENIZED));

String name = "2013春装新款女气质修身风衣大翻领双层大摆长款外套 系腰带";
doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));

// Add the last modified date of the file a field named "modified".  Use
// a field that is indexed (i.e. searchable), but don't tokenize the field
// into words.
doc.add(new Field("modified",
DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
Field.Store.YES, Field.Index.UN_TOKENIZED));
// Add the contents of the file to a field named "contents".  Specify a Reader,
// so that the text of the file is tokenized and indexed, but not stored.
// Note that FileReader expects the file to be in the system's default encoding.
// If that's not the case searching for special characters will fail.
doc.add(new Field("contents", new FileReader(f)));

iwriter.addDocument(doc);
iwriter.optimize();
iwriter.close();
}

public static void main(String[] args) throws Exception {
System.out.println("Start test: ");

if( args.length > 0){
CnExample();
}
else{
EnExample();
}

System.out.println("Index dir: /tmp/testindex");
}
}




SearcherExample.java



/*
* Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  SearcherExample.java
* Exec    : java  -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  SearcherExample
*
*/

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.queryParser.QueryParser;

public class SearcherExample {

public static void main(String[] args) throws Exception {
if (args.length < 2) {
throw new Exception("Usage: java " + Searcher.class.getName()
+ "<index dir> <query> [cn]");
}
File indexDir = new File(args[0]);
String q = args[1];
boolean bCn = args.length > 2? true : false;

if (!indexDir.exists() || !indexDir.isDirectory()) {
throw new Exception(indexDir +
" does not exist or is not a directory.");
}
search(indexDir, q, bCn);
}

public static void search(File indexDir, String q, boolean bCn)
throws Exception {
Directory fsDir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher is = new IndexSearcher(fsDir);

Analyzer analyzer = new StandardAnalyzer();
if( bCn ){
analyzer = new ChineseAnalyzer();
}

QueryParser parser = new QueryParser( "name",  analyzer);
Query query = parser.parse(q);

System.out.println("Query: " + query.toString());
long start = new Date().getTime();
Hits hits = is.search(query);
long end = new Date().getTime();

System.err.println("Found " + hits.length() +
" document(s) (in " + (end - start) +
" milliseconds) that matched query '" +
q + "'");

for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
System.out.println( "HIT " + i + " :" + doc.get("name"));
}
}
}




中文分词可采用lucene自带的库,效果不好,或者自行封装,核心就是封装分词Tokenizer。



package org.apache.lucene.analysis.cn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;

public class SnippetTermTokenizer extends Tokenizer {
private StringBuffer buffer = new StringBuffer();
private BufferedReader inputBuffer;
private JNISelecter selecter;     // 中文分词核心类
private List<Token> tokenList = null;
private List<String> phraseTokenList = null;
private Iterator<Token> tokenIter = null;

public SnippetTermTokenizer(Reader reader, JNISelecter s) {
inputBuffer = new BufferedReader(reader, 2048);
selecter = s;
}

public Token next() throws IOException {
if (tokenIter != null) {
if (tokenIter.hasNext()) {
return tokenIter.next();
} else {
// finish read input
return null;
}
}
// need to read content
readContent();
if (segment()) {
// segment succeed, create iterator
return tokenIter.next();
}
return null;
}

public void close() throws IOException {
inputBuffer.close();
}

// 分词相关略
}


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: