您的位置:首页 > 其它

基于Lucene的文件检索Demo

2013-12-23 22:38 239 查看
通过Lucene实现了简单的文件检索功能的Demo。这个Demo支持基于文件内容的检索,支持中文分词和高亮显示。

下面简单的介绍下核心的类

1)索引相关的类

1.FileIndexBuilder ---建立索引

package uap.pub.bap.fs.search.indexer;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
* 文件索引生成器
*
* @author chenfeic
*
*/
public class FileIndexBuilder {

/**
* 需要索引的文件列表
*/
private List<File> fileList = new ArrayList<File>();

private IndexWriter writer;

/**
*
* @param fileDir
*            文件位置
* @param indexDir
*            索引位置
*/
public void generateIndexer(String fileDir, String indexDir) {
if (StringUtils.isEmpty(indexDir) || StringUtils.isEmpty(fileDir)) {
System.out.println("文件和索引路径都不能为空");
throw new RuntimeException("文件和索引路径都不能为空");
}
Directory d = null;
try {
// 初始化IndexWriter
d = FSDirectory.open(new File(indexDir));
initWriter(indexDir, d);
// 创建索引文档
initIndex(fileDir);
System.out.println("索引创建成功!");
} catch (Exception e) {
System.out.println("创建索引失败");
System.out.println(e);
} finally {
FileSearchUtils.closeIndexWriter(writer);
FileSearchUtils.closeDirectory(d);
}
}

/**
* 初始化 Lucene Index Writer 步骤1: Directory创建索引存放的位置 步骤2:创建分析器Analyzer
* 步骤3:创建IndexWriterConfig,使用分析器Analyzer 步骤4:创建IndexWriter
*
* @param indexDir
* @param directory
* @throws IOException
*/
private void initWriter(String indexDir, Directory directory)
throws IOException {
Analyzer analyzer = new IKAnalyzer();
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46,
analyzer);
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
writer = new IndexWriter(directory, conf);
}

/**
* 初始化索引文档
*
* @param fileDir
*            文档目录
* @return
*/
private int initIndex(String fileDir) {
getAllSubFile(new File(fileDir));
TextFileFilter fileter = new TextFileFilter();
for (File file : fileList) {
if (fileter.accept(file)) {
try {
DocumentBuilder db = new DocumentBuilder(file);
Document doc = db.createDocument();
writer.addDocument(doc);
} catch (FileNotFoundException e) {
System.out.println("创建索引失败,文件不存在:" + e.getMessage());
} catch (IOException e) {
e.printStackTrace();
}
}
}
return writer.numDocs();

}

private void getAllSubFile(File file) {
File[] listFiles = file.listFiles();
if (ArrayUtils.isEmpty(listFiles)) {
return;
}
for (File subfile : listFiles) {
if (subfile.isDirectory()) {
getAllSubFile(subfile);
} else {
fileList.add(subfile);
}
}
}

public static void main(String[] args) {
String fileDir = "E:\\lucene\\data";
String indexDir = "E:\\lucene\\index";
FileIndexBuilder indexer = new FileIndexBuilder();
indexer.generateIndexer(fileDir, indexDir);
}

}


2. DocumentBuilder --索引内容生成器

package uap.pub.bap.fs.search.indexer;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;

import uap.pub.bap.fs.search.IFileSearchConst;
import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
* Document生成器
*
* @author chenfeic
*
*/
public class DocumentBuilder {

private File file = null;

private IContextConverter contextConverter;

public DocumentBuilder(File file) {
this.file = file;
initConverter();
}

/**
* 初始化内容转换器
*/
private void initConverter() {
String fileType = FileSearchUtils.getFileType(file.getName());
// 1.word
if ("docx".equalsIgnoreCase(fileType)
|| "doc".equalsIgnoreCase(fileType)) {
contextConverter = new WordContextConverter();
}
// 2. excel
else if ("xlsx".equalsIgnoreCase(fileType)
|| "xls".equalsIgnoreCase(fileType)) {
contextConverter = new ExcelContextConverter();
}
// 3.pdf
else if ("pdf".equalsIgnoreCase(fileType)) {
contextConverter = new PdfContextConverter();
}
// 4.txt(log)
else {
contextConverter = new TextContextConverter();
}
}

public Document createDocument() {
if (file == null || !file.exists()) {
return null;
}
Document doc = new Document();
try {
doc.add(new TextField(IFileSearchConst.CONTENT_TYPE,
contextConverter.context2String(file), Field.Store.YES));
doc.add(new StringField(IFileSearchConst.FILENAM_TYPE, file
.getName(), Field.Store.YES));
doc.add(new StringField(IFileSearchConst.PATH_TYPE, file
.getCanonicalPath(), Field.Store.YES));
} catch (IOException e) {
e.printStackTrace();
}
return doc;
}
}


3: IContextConverter--文件内容转换器接口,将文件内容装换为字符串

package uap.pub.bap.fs.search.indexer;

import java.io.File;

/**
* 内容转换器,将文本内容转换成字符串
*
* @author chenfeic
*
*/
public interface IContextConverter {

/**
* 文件内容转换成字符串
*
* @param file 文件
* @return
*/
public String context2String(File file);

}


4:AbstractContextConverter--这个类主要利用第三方开源包cpdetector获取文件编码格式

package uap.pub.bap.fs.search.indexer;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

import java.io.File;

public abstract class AbstractContextConverter implements IContextConverter {

/**
* 利用第三方开源包cpdetector获取文件编码格式
*
* @param path
*            要判断文件编码格式的源文件的路径
* @author huanglei
* @version 2012-7-12 14:05
*/
protected String getFileEncode(String path) {
/*
* detector是探测器,它把探测任务交给具体的探测实现类的实例完成。
* cpDetector内置了一些常用的探测实现类,这些探测实现类的实例可以通过add方法 加进来,如ParsingDetector、
* JChardetFacade、ASCIIDetector、UnicodeDetector。
* detector按照“谁最先返回非空的探测结果,就以该结果为准”的原则返回探测到的
* 字符集编码。使用需要用到三个第三方JAR包:antlr.jar、chardet.jar和cpdetector.jar
* cpDetector是基于统计学原理的,不保证完全正确。
*/
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
/*
* ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于
* 指示是否显示探测过程的详细信息,为false不显示。
*/
detector.add(new ParsingDetector(false));
/*
* JChardetFacade封装了由Mozilla组织提供的JChardet,它可以完成大多数文件的编码
* 测定。所以,一般有了这个探测器就可满足大多数项目的要求,如果你还不放心,可以
* 再多加几个探测器,比如下面的ASCIIDetector、UnicodeDetector等。
*/
detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar
// ASCIIDetector用于ASCII编码测定
detector.add(ASCIIDetector.getInstance());
// UnicodeDetector用于Unicode家族编码的测定
detector.add(UnicodeDetector.getInstance());
java.nio.charset.Charset charset = null;
File f = new File(path);
try {
charset = detector.detectCodepage(f.toURI().toURL());
} catch (Exception ex) {
ex.printStackTrace();
}
if (charset != null)
return charset.name();
else
return null;
}

}


4.TextContextConverter

package uap.pub.bap.fs.search.indexer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;

import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
* txt、配置文件、log等文本的Document生成器
*
* @author chenfeic
*
*/
public class TextContextConverter extends AbstractContextConverter {

@Override
public String context2String(File file) {
StringBuilder sb = new StringBuilder();
BufferedReader reader = null;
InputStream in = null;
try {
String encoding = getFileEncode(file.getCanonicalPath());
in = new FileInputStream(file);
if (encoding != null && !"".equals(encoding.trim())) {
reader = new BufferedReader(new InputStreamReader(in, encoding));
} else {
reader = new BufferedReader(new InputStreamReader(in));
}
// 将输入流写入输出流
String line = "";
while ((line = reader.readLine()) != null) {
sb.append(line + "\n");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
FileSearchUtils.closeInputStream(in);
FileSearchUtils.closeReader(reader);
}
return sb.toString();
}

}


下面的两个类主要是读取excel、word等office 办公软件的内容,用到的第三方插件为poi

5.WordContextConverter

package uap.pub.bap.fs.search.indexer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;

import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
* word文档内容转换器
*
* @author chenfeic
*
*/
public class WordContextConverter extends AbstractContextConverter {

@Override
public String context2String(File file) {
if (isWord2003(file)) {
return readWord2003(file);
} else {
return readWord2007(file);
}
}

/**
* 判断是否是Word 97(-2003)版本
*
* @param file
* @return
*/
private boolean isWord2003(File file) {
String fileType = FileSearchUtils.getFileType(file.getName());
return "doc".equalsIgnoreCase(fileType);
}

/**
* 读取Word 97(-2003)文件内容
*
* @param file
* @return
*/
private String readWord2003(File file) {
InputStream inputStream = null;
String context = null;
try {
inputStream = new FileInputStream(file);
WordExtractor extractor = new WordExtractor(inputStream);
context = extractor.getText();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
FileSearchUtils.closeInputStream(inputStream);
}
return context;
}

private String readWord2007(File file) {
String text = null;
OPCPackage openPackage = null;
try {
// 得到.docx文件提取器
openPackage = POIXMLDocument.openPackage(file.getCanonicalPath());
XWPFWordExtractor docx = new XWPFWordExtractor(openPackage);
// 提取.docx正文文本
text = docx.getText();
} catch (IOException e) {
e.printStackTrace();
} catch (XmlException e) {
e.printStackTrace();
} catch (OpenXML4JException e) {
e.printStackTrace();
} finally {
if (openPackage != null) {
try {
openPackage.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return text;
}

}


6.ExcelContextConverter

package uap.pub.bap.fs.search.indexer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
* excel内容转换器
*
* @author chenfeic
*
*/
public class ExcelContextConverter extends AbstractContextConverter {

@Override
public String context2String(File file) {
if (isExcel2003(file)) {
return readExcel2003(file);
} else {
return readExcel2007(file);
}
}

/**
* 判断是否是Excel 97(-2003)版本
*
* @param file
* @return
*/
private boolean isExcel2003(File file) {
String fileType = FileSearchUtils.getFileType(file.getName());
return "xls".equalsIgnoreCase(fileType);
}

public String readExcel2003(File file) {
InputStream inputStream = null;
String content = null;
try {
inputStream = new FileInputStream(file.getCanonicalPath());
HSSFWorkbook wb = new HSSFWorkbook(inputStream);
ExcelExtractor extractor = new ExcelExtractor(wb);
extractor.setFormulasNotResults(true);
extractor.setIncludeSheetNames(false);
content = extractor.getText();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
FileSearchUtils.closeInputStream(inputStream);
}
return content;
}

public String readExcel2007(File file) {
StringBuffer content = new StringBuffer();
InputStream inputStream = null;
try {
inputStream = new FileInputStream(file.getCanonicalPath());
XSSFWorkbook xwb = new XSSFWorkbook(inputStream);
// 循环工作表Sheet
for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
XSSFSheet xSheet = xwb.getSheetAt(numSheet);
if (xSheet == null) {
continue;
}
// 循环行Row
for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
XSSFRow xRow = xSheet.getRow(rowNum);
if (xRow == null) {
continue;
}
// 循环列Cell
for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
XSSFCell xCell = xRow.getCell(cellNum);
if (xCell == null) {
continue;
}
if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
content.append(xCell.getBooleanCellValue());
} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
content.append(xCell.getNumericCellValue());
} else {
content.append(xCell.getStringCellValue());
}
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
FileSearchUtils.closeInputStream(inputStream);
}

return content.toString();
}
}


2)检索相关的类

FileSearchServiceImpl --查询关键字

package uap.pub.bap.fs.search.service;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexNotFoundException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import uap.pub.bap.fs.search.IFileSearchConst;
import uap.pub.bap.fs.search.SearchResult;
import uap.pub.bap.fs.search.util.FileSearchUtils;

public class FileSearchServiceImpl implements IFileSerachService {

private int count = 0;

@Override

public List<SearchResult> search(String type, String key) {
//type:查询的类型---标题,文件内容等
//key:查询关键字
List<SearchResult> results = new ArrayList<SearchResult>();
if (StringUtils.isEmpty(key)) {
return results;
}
// TODO chenfeic
String indexDir = "E:\\lucene\\index";
IndexReader reader = null;
Directory directory = null;
try {
directory = FSDirectory.open(new File(indexDir));
reader = DirectoryReader.open(directory);
IndexSearcher search = new IndexSearcher(reader);
// 使用QueryParser查询分析器构造Query对象
Analyzer analyzer = new IKAnalyzer();
QueryParser qp = new QueryParser(Version.LUCENE_46, type, analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(key);
// 一个简单的指针容器,指向前N个排名的搜索结果
TopDocs hits = search.search(query, null, 100);
count = hits.totalHits;
for (ScoreDoc soreDoc : hits.scoreDocs) {
Document doc = search.doc(soreDoc.doc);
String summary = toHighlighter(query, doc,
IFileSearchConst.CONTENT_TYPE, analyzer);
String title = doc.get(IFileSearchConst.FILENAM_TYPE);
String path = doc.get(IFileSearchConst.PATH_TYPE);
SearchResult result = new SearchResult();
result.setPath(path);
result.setTitle(title);
if (!StringUtils.isEmpty(summary)) {
result.setSummary(summary);
}
results.add(result);
}

} catch (IndexNotFoundException e1) {
System.out.println("无查询结果,没有此词条的索引");
}catch (IOException e) {
System.out.println("无查询结果!");
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} finally {
FileSearchUtils.closeIndexReader(reader);
FileSearchUtils.closeDirectory(directory);
}
return results;
}

/**
* 高亮显示
*
* @param query
* @param doc
* @param field
* @return
*/
private String toHighlighter(Query query, Document doc, String field,
Analyzer analyzer) {
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(
"<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter,
new QueryScorer(query));
// highlighter.setTextFragmenter(new
// SimpleFragmenter(20));//显示20个字符,默认是100个
TokenStream tokenStream = analyzer.tokenStream(field,
new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream,
doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
e.printStackTrace();

} catch (InvalidTokenOffsetsException e) {
e.printStackTrace();
}
return null;
}

@Override
public int getCount() {
return this.count;
}

}


上述基本上就是此Demo的核心类,其他的一些工具类和jsp、servlet处理类就没多写了

基本上用的都是第三方的开源工具,比如支持中文分词的IK_Analyzer(版本IK Analyzer 2012FF_hf1)(注:开始的时候想要的是paoding作为中文的分词器,用户之后发现paoding不支持Lucene4.0版本,估计3.X就已经不支持了,原因是因为坑跌的Lucene总是变化的他的实现和结构,使得有些方法变为final这样paoding中重写了此方法,导致编译出错),为了支持基于文件内容的检索,所以需要对文件内容进行索引并保存,所以用到了poi用于对ms office的处理,对于txt等读取时通过cpdetector检测文件的编码格式。代码都可以参照上面。代码有些是自己写的,有些是参照网上其他同仁的,再次一并谢过

相关jar包列表为

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: