您的位置:首页 > 其它

Lucene索引的增删改查和二次检索

2015-07-16 13:20 344 查看
本博客是本人使用lucene时写的工具类。包含对索引的增删改查,以及对检索结果的多次子检索,检索结果高亮显示等。

//本示例取数据的bean (在此用的是档案文件bean)  自行从数据库封装数据
public class DocumentBean {
private String id;                   //数据id
private String archiveTypeId;        //档案类型id
private String archiveId;            //档案id
private String itemNo;               //文件号
private String titleProper;          //标题
private String fileSize;             //文件大小
private String fileFormat;           //文件格式
private String path;                 //文件路径
private String officeArchiveCode;    //文号
private String docType;              //文件类型
private String browseCount;          //浏览次数
private String downloadCount;        //下载次数

private String fileLastDate;         //最后更改日期

private String archiveTypeName;      //档案类型名称
private String uploadUserName;
private String uploadDeptName;
private String content;
private String realName;  //该文件存入数据库的名称
private String belongArch;//所属档案
/**省略getter setter**/

}

//lucene用到静态变量类
public class GlobalBean {
// lucene 配置
public static final String LUCENE_INDEX_PATH = "/index";// 索引存放路径
public static final String LUCENE_FILE_PATH = "/uploadFile"; // 添加到索引的文件路径

public static final String LUCENE_FIELD_ID = "id"; // 索引字段id
public static final String LUCENE_FIELD_TITLE_PROPER = "titleProper"; // 索引字段标题
public static final String LUCENE_FIELD_UPLOAD_USER = "uploadUser"; // 索引字段上传用户
public static final String LUCENE_FIELD_FILE_FORMAT = "fileFormat"; // 索引字段文件类型
public static final String LUCENE_FIELD_ARCHIVE_TYPE = "archiveType"; // 索引字段档案类型
public static final String LUCENE_FIELD_BELONG_FILE = "belongFile"; // 索引字段所属文件
public static final String LUCENE_FIELD_DOC_TYPE = "docType"; // 文件mine类型
public static final String LUCENE_FIELD_PATH = "path"; // 文件路径
public static final String LUCENE_FIELD_CONTENT = "content"; // 索引字段档案内容
public static final String LUCENE_FIELD_TYPE_ID = "typeId"; // 索引字段档案类型id
// 检索时参与检索的索引字段
public static final String LUCENE_INDEX_FIELDS = LUCENE_FIELD_TITLE_PROPER
+ " " + LUCENE_FIELD_UPLOAD_USER + " " + LUCENE_FIELD_ARCHIVE_TYPE
+ " " + LUCENE_FIELD_CONTENT;

// 文件上传服务器真实路径
public static String LUCENE_FACT_FILE_PATH = ""; // 添加到索引的文件路径 (你们填写上自己的真实路径,我这边是服务器启动初始化过了)
// 创建索引服务器真实路径
public static String LUCENE_FACT_INDEX_PATH = ""; // 添加到索引的文件路径(你们填写上自己的真实路径,我这边是服务器启动初始化过了)
// 高亮显示上下文 字段显示字数
public static int LUCENE_HIGHLIGHT_CONTEXT_COUNT = 80;
}
//lucene操作工具类
public class LuceneUtil {

private static IndexWriter writer = null;
private static IndexSearcher searcher = null;
private static IndexReader reader = null;
private static final IKAnalyzer analyzer = new IKAnalyzer();

/**
* 创建索引
*
* @param fileMap
*            存放数据库查询的上传文件 信息 的 名称(key) 文件bean对象(value) 集合
* @throws SystemGlobalException
*             系统全局异常
*/
public static void createIndex(Map<String, DocumentBean> fileMap)
throws SystemGlobalException {
String unDataBaseFileMsg = ""; // 数据库中不存在的文件提示信息
String unExistsFileMsg = ""; // 数据库存在文件实体丢失的提示信息
List<String> entiryList = null; // 存放实体列表
try {

File[] files = getFiles();
entiryList = new ArrayList<String>();
for (File file : files) {
if (!file.canRead())
throw new SystemGlobalException("文件" + file.getName()
+ "不可读!");
// 当文件不是数据库存在时.不创建该文件索引
DocumentBean dataBaseDoc = fileMap.get(file.getName());
if (CommonUtil.isEmpty(dataBaseDoc)) {
unDataBaseFileMsg += "[" + file.getName() + "],";
continue;
}
entiryList.add(file.getName());
if (file.isFile()) {
saveOrUpdateDocument(dataBaseDoc, file);
}
}
for (Iterator<String> file = fileMap.keySet().iterator(); file
.hasNext();) {
String dataFileName = file.next();
// 说明数据库中存在,但文件丢失
if (!entiryList.contains(dataFileName)) {
unExistsFileMsg += "["
+ fileMap.get(dataFileName).getTitleProper() + "],";
}
}

if (!CommonUtil.isEmpty(unDataBaseFileMsg)) {
unDataBaseFileMsg = "创建索引完成!其中"
+ StringUtil.subStr(unDataBaseFileMsg, 1)
+ "创建失败!原因:文件不在数据库中存在!";
}
if (!CommonUtil.isEmpty(unExistsFileMsg)) {
unExistsFileMsg = "创建索引完成!其中"
+ StringUtil.subStr(unExistsFileMsg, 1)
+ "创建失败!原因:本地文件丢失!";
}
getWriter().commit();
// getWriter().close();
if (!CommonUtil.isEmpty(unDataBaseFileMsg)
|| !CommonUtil.isEmpty(unExistsFileMsg)) {
throw new SystemGlobalException(unDataBaseFileMsg + "\n"
+ unExistsFileMsg);
}

} catch (IOException e) {
throw new SystemGlobalException("文件读写异常!");
}
}

/**
* 删除所有索引
*
* @param indexWriter
*            索引操作对象
* @throws SystemGlobalException
*/
public static void deleteAllIndex() throws SystemGlobalException {
try {
getWriter().deleteAll();
} catch (IOException e) {
throw new SystemGlobalException("删除索引异常");
}
}

/**
* 重建索引库 先 删除所有 ,后添加所有
*
* @param indexWriter
*            索引操作对象
* @param useSmart
*            是否使用智能切分 false 为最细密度切分
*/
public static void rebuildIndex(Map<String, DocumentBean> fileMap)
throws SystemGlobalException {
try {
deleteAllIndex();
createIndex(fileMap);
System.out.println("-----重建索引库成功!");
} catch (SystemGlobalException e) {
throw new SystemGlobalException("重建索引库异常!");
}
}

/**
* 更新索引库所有文件更改过的索引
*
* @param beanMap
*            数据库 电子全文对象map集合
*/
public static Map<String, File> updateBatchIndex(
Map<String, DocumentBean> beanMap) throws SystemGlobalException {
Map<String, File> modifiedMap = new HashMap<String, File>();
// 获取配置目录下的文件
File[] files = getFiles();
// 判断文件是否修改过
for (File file : files) {
Date d = new Date(file.lastModified());
String lastDate = DateUtil.convertDateToStr(d,
DateUtil.YYYYMMDDHHMMSS);
DocumentBean bean = beanMap.get(file.getName());
if (bean != null) {
if (!CommonUtil.isEmpty(lastDate)
&& !lastDate.equals(bean.getFileLastDate())) {
// 文件修改过.更新索引将电子全文id添加至list集合 返回批量数据库修改
// saveOrUpdateDocument(indexWriter, bean, file);
modifiedMap.put(bean.getId(), file);
}
}
}
return modifiedMap;
}

/**
* 保存或更新索引档案信息
*
* @param dataBaseDoc
*            电子全文对象
* @param file
*            电子全文文件
*/
public static void saveOrUpdateDocument(DocumentBean dataBaseDoc, File file) {
System.out
.println("正在创建文件[" + dataBaseDoc.getTitleProper() + "]的索引...");
try {
Document doc = new Document();
Tika tika = new Tika();
// id 用于标识唯一 记录
doc.add(new TextField(GlobalBean.LUCENE_FIELD_ID, CommonUtil
.objToStr(dataBaseDoc.getId()), Store.YES));
// 存放文件标题
doc.add(new TextField(GlobalBean.LUCENE_FIELD_TITLE_PROPER,
CommonUtil.objToStr(dataBaseDoc.getTitleProper()),
Store.YES));
// 存放上传人
doc.add(new TextField(GlobalBean.LUCENE_FIELD_UPLOAD_USER,
CommonUtil.objToStr(dataBaseDoc.getUploadUserName()),
Store.YES));
// 存放电子档案归类名
doc.add(new TextField(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE,
CommonUtil.objToStr(dataBaseDoc.getArchiveTypeName()),
Store.YES));
// 将文件中所有信息存放入文档对象content中
String formatter = CommonUtil.objToStr(dataBaseDoc.getFileFormat());
String content = "";
if (dataBaseDoc.getDocType().indexOf("image/") != -1) {
//              if (formatter.length() > 0){
//                  content = OCRUtil.recognizeText(
//                          new File(GlobalBean.LUCENE_FACT_FILE_PATH+dataBaseDoc.getPath().replace("uploadFile", "")),
//                          formatter.substring(1)).trim();
//                  if(content == null) content = "";
//              }

} else {
content = tika.parseToString(file).trim();
}
doc.add(new TextField(GlobalBean.LUCENE_FIELD_CONTENT, content,
Store.YES));
doc.add(new TextField(GlobalBean.LUCENE_FIELD_FILE_FORMAT,
formatter, Store.YES));
doc.add(new TextField(GlobalBean.LUCENE_FIELD_BELONG_FILE,
CommonUtil.objToStr(dataBaseDoc.getBelongArch()), Store.YES));
doc.add(new TextField(GlobalBean.LUCENE_FIELD_DOC_TYPE, CommonUtil
.objToStr(dataBaseDoc.getDocType()), Store.YES));
doc.add(new TextField(GlobalBean.LUCENE_FIELD_PATH, CommonUtil
.objToStr(dataBaseDoc.getPath()), Store.YES));
doc.add(new TextField(GlobalBean.LUCENE_FIELD_TYPE_ID, CommonUtil
.objToStr(dataBaseDoc.getArchiveTypeId()), Store.YES));
// 添加该档案对象到索引中 如果未存在添加,存在则修改为最新的
getWriter().updateDocument(
new Term(GlobalBean.LUCENE_FIELD_ID, dataBaseDoc.getId()),
doc);
System.out.println("文件[" + dataBaseDoc.getTitleProper()
+ "]的索引创建完成!");
} catch (Exception e) {
System.out.println(e.getMessage() + "\n不能添加空索引值!");
}
}

/**
* 保存索引
*
* @param bean
* @param file
*/
public static void saveOrUpdateIndex(DocumentBean bean, File file) {
try {
saveOrUpdateDocument(bean, file);
getWriter().commit();
} catch (Exception e) {
System.out.println("文件[" + bean.getTitleProper() + "]的索引创建出错!错误原因:"
+ e.getMessage());
}
}

/**
* 获取配置目录下的文件
*
* @return
*/
public static File[] getFiles() {
// 上传的文件目录路径
File fileDir = new File(GlobalBean.LUCENE_FACT_FILE_PATH);
return fileDir.listFiles();
}

/**
* 获取指定目录下的文件
*
* @return
*/
public static File[] getFiles(String path) {
// 上传的文件目录路径
File fileDir = new File(path);
return fileDir.listFiles();
}

/**
* 查询
*
* @param queryStr
*/
public static List<DocumentBean> search(String queryStr) {
System.out.println("正在检索关键字为[" + queryStr + "]的索引文件!");
Map<String, DocumentBean> map = new HashMap<String, DocumentBean>();
List<String> sortList = new ArrayList<String>();
List<DocumentBean> sortedList = new ArrayList<DocumentBean>();
try {
long start = System.currentTimeMillis();
if (!CommonUtil.isEmpty(queryStr)) {
String[] fields = GlobalBean.LUCENE_INDEX_FIELDS.split(" ");
BooleanClause.Occur[] flags = new BooleanClause.Occur[fields.length];
for (int index = 0; index < fields.length; index++) {
flags[index] = BooleanClause.Occur.SHOULD;
}
// 多字段查询
Query query = MultiFieldQueryParser.parse(queryStr, fields,
flags, analyzer);
QueryWrapperFilter filter = new QueryWrapperFilter(query);
search(query, filter, map, sortList);
}
long end = System.currentTimeMillis();
System.out.println("检索结束,耗时:" + (end - start) + "ms");
for (String id : sortList) {
sortedList.add(map.get(id));
}

} catch (Exception e) {
e.printStackTrace();
}
return sortedList;
}

/**
* 高级结果查询 不定长
*/
public static List<DocumentBean> seniorSearchInResult(List<String> params) {
List<DocumentBean> list = null;
List<DocumentBean> tempList = new ArrayList<DocumentBean>();
List<String> sortList = new ArrayList<String>();
List<String> tempSortList = new ArrayList<String>();
for (int index = 0; index < params.size() - 1; index++) {
list = new ArrayList<DocumentBean>();
tempList = searchInResult(params.get(index), params.get(index + 1),
tempSortList);
if (index == 0) {
sortList.addAll(tempSortList);
}
if (!CommonUtil.isEmpty(sortList)) {
for (String id : tempSortList) {
int subIndex = tempSortList.size() - tempList.size();
if (!sortList.contains(id))
tempList.remove(tempSortList.indexOf(id) - subIndex);
}
}
if (sortList.size() > 0) {
list.addAll(tempList);
}
}
return list;
}

/**
* @desc 二次搜索 在上次搜索的结果缓存的基础上进行再次检索
* @param newQueryString
* @param oldQueryString
*/
public static List<DocumentBean> searchInResult(String oldStr,
String newStr, List<String> sortList) {
sortList.clear();
System.out
.println("正在检索关键字[" + oldStr + "]结果中关键字为[" + newStr + "]的文件!");
Map<String, DocumentBean> map = new HashMap<String, DocumentBean>();
List<DocumentBean> sortedList = new ArrayList<DocumentBean>();
try {
long start = System.currentTimeMillis();

String[] fields = GlobalBean.LUCENE_INDEX_FIELDS.split(" ");
BooleanClause.Occur[] flags = new BooleanClause.Occur[fields.length];
for (int index = 0; index < fields.length; index++) {
flags[index] = BooleanClause.Occur.SHOULD;
}
// 多字段查询
Query query = MultiFieldQueryParser.parse(newStr.trim(), fields,
flags, analyzer);
Query oldQuery = MultiFieldQueryParser.parse(oldStr.trim(), fields,
flags, analyzer);
QueryWrapperFilter oldFilter = new QueryWrapperFilter(oldQuery);
CachingWrapperFilter filter = new CachingWrapperFilter(oldFilter);
search(query, filter, map, sortList);

// for (String field : GlobalBean.LUCENE_INDEX_FIELDS.split(" ")) {
// QueryParser queryParser = new QueryParser(field, analyzer);
// Query query = queryParser.parse(newStr.trim());
// Query oldQuery = queryParser.parse(oldStr.trim());
// // 查询包装过滤器
// QueryWrapperFilter oldFilter = new QueryWrapperFilter(oldQuery);
// CachingWrapperFilter filter = new CachingWrapperFilter(
// oldFilter);
// search(query, filter, map, sortList);
// }
long end = System.currentTimeMillis();
System.out.println("检索结束,耗时:" + (end - start) + "ms");
for (String id : sortList) {
sortedList.add(map.get(id));
}
} catch (Exception e) {
}
return sortedList;
}

/**
* 全文检索
*
* @param query
* @param filter
* @return
*/
private static void search(Query query, Filter filter,
Map<String, DocumentBean> map, List<String> sortList) {
try {
TopDocs topDocs = getSearcher().search(query, filter, 1000000);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
if (scoreDocs != null && scoreDocs.length != 0) {
DocumentBean bean = null;
for (int i = 0; i < scoreDocs.length; i++) {
bean = new DocumentBean();
Document document = getSearcher().doc(scoreDocs[i].doc);
String id = document.get(GlobalBean.LUCENE_FIELD_ID);
if (!map.containsKey(id)) {
String titleProper = document
.get(GlobalBean.LUCENE_FIELD_TITLE_PROPER);
String uploadUser = document
.get(GlobalBean.LUCENE_FIELD_UPLOAD_USER);
String archiveType = document
.get(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE);
String content = document
.get(GlobalBean.LUCENE_FIELD_CONTENT);
String fileFormat = document
.get(GlobalBean.LUCENE_FIELD_FILE_FORMAT);
String belongFile = document
.get(GlobalBean.LUCENE_FIELD_BELONG_FILE);
String docType = document
.get(GlobalBean.LUCENE_FIELD_DOC_TYPE);
String path = document
.get(GlobalBean.LUCENE_FIELD_PATH);
String archTypeId = document
.get(GlobalBean.LUCENE_FIELD_TYPE_ID);
String tempStr = "";
if (GlobalBean.LUCENE_INDEX_FIELDS
.contains(GlobalBean.LUCENE_FIELD_ID)) {
tempStr = toHighlight(query, analyzer,
GlobalBean.LUCENE_FIELD_ID, id,
GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);
}
bean.setId(CommonUtil.isEmpty(tempStr) ? id : tempStr);
tempStr = "";
if (GlobalBean.LUCENE_INDEX_FIELDS
.contains(GlobalBean.LUCENE_FIELD_TITLE_PROPER)) {
tempStr = toHighlight(query, analyzer,
GlobalBean.LUCENE_FIELD_TITLE_PROPER,
titleProper,
GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);
}
bean.setTitleProper(CommonUtil.isEmpty(tempStr) ? titleProper
: tempStr);
tempStr = "";
if (GlobalBean.LUCENE_INDEX_FIELDS
.contains(GlobalBean.LUCENE_FIELD_UPLOAD_USER)) {
tempStr = toHighlight(query, analyzer,
GlobalBean.LUCENE_FIELD_UPLOAD_USER,
uploadUser,
GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);
}
bean.setUploadUserName(CommonUtil.isEmpty(tempStr) ? uploadUser
: tempStr);
tempStr = "";

if (GlobalBean.LUCENE_INDEX_FIELDS
.contains(GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE)) {
tempStr = toHighlight(query, analyzer,
GlobalBean.LUCENE_FIELD_ARCHIVE_TYPE,
archiveType,
GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);
}
bean.setArchiveTypeName(CommonUtil.isEmpty(tempStr) ? archiveType
: tempStr);
tempStr = "";
if (GlobalBean.LUCENE_INDEX_FIELDS
.contains(GlobalBean.LUCENE_FIELD_CONTENT)) {
tempStr = toHighlight(query, analyzer,
GlobalBean.LUCENE_FIELD_CONTENT, content,
GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);
}
bean.setContent(CommonUtil.isEmpty(tempStr) ? content
: tempStr);

tempStr = "";
if (GlobalBean.LUCENE_INDEX_FIELDS
.contains(GlobalBean.LUCENE_FIELD_FILE_FORMAT)) {
tempStr = toHighlight(query, analyzer,
GlobalBean.LUCENE_FIELD_FILE_FORMAT,
fileFormat,
GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);
}
bean.setFileFormat(CommonUtil.isEmpty(tempStr) ? fileFormat
: tempStr);
tempStr = "";
if (GlobalBean.LUCENE_INDEX_FIELDS
.contains(GlobalBean.LUCENE_FIELD_BELONG_FILE)) {
tempStr = toHighlight(query, analyzer,
GlobalBean.LUCENE_FIELD_BELONG_FILE,
belongFile,
GlobalBean.LUCENE_HIGHLIGHT_CONTEXT_COUNT);
}
bean.setBelongArch(CommonUtil.isEmpty(tempStr) ? belongFile
: tempStr);
bean.setDocType(docType);
bean.setPath(path);
bean.setArchiveTypeId(archTypeId);
map.put(id, bean);
sortList.add(id);// 用于排序
}

}
// reader.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}

/**
* 获取操作对象
*
* @return
* @throws SystemGlobalException
*/
public static IndexWriter getWriter() throws SystemGlobalException {
if (writer == null) {
// 存放索引路径
IndexWriterConfig config = new IndexWriterConfig(
Version.LUCENE_4_10_3, analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
File fileindex = new File(GlobalBean.LUCENE_FACT_INDEX_PATH);
FSDirectory directory = null;
try {
directory = FSDirectory.open(fileindex);
writer = new IndexWriter(directory, config);
} catch (IOException e) {
throw new SystemGlobalException("索引操作对象创建失败!");
}
}
return writer;
}

/**
* 获取检索对象
*
* @return
* @throws SystemGlobalException
*/
public static IndexSearcher getSearcher() throws SystemGlobalException {
// 存放索引路径
File indexDir = new File(GlobalBean.LUCENE_FACT_INDEX_PATH);
try {
reader = DirectoryReader.open(FSDirectory.open(indexDir));
searcher = new IndexSearcher(reader);
} catch (IOException e) {
throw new SystemGlobalException("索引搜索对象创建失败!");
}
return searcher;
}

public static void deleteFileIndex(DocumentBean dataBaseDoc) {
System.out
.println("正在删除文件[" + dataBaseDoc.getTitleProper() + "]的索引...");
try {
getWriter().deleteDocuments(
new Term(GlobalBean.LUCENE_FIELD_ID, dataBaseDoc.getId()));
System.out.println("文件[" + dataBaseDoc.getTitleProper()
+ "]的索引删除完成!");
getWriter().commit();
} catch (Exception e) {
}
}

/**
* 查询结果高亮
*
* @param query
* @param analyzer
* @param fieldName
* @param text
* @param length
* @return
* @throws Exception
*/
private static String toHighlight(Query query, Analyzer analyzer,
String fieldName, String text, int length) throws Exception {
Highlighter highLighter = new Highlighter(new SimpleHTMLFormatter(
"<font color='red'>", "</font>"), new QueryScorer(query));
Fragmenter fragmenter = new SimpleFragmenter(length);
highLighter.setTextFragmenter(fragmenter);
return highLighter.getBestFragment(analyzer, fieldName, text);
}

/**
* 二次转码
*
* @param str
* @return
*/
public static String secondParseCode(String str) {
if (!CommonUtil.isEmpty(str)) {
try {
return new String(str.getBytes("gbk"), "utf-8");
} catch (UnsupportedEncodingException e) {
}
}
return str;
}

}


如有疑问,欢迎留言
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: