您的位置:首页 > 其它

Lucene 实现txt文件的构建索引与查询

2017-06-14 18:56 267 查看
package net.jqsoft.hecv.util;

import net.sf.json.JSONArray;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
* Created by tianhj on 2017/6/14.
*/
public class LuceneTest {

private String indexPath="F:\\lucene\\luceneIndex";//索引存放路径
private String dataPath="F:\\lucene\\luceneData";//txt文件所在路径

private Analyzer analyzer = new IKAnalyzer();//IK中文分词器
private IndexWriter indexWriter;//索引器
private Directory directory;//索引库

private static final String STARTTAG = "<";//高亮开始
private static final String ENDTAG = ">";//高亮结束

public static void main(String[] args) {
LuceneTest luceneTest=new LuceneTest();
//luceneTest.buildTxtIndex();
luceneTest.searchTxt("勒索病毒手机");
}

/**
* Lucene检索
* @param text 关键词
*/
public void searchTxt(String text){
JSONArray array = new JSONArray();
try{
directory= FSDirectory.open(new File(indexPath));
DirectoryReader ireader = DirectoryReader.open(directory);//打开存储位置
IndexSearcher searcher = new IndexSearcher(ireader);//创建搜索器
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(
STARTTAG, ENDTAG);//查询结果高亮转换器
QueryParser parser = new QueryParser(Version.LUCENE_43, "content", analyzer);//查询解析器,设置Lucene版本、要查询的Field、分词器
Query query = null;
try {
query = parser.parse(text);
} catch (ParseException e) {
e.printStackTrace();
}
Highlighter highlighter = new Highlighter(simpleHtmlFormatter,
new QueryScorer(query));
TokenStream tokenStream = null;
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
Document doc;
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
doc = searcher.doc(docId);
String content=doc.get("content").trim();
tokenStream = analyzer.tokenStream("content", new StringReader(content));
content = highlighter.getBestFragment(tokenStream, content);
System.out.print((i+1)+": 文件名称"+doc.get("filename"));
System.out.println("---------文件内容:"+content);
}
ireader.close();
}catch (Exception e){

}
}

/**
* 创建索引
*/
public void buildTxtIndex(){
try {
directory= FSDirectory.open(new File(indexPath));
indexWriter=getIndexWriter(directory);
indexWriter.deleteAll();//清空所有索引库
} catch(Exception e) {
System.out.println("索引打开异常!");
}

List<File> fileList = getFileList(dataPath);
Document document = null;
try{
for(File file:fileList){
document = fileToDocument(file);
indexWriter.addDocument(document);
System.out.println("filename:"+document.get("filename")+";content:"+document.get("content"));
}
indexWriter.commit();
closeWriter();
}catch (Exception e){

}

}

/**
* 获得indexwriter对象
* @param dir
* @return
* @throws Exception
*/
public IndexWriter getIndexWriter(Directory dir) throws Exception {
IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_43, analyzer);
return new IndexWriter(dir, iwc);
}

/**
* 关闭indexwriter对象
* @throws Exception
*/
public void closeWriter() throws Exception {
if(indexWriter != null) {
indexWriter.close();
}
}

/**
* 将文件转换成Document对象
* @param file
* @return
* @throws Exception
*/
public Document fileToDocument(File file) throws Exception {
Document document=new Document();
document.add(new Field("filename", file.getName(), TextField.TYPE_STORED));
document.add(new Field("content", getFileContent(file), TextField.TYPE_STORED));
document.add(new Field("size", file.getTotalSpace()+"", TextField.TYPE_STORED));
return document;
}

/**
* 读取文件内容
* @param file
* @return
* @throws Exception
*/
public String getFileContent(File file) throws Exception{
InputStreamReader reader = new InputStreamReader(new FileInputStream(file),"GBK");
BufferedReader br = new BufferedReader(reader);
StringBuilder result = new StringBuilder();
String lineTxt = null;
while((lineTxt = br.readLine()) != null){
result.append(lineTxt);
}
br.close();
reader.close();

return result.toString();
}

/**
* 获得所有txt文件
* @param dirPath
* @return
*/
public List<File> getFileList(String dirPath) {
File[] files=new File(dirPath).listFiles();
List<File> fileList=new ArrayList<File>();
for(File file: files) {
if(isTxtFile(file.getName())) {
fileList.add(file);
}
}
return fileList;
}

/**
* 判断是否是txt文件
* @param fileName
* @return
*/
public boolean isTxtFile(String fileName) {
if(fileName.lastIndexOf(".txt") > 0) {
return true;
}
return false;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  lucene