Lucene 实现txt文件的构建索引与查询
2017-06-14 18:56
267 查看
package net.jqsoft.hecv.util; import net.sf.json.JSONArray; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import java.io.*; import java.util.ArrayList; import java.util.List; /** * Created by tianhj on 2017/6/14. */ public class LuceneTest { private String indexPath="F:\\lucene\\luceneIndex";//索引存放路径 private String dataPath="F:\\lucene\\luceneData";//txt文件所在路径 private Analyzer analyzer = new IKAnalyzer();//IK中文分词器 private IndexWriter indexWriter;//索引器 private Directory directory;//索引库 private static final String STARTTAG = "<";//高亮开始 private static final String ENDTAG = ">";//高亮结束 public static void main(String[] args) { LuceneTest luceneTest=new LuceneTest(); //luceneTest.buildTxtIndex(); luceneTest.searchTxt("勒索病毒手机"); } /** * Lucene检索 * @param text 关键词 */ public void searchTxt(String text){ JSONArray array = new JSONArray(); try{ directory= FSDirectory.open(new File(indexPath)); DirectoryReader ireader = DirectoryReader.open(directory);//打开存储位置 IndexSearcher searcher = new IndexSearcher(ireader);//创建搜索器 SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( STARTTAG, ENDTAG);//查询结果高亮转换器 QueryParser parser = new QueryParser(Version.LUCENE_43, "content", analyzer);//查询解析器,设置Lucene版本、要查询的Field、分词器 Query query = null; try { query = parser.parse(text); } catch (ParseException e) { e.printStackTrace(); } Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); TokenStream tokenStream = null; ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; Document doc; for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; doc = searcher.doc(docId); String content=doc.get("content").trim(); tokenStream = analyzer.tokenStream("content", new StringReader(content)); content = highlighter.getBestFragment(tokenStream, content); System.out.print((i+1)+": 文件名称"+doc.get("filename")); System.out.println("---------文件内容:"+content); } ireader.close(); }catch (Exception e){ } } /** * 创建索引 */ public void buildTxtIndex(){ try { directory= FSDirectory.open(new File(indexPath)); indexWriter=getIndexWriter(directory); indexWriter.deleteAll();//清空所有索引库 } catch(Exception e) { System.out.println("索引打开异常!"); } List<File> fileList = getFileList(dataPath); Document document = null; try{ for(File file:fileList){ document = fileToDocument(file); indexWriter.addDocument(document); System.out.println("filename:"+document.get("filename")+";content:"+document.get("content")); } indexWriter.commit(); closeWriter(); }catch (Exception e){ } } /** * 获得indexwriter对象 * @param dir * @return * @throws Exception */ public IndexWriter getIndexWriter(Directory dir) throws Exception { IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_43, analyzer); return new IndexWriter(dir, iwc); } /** * 关闭indexwriter对象 * @throws Exception */ public void closeWriter() throws Exception { if(indexWriter != null) { indexWriter.close(); } } /** * 将文件转换成Document对象 * @param file * @return * @throws Exception */ public Document fileToDocument(File file) throws Exception { Document document=new Document(); document.add(new Field("filename", file.getName(), TextField.TYPE_STORED)); document.add(new Field("content", getFileContent(file), TextField.TYPE_STORED)); document.add(new Field("size", file.getTotalSpace()+"", TextField.TYPE_STORED)); return document; } /** * 读取文件内容 * @param file * @return * @throws Exception */ public String getFileContent(File file) throws Exception{ InputStreamReader reader = new InputStreamReader(new FileInputStream(file),"GBK"); BufferedReader br = new BufferedReader(reader); StringBuilder result = new StringBuilder(); String lineTxt = null; while((lineTxt = br.readLine()) != null){ result.append(lineTxt); } br.close(); reader.close(); return result.toString(); } /** * 获得所有txt文件 * @param dirPath * @return */ public List<File> getFileList(String dirPath) { File[] files=new File(dirPath).listFiles(); List<File> fileList=new ArrayList<File>(); for(File file: files) { if(isTxtFile(file.getName())) { fileList.add(file); } } return fileList; } /** * 判断是否是txt文件 * @param fileName * @return */ public boolean isTxtFile(String fileName) { if(fileName.lastIndexOf(".txt") > 0) { return true; } return false; } }
相关文章推荐
- lucene.net 2.9.2 实现索引生成,修改,查询,删除功能
- Lucene 4.9索引txt文件
- lucene.net学习六——多Field多索引文件的查询
- lucene.net 2.9.2 实现索引生成,修改,查询,删除功能
- 空间复杂度,实现从excel导出到txt文件中的java代码自动构建,逻辑条件不同实现则不同
- Java实现读取Doxygen查询功能的索引文件。
- Lucene使用之构建索引、查询
- 【lucene系列学习一】实现Lucene索引,查询以及中文分词功能
- Lucene实现索引和查询的实例讲解
- lucene.net 2.9.2 实现索引生成,修改,查询,删除功能
- Lucene4.10.4实践 索引联合查询数据库实现查询更快
- Lucene_demo09_txt文件索引
- lucene.net 2.9.2 实现索引生成,修改,查询,删除功
- 第二章:lucene构建索引(新增、查询、更新、删除)
- lucene多索引文件并行查询
- 利用JAVA/JSF/JSP实现读取并截取.txt文件http://zhidao.baidu.com/question/42189115.html
- lucene与sql server数据库实现索引的简单实例(vs.net2008)
- Lucene.NET搜索多个索引文件
- lucene.net索引文件存储简析
- 搜索引擎第七讲 Compass-Lucene版的构建索引时的参数详解