Lucene检索WORD等文件
2016-06-30 16:58
183 查看
Lucene是什么?
lucene是apache开源的全文检索的框架,不像百度那样的搜索引擎拿来就能用!
Lucene实现检索的过程?
Lucene实际上是先将文本写入,然后再搜索出来。
写入:
涉及的类: Document 、Field 、IndexWriter
Document相当于数据库表的一行,Field相当于数据库表的一个字段,Document可以包含多个Field,用IndexWriter对象将Document对象写在磁盘上或内存里,这就实现了字符串的写入!
搜索:
对写入的文本进行搜索!
如何检索WORD等microsoft文件?
要实现Lucene检索WORD等文件,首先需要读取出WORD文件中的内容,再使用Lucene将内容写入。
如何读取microsoft文件?
可以使用apache的POI开源项目进行读取。
javaCode:package org.fazlan.lucene.demo; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Indexer { // location where the index will be stored. public static final String INDEX_DIR = "src/main/resources/index"; private IndexWriter writer = null; public Indexer() { try { writer = new IndexWriter(FSDirectory.open(new File(INDEX_DIR)), new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36))); } catch (Exception e) { e.printStackTrace(); } } /** * This method will add the items into index */ public void writeIndex(IndexItem indexItem) throws IOException { // deleting the item, if already exists writer.deleteDocuments(new Term(IndexItem.ID, indexItem.getId().toString())); Document doc = new Document(); doc.add(new Field(IndexItem.ID, indexItem.getId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(IndexItem.TITLE, indexItem.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(IndexItem.FILENAME, indexItem.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(IndexItem.CONTENT, indexItem.getContent(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(IndexItem.DATE, indexItem.getDate(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(IndexItem.USER_NAME, indexItem.getUserName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(IndexItem.URL, indexItem.getUrl(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // add the document to the index writer.addDocument(doc); } /** * Closing the writer */ public void close() throws IOException { writer.close(); } }
package org.fazlan.lucene.demo; /** * 索引对象,根据业务要求改动 * * @author JiaJiCheng * */ public class IndexItem { private Long id; private String title; private String filename; private String content; private String date; private String userName; private String url; public static final String ID = "id"; public static final String TITLE = "title"; public static final String CONTENT = "content"; public static final String DATE = "date"; public static final String USER_NAME = "userName"; public static final String FILENAME = "filename"; public static final String URL = "url"; public IndexItem(Long id, String title, String filename, String content, String date, String userName, String url) { this.id = id; this.title = title; this.content = content; this.date = date; this.userName = userName; this.filename = filename; this.url = url; } public String getFilename() { return filename; } public String getUrl() { return url; } public String getDate() { return date; } public String getUserName() { return userName; } public Long getId() { return id; } public String getTitle() { return title; } public String getContent() { return content; } @Override public String toString() { return "IndexItem{" + "id=" + id + ", title='" + title + ", content='" + content + '\'' + "date=" + date + "userName=" + userName + '}'; } }
package org.fazlan.lucene.demo; import org.apache.poi.extractor.ExtractorFactory; import java.io.File; import java.io.IOException; /** * 文件转换器 * * @author JiaJiCheng * */ public class MSDocumentParser { private static String getFilename(String filename) { return filename.substring(0, filename.lastIndexOf(".")); } public static IndexItem parser(File file, String date, String userName, String url) throws IOException { String content = null; try { content = ExtractorFactory.createExtractor(file).getText(); } catch (Exception e) { e.printStackTrace(); } return new IndexItem((long) file.hashCode(), getFilename(file.getName()), file.getName(), content, date, userName, url); } }
package org.fazlan.lucene.demo; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Searcher { private IndexSearcher searcher; private QueryParser titleQueryParser; private QueryParser contentQueryParser; private static final StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); // default find result size. private static final int DEFAULT_RESULT_SIZE = 100; public Searcher() throws IOException { // open the index directory to search searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File(Indexer.INDEX_DIR)))); // defining the query parser to search items by title field. titleQueryParser = new QueryParser(Version.LUCENE_36, IndexItem.TITLE, analyzer); // defining the query parser to search items by content field. contentQueryParser = new QueryParser(Version.LUCENE_36, IndexItem.CONTENT, analyzer); } /** * This method is used to find the indexed items by the title. * * @param queryString * - the query string to search for */ public List<IndexItem> findByTitle(String queryString) throws ParseException, IOException { // create query from the incoming query string. Query query = titleQueryParser.parse(queryString); // execute the query and get the results ScoreDoc[] queryResults = searcher.search(query, DEFAULT_RESULT_SIZE).scoreDocs; List<IndexItem> results = new ArrayList<IndexItem>(); // process the results for (ScoreDoc scoreDoc : queryResults) { Document doc = searcher.doc(scoreDoc.doc); results.add(new IndexItem(Long.parseLong(doc.get(IndexItem.ID)), doc.get(IndexItem.TITLE), doc.get(IndexItem.FILENAME), doc.get(IndexItem.CONTENT), doc.get(IndexItem.DATE), doc.get(IndexItem.USER_NAME), doc.get(IndexItem.URL))); } return results; } /** * This method is used to find the indexed items by the content. * * @param queryString * - the query string to search for */ public List<IndexItem> findByContent(String queryString) throws ParseException, IOException { // create query from the incoming query string. Query query = contentQueryParser.parse(queryString); // execute the query and get the results ScoreDoc[] queryResults = searcher.search(query, DEFAULT_RESULT_SIZE).scoreDocs; List<IndexItem> results = new ArrayList<IndexItem>(); // process the results for (ScoreDoc scoreDoc : queryResults) { Document doc = searcher.doc(scoreDoc.doc); results.add(new IndexItem(Long.parseLong(doc.get(IndexItem.ID)), doc.get(IndexItem.TITLE), doc.get(IndexItem.FILENAME), doc.get(IndexItem.CONTENT), doc.get(IndexItem.DATE), doc.get(IndexItem.USER_NAME), doc.get(IndexItem.URL))); } return results; } public void close() throws IOException { searcher.close(); } }
package org.fazlan.lucene.demo; import org.apache.lucene.queryParser.ParseException; import java.io.File; import java.io.IOException; import java.util.List; /** * 实例 * * @author JiaJiCheng * */ public class FileIndexApplication { public static void main(String[] args) throws IOException, ParseException { File msWordFile = new File("src/main/resources/files/MSWord.doc"); File msWord2003File = new File("src/main/resources/files/MSWord.docx"); File msExcellFile = new File("src/main/resources/files/招商局系统.xls"); // creating the indexer and indexing the items Indexer indexer = new Indexer(); indexer.writeIndex(MSDocumentParser.parser(msWordFile, "1990-0-0", "zhangsan", "www.baidu.com")); indexer.writeIndex(MSDocumentParser.parser(msWord2003File, "1990-0-0", "zhangsan", "www.baidu.com")); indexer.writeIndex(MSDocumentParser.parser(msExcellFile, "1990-0-0", "zhangsan", "www.baidu.com")); // close the index to enable them index indexer.close(); // creating the Searcher to the same index location as the Indexer Searcher searcher = new Searcher(); // List<IndexItem> result = searcher.findByContent("Microfost", // DEFAULT_RESULT_SIZE); List<IndexItem> result = searcher.findByTitle("招"); print(result); searcher.close(); } /** * print the results. */ private static void print(List<IndexItem> result) { System.out.println("Result Size: " + result.size()); for (IndexItem item : result) { System.out.println(item); } } }
<dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>3.6.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.8</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.8</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.8</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency>
完整的项目代码见附件
相关文章推荐
- Android自定义折线图 不用三方库
- 基于线程池的线程调度管控系统
- 自定义控件原理
- Java模拟表单文件上传(微信/中文名乱码问题)
- editText带删除按钮
- Spring源码深度解析(七)获取Document
- python把元组组合成字典
- reshape函数使用
- MySQL5.6和5.5的一些区别地方
- spring请求流程详解
- docker深入2-使用jenkins镜像
- 如何添加一种新的语言
- 推荐一个图片管理的html模版和简单的js交互
- RBF神经网络与BP神经网络的比较
- mysql慢查询日志使用总结
- 读《商业的本质》
- DL插件化框架
- Linux必会原理之输入网址到看到页面内容原理
- jenkins-基础操作
- Ajax加载HTML、XML、JSON对比分析