您的位置:首页 > 其它

Lucene检索WORD等文件

2016-06-30 16:58 183 查看

Lucene是什么?

lucene是apache开源的全文检索的框架,不像百度那样的搜索引擎拿来就能用!

Lucene实现检索的过程?

Lucene实际上是先将文本写入,然后再搜索出来。

写入:

涉及的类: Document 、Field 、IndexWriter

Document相当于数据库表的一行,Field相当于数据库表的一个字段,Document可以包含多个Field,用IndexWriter对象将Document对象写在磁盘上或内存里,这就实现了字符串的写入!

搜索:

对写入的文本进行搜索!

如何检索WORD等microsoft文件?

要实现Lucene检索WORD等文件,首先需要读取出WORD文件中的内容,再使用Lucene将内容写入。

如何读取microsoft文件?

可以使用apache的POI开源项目进行读取。

javaCode:

package org.fazlan.lucene.demo;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Indexer {

// location where the index will be stored.
public static final String INDEX_DIR = "src/main/resources/index";

private IndexWriter writer = null;

public Indexer() {
try {
writer = new IndexWriter(FSDirectory.open(new File(INDEX_DIR)),
new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)));
} catch (Exception e) {
e.printStackTrace();
}
}

/**
* This method will add the items into index
*/
public void writeIndex(IndexItem indexItem) throws IOException {

// deleting the item, if already exists
writer.deleteDocuments(new Term(IndexItem.ID, indexItem.getId().toString()));

Document doc = new Document();

doc.add(new Field(IndexItem.ID, indexItem.getId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(IndexItem.TITLE, indexItem.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(IndexItem.FILENAME, indexItem.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(IndexItem.CONTENT, indexItem.getContent(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(IndexItem.DATE, indexItem.getDate(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(IndexItem.USER_NAME, indexItem.getUserName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(IndexItem.URL, indexItem.getUrl(), Field.Store.YES, Field.Index.NOT_ANALYZED));

// add the document to the index
writer.addDocument(doc);
}

/**
* Closing the writer
*/
public void close() throws IOException {
writer.close();
}
}


package org.fazlan.lucene.demo;

/**
* 索引对象,根据业务要求改动
*
* @author JiaJiCheng
*
*/
public class IndexItem {

private Long id;

private String title;

private String filename;

private String content;

private String date;

private String userName;

private String url;

public static final String ID = "id";
public static final String TITLE = "title";
public static final String CONTENT = "content";
public static final String DATE = "date";
public static final String USER_NAME = "userName";
public static final String FILENAME = "filename";
public static final String URL = "url";

public IndexItem(Long id, String title, String filename, String content, String date, String userName, String url) {
this.id = id;
this.title = title;
this.content = content;
this.date = date;
this.userName = userName;
this.filename = filename;
this.url = url;
}

public String getFilename() {
return filename;
}

public String getUrl() {
return url;
}

public String getDate() {
return date;
}

public String getUserName() {
return userName;
}

public Long getId() {
return id;
}

public String getTitle() {
return title;
}

public String getContent() {
return content;
}

@Override
public String toString() {
return "IndexItem{" + "id=" + id + ", title='" + title + ", content='" + content + '\'' + "date=" + date
+ "userName=" + userName + '}';
}
}


package org.fazlan.lucene.demo;

import org.apache.poi.extractor.ExtractorFactory;

import java.io.File;
import java.io.IOException;

/**
* 文件转换器
*
* @author JiaJiCheng
*
*/
public class MSDocumentParser {
private static String getFilename(String filename) {
return filename.substring(0, filename.lastIndexOf("."));
}

public static IndexItem parser(File file, String date, String userName, String url) throws IOException {
String content = null;
try {
content = ExtractorFactory.createExtractor(file).getText();
} catch (Exception e) {
e.printStackTrace();
}

return new IndexItem((long) file.hashCode(), getFilename(file.getName()), file.getName(), content, date,
userName, url);
}
}


package org.fazlan.lucene.demo;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {
private IndexSearcher searcher;
private QueryParser titleQueryParser;
private QueryParser contentQueryParser;
private static final StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
// default find result size.
private static final int DEFAULT_RESULT_SIZE = 100;

public Searcher() throws IOException {
// open the index directory to search
searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File(Indexer.INDEX_DIR))));

// defining the query parser to search items by title field.
titleQueryParser = new QueryParser(Version.LUCENE_36, IndexItem.TITLE, analyzer);

// defining the query parser to search items by content field.
contentQueryParser = new QueryParser(Version.LUCENE_36, IndexItem.CONTENT, analyzer);
}

/**
* This method is used to find the indexed items by the title.
*
* @param queryString
*            - the query string to search for
*/
public List<IndexItem> findByTitle(String queryString) throws ParseException, IOException {
// create query from the incoming query string.
Query query = titleQueryParser.parse(queryString);
// execute the query and get the results
ScoreDoc[] queryResults = searcher.search(query, DEFAULT_RESULT_SIZE).scoreDocs;

List<IndexItem> results = new ArrayList<IndexItem>();
// process the results
for (ScoreDoc scoreDoc : queryResults) {
Document doc = searcher.doc(scoreDoc.doc);
results.add(new IndexItem(Long.parseLong(doc.get(IndexItem.ID)), doc.get(IndexItem.TITLE),
doc.get(IndexItem.FILENAME), doc.get(IndexItem.CONTENT), doc.get(IndexItem.DATE),
doc.get(IndexItem.USER_NAME), doc.get(IndexItem.URL)));
}

return results;
}

/**
* This method is used to find the indexed items by the content.
*
* @param queryString
*            - the query string to search for
*/
public List<IndexItem> findByContent(String queryString) throws ParseException, IOException {
// create query from the incoming query string.
Query query = contentQueryParser.parse(queryString);
// execute the query and get the results
ScoreDoc[] queryResults = searcher.search(query, DEFAULT_RESULT_SIZE).scoreDocs;
List<IndexItem> results = new ArrayList<IndexItem>();
// process the results
for (ScoreDoc scoreDoc : queryResults) {
Document doc = searcher.doc(scoreDoc.doc);
results.add(new IndexItem(Long.parseLong(doc.get(IndexItem.ID)), doc.get(IndexItem.TITLE),
doc.get(IndexItem.FILENAME), doc.get(IndexItem.CONTENT), doc.get(IndexItem.DATE),
doc.get(IndexItem.USER_NAME), doc.get(IndexItem.URL)));
}

return results;
}

public void close() throws IOException {
searcher.close();
}
}


package org.fazlan.lucene.demo;

import org.apache.lucene.queryParser.ParseException;

import java.io.File;
import java.io.IOException;
import java.util.List;

/**
* 实例
*
* @author JiaJiCheng
*
*/
public class FileIndexApplication {
public static void main(String[] args) throws IOException, ParseException {

File msWordFile = new File("src/main/resources/files/MSWord.doc");
File msWord2003File = new File("src/main/resources/files/MSWord.docx");
File msExcellFile = new File("src/main/resources/files/招商局系统.xls");

// creating the indexer and indexing the items
Indexer indexer = new Indexer();
indexer.writeIndex(MSDocumentParser.parser(msWordFile, "1990-0-0", "zhangsan", "www.baidu.com"));
indexer.writeIndex(MSDocumentParser.parser(msWord2003File, "1990-0-0", "zhangsan", "www.baidu.com"));
indexer.writeIndex(MSDocumentParser.parser(msExcellFile, "1990-0-0", "zhangsan", "www.baidu.com"));
// close the index to enable them index
indexer.close();

// creating the Searcher to the same index location as the Indexer
Searcher searcher = new Searcher();
// List<IndexItem> result = searcher.findByContent("Microfost",
// DEFAULT_RESULT_SIZE);
List<IndexItem> result = searcher.findByTitle("招");
print(result);

searcher.close();
}

/**
* print the results.
*/
private static void print(List<IndexItem> result) {
System.out.println("Result Size: " + result.size());

for (IndexItem item : result) {
System.out.println(item);
}
}
}


<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.8</version>
</dependency>

<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.8</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>


完整的项目代码见附件

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: