您的位置:首页 > 其它

中文分词-lucene 第二个版本

2015-06-02 13:53 337 查看
jar包

IKAnalyzer2012FF_u1.jar

lucene-core-4.6.1.jar

lucene-queryparser-4.6.1.jar

api: http://lucene.apache.org/core/4_6_1/core/index.html
代码:

package com.ishehui.utils;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.ishehui.entity.Article;
import com.ishehui.entity.Conchs;

/*****
* IKAnalyzer 中文分词 && 相似度匹配
* @author wenmeishuai
*
*/
public class IKWords {
/* 创建简单中文分析器 创建索引使用的分词器必须和查询时候使用的分词器一样,否则查询不到想要的结果 */
private static Analyzer analyzer = new IKAnalyzer(true);
// 索引保存目录
private static File indexFile = new File("d:\\indexDir\\");///data/work/videos/conchsindex/   d:\\indexDir\\
//需要搜索出的列
private static String[] fieldName = {"id","content"};

/**
* 查看IKAnalyzer 分词器是如何将一个完整的词组进行分词的
*
* @param text
* @param isMaxWordLength
*/
public static String splitWord(String text) {
try {
// 创建分词对象
//	        Analyzer analyzer = new IKAnalyzer(isMaxWordLength);
StringReader reader = new StringReader(text);
// 分词
TokenStream ts = analyzer.tokenStream("", reader);
//重置到流的开始位置
ts.reset();
CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
// 遍历分词数据
System.out.print("IKAnalyzer把关键字拆分的结果是:");
StringBuffer b = new StringBuffer();
while (ts.incrementToken()) {
System.out.print("【" + term.toString() + "】");
b.append("【" + term.toString() + "】");
}
reader.close();
return b.toString();
} catch (IOException e) {
e.printStackTrace();
}
return "";
}
/**
* 创建索引文件到磁盘中永久保存
* isCreateAll true :重新构建索引  false:在原有基础上添加
*/
public static void createConchsIndexFile(List<Conchs> cs,boolean isCreateAll) {
long startTime = System.currentTimeMillis();
System.out.println("*****************创建索引开始**********************");
Directory directory = null;
IndexWriter indexWriter = null;
try {
// 创建哪个版本的IndexWriterConfig,根据参数可知lucene是向下兼容的,选择对应的版本就好
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);
// 创建磁盘目录对象
directory = new SimpleFSDirectory(indexFile);
indexWriter = new IndexWriter(directory, indexWriterConfig);
// indexWriter = new IndexWriter(directory, analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED);
// 这上面是使用内存保存索引的创建索引写入对象的例子,和这里的实现方式不一样,但是效果是一样的
if(isCreateAll){
// 为了避免重复插入数据,每次测试前 先删除之前的索引
indexWriter.deleteAll();
}

// 获取实体对象
for (int i = 0; i < cs.size(); i++) {
Conchs article = cs.get(i);
// indexWriter添加索引
Document doc = new Document();
doc.add(new Field("id", article.getId().toString(),Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("content", article.getContent().toString(),Field.Store.YES, Field.Index.ANALYZED));
// 添加到索引中去
indexWriter.addDocument(doc);
System.out.println("索引添加成功:第" + (i + 1) + "次!!");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (indexWriter != null) {
try {
indexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (directory != null) {
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.out.println("创建索引文件成功,总共花费" + (System.currentTimeMillis() - startTime) + "毫秒。");
System.out.println("*****************创建索引结束**********************");
}
/*****
* 从索引中搜索  (多字段 单条件)
* @param keyword
* @return
*/
public static List<String> getReultFromIndex(String keyword) {

IndexSearcher isearcher = null;
IndexReader indexReader = null;
try {
indexReader = IndexReader.open(FSDirectory.open(indexFile));

//实例化搜索器
isearcher = new IndexSearcher(indexReader);
//在索引器中使用IKSimilarity相似度评估器
//       	 	isearcher.setSimilarity()

//           //使用IKQueryParser查询分析器构造Query对象
MultiFieldQueryParser queryParser =  new MultiFieldQueryParser(Version.LUCENE_46, fieldName,analyzer);
//           //利用queryParser解析传递过来的检索关键字,完成Query对象的封装
Query query = queryParser.parse(keyword);
//       	 	Query query = IKQueryParser.parseMultiField(fieldName, keyword);

splitWord(keyword);// 显示拆分结果

//搜索相似度最高的20条记录
TopDocs topDocs = isearcher.search(query , 50);
System.out.println("命中:" + topDocs.totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;

List<String> returnList = new ArrayList<String>();
for (int i = 0; i < scoreDocs.length; i++){
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.println("命中内容: id:"+targetDoc.get("id")+" content:"+targetDoc.get("content"));

returnList.add(targetDoc.get("id"));
}
return returnList;
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally{
//           if(isearcher != null){
//               try {
//                   isearcher.close();
//               } catch (IOException e) {
//                   e.printStackTrace();
//               }
//           }
}
return null;
}
/****
* 从内存中检索  (多字段 单条件)
* @param keyword
* @param cs
* @return
*/
public static List<String> getReultFromMemory(String keyword,List<Conchs> cs){

Directory directory = null;
IndexWriter iwriter = null;
IndexSearcher isearcher = null;
IndexReaderContext irederContext = null;
try {
//建立内存索引对象将数据加载到内存中
directory = new RAMDirectory();
//                iwriter = new IndexWriter(directory, analyzer, true , IndexWriter.MaxFieldLength.LIMITED);
iwriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46, analyzer));
for(Conchs text:cs){
Document doc = new Document();
doc.add(new Field("content", text.getContent(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("id", text.getId().toString(), Field.Store.YES, Field.Index.ANALYZED));
System.out.println("id:"+text.getId()+" text:"+text.getContent());
iwriter.addDocument(doc);
}
iwriter.close();

//实例化搜索器   从内存中搜索出结果
isearcher = new IndexSearcher(IndexReader.open(directory)) ;
//在索引器中使用IKSimilarity相似度评估器
//                isearcher.setSimilarity(new IKSimilarity());

//构造Query对象
//                QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fieldName,analyzer);
//利用queryParser解析传递过来的检索关键字,完成Query对象的封装
MultiFieldQueryParser queryParser =  new MultiFieldQueryParser(Version.LUCENE_46, fieldName,analyzer);
//              //利用queryParser解析传递过来的检索关键字,完成Query对象的封装
Query query = queryParser.parse(keyword);
//                Query query = IKQueryParser.parseMultiField(fieldName, keyword);
// 显示拆分结果 给开发者
splitWord(keyword);

//搜索相似度最高的20条记录
TopDocs topDocs = isearcher.search(query , 20);
System.out.println("命中:" + topDocs.totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;

List<String> returnList = new ArrayList<String>();
for (int i = 0; i < scoreDocs.length; i++){
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.println("内容:" + targetDoc.toString()+" id:"+targetDoc.get("id"));

returnList.add(targetDoc.get("id"));
}
return returnList;
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally{
//                if(isearcher != null){
//                    try {
//                        isearcher.close();
//                    } catch (IOException e) {
//                        e.printStackTrace();
//                    }
//                }
if(directory != null){
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
/**
* 直接读取索引文件,查询索引记录
*
* @throws IOException
*/
public static void openIndexFile() {
long startTime = System.currentTimeMillis();
System.out.println("*****************读取索引开始**********************");
List<Article> articles = new ArrayList<Article>();
// 得到索引的目录
Directory directory = null;
IndexReader indexReader = null;
try {
directory = new SimpleFSDirectory(indexFile);
// 根据目录打开一个indexReader
//            indexReader = IndexReader.open(directory);
indexReader = IndexReader.open(directory);
System.out.println("在索引文件中总共插入了" + indexReader.maxDoc() + "条记录。");
// 获取第一个插入的document对象
Document minDoc = indexReader.document(0);
// 获取最后一个插入的document对象
Document maxDoc = indexReader.document(indexReader.maxDoc() - 1);
// document对象的get(字段名称)方法获取字段的值
System.out.println("第一个插入的document对象的标题是:" + minDoc.get("title"));
System.out.println("最后一个插入的document对象的标题是:" + maxDoc.get("title"));

int docLength = indexReader.maxDoc();
for (int i = 0; i < docLength; i++) {
Document doc = indexReader.document(i);
Article article = new Article();
if (doc.get("id") == null) {
System.out.println("id为空");
} else {
article.setId(Integer.parseInt(doc.get("id")));
//                    if(article.getId().intValue() == 1){
//                    	//删除索引
//                    	indexReader.deleteDocument(1);
//                    	System.out.println(indexReader.hasDeletions());
//                    }
article.setTitle(doc.get("title"));
article.setContent(doc.get("content"));
articles.add(article);
}
}
System.out.println("显示所有插入的索引记录:");
for (Article article : articles) {
System.out.println(article);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (indexReader != null) {
try {
indexReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (directory != null) {
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.out.println("直接读取索引文件成功,总共花费" + (System.currentTimeMillis() - startTime) + "毫秒。");
System.out.println("*****************读取索引结束**********************");
}

public static void main(String[] args) throws IOException {
//	 	String text="寻找真心朋友,只限女生,偶尔一起嗨皮下";
//        StringReader sr=new StringReader(text);
//        IKSegmenter ik=new IKSegmenter(sr, true);
//        Lexeme lex=null;
//        while((lex=ik.next())!=null){
//            System.out.print(lex.getLexemeText()+"|");
//        }

//	 createIndexFile();
//	 getReultFromIndex11();
//	 splitWord("周末我们去嗨皮吧");
//	 getReultFromIndex("我喜欢小猫");
openIndexFile();
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: