中文分词-lucene 第二个版本
2015-06-02 13:53
337 查看
jar包
IKAnalyzer2012FF_u1.jar
lucene-core-4.6.1.jar
lucene-queryparser-4.6.1.jar
api: http://lucene.apache.org/core/4_6_1/core/index.html
代码:
IKAnalyzer2012FF_u1.jar
lucene-core-4.6.1.jar
lucene-queryparser-4.6.1.jar
api: http://lucene.apache.org/core/4_6_1/core/index.html
代码:
package com.ishehui.utils; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import com.ishehui.entity.Article; import com.ishehui.entity.Conchs; /***** * IKAnalyzer 中文分词 && 相似度匹配 * @author wenmeishuai * */ public class IKWords { /* 创建简单中文分析器 创建索引使用的分词器必须和查询时候使用的分词器一样,否则查询不到想要的结果 */ private static Analyzer analyzer = new IKAnalyzer(true); // 索引保存目录 private static File indexFile = new File("d:\\indexDir\\");///data/work/videos/conchsindex/ d:\\indexDir\\ //需要搜索出的列 private static String[] fieldName = {"id","content"}; /** * 查看IKAnalyzer 分词器是如何将一个完整的词组进行分词的 * * @param text * @param isMaxWordLength */ public static String splitWord(String text) { try { // 创建分词对象 // Analyzer analyzer = new IKAnalyzer(isMaxWordLength); StringReader reader = new StringReader(text); // 分词 TokenStream ts = analyzer.tokenStream("", reader); //重置到流的开始位置 ts.reset(); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); // 遍历分词数据 System.out.print("IKAnalyzer把关键字拆分的结果是:"); StringBuffer b = new StringBuffer(); while (ts.incrementToken()) { System.out.print("【" + term.toString() + "】"); b.append("【" + term.toString() + "】"); } reader.close(); return b.toString(); } catch (IOException e) { e.printStackTrace(); } return ""; } /** * 创建索引文件到磁盘中永久保存 * isCreateAll true :重新构建索引 false:在原有基础上添加 */ public static void createConchsIndexFile(List<Conchs> cs,boolean isCreateAll) { long startTime = System.currentTimeMillis(); System.out.println("*****************创建索引开始**********************"); Directory directory = null; IndexWriter indexWriter = null; try { // 创建哪个版本的IndexWriterConfig,根据参数可知lucene是向下兼容的,选择对应的版本就好 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer); // 创建磁盘目录对象 directory = new SimpleFSDirectory(indexFile); indexWriter = new IndexWriter(directory, indexWriterConfig); // indexWriter = new IndexWriter(directory, analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED); // 这上面是使用内存保存索引的创建索引写入对象的例子,和这里的实现方式不一样,但是效果是一样的 if(isCreateAll){ // 为了避免重复插入数据,每次测试前 先删除之前的索引 indexWriter.deleteAll(); } // 获取实体对象 for (int i = 0; i < cs.size(); i++) { Conchs article = cs.get(i); // indexWriter添加索引 Document doc = new Document(); doc.add(new Field("id", article.getId().toString(),Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("content", article.getContent().toString(),Field.Store.YES, Field.Index.ANALYZED)); // 添加到索引中去 indexWriter.addDocument(doc); System.out.println("索引添加成功:第" + (i + 1) + "次!!"); } } catch (IOException e) { e.printStackTrace(); } finally { if (indexWriter != null) { try { indexWriter.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } System.out.println("创建索引文件成功,总共花费" + (System.currentTimeMillis() - startTime) + "毫秒。"); System.out.println("*****************创建索引结束**********************"); } /***** * 从索引中搜索 (多字段 单条件) * @param keyword * @return */ public static List<String> getReultFromIndex(String keyword) { IndexSearcher isearcher = null; IndexReader indexReader = null; try { indexReader = IndexReader.open(FSDirectory.open(indexFile)); //实例化搜索器 isearcher = new IndexSearcher(indexReader); //在索引器中使用IKSimilarity相似度评估器 // isearcher.setSimilarity() // //使用IKQueryParser查询分析器构造Query对象 MultiFieldQueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_46, fieldName,analyzer); // //利用queryParser解析传递过来的检索关键字,完成Query对象的封装 Query query = queryParser.parse(keyword); // Query query = IKQueryParser.parseMultiField(fieldName, keyword); splitWord(keyword);// 显示拆分结果 //搜索相似度最高的20条记录 TopDocs topDocs = isearcher.search(query , 50); System.out.println("命中:" + topDocs.totalHits); //输出结果 ScoreDoc[] scoreDocs = topDocs.scoreDocs; List<String> returnList = new ArrayList<String>(); for (int i = 0; i < scoreDocs.length; i++){ Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("命中内容: id:"+targetDoc.get("id")+" content:"+targetDoc.get("content")); returnList.add(targetDoc.get("id")); } return returnList; } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } finally{ // if(isearcher != null){ // try { // isearcher.close(); // } catch (IOException e) { // e.printStackTrace(); // } // } } return null; } /**** * 从内存中检索 (多字段 单条件) * @param keyword * @param cs * @return */ public static List<String> getReultFromMemory(String keyword,List<Conchs> cs){ Directory directory = null; IndexWriter iwriter = null; IndexSearcher isearcher = null; IndexReaderContext irederContext = null; try { //建立内存索引对象将数据加载到内存中 directory = new RAMDirectory(); // iwriter = new IndexWriter(directory, analyzer, true , IndexWriter.MaxFieldLength.LIMITED); iwriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46, analyzer)); for(Conchs text:cs){ Document doc = new Document(); doc.add(new Field("content", text.getContent(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("id", text.getId().toString(), Field.Store.YES, Field.Index.ANALYZED)); System.out.println("id:"+text.getId()+" text:"+text.getContent()); iwriter.addDocument(doc); } iwriter.close(); //实例化搜索器 从内存中搜索出结果 isearcher = new IndexSearcher(IndexReader.open(directory)) ; //在索引器中使用IKSimilarity相似度评估器 // isearcher.setSimilarity(new IKSimilarity()); //构造Query对象 // QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fieldName,analyzer); //利用queryParser解析传递过来的检索关键字,完成Query对象的封装 MultiFieldQueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_46, fieldName,analyzer); // //利用queryParser解析传递过来的检索关键字,完成Query对象的封装 Query query = queryParser.parse(keyword); // Query query = IKQueryParser.parseMultiField(fieldName, keyword); // 显示拆分结果 给开发者 splitWord(keyword); //搜索相似度最高的20条记录 TopDocs topDocs = isearcher.search(query , 20); System.out.println("命中:" + topDocs.totalHits); //输出结果 ScoreDoc[] scoreDocs = topDocs.scoreDocs; List<String> returnList = new ArrayList<String>(); for (int i = 0; i < scoreDocs.length; i++){ Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("内容:" + targetDoc.toString()+" id:"+targetDoc.get("id")); returnList.add(targetDoc.get("id")); } return returnList; } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } finally{ // if(isearcher != null){ // try { // isearcher.close(); // } catch (IOException e) { // e.printStackTrace(); // } // } if(directory != null){ try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } return null; } /** * 直接读取索引文件,查询索引记录 * * @throws IOException */ public static void openIndexFile() { long startTime = System.currentTimeMillis(); System.out.println("*****************读取索引开始**********************"); List<Article> articles = new ArrayList<Article>(); // 得到索引的目录 Directory directory = null; IndexReader indexReader = null; try { directory = new SimpleFSDirectory(indexFile); // 根据目录打开一个indexReader // indexReader = IndexReader.open(directory); indexReader = IndexReader.open(directory); System.out.println("在索引文件中总共插入了" + indexReader.maxDoc() + "条记录。"); // 获取第一个插入的document对象 Document minDoc = indexReader.document(0); // 获取最后一个插入的document对象 Document maxDoc = indexReader.document(indexReader.maxDoc() - 1); // document对象的get(字段名称)方法获取字段的值 System.out.println("第一个插入的document对象的标题是:" + minDoc.get("title")); System.out.println("最后一个插入的document对象的标题是:" + maxDoc.get("title")); int docLength = indexReader.maxDoc(); for (int i = 0; i < docLength; i++) { Document doc = indexReader.document(i); Article article = new Article(); if (doc.get("id") == null) { System.out.println("id为空"); } else { article.setId(Integer.parseInt(doc.get("id"))); // if(article.getId().intValue() == 1){ // //删除索引 // indexReader.deleteDocument(1); // System.out.println(indexReader.hasDeletions()); // } article.setTitle(doc.get("title")); article.setContent(doc.get("content")); articles.add(article); } } System.out.println("显示所有插入的索引记录:"); for (Article article : articles) { System.out.println(article); } } catch (IOException e) { e.printStackTrace(); } finally { if (indexReader != null) { try { indexReader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } System.out.println("直接读取索引文件成功,总共花费" + (System.currentTimeMillis() - startTime) + "毫秒。"); System.out.println("*****************读取索引结束**********************"); } public static void main(String[] args) throws IOException { // String text="寻找真心朋友,只限女生,偶尔一起嗨皮下"; // StringReader sr=new StringReader(text); // IKSegmenter ik=new IKSegmenter(sr, true); // Lexeme lex=null; // while((lex=ik.next())!=null){ // System.out.print(lex.getLexemeText()+"|"); // } // createIndexFile(); // getReultFromIndex11(); // splitWord("周末我们去嗨皮吧"); // getReultFromIndex("我喜欢小猫"); openIndexFile(); } }
相关文章推荐
- Android wakelock和keyguardManager
- Android Xfermode 实战 实现圆形、圆角图片
- alsa utils工具使用
- 从第三方Launcher授权启动指定APP的设计与实现
- sgu248:Integer Linear Programming(扩展欧几里得)
- JDK内置的进制转换
- Maven学习笔记(一)--简介
- android http协议post请求方式
- ASP.NET MVC 实现跨域请求的两种形式
- 浅谈 Scala 中下划线的用途
- pig-csm 7.9修改记录
- 获取本地图片并显示
- Android应用Context详解及源码解析
- MVC模式
- JavaScript中字符串截取函数slice()、substring()、substr()
- 第四节---nginx+php
- maven如何修改本地仓库与中央仓库
- Java中的final详解以及用途实战
- 朴素贝叶斯分类
- 线程安全