Lucene从入门到熟悉(三)分词
2014-08-21 21:45
405 查看
分词
分词是用来对文本按语言特征按算法进行过滤、分组处理一种技术。分词的对象是文本,而不是图像动画脚本等等
分词的方式就是过滤与分组
过滤主要是把文本中那些没有实际意义的字或词过滤掉
分组就是按照"分词数据库"内已添加好的词进行匹配。
Lucene提供的分词器
// Analyzer analyzer=new StandardAnalyzer();
// Analyzer analyzer=new SimpleAnalyzer();
// Analyzer analyzer=new WhitespaceAnalyzer();
// Analyzer analyzer=new ChineseAnalyzer();
// Analyzer analyzer=new CJKAnalyzer(); // 两个汉字一组
package com.lucene.test.T03; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.cn.ChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.wltea.analyzer.lucene.IKAnalyzer; public class TestAnalyzer { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // Analyzer analyzer=new StandardAnalyzer(); // Analyzer analyzer=new SimpleAnalyzer(); // Analyzer analyzer=new WhitespaceAnalyzer(); // Analyzer analyzer=new ChineseAnalyzer(); // Analyzer analyzer=new CJKAnalyzer(); Analyzer analyzer=new IKAnalyzer(); TokenStream tokenStream=analyzer.tokenStream("", new StringReader("welcome to use lucene! ?")); // TokenStream tokenStream=analyzer.tokenStream("", new StringReader("明天会更美好!")); Token token =new Token(); while(tokenStream.next(token)!=null) { System.out.println(token.term()); } } }
paoding开源中文分词器
package com.lucene.test.T03; import java.io.IOException; import java.io.StringReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class AnalyzerPaoding { private static Logger logger = LoggerFactory .getLogger(AnalyzerPaoding.class); public static void main(String[] args) throws IOException { Analyzer analyzer = new PaodingAnalyzer(); TokenStream ts = analyzer.tokenStream("", new StringReader("法律实践奥利弗论文集饿哦土建类士大夫接待来访将阿隆索")); Token token = new Token(); while ((token = ts.next()) != null) { logger.debug("read result from token"); System.out.println(token.term()); } } }
paoding中用Queryparse
package com.lucene.test.T03; import java.io.IOException; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class TestIndexPaoding { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { String[] ids = { "1", "2", "3", "4" }; String[] names = { "张三", "李四", "王五", "赵六" }; // String[] names = { "zhangsan", "zhangsun", "zhangson", "zhaoliu" }; String[] address = { "居住北京", "南京", "北京海淀", "dalian" }; String[] birthday = { "19880101", "19860105", "19760205", "19550719" }; Analyzer analyzer = new PaodingAnalyzer(); String indexDir = "c:/temp/luceneindex"; Directory dir = FSDirectory.getDirectory(indexDir); // true 表示创建或覆盖当前索引;false表示对当前索引进行追加 // Default value is 128 IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); for (int i = 0; i < ids.length; i++) { Document document = new Document(); document.add(new Field("id", ids[i], Field.Store.YES, Field.Index.ANALYZED)); document.add(new Field("name", names[i], Field.Store.YES, Field.Index.ANALYZED)); // Field.Index.NO表示不建立索引 document.add(new Field("address", address[i], Field.Store.YES, Field.Index.ANALYZED)); document.add(new Field("birthday", birthday[i], Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(document); } writer.optimize(); writer.close(); System.out.println("index created ...."); } }
package com.lucene.test.T03; import java.io.IOException; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class TestSearcherPaoding { public static void main(String[] args) throws IOException, ParseException { String indexDir = "c:/temp/luceneindex"; Directory dir = FSDirectory.getDirectory(indexDir); IndexSearcher searcher = new IndexSearcher(dir); Analyzer analyzer = new PaodingAnalyzer(); ScoreDoc[] hits = null; QueryParser parser = new QueryParser("name", analyzer); Query qury = parser.parse("address:北京 AND NOT name:张三");// 高级查询(adress包含北京,但name不为张三的) TopDocs topDocs = searcher.search(qury, 10); hits = topDocs.scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); System.out.print(hits[i].score + " "); System.out.print(doc.get("id") + " "); System.out.print(doc.get("name") + " "); System.out.print(doc.get("address") + " "); System.out.println(doc.get("birthday") + " "); } searcher.close(); dir.close(); } }
读取文件建立索引并查询:
package com.lucene.test.T04; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TestFileIndex { private static Logger logger = LoggerFactory.getLogger(TestFileIndex.class); public static void main(String[] args) throws FileNotFoundException, IOException { String indexDir = "c:/temp/lucene/index"; Analyzer analyzer = new PaodingAnalyzer(); Directory dir = FSDirectory.getDirectory(indexDir); IndexWriter writer = new IndexWriter(indexDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); // read data from dataDir and create index String dataDir = "c:/temp/lucene/data"; File[] files = new File(data c2f4 Dir).listFiles(); System.out.println("file numbers:" + files.length); for (int i = 0; i < files.length; i++) { // read file content StringBuffer strBuff = new StringBuffer(); String line = ""; FileInputStream is = new FileInputStream(files[i].getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(is)); line = br.readLine(); while (line != null) { strBuff.append(line); strBuff.append("\n"); line = br.readLine(); } // create index Document document = new Document(); document.add(new Field("title", files[i].getName(), Field.Store.YES, Field.Index.ANALYZED)); document.add(new Field("content", strBuff.toString(), Field.Store.YES, Field.Index.ANALYZED)); // write index writer.addDocument(document); is.close(); br.close(); } writer.close(); dir.close(); } }
package com.lucene.test.T04; import java.io.IOException; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class TestFileSearch { public static void main(String[] args) throws IOException, ParseException { String indexDir = "c:/temp/lucene/index"; Directory dir = FSDirectory.getDirectory(indexDir); IndexSearcher searcher = new IndexSearcher(dir); Analyzer analyzer = new PaodingAnalyzer(); ScoreDoc[] hits = null; QueryParser parser = new QueryParser("content", analyzer); Query qury = parser.parse("软件"); TopDocs topDocs = searcher.search(qury, 10); hits = topDocs.scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); // System.out.print(hits[i].score + " "); System.out.print(doc.get("title") + " "); System.out.print(doc.get("content") + " "); } searcher.close(); dir.close(); } }
高亮
package com.lucene.test.T05; import java.io.StringReader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; public class TestHighlight { public static void main(String[] args) throws Exception { Searcher searcher = new IndexSearcher("c:/temp/lucene/index"); Analyzer analyzer = new PaodingAnalyzer(); String filed = "content"; String queryStr = "分词"; QueryParser parser = new QueryParser(filed, analyzer); Query query = parser.parse(queryStr); TopDocCollector collector = new TopDocCollector(10); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // highlight setup Highlighter highlight = null; SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter( "<font color='red'>", "</font>"); highlight = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); highlight.setTextFragmenter(new SimpleFragmenter(200)); Document doc; for (int i = 0; i < hits.length; i++) { System.out.println(hits[i].doc); System.out.println("---------------------------------------1"); System.out.println(hits[i].score); System.out.println("---------------------------------------2"); doc = searcher.doc(hits[i].doc); // System.out.println(doc.toString()); System.out.println("---------------------------------------3"); // hightlight view TokenStream tokenStream = new PaodingAnalyzer().tokenStream( "content", new StringReader(doc.get("content"))); System.out.println(highlight.getBestFragment(tokenStream, doc.get("content"))); } } }
相关文章推荐
- Lucene2.9.2 + 盘古分词2.3.1(一) 入门: 建立简单索引,搜索(原创)
- Lucene 3.6.2入门:自定义停用词分词器和同义词分词器
- 【Lucene3.6.2入门系列】第14节_SolrJ操作索引和搜索文档以及整合中文分词
- lucene入门-使用JE中文分词
- Lucene整合"庖丁解牛"中文分词 ----入门 2
- Lucene.net入门学习(结合盘古分词)
- 【Lucene3.6.2入门系列】第05节_自定义停用词分词器和同义词分词器
- 第61天(就业班) Lucene索引库优化、内置分词器、IK分词器、关键字高亮、单字段和多字段搜索、easyui入门
- Lucene.net入门学习(结合盘古分词)
- Lucene从入门到熟悉(二)检索
- 【Lucene3.6.2入门系列】第14节_SolrJ操作索引和搜索文档以及整合中文分词
- JAVAWEB开发之Lucene详解——Lucene入门及使用场景、全文检索、索引CRUD、优化索引库、分词器、高亮、相关度排序、各种查询
- Lucene 4.2.1入门教程(分词)
- 【Lucene3.6.2入门系列】第14节_SolrJ操作索引和搜索文档以及整合中文分词
- 【Lucene3.6.2入门系列】第05节_自定义停用词分词器和同义词分词器
- 【Lucene3.6.2入门系列】第05节_自定义停用词分词器和同义词分词器
- Lucene中使用Paoding中文分词
- Lucene 快速入门
- lucene 排序、过滤、分词器
- Lucene 6.1.0 入门Demo