mmseg 同义词分析器 SolrSynonymParser
2016-03-10 16:05
537 查看
package synonym; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Version; import com.chenlb.mmseg4j.Dictionary; import com.chenlb.mmseg4j.MaxWordSeg; import com.chenlb.mmseg4j.Seg; import com.chenlb.mmseg4j.analysis.MMSegTokenizer; import com.chenlb.mmseg4j.analysis.SimpleAnalyzer; public class MMSegAnalyzer extends Analyzer { protected Dictionary dic; /** * @see Dictionary#getInstance() */ public MMSegAnalyzer() { dic = Dictionary.getInstance(); } /** * @param path * 词库路径 * @see Dictionary#getInstance(String) */ public MMSegAnalyzer(String path) { dic = Dictionary.getInstance(path); } /** * @param path * 词库目录 * @see Dictionary#getInstance(File) */ public MMSegAnalyzer(File path) { dic = Dictionary.getInstance(path); } public MMSegAnalyzer(Dictionary dic) { super(); this.dic = dic; } protected Seg newSeg() { return new MaxWordSeg(dic); } public Dictionary getDict() { return dic; } // 自定义停用词 private static final String[] stopWords = { "and", "of", "the", "to", "is", "their", "can", "all" }; static SynonymMap synonymMap=null; static { synonymMap = getSynonymMap("e:\\synonym.txt"); } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { // 创建一个分词器 // Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_47, // reader); Tokenizer tokenizer = new MMSegTokenizer(newSeg(), reader); // 创建一系列的分词过滤器 TokenFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_47, tokenizer); TokenFilter synonymFilter = new SynonymFilter(lowerCaseFilter,synonymMap, true); //TokenFilter synonymFilter = new SynonymFilter(lowerCaseFilter,getSynonymMap(), true); TokenFilter stopFilter = new StopFilter(Version.LUCENE_47,synonymFilter, buildCharArraySetFromArry(stopWords)); TokenFilter stemFilter = new PorterStemFilter(stopFilter); // TokenStream的包装类 在2.2之中 是TokenStream return new TokenStreamComponents(tokenizer, stemFilter); } // 将数组转成lucene可识别的CharArraySet对象 CharArraySet类似java.util.set private CharArraySet buildCharArraySetFromArry(String[] array) { CharArraySet set = new CharArraySet(Version.LUCENE_47, array.length, true); for (String value : array) { set.add(value); } return set; } // 创建一个同义词表 private static SynonymMap getSynonymMap(String path) { File file = new File(path); try { // FileReader fileReader = new FileReader(file); // System.out.println(IOUtils.toString(fileReader)); /* * SynonymMap.Builder parser = null; parser = new * WordnetSynonymParser(true, true, new SimpleAnalyzer()); * ((WordnetSynonymParser) parser).parse(new InputStreamReader(new * FileInputStream(file))); SynonymMap map = parser.build(); */ /*WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new SimpleAnalyzer()); parser.parse(new InputStreamReader(new FileInputStream(file))); */ SynonymMap.Builder parser = null; parser = new SolrSynonymParser(true, true, new WhitespaceAnalyzer( Version.LUCENE_47)); ((SolrSynonymParser) parser).parse(new InputStreamReader(new FileInputStream(file))); SynonymMap map = parser.build(); System.out.println(map); return map; } catch (Exception e) { e.printStackTrace(); throw new IllegalArgumentException("failed to build synonyms", e); } } // 创建一个同义词表 private SynonymMap getSynonymMap() { String base1 = "fast"; String syn1 = "rapid"; String base2 = "slow"; String syn2 = "sluggish"; String base3 = "中国"; String syn3 = "天朝"; SynonymMap.Builder sb = new SynonymMap.Builder(true); sb.add(new CharsRef(base1), new CharsRef(syn1), true); sb.add(new CharsRef(base2), new CharsRef(syn2), true); sb.add(new CharsRef(base3), new CharsRef(syn3), true); sb.add(new CharsRef(syn3), new CharsRef(base3), true); SynonymMap smap = null; try { smap = sb.build(); } catch (IOException e) { e.printStackTrace(); } return smap; } // 测试方法 public static void testPorterStemmingAnalyzer() throws IOException { Analyzer analyzer = new MMSegAnalyzer(); String text = "Collective slow intelligence and Web2.0, fast and rapid2 天朝 柿子"; Reader reader = new StringReader(text); TokenStream ts = null; try { ts = analyzer.tokenStream(null, reader); //ts.reset(); while (ts.incrementToken()) { CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class); System.out.println(ta.toString()); } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) throws IOException { testPorterStemmingAnalyzer(); } }
相关文章推荐
- 布式搜索elasticsearch 中文分词集成
- Spring Bean 初始化过程
- Maven导出工程依赖的jar包
- jactor
- java 线程池详解
- 协同过滤推荐算法在MapReduce与Spark上实现对比
- lucene4.7 分词器(三)
- elasticsearch的实现全文检索
- 详解Java GC的工作原理
- storm - 常用命令
- linux统计文件中关键字出现的行号
- 加锁方法对于Map
- Maven下使用Jetty进行Debug
- java file 海量文件处理
- 滑动窗口 TOPN 技术实现演变
- 部署与管理ZooKeeper
- JSONP跨域的原理解析
- JPPF helloword
- com.bea.xml.XmlException: failed to load java type corresponding to e=web-app@ht
- HttpClient4.3教程 第二章 连接管理