您的位置:首页 > 其它

mmseg 同义词分析器 SolrSynonymParser

2016-03-10 16:05 537 查看
package synonym;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;

import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.Seg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
import com.chenlb.mmseg4j.analysis.SimpleAnalyzer;

public class MMSegAnalyzer extends Analyzer {

protected Dictionary dic;

/**
* @see Dictionary#getInstance()
*/
public MMSegAnalyzer() {
dic = Dictionary.getInstance();
}

/**
* @param path
*            词库路径
* @see Dictionary#getInstance(String)
*/
public MMSegAnalyzer(String path) {
dic = Dictionary.getInstance(path);
}

/**
* @param path
*            词库目录
* @see Dictionary#getInstance(File)
*/
public MMSegAnalyzer(File path) {
dic = Dictionary.getInstance(path);
}

public MMSegAnalyzer(Dictionary dic) {
super();
this.dic = dic;
}

protected Seg newSeg() {
return new MaxWordSeg(dic);
}

public Dictionary getDict() {
return dic;
}

// 自定义停用词
private static final String[] stopWords = { "and", "of", "the", "to", "is",
"their", "can", "all" };

static SynonymMap synonymMap=null;

static
{
synonymMap = getSynonymMap("e:\\synonym.txt");
}

@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
// 创建一个分词器
// Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_47,
// reader);

Tokenizer tokenizer = new MMSegTokenizer(newSeg(), reader);

// 创建一系列的分词过滤器
TokenFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_47,
tokenizer);
TokenFilter synonymFilter = new SynonymFilter(lowerCaseFilter,synonymMap, true);

//TokenFilter synonymFilter = new SynonymFilter(lowerCaseFilter,getSynonymMap(), true);

TokenFilter stopFilter = new StopFilter(Version.LUCENE_47,synonymFilter, buildCharArraySetFromArry(stopWords));
TokenFilter stemFilter = new PorterStemFilter(stopFilter);

// TokenStream的包装类 在2.2之中 是TokenStream
return new TokenStreamComponents(tokenizer, stemFilter);
}

// 将数组转成lucene可识别的CharArraySet对象 CharArraySet类似java.util.set
private CharArraySet buildCharArraySetFromArry(String[] array) {
CharArraySet set = new CharArraySet(Version.LUCENE_47, array.length,
true);
for (String value : array) {
set.add(value);
}
return set;
}

// 创建一个同义词表
private static SynonymMap getSynonymMap(String path) {
File file = new File(path);
try {
// FileReader fileReader = new FileReader(file);

// System.out.println(IOUtils.toString(fileReader));
/*
* SynonymMap.Builder parser = null; parser = new
* WordnetSynonymParser(true, true, new SimpleAnalyzer());
* ((WordnetSynonymParser) parser).parse(new InputStreamReader(new
* FileInputStream(file))); SynonymMap map = parser.build();
*/

/*WordnetSynonymParser parser = new WordnetSynonymParser(true, true,
new SimpleAnalyzer());
parser.parse(new InputStreamReader(new FileInputStream(file)));
*/
SynonymMap.Builder parser = null;
parser = new SolrSynonymParser(true, true, new WhitespaceAnalyzer(
Version.LUCENE_47));
((SolrSynonymParser) parser).parse(new InputStreamReader(new FileInputStream(file)));

SynonymMap map = parser.build();
System.out.println(map);
return map;
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException("failed to build synonyms", e);
}
}

// 创建一个同义词表
private SynonymMap getSynonymMap() {
String base1 = "fast";
String syn1 = "rapid";

String base2 = "slow";
String syn2 = "sluggish";

String base3 = "中国";
String syn3 = "天朝";

SynonymMap.Builder sb = new SynonymMap.Builder(true);
sb.add(new CharsRef(base1), new CharsRef(syn1), true);
sb.add(new CharsRef(base2), new CharsRef(syn2), true);
sb.add(new CharsRef(base3), new CharsRef(syn3), true);

sb.add(new CharsRef(syn3), new CharsRef(base3), true);

SynonymMap smap = null;
try {
smap = sb.build();
} catch (IOException e) {
e.printStackTrace();
}
return smap;
}

// 测试方法
public static void testPorterStemmingAnalyzer() throws IOException {
Analyzer analyzer = new MMSegAnalyzer();
String text = "Collective slow intelligence and Web2.0, fast and rapid2  天朝  柿子";
Reader reader = new StringReader(text);
TokenStream ts = null;
try {
ts = analyzer.tokenStream(null, reader);
//ts.reset();
while (ts.incrementToken()) {
CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class);
System.out.println(ta.toString());
}
} catch (IOException e) {
e.printStackTrace();
}

}

public static void main(String[] args) throws IOException {
testPorterStemmingAnalyzer();
}

}

 
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: