您的位置:首页 > 其它

lucene5.5根据现有分词器改造做同义词分词器

2016-04-01 09:37 501 查看
lucene5之后版本有了较大的改动,现将lucene5的同义词分词器改造代码和方式,记录一下

功能加测试的类一共6个,一一介绍一下

1 同义词分词器类SameWordAnalyzer

2 同义词过滤器类SameWordFilter

3 根据词语获取同义词引擎接口SameWordEngine

4 同义词引擎接口实现类SameWordEngineImpl

5 分词器分词分析工具类 AnalyzerUtils

6 结果测试类 TestUnit

使用的jar包如下



具体代码如下

1 SameWordAnalyzer类内容:

package com.liu.lucene.pro;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.Tokenizer;

import org.apache.lucene.analysis.standard.StandardTokenizer;

public class SameWordAnalyzer extends Analyzer {

private SameWordEngine engine;

public SameWordAnalyzer(SameWordEngine engine){

this.engine = engine;

}

@Override

protected TokenStreamComponents createComponents(String fieldName) {

// TODO Auto-generated method stub

Tokenizer source = new StandardTokenizer();

TokenStream result = new SameWordFilter(source,engine);

return new TokenStreamComponents(source, result);

}

}

2 SameWordFilter类

package com.liu.lucene.pro;

import java.io.IOException;

import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import org.apache.lucene.util.AttributeSource;

public class SameWordFilter extends TokenFilter {
private SameWordEngine engine;
private Stack<String> samewordStack;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

    private AttributeSource.State current;

    

    protected SameWordFilter(TokenStream input,SameWordEngine engine) {
super(input);
this.engine = engine;
samewordStack = new Stack<String>();
}

    
@Override
public boolean incrementToken() throws IOException {
if(samewordStack.size()>0){
String sameWord = samewordStack.pop();
this.restoreState(current);
//termAtt.copyBuffer(sameWord.toCharArray(), 0, sameWord.length());
termAtt.setEmpty();
termAtt.append(sameWord);
posIncrAtt.setPositionIncrement(0);
return true;
}

if(!input.incrementToken()){
return false;
}

if(isAddSameWord()){
current = this.captureState();
}

return true;
}

private boolean isAddSameWord() {
String[] sameWords = engine.getSameWords(termAtt.toString());
if(sameWords == null){
return false;
}

for(String sameWord:sameWords){
samewordStack.push(sameWord);
}
return true;
}

}

3 SameWordEngine引擎接口

package com.liu.lucene.pro;

public interface SameWordEngine {
String[] getSameWords(String str);

}

4 SameWordEngineImpl引擎接口实现类

package com.liu.lucene.pro;

import java.util.HashMap;

import java.util.Map;

public class SameWordEngineImpl implements SameWordEngine {

@Override
public String[] getSameWords(String str) {
// TODO Auto-generated method stub
Map<String,String[]> map = new HashMap<String,String[]>();

map.put("2015", new String[]{"二零一五","20一5"});
map.put("redis", new String[]{"内存数据库","re内存"});

return map.get(str);
}

}

5 分词器分析工具类AnalyzerUtils

package com.liu.lucene.pro;

import java.io.IOException;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

public class AnalyzerUtils {

public static void displayTokens(Analyzer analyzer,Reader reader){
try {
TokenStream tokenStream = analyzer.tokenStream("path", reader);
tokenStream.reset();

CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);

while(tokenStream.incrementToken()){
System.out.print("["+term.toString()+"]");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

6 测试类 TestUnit

package com.liu.lucene.test;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.Reader;

import org.junit.Before;

import org.junit.Test;

import com.liu.lucene.pro.AnalyzerUtils;

import com.liu.lucene.pro.LuceneIndex;

import com.liu.lucene.pro.SameWordAnalyzer;

import com.liu.lucene.pro.SameWordEngineImpl;

public class TestUnit {
LuceneIndex index = null;

@Before
public void setUp(){
index = new LuceneIndex();
}

@Test
public void testIndex(){
index.index(true);
}

@Test
public void testIndexAnalyzer(){

index.index(true,new SameWordAnalyzer(new SameWordEngineImpl()));
}

@Test
public void testSearch(){
index.search("20一5",new SameWordAnalyzer(new SameWordEngineImpl()));
}

@Test
public void testDisplayTokens(){
try {
Reader reader = new FileReader("D:\\lhl\\developSoft\\apache-tomc
ab2d
at-7.0.62-windows-x64\\apache-tomcat-7.0.62\\logs\\loginfo.log.2015-11-27.log");

AnalyzerUtils.displayTokens(new SameWordAnalyzer(new SameWordEngineImpl()), reader);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: