lucene5.5根据现有分词器改造做同义词分词器
2016-04-01 09:37
501 查看
lucene5之后版本有了较大的改动,现将lucene5的同义词分词器改造代码和方式,记录一下
功能加测试的类一共6个,一一介绍一下
1 同义词分词器类SameWordAnalyzer
2 同义词过滤器类SameWordFilter
3 根据词语获取同义词引擎接口SameWordEngine
4 同义词引擎接口实现类SameWordEngineImpl
5 分词器分词分析工具类 AnalyzerUtils
6 结果测试类 TestUnit
使用的jar包如下
具体代码如下
1 SameWordAnalyzer类内容:
package com.liu.lucene.pro;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class SameWordAnalyzer extends Analyzer {
private SameWordEngine engine;
public SameWordAnalyzer(SameWordEngine engine){
this.engine = engine;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
// TODO Auto-generated method stub
Tokenizer source = new StandardTokenizer();
TokenStream result = new SameWordFilter(source,engine);
return new TokenStreamComponents(source, result);
}
}
2 SameWordFilter类
package com.liu.lucene.pro;
import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class SameWordFilter extends TokenFilter {
private SameWordEngine engine;
private Stack<String> samewordStack;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private AttributeSource.State current;
protected SameWordFilter(TokenStream input,SameWordEngine engine) {
super(input);
this.engine = engine;
samewordStack = new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
if(samewordStack.size()>0){
String sameWord = samewordStack.pop();
this.restoreState(current);
//termAtt.copyBuffer(sameWord.toCharArray(), 0, sameWord.length());
termAtt.setEmpty();
termAtt.append(sameWord);
posIncrAtt.setPositionIncrement(0);
return true;
}
if(!input.incrementToken()){
return false;
}
if(isAddSameWord()){
current = this.captureState();
}
return true;
}
private boolean isAddSameWord() {
String[] sameWords = engine.getSameWords(termAtt.toString());
if(sameWords == null){
return false;
}
for(String sameWord:sameWords){
samewordStack.push(sameWord);
}
return true;
}
}
3 SameWordEngine引擎接口
package com.liu.lucene.pro;
public interface SameWordEngine {
String[] getSameWords(String str);
}
4 SameWordEngineImpl引擎接口实现类
package com.liu.lucene.pro;
import java.util.HashMap;
import java.util.Map;
public class SameWordEngineImpl implements SameWordEngine {
@Override
public String[] getSameWords(String str) {
// TODO Auto-generated method stub
Map<String,String[]> map = new HashMap<String,String[]>();
map.put("2015", new String[]{"二零一五","20一5"});
map.put("redis", new String[]{"内存数据库","re内存"});
return map.get(str);
}
}
5 分词器分析工具类AnalyzerUtils
package com.liu.lucene.pro;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class AnalyzerUtils {
public static void displayTokens(Analyzer analyzer,Reader reader){
try {
TokenStream tokenStream = analyzer.tokenStream("path", reader);
tokenStream.reset();
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
while(tokenStream.incrementToken()){
System.out.print("["+term.toString()+"]");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
6 测试类 TestUnit
package com.liu.lucene.test;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.Reader;
import org.junit.Before;
import org.junit.Test;
import com.liu.lucene.pro.AnalyzerUtils;
import com.liu.lucene.pro.LuceneIndex;
import com.liu.lucene.pro.SameWordAnalyzer;
import com.liu.lucene.pro.SameWordEngineImpl;
public class TestUnit {
LuceneIndex index = null;
@Before
public void setUp(){
index = new LuceneIndex();
}
@Test
public void testIndex(){
index.index(true);
}
@Test
public void testIndexAnalyzer(){
index.index(true,new SameWordAnalyzer(new SameWordEngineImpl()));
}
@Test
public void testSearch(){
index.search("20一5",new SameWordAnalyzer(new SameWordEngineImpl()));
}
@Test
public void testDisplayTokens(){
try {
Reader reader = new FileReader("D:\\lhl\\developSoft\\apache-tomc
ab2d
at-7.0.62-windows-x64\\apache-tomcat-7.0.62\\logs\\loginfo.log.2015-11-27.log");
AnalyzerUtils.displayTokens(new SameWordAnalyzer(new SameWordEngineImpl()), reader);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
功能加测试的类一共6个,一一介绍一下
1 同义词分词器类SameWordAnalyzer
2 同义词过滤器类SameWordFilter
3 根据词语获取同义词引擎接口SameWordEngine
4 同义词引擎接口实现类SameWordEngineImpl
5 分词器分词分析工具类 AnalyzerUtils
6 结果测试类 TestUnit
使用的jar包如下
具体代码如下
1 SameWordAnalyzer类内容:
package com.liu.lucene.pro;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class SameWordAnalyzer extends Analyzer {
private SameWordEngine engine;
public SameWordAnalyzer(SameWordEngine engine){
this.engine = engine;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
// TODO Auto-generated method stub
Tokenizer source = new StandardTokenizer();
TokenStream result = new SameWordFilter(source,engine);
return new TokenStreamComponents(source, result);
}
}
2 SameWordFilter类
package com.liu.lucene.pro;
import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class SameWordFilter extends TokenFilter {
private SameWordEngine engine;
private Stack<String> samewordStack;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private AttributeSource.State current;
protected SameWordFilter(TokenStream input,SameWordEngine engine) {
super(input);
this.engine = engine;
samewordStack = new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
if(samewordStack.size()>0){
String sameWord = samewordStack.pop();
this.restoreState(current);
//termAtt.copyBuffer(sameWord.toCharArray(), 0, sameWord.length());
termAtt.setEmpty();
termAtt.append(sameWord);
posIncrAtt.setPositionIncrement(0);
return true;
}
if(!input.incrementToken()){
return false;
}
if(isAddSameWord()){
current = this.captureState();
}
return true;
}
private boolean isAddSameWord() {
String[] sameWords = engine.getSameWords(termAtt.toString());
if(sameWords == null){
return false;
}
for(String sameWord:sameWords){
samewordStack.push(sameWord);
}
return true;
}
}
3 SameWordEngine引擎接口
package com.liu.lucene.pro;
public interface SameWordEngine {
String[] getSameWords(String str);
}
4 SameWordEngineImpl引擎接口实现类
package com.liu.lucene.pro;
import java.util.HashMap;
import java.util.Map;
public class SameWordEngineImpl implements SameWordEngine {
@Override
public String[] getSameWords(String str) {
// TODO Auto-generated method stub
Map<String,String[]> map = new HashMap<String,String[]>();
map.put("2015", new String[]{"二零一五","20一5"});
map.put("redis", new String[]{"内存数据库","re内存"});
return map.get(str);
}
}
5 分词器分析工具类AnalyzerUtils
package com.liu.lucene.pro;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class AnalyzerUtils {
public static void displayTokens(Analyzer analyzer,Reader reader){
try {
TokenStream tokenStream = analyzer.tokenStream("path", reader);
tokenStream.reset();
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
while(tokenStream.incrementToken()){
System.out.print("["+term.toString()+"]");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
6 测试类 TestUnit
package com.liu.lucene.test;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.Reader;
import org.junit.Before;
import org.junit.Test;
import com.liu.lucene.pro.AnalyzerUtils;
import com.liu.lucene.pro.LuceneIndex;
import com.liu.lucene.pro.SameWordAnalyzer;
import com.liu.lucene.pro.SameWordEngineImpl;
public class TestUnit {
LuceneIndex index = null;
@Before
public void setUp(){
index = new LuceneIndex();
}
@Test
public void testIndex(){
index.index(true);
}
@Test
public void testIndexAnalyzer(){
index.index(true,new SameWordAnalyzer(new SameWordEngineImpl()));
}
@Test
public void testSearch(){
index.search("20一5",new SameWordAnalyzer(new SameWordEngineImpl()));
}
@Test
public void testDisplayTokens(){
try {
Reader reader = new FileReader("D:\\lhl\\developSoft\\apache-tomc
ab2d
at-7.0.62-windows-x64\\apache-tomcat-7.0.62\\logs\\loginfo.log.2015-11-27.log");
AnalyzerUtils.displayTokens(new SameWordAnalyzer(new SameWordEngineImpl()), reader);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
相关文章推荐
- HttpServlet
- java5 ReadWriteLock用法--读写锁实现
- SearchRequestBuilder常用方法说明
- SearchRequestBuilder常用方法说明
- 文件上传 - iframe上传
- XMLHTTP.readyState的五种状态
- iOS App设计模式开发中对建造者模式的运用实例
- CodeForces - 367C Sereja and the Arrangement of Numbers (图论&不懂)
- 2015年蓝桥杯C/C++组:串逐位和
- ios 内存管理与property copy strong weak assign
- bat重命名文件名
- this指针详解
- struts2中的constant配置详解
- HostMonitor监控主机状态
- 很有哲理的话
- 日常工作使用连接
- 动态HTML和W3C文档对象模型
- linux中execve函数的用法
- ubuntu sudo update与upgrade的作用及区别
- 关于直播的一些资料