您的位置:首页 > 其它

一个自定义的用语过滤非字符的Lucene分析器

2016-02-27 11:26 253 查看
<strong><span style="font-size:18px;">/***
 * @author YangXin
 * @info 一个定义的用语过滤非字字符的Lucene分析器
 */
package unitNine;

import org.apache.lucene.analysis.Analyzer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
public class MyAnalyzer extends Analyzer{
	private final Pattern alphabets = Pattern.compile("[a-z]+");
	  
	  @Override
	  public TokenStream tokenStream(String fieldName, Reader reader) {
	    TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
	    result = new StandardFilter(result);
	    result = new LowerCaseFilter(result);
	    result = new StopFilter(true, result, StandardAnalyzer.STOP_WORDS_SET);
	    
	    TermAttribute termAtt = (TermAttribute) result.addAttribute(TermAttribute.class);
	    StringBuilder buf = new StringBuilder();
	    try {
	      while (result.incrementToken()) {
	        if (termAtt.termLength() < 3) continue;
	        String word = new String(termAtt.termBuffer(), 0, termAtt.termLength());
	        Matcher m = alphabets.matcher(word);
	        
	        if (m.matches()) {
	          buf.append(word).append(" ");
	        }
	      }
	    } catch (IOException e) {
	      e.printStackTrace();
	    }
	    
	    return new WhitespaceTokenizer(new StringReader(buf.toString()));
	  }
}
</span></strong>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: