您的位置:首页 > 其它

比较lucene各种英文分析器Analyzer

2016-04-06 13:55 441 查看
比较常用的几种英文分析器,他们之间的区别见程序中的注释。

SimpleAnalyzer

StandardAnalyzer

WhitespaceAnalyzer

StopAnalyzer

package analyzer;

import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class TestAnalyzer {
private static String testString1 = "The quick brown fox jumped over the lazy dogs";
private static String testString2 = "xy&z mail is - xyz@sohu.com";
public static void testWhitespace(String testString) throws Exception{
Analyzer analyzer = new WhitespaceAnalyzer();
Reader r = new StringReader(testString);
Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
System.err.println("=====Whitespace analyzer====");
System.err.println("分析方法:空格分割");
Token t;
while ((t = ts.next()) != null) {
System.out.println(t.termText());
}
}
public static void testSimple(String testString) throws Exception{
Analyzer analyzer = new SimpleAnalyzer();
Reader r = new StringReader(testString);
Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
System.err.println("=====Simple analyzer====");
System.err.println("分析方法:空格及各种符号分割");
Token t;
while ((t = ts.next()) != null) {
System.out.println(t.termText());
}
}
public static void testStop(String testString) throws Exception{
Analyzer analyzer = new StopAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====stop analyzer====");
System.err.println("分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词");
//停止词
Token t;
while ((t = sf.next()) != null) {
System.out.println(t.termText());
}
}
public static void testStandard(String testString) throws Exception{
Analyzer analyzer = new StandardAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====standard analyzer====");
System.err.println("分析方法:混合分割,包括了去掉停止词,支持汉语");
Token t;
while ((t = sf.next()) != null) {
System.out.println(t.termText());
}
}
public static void main(String[] args) throws Exception{
//        String testString = testString1;
String testString = testString2;
System.out.println(testString);
testWhitespace(testString);
testSimple(testString);
testStop(testString);
testStandard(testString);
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  lucene 英文分词器