您的位置:首页 > 其它

Lucene从入门到熟悉(三)分词

2014-08-21 21:45 405 查看

分词

分词是用来对文本按语言特征按算法进行过滤、分组处理一种技术。
分词的对象是文本,而不是图像动画脚本等等
分词的方式就是过滤分组
过滤主要是把文本中那些没有实际意义的字或词过滤掉
分组就是按照"分词数据库"内已添加好的词进行匹配。

Lucene提供的分词器
// Analyzer   analyzer=new StandardAnalyzer();

// Analyzer   analyzer=new SimpleAnalyzer();   

// Analyzer   analyzer=new WhitespaceAnalyzer();

// Analyzer   analyzer=new ChineseAnalyzer();

// Analyzer   analyzer=new CJKAnalyzer();  // 两个汉字一组

package com.lucene.test.T03;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class TestAnalyzer {

/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {

//		Analyzer   analyzer=new StandardAnalyzer();
//		Analyzer   analyzer=new SimpleAnalyzer();
//		Analyzer   analyzer=new WhitespaceAnalyzer();
//		Analyzer   analyzer=new ChineseAnalyzer();
//		Analyzer   analyzer=new CJKAnalyzer();
Analyzer   analyzer=new IKAnalyzer();
TokenStream tokenStream=analyzer.tokenStream("", new StringReader("welcome to use lucene! ?"));
//		TokenStream tokenStream=analyzer.tokenStream("", new StringReader("明天会更美好!"));

Token token =new Token();
while(tokenStream.next(token)!=null)
{
System.out.println(token.term());
}
}

}


paoding开源中文分词器
package com.lucene.test.T03;

import java.io.IOException;
import java.io.StringReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class AnalyzerPaoding {
private static Logger logger = LoggerFactory
.getLogger(AnalyzerPaoding.class);

public static void main(String[] args) throws IOException {
Analyzer analyzer = new PaodingAnalyzer();

TokenStream ts = analyzer.tokenStream("", new StringReader("法律实践奥利弗论文集饿哦土建类士大夫接待来访将阿隆索"));
Token token = new Token();
while ((token = ts.next()) != null) {
logger.debug("read result from token");
System.out.println(token.term());
}
}
}

paoding中用Queryparse
package com.lucene.test.T03;

import java.io.IOException;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestIndexPaoding {

/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
String[] ids = { "1", "2", "3", "4" };
String[] names = { "张三", "李四", "王五", "赵六" };
// String[] names = { "zhangsan", "zhangsun", "zhangson", "zhaoliu" };
String[] address = { "居住北京", "南京", "北京海淀", "dalian" };
String[] birthday = { "19880101", "19860105", "19760205", "19550719" };
Analyzer analyzer = new PaodingAnalyzer();
String indexDir = "c:/temp/luceneindex";
Directory dir = FSDirectory.getDirectory(indexDir);
// true 表示创建或覆盖当前索引;false表示对当前索引进行追加
// Default value is 128
IndexWriter writer = new IndexWriter(dir, analyzer, true,
IndexWriter.MaxFieldLength.LIMITED);
for (int i = 0; i < ids.length; i++) {
Document document = new Document();
document.add(new Field("id", ids[i], Field.Store.YES,
Field.Index.ANALYZED));
document.add(new Field("name", names[i], Field.Store.YES,
Field.Index.ANALYZED)); // Field.Index.NO表示不建立索引
document.add(new Field("address", address[i], Field.Store.YES,
Field.Index.ANALYZED));
document.add(new Field("birthday", birthday[i], Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(document);
}
writer.optimize();
writer.close();
System.out.println("index created ....");

}
}


package com.lucene.test.T03;

import java.io.IOException;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestSearcherPaoding {
public static void main(String[] args) throws IOException, ParseException {
String indexDir = "c:/temp/luceneindex";
Directory dir = FSDirectory.getDirectory(indexDir);
IndexSearcher searcher = new IndexSearcher(dir);
Analyzer analyzer = new PaodingAnalyzer();
ScoreDoc[] hits = null;

QueryParser parser = new QueryParser("name", analyzer);
Query qury = parser.parse("address:北京 AND NOT name:张三");// 高级查询(adress包含北京,但name不为张三的)

TopDocs topDocs = searcher.search(qury, 10);
hits = topDocs.scoreDocs;
for (int i = 0; i < hits.length; i++) {
Document doc = searcher.doc(hits[i].doc);
System.out.print(hits[i].score + " ");
System.out.print(doc.get("id") + " ");
System.out.print(doc.get("name") + " ");
System.out.print(doc.get("address") + " ");
System.out.println(doc.get("birthday") + " ");
}

searcher.close();
dir.close();
}
}


读取文件建立索引并查询:
package com.lucene.test.T04;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TestFileIndex {
private static Logger logger = LoggerFactory.getLogger(TestFileIndex.class);

public static void main(String[] args) throws FileNotFoundException,
IOException {
String indexDir = "c:/temp/lucene/index";
Analyzer analyzer = new PaodingAnalyzer();
Directory dir = FSDirectory.getDirectory(indexDir);
IndexWriter writer = new IndexWriter(indexDir, analyzer, true,
IndexWriter.MaxFieldLength.LIMITED);

// read data from dataDir and create index
String dataDir = "c:/temp/lucene/data";
File[] files = new File(data
c2f4
Dir).listFiles();
System.out.println("file numbers:" + files.length);
for (int i = 0; i < files.length; i++) {
// read file content
StringBuffer strBuff = new StringBuffer();
String line = "";
FileInputStream is = new FileInputStream(files[i].getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(is));
line = br.readLine();
while (line != null) {
strBuff.append(line);
strBuff.append("\n");
line = br.readLine();
}

// create index
Document document = new Document();
document.add(new Field("title", files[i].getName(),
Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("content", strBuff.toString(),
Field.Store.YES, Field.Index.ANALYZED));

// write index
writer.addDocument(document);

is.close();
br.close();
}

writer.close();
dir.close();
}
}


package com.lucene.test.T04;

import java.io.IOException;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestFileSearch {
public static void main(String[] args) throws IOException, ParseException {
String indexDir = "c:/temp/lucene/index";
Directory dir = FSDirectory.getDirectory(indexDir);
IndexSearcher searcher = new IndexSearcher(dir);
Analyzer analyzer = new PaodingAnalyzer();
ScoreDoc[] hits = null;

QueryParser parser = new QueryParser("content", analyzer);
Query qury = parser.parse("软件");

TopDocs topDocs = searcher.search(qury, 10);
hits = topDocs.scoreDocs;
for (int i = 0; i < hits.length; i++) {
Document doc = searcher.doc(hits[i].doc);
// System.out.print(hits[i].score + " ");
System.out.print(doc.get("title") + " ");
System.out.print(doc.get("content") + " ");
}

searcher.close();
dir.close();
}
}

高亮

package com.lucene.test.T05;

import java.io.StringReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

public class TestHighlight {

public static void main(String[] args) throws Exception {

Searcher searcher = new IndexSearcher("c:/temp/lucene/index");
Analyzer analyzer = new PaodingAnalyzer();

String filed = "content";
String queryStr = "分词";

QueryParser parser = new QueryParser(filed, analyzer);
Query query = parser.parse(queryStr);

TopDocCollector collector = new TopDocCollector(10);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;

// highlight setup
Highlighter highlight = null;

SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(
"<font color='red'>", "</font>");

highlight = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
highlight.setTextFragmenter(new SimpleFragmenter(200));

Document doc;
for (int i = 0; i < hits.length; i++) {
System.out.println(hits[i].doc);
System.out.println("---------------------------------------1");
System.out.println(hits[i].score);
System.out.println("---------------------------------------2");
doc = searcher.doc(hits[i].doc);
//			System.out.println(doc.toString());
System.out.println("---------------------------------------3");

// hightlight view
TokenStream tokenStream = new PaodingAnalyzer().tokenStream(
"content", new StringReader(doc.get("content")));
System.out.println(highlight.getBestFragment(tokenStream,
doc.get("content")));

}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  lucene