您的位置:首页 > 其它

lucene创建索引,分组查询

2013-05-29 14:26 302 查看
package test.lucene;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TopDocsCollector;

@SuppressWarnings("unchecked")
public  class GroupCollector extends TopDocsCollector {

Collector collector;
int docBase;

private String[] fc; // fieldCache
private GroupField gf = new GroupField();// 保存分组统计结果

GroupCollector(Collector topDocsCollector, String[] fieldCache)
throws IOException {
super(null);
collector = topDocsCollector;
this.fc = fieldCache;
}

@Override
public void collect(int doc) throws IOException {
collector.collect(doc);
// 因为doc是每个segment的文档编号,需要加上docBase才是总的文档编号
int docId = doc + docBase;
// 添加的GroupField中,由GroupField负责统计每个不同值的数目
gf.addValue(fc[docId]);
}

@Override
public void setNextReader(IndexReader reader, int docBase)
throws IOException {
collector.setNextReader(reader, docBase);
this.docBase = docBase;
}

@Override
public void setScorer(Scorer scorer) throws IOException {
collector.setScorer(scorer);
}

@Override
public boolean acceptsDocsOutOfOrder() {
return collector.acceptsDocsOutOfOrder();
}

public void setFc(String[] fc) {
this.fc = fc;
}

public GroupField getGroupField() {
return gf;
}
}
package test.lucene;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import java.util.HashMap;import java.util.List;import java.util.Map;/*** 用于保存分组统计后每个字段的分组结果*/public class GroupField {/*** 字段名*/private String name;/*** 所有可能的分组字段值,排序按每个字段值的文档个数大小排序*/private List<String> values = new ArrayList<String>();/*** 保存字段值和文档个数的对应关系*/private Map<String, Integer> countMap = new HashMap<String, Integer>();public Map<String, Integer> getCountMap() {return countMap;}public void setCountMap(Map<String, Integer> countMap) {this.countMap = countMap;}public String getName() {return name;}public void setName(String name) {this.name = name;}public List<String> getValues() {Collections.sort(values, new ValueComparator());return values;}public void setValues(List<String> values) {this.values = values;}public void addValue(String value) {if (value == null || "".equals(value))return;// 对于多值的字段,支持按空格拆分String[] temp = value.split(" ");for (String str : temp) {if (countMap.get(str) == null) {countMap.put(str, 1);values.add(str);} else {countMap.put(str, countMap.get(str) + 1);}}}class ValueComparator implements Comparator<String> {public int compare(String value0, String value1) {if (countMap.get(value0) > countMap.get(value1)) {return -1;} else if (countMap.get(value0) < countMap.get(value1)) {return 1;}return 0;}}}
package test.lucene;import java.io.File;import java.io.IOException;import java.util.List;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriter.MaxFieldLength;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.search.FieldCache;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.MatchAllDocsQuery;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.Searcher;import org.apache.lucene.search.TopDocsCollector;import org.apache.lucene.search.TopScoreDocCollector;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.util.Version;public class Lucene {String path = "D:\\index";Version version = Version.LUCENE_29;public static final String COLUMN = "cid";@SuppressWarnings({ "deprecation", "unchecked" })public void search(int pageNO, int pageSize) throws ParseException {try {int start = (pageNO - 1) * pageSize;@SuppressWarnings("unused")int topCount = pageSize * pageNO;IndexReader reader = IndexReader.open(FSDirectory.open(new File(path)), true);Searcher searcher = new IndexSearcher(reader);TopDocsCollector collector = TopScoreDocCollector.create(0, false);// 读取"fenlei"字段值,放到fieldCache中final String[] fc = FieldCache.DEFAULT.getStrings(reader, "fenlei");// GroupCollector是自定义文档收集器,用于实现分组统计GroupCollector groupCollector = new GroupCollector(collector, fc);searcher.search(new MatchAllDocsQuery(), groupCollector);// GroupField用来保存分组统计的结果GroupField gf = groupCollector.getGroupField();System.out.println("分组信息");List<String> values = gf.getValues();for (String value : values) {System.out.println(value + "=" + gf.getCountMap().get(value));}// 搜索结果总数int totalHits = collector.getTotalHits();System.out.println("总数:" + totalHits);System.out.println("分页结果");// 获取分页后搜索结果ScoreDoc[] scoreDocs = collector.topDocs(start, pageSize).scoreDocs;for (int i = 0; i < scoreDocs.length; i++) {int docId = scoreDocs[i].doc;Document doc = reader.document(docId);System.out.println("id:" + doc.get("id") + " fenlei:"+ doc.get("fenlei") + " title:" + doc.get("title"));}} catch (IOException e) {e.printStackTrace();}}public void WriteIndex() throws CorruptIndexException,LockObtainFailedException, IOException {Long start = System.currentTimeMillis();// 分词器Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)),analyzer, MaxFieldLength.LIMITED);writer.setMaxBufferedDocs(2048);writer.setRAMBufferSizeMB(256);int count = 0;String title = "中国人民  测试数据";String fenlei = "分类";// 开始读取数据创建索引int max = 1000000;int groupMax = 75000;for (int i = 0; i < max; i++) {if (i % groupMax == 0) {count++;System.out.println(i);}Document document = new Document();Field idField = new Field("id", Integer.toString(i + 1), Store.YES,Index.NOT_ANALYZED);Field titleField = new Field("title", title + (i + 1), Store.YES,Index.ANALYZED);Field fenleiField = new Field("fenlei", fenlei + count, Store.YES,Index.NOT_ANALYZED);document.add(idField);document.add(titleField);document.add(fenleiField);writer.addDocument(document);}writer.commit();writer.optimize();writer.close();Long time = System.currentTimeMillis() - start;System.out.println("创建索引所用时间为:" + time + "毫秒");}public static void main(String[] args) throws CorruptIndexException,IOException, ParseException {Lucene test = new Lucene();// 建立索引//test.WriteIndex();// 搜索索引int pageNO = 14000, pageSize = 20;test.search(pageNO, pageSize);}}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: