lucene创建索引,分组查询
2013-05-29 14:26
302 查看
package test.lucene; import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Collector; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TopDocsCollector; @SuppressWarnings("unchecked") public class GroupCollector extends TopDocsCollector { Collector collector; int docBase; private String[] fc; // fieldCache private GroupField gf = new GroupField();// 保存分组统计结果 GroupCollector(Collector topDocsCollector, String[] fieldCache) throws IOException { super(null); collector = topDocsCollector; this.fc = fieldCache; } @Override public void collect(int doc) throws IOException { collector.collect(doc); // 因为doc是每个segment的文档编号,需要加上docBase才是总的文档编号 int docId = doc + docBase; // 添加的GroupField中,由GroupField负责统计每个不同值的数目 gf.addValue(fc[docId]); } @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { collector.setNextReader(reader, docBase); this.docBase = docBase; } @Override public void setScorer(Scorer scorer) throws IOException { collector.setScorer(scorer); } @Override public boolean acceptsDocsOutOfOrder() { return collector.acceptsDocsOutOfOrder(); } public void setFc(String[] fc) { this.fc = fc; } public GroupField getGroupField() { return gf; } }
package test.lucene;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import java.util.HashMap;import java.util.List;import java.util.Map;/*** 用于保存分组统计后每个字段的分组结果*/public class GroupField {/*** 字段名*/private String name;/*** 所有可能的分组字段值,排序按每个字段值的文档个数大小排序*/private List<String> values = new ArrayList<String>();/*** 保存字段值和文档个数的对应关系*/private Map<String, Integer> countMap = new HashMap<String, Integer>();public Map<String, Integer> getCountMap() {return countMap;}public void setCountMap(Map<String, Integer> countMap) {this.countMap = countMap;}public String getName() {return name;}public void setName(String name) {this.name = name;}public List<String> getValues() {Collections.sort(values, new ValueComparator());return values;}public void setValues(List<String> values) {this.values = values;}public void addValue(String value) {if (value == null || "".equals(value))return;// 对于多值的字段,支持按空格拆分String[] temp = value.split(" ");for (String str : temp) {if (countMap.get(str) == null) {countMap.put(str, 1);values.add(str);} else {countMap.put(str, countMap.get(str) + 1);}}}class ValueComparator implements Comparator<String> {public int compare(String value0, String value1) {if (countMap.get(value0) > countMap.get(value1)) {return -1;} else if (countMap.get(value0) < countMap.get(value1)) {return 1;}return 0;}}}
package test.lucene;import java.io.File;import java.io.IOException;import java.util.List;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriter.MaxFieldLength;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.search.FieldCache;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.MatchAllDocsQuery;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.Searcher;import org.apache.lucene.search.TopDocsCollector;import org.apache.lucene.search.TopScoreDocCollector;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.util.Version;public class Lucene {String path = "D:\\index";Version version = Version.LUCENE_29;public static final String COLUMN = "cid";@SuppressWarnings({ "deprecation", "unchecked" })public void search(int pageNO, int pageSize) throws ParseException {try {int start = (pageNO - 1) * pageSize;@SuppressWarnings("unused")int topCount = pageSize * pageNO;IndexReader reader = IndexReader.open(FSDirectory.open(new File(path)), true);Searcher searcher = new IndexSearcher(reader);TopDocsCollector collector = TopScoreDocCollector.create(0, false);// 读取"fenlei"字段值,放到fieldCache中final String[] fc = FieldCache.DEFAULT.getStrings(reader, "fenlei");// GroupCollector是自定义文档收集器,用于实现分组统计GroupCollector groupCollector = new GroupCollector(collector, fc);searcher.search(new MatchAllDocsQuery(), groupCollector);// GroupField用来保存分组统计的结果GroupField gf = groupCollector.getGroupField();System.out.println("分组信息");List<String> values = gf.getValues();for (String value : values) {System.out.println(value + "=" + gf.getCountMap().get(value));}// 搜索结果总数int totalHits = collector.getTotalHits();System.out.println("总数:" + totalHits);System.out.println("分页结果");// 获取分页后搜索结果ScoreDoc[] scoreDocs = collector.topDocs(start, pageSize).scoreDocs;for (int i = 0; i < scoreDocs.length; i++) {int docId = scoreDocs[i].doc;Document doc = reader.document(docId);System.out.println("id:" + doc.get("id") + " fenlei:"+ doc.get("fenlei") + " title:" + doc.get("title"));}} catch (IOException e) {e.printStackTrace();}}public void WriteIndex() throws CorruptIndexException,LockObtainFailedException, IOException {Long start = System.currentTimeMillis();// 分词器Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)),analyzer, MaxFieldLength.LIMITED);writer.setMaxBufferedDocs(2048);writer.setRAMBufferSizeMB(256);int count = 0;String title = "中国人民 测试数据";String fenlei = "分类";// 开始读取数据创建索引int max = 1000000;int groupMax = 75000;for (int i = 0; i < max; i++) {if (i % groupMax == 0) {count++;System.out.println(i);}Document document = new Document();Field idField = new Field("id", Integer.toString(i + 1), Store.YES,Index.NOT_ANALYZED);Field titleField = new Field("title", title + (i + 1), Store.YES,Index.ANALYZED);Field fenleiField = new Field("fenlei", fenlei + count, Store.YES,Index.NOT_ANALYZED);document.add(idField);document.add(titleField);document.add(fenleiField);writer.addDocument(document);}writer.commit();writer.optimize();writer.close();Long time = System.currentTimeMillis() - start;System.out.println("创建索引所用时间为:" + time + "毫秒");}public static void main(String[] args) throws CorruptIndexException,IOException, ParseException {Lucene test = new Lucene();// 建立索引//test.WriteIndex();// 搜索索引int pageNO = 14000, pageSize = 20;test.search(pageNO, pageSize);}}
相关文章推荐
- solr入门之lucene创建索引和查询索引及查询的源码读取类确定
- Lucene学习笔记(1)-索引创建和简单的查询
- lucene5学习 - 索引基本操作(创建,查询,更新,删除,分页)
- Lucene索引创建、查询与高亮
- Lucene的入门例子 - 创建索引,利用索引查询
- lucene索引创建与查询入门例子
- Lucene多线程创建索引及多目录下查询索引
- lucene4.8.0 + IKAnalyzer5.0.1 创建索引与查询demo
- lucene-5.1.0 索引的创建与查询 demo
- java分组批量执行,发短信、创建Lucene索引(应用场景)
- lucene 索引创建查询
- Lucene学习一:入门级Demo,创建索引和查询高亮显示
- Lucene学习之一:使用lucene为数据库表创建索引,并按关键字查询
- lucene4.3简单创建和查询索引实例
- 创建通用的分组索引查询
- Lucene学习-创建索引、关键词查询
- 使用org.apache.lucene创建和查询索引核心代码详解
- 使用org.apache.lucene创建和查询索引核心代码详解
- lucene 创建索引 查询实例
- Lucene创建、查询、删除、更新 索引