Lucene入门例子
2014-07-08 12:30
288 查看
1.建立索引
2.查询
3.使用mmseg4j分词的例子
package org.senssic.lucene; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Scanner; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import com.chenlb.mmseg4j.analysis.MMSegAnalyzer; /** * jdk7+ * * @ClassName: IndexFiles * @Description: 索引建立 * @author senssic * @date 2014年7月8日 上午9:39:30 * */ public class IndexFiles { private static String[] name = { ".txt", ".html" }; private IndexFiles() { } public static void main(String[] args) { String indexPath = "D:\\Index";// 建立索引的目录 String docsPath = "D:\\LuceneIndex";// 被索引目录 boolean create = true;// 是否重新删除建立 final File docDir = new File(docsPath); Date start = new Date(); try { System.out.println("索引目录中 '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); // 使用标准分词 Analyzer analyzer = new MMSegAnalyzer(); // 建立索引配置类 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, analyzer); if (create) { // 删除索引重新建立改变状态 iwc.setOpenMode(OpenMode.CREATE); } else { // 在索引中添加一个新的document改变状态 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // 索引缓冲 iwc.setRAMBufferSizeMB(100); IndexWriter writer = new IndexWriter(dir, iwc); // 递归建立索引 indexDocs(writer, docDir); writer.close(); Date end = new Date(); System.out.println("总耗时\t" + (end.getTime() - start.getTime()) + "\t毫秒"); } catch (IOException e) { System.out.println(" 异常: " + e.getClass() + "\n异常信息: " + e.getMessage()); } } static void indexDocs(IndexWriter writer, File file) throws IOException { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(new FilenameFilter() { @Override public boolean accept(File paramFile, String pString) { for (String stn : name) { if (!pString.toLowerCase().endsWith(stn)) { return true; } } return false; } }); if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { StringBuilder sb = new StringBuilder(); Scanner scanner = new Scanner(file); scanner.useDelimiter("\n"); while (scanner.hasNext()) { sb.append(scanner.next() + "\n"); } try { Document doc = new Document(); // 建立索引信息元素如果不保存则不会存储到Document Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); doc.add(new TextField("contents", sb.toString(), Field.Store.YES)); doc.add(new StringField("lastmodified", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") .format(new Date(file.lastModified())), Field.Store.YES)); doc.add(new StringField("filename", file.getName(), Field.Store.YES)); float length = (float) file.length() / (float) 1024; doc.add(new StringField("filelength", String.format("%.3f", length) + "kB", Field.Store.YES)); doc.add(new StringField("absolutepath", file .getAbsolutePath(), Field.Store.YES)); // 判断刚才IndexWriterConfig的OpenMode的状态 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // 重新建立索引 System.out.println("添加中 " + file); writer.addDocument(doc); } else { // 更新建立索引 System.out.println("更新中 " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { scanner.close(); } } } } }
2.查询
package org.senssic.lucene; import java.io.File; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import com.chenlb.mmseg4j.analysis.MMSegAnalyzer; public class SearchFiles { private SearchFiles() { } public static void main(String[] args) throws Exception { String index = "D:\\Index"; String queryString = "我爱你"; // 打开索引 IndexReader reader = DirectoryReader.open(FSDirectory.open(new File( index))); IndexSearcher searcher = new IndexSearcher(reader); // 标准分词 Analyzer analyzer = new MMSegAnalyzer(); // 分析的字段 QueryParser parser = new QueryParser(Version.LUCENE_48, "contents", analyzer); // 查询的内容 Query query = parser.parse(queryString); System.out.println("查询内容: " + query.toString("contents")); Date start = new Date(); TopDocs results = searcher.search(query, null, 100); ScoreDoc sDoc[] = results.scoreDocs; int i = 0; for (ScoreDoc scoreDoc : sDoc) { Document document = searcher.doc(scoreDoc.doc); System.out.println("\n\n\n第" + ++i + "个文件:"); System.out.println("文件名称:" + document.get("filename") + "\n路径:" + document.get("path") + "\n绝对路径:" + document.get("absolutepath") + "\n内容:" // document.get("contents") + "\n最后修改时间:" + document.get("lastmodified") + "\n文件大小:" + document.get("filelength")); } Date end = new Date(); System.out.println("\n\n\n耗时: " + (end.getTime() - start.getTime()) + "ms"); System.out.println(results.totalHits); reader.close(); } }
3.使用mmseg4j分词的例子
package org.senssic.lucene.util; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import com.chenlb.mmseg4j.analysis.MMSegAnalyzer; public class AnalyzerUtils { public static void displayAllTokenInfo(String str, Analyzer a) { try { TokenStream stream = a .tokenStream("content", new StringReader(str)); // 位置增量的属性,存储语汇单元之间的距离 PositionIncrementAttribute pia = stream .addAttribute(PositionIncrementAttribute.class); // 每个语汇单元的位置偏移量 OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class); // 存储每一个语汇单元的信息(分词单元信息) CharTermAttribute cta = stream .addAttribute(CharTermAttribute.class); // 使用的分词器的类型信息 TypeAttribute ta = stream.addAttribute(TypeAttribute.class); for (; stream.incrementToken();) { System.out.print("[" + cta + "]"); System.out.print(pia.getPositionIncrement() + ":"); System.out.print(cta + "[" + oa.startOffset() + "-" + oa.endOffset() + "]-->" + ta.type() + "\n"); } } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) { AnalyzerUtils.displayAllTokenInfo("我爱你中国", new MMSegAnalyzer()); } }需要的jar包
相关文章推荐
- Lucene-搜索的入门例子
- Lucene入门例子
- java lucene入门例子
- Lucene入门以及例子
- Lucene第一个入门学习例子
- Lucene入门例子
- Lucene的入门例子
- Lucene入门例子
- lucene索引创建与查询入门例子
- lucene快速入门---一个例子读懂
- Lucene入门,小例子,笔记
- lucene入门简单的例子
- Lucene的入门例子 - 创建索引,利用索引查询
- Lucene入门,小例子,笔记
- Lucene第一个入门学习例子
- Spring 入门(一个简单的例子)
- 通过例子学习Lua(5) ---- Lua与C交互入门 (转)
- ajax入门小例子以及一些xmlhttp的参考资料
- ajax入门——一些理解及例子
- 一个经典的ADO.NET入门例子