您的位置:首页 > 其它

lucene 建立索引的不同方式

2015-04-16 20:44 232 查看
1.创建一个简单的索引:

package lia.meetlucene;

/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0 *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;

// From chapter 1

/**
* This code was originally written for Erik's Lucene intro java.net article
*/
public class Indexer {

public static void main(String[] args) throws Exception {
// args = new String[2];
// args[0] = "E:/xiaodajun/new/lia2e/src/lia/meetlucene";
// args[1] =
// "E:/xiaodajun/new/lia2e/src/lia/meetlucene/data";//"src/lia/meetlucene/data";
// C:/Users/Administrator/Desktop/xdj/data
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java "
+ Indexer.class.getName() + " <index dir> <data dir>");
}
// String indexDir = args[0]; // 1
// String dataDir = args[1]; // 2

// String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
// String dataDir = "C:/Users/Administrator/Desktop/xdj/data";

String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
String dataDir = "C:/Users/Administrator/Desktop/xdj/tengxun/A__Vae";

long start = System.currentTimeMillis();
// ///////////////////////////////////////////////////////////////////////////////////////////
Indexer indexer = new Indexer(indexDir);
int numIndexed;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} finally {
indexer.close();
}
long end = System.currentTimeMillis();
// /////////////////////////////////////////////////////////////////////////////////////////////
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}

private IndexWriter writer;

public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(new File(indexDir));

/*
* writer = new IndexWriter(dir, //3 创建Lucene Index Writer new
* StandardAnalyzer( //3 Version.LUCENE_30),//3 true, //3
* IndexWriter.MaxFieldLength.UNLIMITED); //3
*/
writer = new IndexWriter(dir, // 3 创建Lucene Index Writer
new SmartChineseAnalyzer(Version.LUCENE_20),// 3
// new StandardAnalyzer(Version.LUCENE_30),
true, // 3
IndexWriter.MaxFieldLength.UNLIMITED); // 3
}

public void close() throws IOException {
writer.close(); // 4 关闭Lucene Index Writer
}

public int index(String dataDir, FileFilter filter) throws Exception {

File[] files = new File(dataDir).listFiles();

for (File f : files) {
if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead()
&& (filter == null || filter.accept(f))) {

indexFile(f);
}
}

return writer.numDocs(); // 5返沪被索引文档数
}

private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() // 6只索引.txt文件,采用FileFilter
.endsWith(".xml"); // 6
}
}

protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f))); // 7索引文件内容
doc.add(new Field("filename", f.getName(), // 8索引文件名
Field.Store.YES, Field.Index.NOT_ANALYZED));// 8
doc.add(new Field("fullpath", f.getCanonicalPath(), // 9索引文件完整路径
Field.Store.YES, Field.Index.NOT_ANALYZED));// 9
return doc;
}

// Store.是否存储 yes no compress(压缩之后再存)
// Index。是否进行索引 Index.ANALYZED 分词后进行索引,NOT_ANALYZED 不索引,NOT_ANALYZED 不分词直接索引

private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc); // 10向Lucene索引中添加文档
}
}

/*
* #1 Create index in this directory #2 Index *.txt files from this directory #3
* Create Lucene IndexWriter #4 Close IndexWriter #5 Return number of documents
* indexed #6 Index .txt files only, using FileFilter #7 Index file content #8
* Index file name #9 Index file full path #10 Add document to Lucene index
*/


View Code
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: