您的位置:首页 > 数据库

利用lucene对整个数据库建立索引

2013-03-12 21:25 369 查看

利用lucene对整个数据库建立索引

导言:
如果要对整个数据库做精确查询或模糊查询,我们怎么才可以做到?还是通过SQL查询吗?答案是否定的。因为,通过SQL对整个数据库做精确查询或模糊查询,速度将非常的慢;
lucene解决了这个问题。通过对表或者文本文件预先建立索引,可以很快的实现全文检索。
思路:
1、通过SQL得到所有表名的集合---->2、遍历所有的表,分别为每个表的每个记录建立索引;同时添加表的中文名以及表的说明的索引---->按Writer\analyzer\document\field的循序建索引。
IndexerDB类:
package com.model;

import java.io.File;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.util.Date;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class IndexerDB {
// 保存索引文件的地方
private static String INDEX_DIR = "F:\\MyLuceneDB2\\LuceneFileIndexDir";
private String index_dir;
private File file = new File(INDEX_DIR);
// 将要搜索TXT文件的地方
private String data_dir;
private String DATA_DIR = "F:\\Lucene";
private String entityName;
// private ApiInfo apiInfo;
// private ApiInfoQuery apiInfoQuery = new ApiInfoQuery();
// private ApiInfoService apiInfoService;
// private List<ApiIndicator> apiIndicators = new ArrayList<ApiIndicator>();
// private ApiIndicator apiIndicator;
private StringBuilder newsb = new StringBuilder();
private ResultSet rs = null;
private ResultSet tempRs = null;
private ResultSetMetaData rsmd = null;
// private SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(
// Version.LUCENE_30, true);
private Document doc;
private String tableName;

public static void main(String[] args) throws Exception {
IndexerDB indexDB = new IndexerDB();
File file = new File(INDEX_DIR);
// 1
indexDB.execute(file);
}

// 执行对数据库的索引化
// @param file
// @return
// @throws Exception
//
public String execute(File file) throws Exception {
long start = new Date().getTime();
// 2
int numIndexed = getAllTableNameFromDBAndIndexing(file);
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
return "success";
}

// 查询所有的表,并遍历所有的表;
// 调用getDataFromTable(tableName)获取表的记录;
// 调用indexData(writer, rs, tableName)对单张表做索引
public int getAllTableNameFromDBAndIndexing(File file) throws Exception {
ResultSet rs = null;
String sql1 = null;
String sql2 = null;
sql1 = "select [name] from [sysobjects] where [type] = 'u' order by [name]";
sql2 = "show tables";
SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(
Version.LUCENE_36, true);
IndexWriter writer = new IndexWriter(FSDirectory.open(file), analyzer,
true, IndexWriter.MaxFieldLength.LIMITED);
try {
rs = JDBCUtil.execute(sql1);
} catch (Exception e) {
rs = JDBCUtil.execute(sql2);
}
while (rs.next()) {

tableName = rs.getString(1);
System.out.println("tableName: " + rs.getString(1));
// ResultSetMetaData rsmd = rs.getMetaData();
ResultSet tempRs = getDataFromTable(tableName);
System.out.println("already get data of table");
indexData(writer, tempRs, tableName);
}
int numIndexed = writer.numDocs();
writer.optimize();
writer.close();
return numIndexed;
}

// 取得表的数据
public ResultSet getDataFromTable(String tableName) throws Exception {
String sql = "select * from " + tableName;
System.out.println("" + sql);
// String sql = "select * from " + "HK_STKCODE";
return JDBCUtil.execute(sql);
}

// 对特定表的记录,采用特定writer,索引
// 该方法是lucene的indexer的关键方法
// writer---->document---->field
private void indexData(IndexWriter writer, ResultSet rs, String tableName)
throws Exception {
if (rs == null) {
return;
}
// 取得实体名
// System.out.println("tableName: " + tableName);
entityName = getEntityName(tableName);
System.out.println(" .entityName: " + entityName);
while (rs.next()) {
doc = new Document();
rsmd = rs.getMetaData();
int colsNum = rsmd.getColumnCount();
// System.out.println("colsNum: "+colsNum);
for (int i = 1; i < colsNum + 1; i++) {
String columnName = rsmd.getColumnName(i);
// System.out.println("columnName: "+columnName);
// System.out.println(" rs.getString(i): "+ rs.getString(i));
// 1、对该条记录的第一个字段进行索引
doc.add(new Field(columnName, (rs.getString(i) == null ? ""
: rs.getString(i)), Field.Store.YES,
Field.Index.ANALYZED));
}
// 2、在此处添加,表的说明(description);来至于apiinfo;根据表对应的实体名,找到表对应的apiinfo实体
// apiInfoQuery.setEntityname(entityName);
// apiInfo = apiInfoService.gainApiInfoByEntityName(entityName);
// if (apiInfo != null) {
// doc.add(new Field("apiDesc", apiInfo.toString(), Field.Store.YES,
// Field.Index.ANALYZED));
// }
// 3、在此处添加,表的中英文字段:
// 来自于apiindicator;根据表对应的实体名,去APIINFO中取找apiid,再去apiindicator中找apiindicator的实体
// List<ApiIndicator> apiIndicators = apiInfoService
// .gainApiIndicatorsByApiId(apiInfo.getId());
// // 输出指标和描述结合为字符串
// for (int i = 0; i < apiIndicators.size(); i++) {
// apiIndicator = apiIndicators.get(i);
// newsb.append(apiIndicator.getCnname() + " ")
// .append(apiIndicator.getEnname() + " ")
// .append(apiIndicator.getDescription() + "");
// newsb.append("\n");
// }
// System.out.println("newsb.toString(): " + newsb.toString());
// doc.add(new Field("outputFields", newsb.toString(),
// Field.Store.YES, Field.Index.ANALYZED));
// 4、文件名
doc.add(new Field("apiName", entityName, Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(doc);
}
}

// 将表名通过字符串处理成为实体名
public String getEntityName(String tableName) {
// tableName = tableName.substring(0, tableName.indexOf(".txt"));
StringBuilder sb = new StringBuilder();
// 将"_"替换掉,如果有的话,以便处理成与API对应的实体一样的字符串。如:PUB_SEITGC.txt,RSH_RSHRPT_INDFO.txt
if (tableName.indexOf("_") != -1) {
String[] subStrings = tableName.split("_");
for (int i = 0; i < subStrings.length; i++) {
sb.append(subStrings[i]);
}
return sb.toString();
} else {
return tableName;
}
}

public String getIndex_dir() {
return index_dir;
}

public void setIndex_dir(String index_dir) {
this.index_dir = index_dir;
}

public String getData_dir() {
return data_dir;
}

public void setData_dir(String data_dir) {
this.data_dir = data_dir;
}
// public ApiInfoService getApiInfoService() {
// return apiInfoService;
// }
//
// public void setApiInfoService(ApiInfoService apiInfoService) {
// this.apiInfoService = apiInfoService;
// }
}


JDBCUtil类:

// 数据库查询类
//JDBCUtil用于取得连接;JDBCUtil.execute(sql)用于执行SQL,并返回resultset.
package com.model;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

public class JDBCUtil {
public static Connection conn = null;

// 建立连接
static void getConntion() {
try {
String driver_class = "com.mysql.jdbc.Driver";
// ?useUnicode=true&characterEncoding=utf-8
String connection_url = "jdbc:mysql://localhost:3306/sy001?useUnicode=true&characterEncoding=UTF-8";
String user_name = "root";
String db_password = "0882115303";
Class.forName(driver_class);
conn = DriverManager.getConnection(connection_url, user_name,
db_password);
conn.setAutoCommit(false);
} catch (Exception e) {
e.printStackTrace();
}
}

// 查询后,得到数据
public static ResultSet execute(String sql) throws Exception {
// 取得连接
getConntion();
// 写SQL
// 得到一个statment对象
Statement stmt = conn.createStatement();
// 得到一个结果集
return stmt.executeQuery(sql);
}

static void close() {
if (conn != null) {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}

public static Connection getConn() {
return conn;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: