利用Lucene.net对附件做搜索(转载)
2011-03-08 03:11
330 查看
最近研究了个全文搜索的,Lucene.net,很有名的开源组件(有Java版本)。其实谈不上研 究,就是以前客户有个需要,要能搜索上传文件(如 word Excel Txt 等等),项目中这些附件都存在一个image字段中的,一直没有办法来搜索,本文就讲一下如何利用Lucene.net对附件做搜索功能,并且利用com 组件来读取office内容。
介绍一下Lucene.net的使用,使用了Lucene.Net.dll2.1 Highlighter.Net.dll 2.0(高亮) Lucene.Net.Analysis.Cn.dll 1.3(划词引擎):
1 添加索引
#region 利用com组件读取office
/// <summary>
/// 判断文件是否存在
/// </summary>
/// <param name="pFileName"></param>
private void IsExists(string pFileName) {
if (!File.Exists(pFileName)) {
throw new ApplicationException("指定目录下的无该文件");
}
}
//获得word文件的文本内容
public string Doc2Text(string docFileName) {
IsExists(docFileName);
//实例化COM
Word.ApplicationClass wordApp = new Word.ApplicationClass();
object fileobj = docFileName;
object nullobj = System.Reflection.Missing.Value;
//打开指定文件(不同版本的COM参数个数有差异,一般而言除第一个外都用nullobj就行了)
Word.Document doc = wordApp.Documents.Open(ref fileobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj
);
//取得doc文件中的文本
string outText = doc.Content.Text;
//关闭文件
doc.Close(ref nullobj, ref nullobj, ref nullobj);
//关闭COM,关闭word程序
wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
GC.Collect();
//返回
return outText;
}
//获得excel文件的文本内容
public string Xls2Text(string xlsFileName) {
IsExists(xlsFileName);
Excel.Application xlsApp = new Excel.ApplicationClass();
object nullobj = System.Reflection.Missing.Value;
//打开Excel文档
Excel.Workbook excel = xlsApp.Workbooks.Open(xlsFileName, nullobj,
nullobj, nullobj, nullobj,
nullobj, nullobj, nullobj,
nullobj, nullobj, nullobj,
nullobj, nullobj, nullobj,
nullobj);
//遍历Excel工作表
Excel.Worksheet ews = null;
StringBuilder builder = new StringBuilder();
try
{
for (int k = 1; k <= excel.Worksheets.Count; k++)
{
ews = (Excel.Worksheet)excel.Worksheets[k];
//builder.Append(((Excel.Range)ews.UsedRange).Text);
if (ews.UsedRange.Value2 != null)
{
for (int i = 1; i <= ews.UsedRange.Cells.Rows.Count; i++)
{
for (int j = 1; j <= ews.UsedRange.Cells.Columns.Count; j++)
{
if (((object[,])(ews.UsedRange.Value2))[i, j] != null)
{
builder.Append(((object[,])(ews.UsedRange.Value2))[i, j]).Append("|");
}
}
}
}
}
}
catch (Exception ex)
{
throw ex;
}
finally
{
excel.Close(nullobj, nullobj, nullobj);
xlsApp.Quit();
GC.Collect();
}
return builder.ToString();
}
//获得PPT文件的文本内容
public string Ppt2Text(string pptFileName) {
IsExists(pptFileName);
PowerPoint.Application pptApp = new PowerPoint.ApplicationClass();
object nullobj = System.Reflection.Missing.Value;
PowerPoint.Presentation ppt = pptApp.Presentations.Open(pptFileName,
Microsoft.Office.Core.MsoTriState.msoTrue,
Microsoft.Office.Core.MsoTriState.msoFalse,
Microsoft.Office.Core.MsoTriState.msoFalse);
StringBuilder builder = new StringBuilder();
try
{
foreach (PowerPoint.Slide slide in ppt.Slides)
{
foreach (PowerPoint.Shape shape in slide.Shapes)
{
if (shape.TextFrame.HasText == Microsoft.Office.Core.MsoTriState.msoTrue)
{
builder.Append(shape.TextFrame.TextRange.Text);
}
}
}
}
catch (Exception ex)
{
throw ex;
}
finally {
ppt.Close();
pptApp.Quit();
GC.Collect();
}
return builder.ToString();
}
#endregion 最后看下Demo的界面
![](http://images.cnblogs.com/cnblogs_com/qiba78/lucene.jpg)
Demo源码下载
介绍一下Lucene.net的使用,使用了Lucene.Net.dll2.1 Highlighter.Net.dll 2.0(高亮) Lucene.Net.Analysis.Cn.dll 1.3(划词引擎):
1 添加索引
#region 利用com组件读取office
/// <summary>
/// 判断文件是否存在
/// </summary>
/// <param name="pFileName"></param>
private void IsExists(string pFileName) {
if (!File.Exists(pFileName)) {
throw new ApplicationException("指定目录下的无该文件");
}
}
//获得word文件的文本内容
public string Doc2Text(string docFileName) {
IsExists(docFileName);
//实例化COM
Word.ApplicationClass wordApp = new Word.ApplicationClass();
object fileobj = docFileName;
object nullobj = System.Reflection.Missing.Value;
//打开指定文件(不同版本的COM参数个数有差异,一般而言除第一个外都用nullobj就行了)
Word.Document doc = wordApp.Documents.Open(ref fileobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj
);
//取得doc文件中的文本
string outText = doc.Content.Text;
//关闭文件
doc.Close(ref nullobj, ref nullobj, ref nullobj);
//关闭COM,关闭word程序
wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
GC.Collect();
//返回
return outText;
}
//获得excel文件的文本内容
public string Xls2Text(string xlsFileName) {
IsExists(xlsFileName);
Excel.Application xlsApp = new Excel.ApplicationClass();
object nullobj = System.Reflection.Missing.Value;
//打开Excel文档
Excel.Workbook excel = xlsApp.Workbooks.Open(xlsFileName, nullobj,
nullobj, nullobj, nullobj,
nullobj, nullobj, nullobj,
nullobj, nullobj, nullobj,
nullobj, nullobj, nullobj,
nullobj);
//遍历Excel工作表
Excel.Worksheet ews = null;
StringBuilder builder = new StringBuilder();
try
{
for (int k = 1; k <= excel.Worksheets.Count; k++)
{
ews = (Excel.Worksheet)excel.Worksheets[k];
//builder.Append(((Excel.Range)ews.UsedRange).Text);
if (ews.UsedRange.Value2 != null)
{
for (int i = 1; i <= ews.UsedRange.Cells.Rows.Count; i++)
{
for (int j = 1; j <= ews.UsedRange.Cells.Columns.Count; j++)
{
if (((object[,])(ews.UsedRange.Value2))[i, j] != null)
{
builder.Append(((object[,])(ews.UsedRange.Value2))[i, j]).Append("|");
}
}
}
}
}
}
catch (Exception ex)
{
throw ex;
}
finally
{
excel.Close(nullobj, nullobj, nullobj);
xlsApp.Quit();
GC.Collect();
}
return builder.ToString();
}
//获得PPT文件的文本内容
public string Ppt2Text(string pptFileName) {
IsExists(pptFileName);
PowerPoint.Application pptApp = new PowerPoint.ApplicationClass();
object nullobj = System.Reflection.Missing.Value;
PowerPoint.Presentation ppt = pptApp.Presentations.Open(pptFileName,
Microsoft.Office.Core.MsoTriState.msoTrue,
Microsoft.Office.Core.MsoTriState.msoFalse,
Microsoft.Office.Core.MsoTriState.msoFalse);
StringBuilder builder = new StringBuilder();
try
{
foreach (PowerPoint.Slide slide in ppt.Slides)
{
foreach (PowerPoint.Shape shape in slide.Shapes)
{
if (shape.TextFrame.HasText == Microsoft.Office.Core.MsoTriState.msoTrue)
{
builder.Append(shape.TextFrame.TextRange.Text);
}
}
}
}
catch (Exception ex)
{
throw ex;
}
finally {
ppt.Close();
pptApp.Quit();
GC.Collect();
}
return builder.ToString();
}
#endregion 最后看下Demo的界面
![](http://images.cnblogs.com/cnblogs_com/qiba78/lucene.jpg)
Demo源码下载
相关文章推荐
- 利用Lucene.net对附件做搜索
- (转)利用Lucene.net对附件做搜索
- Lucene.Net:使用eaglet的盘古分词进行分词和搜索(转载)
- 利用Lucene.net搜索引擎进行多条件搜索的做法
- 利用Lucene.net搭建站内搜索(4)---数据检索
- 艾伟_转载:Lucene.net多字段多索引目录搜索
- 利用Lucene.net搭建站内搜索(2)---分词技术
- 利用Lucene.net搜索引擎进行多条件搜索的做法
- 利用Lucene.net搜索引擎进行多条件搜索的做法
- 利用Lucene.net搭建站内搜索 ---Lucene.net
- 利用Lucene.net搭建站内搜索(2)---分词技术
- 利用Lucene.net搭建站内搜索(3)---创建索引
- 利用Lucene.net搭建站内搜索(1)---了解Lucene.net
- 【转载】Lucene.Net无障碍学习和使用:搜索篇
- 利用Lucene.net搜索引擎进行多条件搜索的做法
- 利用Lucene.net搜索引擎进行多条件搜索
- 利用Lucene.net搭建站内搜索(3)---创建索引
- 利用Lucene.net搜索引擎进行多条件搜索的做法
- 利用Lucene.net搜索引擎进行多条件搜索的做法
- 利用Lucene.net搜索引擎进行多条件搜索的做法