您的位置：首页 > 其它

lucene全文检索应用

2012-09-08 17:42 253 查看

使用Lucene实现全文检索，主要有下面三个步骤：

　　1、建立索引库：根据网站新闻信息库中的已有的数据资料建立Lucene索引文件。

　　2、通过索引库搜索：有了索引后，即可使用标准的词法分析器或直接的词法分析器实现进行全文检索。

　　3、维护索引库：网站新闻信息库中的信息会不断的变动，包括新增、修改及删除等，这些信息的变动都需要进一步反映到Lucene索引文件中。

下面是myrss.easyjf.com相关代码!

一、索引管理(建立及维护)

　　索引管理类MyRssIndexManage主要实现根据网站信息库中的数据建立索引，维护索引等。由于索引的过程需要消耗一定的时间，因此，索引管理类实现Runnable接口，使得我们可以在程序中开新线程来运行。

1

package com.easyjf.lucene;

2

import java.util.Date;

3

import java.util.List;

4

import org.apache.lucene.analysis.standard.StandardAnalyzer;

5

import org.apache.lucene.document.Document;

6

import org.apache.lucene.document.Field;

7

import org.apache.lucene.index.IndexReader;

8

import org.apache.lucene.index.IndexWriter;

9

import org.apache.lucene.queryParser.MultiFieldQueryParser;

10

import org.apache.lucene.queryParser.QueryParser;

11

import org.apache.lucene.search.Hits;

12

import org.apache.lucene.search.IndexSearcher;

13

import org.apache.lucene.search.Query;

14

import org.apache.lucene.search.Searcher;

15

import com.easyjf.dbo.EasyJDB;

16

import com.easyjf.news.business.NewsDir;

17

import com.easyjf.news.business.NewsDoc;

18

import com.easyjf.news.business.NewsUtil;

19

import com.easyjf.web.tools.IPageList;

20

public class MyRssIndexManage implements Runnable

{

21

private String indexDir;

22

private String indexType="add";

23

public void run()

{

24

// TODO Auto-generated method stub

25

if("add".equals(indexType))

26

normalIndex();

27

else if ("init".equals(indexType)) reIndexAll();

28

}

29

public void normalIndex()

30

{

31

try

{

32

Date start = new Date();

33

int num=0;

34

IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),false);

35

//NewsDir dir=NewsDir.readBySn();

36

String scope="(needIndex<2) or(needIndex is null)";

37

IPageList pList=NewsUtil.pageList(scope,1,50);

38

for(int p=0;p

{

39

pList=NewsUtil.pageList(scope,p,100);

40

List list=pList.getResult();

41

for(int i=0;i

{

42

NewsDoc doc=(NewsDoc)list.get(i);

43

writer.addDocument(newsdoc2lucenedoc(doc));

44

num++;

45

}

46

}

47

writer.optimize();

48

writer.close();

49

EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where "+scope);

50

Date end = new Date();

51

System.out.print("新增索引"+num+"条信息，一共花："+(end.getTime() - start.getTime())/60000+"分钟!");

52

}

53

catch(Exception e)

54

{

55

e.printStackTrace();

56

}

57

}

58

public void reIndexAll()

59

{

60

try

{

61

Date start = new Date();

62

int num=0;

63

IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),true);

64

NewsDir dir=NewsDir.readBySn("easyjf");

65

IPageList pList=NewsUtil.pageList(dir,1,50);

66

for(int p=0;p

{

67

pList=NewsUtil.pageList(dir,p,100);

68

List list=pList.getResult();

69

for(int i=0;i

{

70

NewsDoc doc=(NewsDoc)list.get(i);

71

writer.addDocument(newsdoc2lucenedoc(doc));

72

num++;

73

}

74

}

75

writer.optimize();

76

writer.close();

77

EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where dirPath like 'easyjf%'");

78

Date end = new Date();

79

System.out.print("全部重新做了一次索引，一共处理了"+num+"条信息，花："+(end.getTime() - start.getTime())/60000+"分钟!");

80

}

81

catch(Exception e)

82

{

83

e.printStackTrace();

84

}

85

}

86

private Document newsdoc2lucenedoc(NewsDoc doc)

87

{

88

Document lDoc=new Document();

89

lDoc.add(new Field("title",doc.getTitle(),Field.Store.YES,Field.Index.TOKENIZED));

90

lDoc.add(new Field("content",doc.getContent(),Field.Store.YES,Field.Index.TOKENIZED));

91

lDoc.add(new Field("url",doc.getRemark(),Field.Store.YES,Field.Index.NO));

92

lDoc.add(new Field("cid",doc.getCid(),Field.Store.YES,Field.Index.NO));

93

lDoc.add(new Field("source",doc.getSource(),Field.Store.YES,Field.Index.NO));

94

lDoc.add(new Field("inputTime",doc.getInputTime().toString(),Field.Store.YES,Field.Index.NO));

95

return lDoc;

96

}

97

public String getIndexDir()

{

98

return indexDir;

99

}

100

public void setIndexDir(String indexDir)

{

101

this.indexDir = indexDir;

102

}

103

104

public String getIndexType()

{

105

return indexType;

106

}

107

public void setIndexType(String indexType)

{

108

this.indexType = indexType;

109

}

110

}

111

二、使用Lucene实现全文搜索

下面是MyRssSearch类的源码，该类主要实现使用Lucene中Searcher及QueryParser实现从索引库中搜索关键词。

1

package com.easyjf.lucene;

2

import java.util.List;

4

import org.apache.lucene.analysis.standard.StandardAnalyzer;

5

import org.apache.lucene.document.Document;

6

import org.apache.lucene.index.IndexReader;

7

import org.apache.lucene.queryParser.MultiFieldQueryParser;

8

import org.apache.lucene.queryParser.QueryParser;

9

import org.apache.lucene.search.Hits;

10

import org.apache.lucene.search.IndexSearcher;

11

import org.apache.lucene.search.Query;

12

import org.apache.lucene.search.Searcher;

13

import com.easyjf.search.MyRssUtil;

15

import com.easyjf.search.SearchContent;

16

import com.easyjf.web.tools.IPageList;

17

import com.easyjf.web.tools.PageList;

18

public class MyRssSearch

{

20

private String indexDir;

21

IndexReader ir;

22

Searcher search;

23

public IPageList search(String key,int pageSize,int currentPage)

24

{

25

IPageList pList=new PageList(new HitsQuery(doSearch(key)));

26

pList.doList(pageSize,currentPage,"","",null);

27

if(pList!=null)

28

{

29

List list=pList.getResult();

30

if(list!=null)

{

31

for(int i=0;i

{

32

list.set(i,lucene2searchObj((Document)list.get(i),key));

33

}

34

}

35

}

36

try

{

37

if(search!=null)search.close();

38

if(ir!=null)ir.close();

39

}

40

catch(Exception e)

41

{

42

e.printStackTrace();

43

}

44

return pList;

45

}

46

private SearchContent lucene2searchObj(Document doc,String key)

47

{

48

SearchContent searchObj=new SearchContent();

49

String title=doc.getField("title").stringValue();

50

searchObj.setTitle(title.replaceAll(key,""+key+""));

51

searchObj.setTvalue(doc.getField("cid").stringValue());

52

searchObj.setUrl(doc.getField("url").stringValue());

53

searchObj.setSource(doc.getField("source").stringValue());

54

searchObj.setLastUpdated(doc.getField("inputTime").stringValue());

55

searchObj.setIntro(MyRssUtil.content2intro(doc.getField("content").stringValue(),key));

56

return searchObj;

57

}

58

public Hits doSearch(String key)

59

{

60

Hits hits=null;

61

try

{

62

ir=IndexReader.open(indexDir);

63

search=new IndexSearcher(ir);

64

String fields[]=

{"title","content"};

65

QueryParser parser=new MultiFieldQueryParser(fields,new StandardAnalyzer());

66

Query query=parser.parse(key);

67

hits=search.search(query);

68

}

69

catch(Exception e)

70

{

71

e.printStackTrace();

72

}

73

//System.out.println("搜索结果:"+hits.length());

74

return hits;

75

}

76

public String getIndexDir()

{

78

return indexDir;

79

}

80

public void setIndexDir(String indexDir)

{

81

this.indexDir = indexDir;

82

}

83

}

84

　　在上面的代码中，search方法返回一个封装了分页查询结果的IPageList，IPageList是EasyJWeb Tools业务引擎中的分页引擎，对于IPageList的使用，请看本人写的这篇文章《EasyJWeb Tools中业务引擎分页的设计实现》：

　　我们针对Lucene的的查询结果Hits结构，写了一个查询器HitsQuery。代码如下所示：

1

package com.easyjf.lucene;

2

import java.util.ArrayList;

3

import java.util.Collection;

4

import java.util.List;

5

import org.apache.lucene.search.Hits;

6

import com.easyjf.web.tools.IQuery;

7

public class HitsQuery implements IQuery

{

8

private int begin=0;

9

private int max=0;

10

private Hits hits;

11

public HitsQuery()

12

{

13

}

15

public HitsQuery(Hits hits)

16

{

17

if(hits!=null)

18

{

19

this.hits=hits;

20

this.max=hits.length();

21

}

22

}

23

public int getRows(String arg0)

{

24

// TODO Auto-generated method stub

25

return (hits==null?0:hits.length());

26

}

27

public List getResult(String arg0)

{

28

// TODO Auto-generated method stub

29

List list=new ArrayList();

30

for(int i=begin;i<(begin+max)&&(i

{

31

try

{

32

list.add(hits.doc(i));

33

}

34

catch(Exception e)

35

{

36

e.printStackTrace();

37

}

38

}

39

return list;

40

}

41

public void setFirstResult(int begin)

{

42

// TODO Auto-generated method stub

43

this.begin=begin;

44

}

45

public void setMaxResults(int max)

{

46

// TODO Auto-generated method stub

47

this.max=max;

48

}

49

public void setParaValues(Collection arg0)

{

50

// TODO Auto-generated method stub

51

}

53

public List getResult(String condition, int begin, int max)

{

54

// TODO Auto-generated method stub

55

if((begin>=0)&&(begin if(!(max>hits.length()))this.max=max;

56

return getResult(condition);

57

}

58

}

59

三、Web调用

　　下面我们来看看在Web中如果调用商业逻辑层的全文检索功能。下面是处理用户请请的Action中关于搜索部分的源码：

1

package com.easyjf.news.action;

2

public class SearchAction implements IWebAction

{

3

public Page doSearch(WebForm form,Module module)throws Exception

4

{

5

String key=CommUtil.null2String(form.get("v"));

6

key=URLDecoder.decode(URLEncoder.encode(key,"ISO8859_1"),"utf-8");

7

form.set("v",key);

8

form.addResult("v2",URLEncoder.encode(key,"utf-8"));

9

if(key.getBytes().length>2)

{

10

String orderBy=CommUtil.null2String(form.get("order"));

11

int currentPage=CommUtil.null2Int(form.get("page"));

12

int pageSize=CommUtil.null2Int(form.get("pageSize"));

13

if(currentPage<1)currentPage=1;

14

if(pageSize<1)pageSize=15;

15

SearchEngine search=new SearchEngine(key,orderBy,pageSize,currentPage);

16

search.getLuceneSearch().setIndexDir(Globals.APP_BASE_DIR+"/WEB-INF/index");

17

search.doSearchByLucene();

18

IPageList pList=search.getResult();

19

if(pList!=null && pList.getRowCount()>0)

{

20

form.addResult("list",pList.getResult());

21

form.addResult("pages",new Integer(pList.getPages()));

22

form.addResult("rows",new Integer(pList.getRowCount()));

23

form.addResult("page",new Integer(pList.getCurrentPage()));

24

form.addResult("gotoPageHTML",CommUtil.showPageHtml(pList.getCurrentPage(),pList.getPages()));

25

}

26

else

27

{

28

form.addResult("notFound","true");//找不到数据

29

}

30

}

31

else

32

form.addResult("errMsg","您输入的关键字太短!");

33

form.addResult("hotSearch",SearchEngine.getHotSearch(20));

34

return null;

35

}

36

}

37

其中调用的SearchEngine类中有关Lucene部分的源码：

38

public class SearchEngine

{

39

private MyRssSearch luceneSearch=new MyRssSearch();

40

public void doSearchByLucene()

41

{

42

SearchKey keyObj=readCache();

43

if(keyObj!=null)

{

44

result=luceneSearch.search(key,pageSize,currentPage);

45

if(updateStatus)

{

46

keyObj.setReadTimes(new Integer(keyObj.getReadTimes().intValue()+1));

47

keyObj.update();

48

}

49

}

50

else//缓存中没有该关键字信息,生成关键字搜索结果

51

{

52

keyObj=new SearchKey();

53

keyObj.setTitle(key);

54

keyObj.setLastUpdated(new Date());

55

keyObj.setReadTimes(new Integer(1));

56

keyObj.setStatus(new Integer(0));

57

keyObj.setSequence(new Integer(1));

58

keyObj.setVdate(new Date());

59

keyObj.save();

60

result=luceneSearch.search(key,pageSize,currentPage);;

61

}

63

}

64

}

65

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航