您的位置:首页 > 其它

lucene全文检索应用

2012-09-08 17:42 253 查看
使用Lucene实现全文检索,主要有下面三个步骤:

  1、建立索引库:根据网站新闻信息库中的已有的数据资料建立Lucene索引文件。

  2、通过索引库搜索:有了索引后,即可使用标准的词法分析器或直接的词法分析器实现进行全文检索。

  3、维护索引库:网站新闻信息库中的信息会不断的变动,包括新增、修改及删除等,这些信息的变动都需要进一步反映到Lucene索引文件中。

下面是myrss.easyjf.com相关代码!

一、索引管理(建立及维护)

  索引管理类MyRssIndexManage主要实现根据网站信息库中的数据建立索引,维护索引等。由于索引的过程需要消耗一定的时间,因此,索引管理类实现Runnable接口,使得我们可以在程序中开新线程来运行。

1

package com.easyjf.lucene;

2

import java.util.Date;

3

import java.util.List;

4

import org.apache.lucene.analysis.standard.StandardAnalyzer;

5

import org.apache.lucene.document.Document;

6

import org.apache.lucene.document.Field;

7

import org.apache.lucene.index.IndexReader;

8

import org.apache.lucene.index.IndexWriter;

9

import org.apache.lucene.queryParser.MultiFieldQueryParser;

10

import org.apache.lucene.queryParser.QueryParser;

11

import org.apache.lucene.search.Hits;

12

import org.apache.lucene.search.IndexSearcher;

13

import org.apache.lucene.search.Query;

14

import org.apache.lucene.search.Searcher;

15

import com.easyjf.dbo.EasyJDB;

16

import com.easyjf.news.business.NewsDir;

17

import com.easyjf.news.business.NewsDoc;

18

import com.easyjf.news.business.NewsUtil;

19

import com.easyjf.web.tools.IPageList;

20



public class MyRssIndexManage implements Runnable

{

21

private String indexDir;

22

private String indexType="add";

23



public void run()

{

24

// TODO Auto-generated method stub

25

if("add".equals(indexType))

26

normalIndex();

27

else if ("init".equals(indexType)) reIndexAll();

28

}

29

public void normalIndex()

30





{

31



try

{

32

Date start = new Date();

33

int num=0;

34

IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),false);

35

//NewsDir dir=NewsDir.readBySn();

36

String scope="(needIndex<2) or(needIndex is null)";

37

IPageList pList=NewsUtil.pageList(scope,1,50);

38



for(int p=0;p

{

39

pList=NewsUtil.pageList(scope,p,100);

40

List list=pList.getResult();

41



for(int i=0;i

{

42

NewsDoc doc=(NewsDoc)list.get(i);

43

writer.addDocument(newsdoc2lucenedoc(doc));

44

num++;

45

}

46

}

47

writer.optimize();

48

writer.close();

49

EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where "+scope);

50

Date end = new Date();

51

System.out.print("新增索引"+num+"条信息,一共花:"+(end.getTime() - start.getTime())/60000+"分钟!");

52

}

53

catch(Exception e)

54





{

55

e.printStackTrace();

56

}

57

}

58

public void reIndexAll()

59





{

60



try

{

61

Date start = new Date();

62

int num=0;

63

IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),true);

64

NewsDir dir=NewsDir.readBySn("easyjf");

65

IPageList pList=NewsUtil.pageList(dir,1,50);

66



for(int p=0;p

{

67

pList=NewsUtil.pageList(dir,p,100);

68

List list=pList.getResult();

69



for(int i=0;i

{

70

NewsDoc doc=(NewsDoc)list.get(i);

71

writer.addDocument(newsdoc2lucenedoc(doc));

72

num++;

73

}

74

}

75

writer.optimize();

76

writer.close();

77

EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where dirPath like 'easyjf%'");

78

Date end = new Date();

79

System.out.print("全部重新做了一次索引,一共处理了"+num+"条信息,花:"+(end.getTime() - start.getTime())/60000+"分钟!");

80

}

81

catch(Exception e)

82





{

83

e.printStackTrace();

84

}

85

}

86

private Document newsdoc2lucenedoc(NewsDoc doc)

87





{

88

Document lDoc=new Document();

89

lDoc.add(new Field("title",doc.getTitle(),Field.Store.YES,Field.Index.TOKENIZED));

90

lDoc.add(new Field("content",doc.getContent(),Field.Store.YES,Field.Index.TOKENIZED));

91

lDoc.add(new Field("url",doc.getRemark(),Field.Store.YES,Field.Index.NO));

92

lDoc.add(new Field("cid",doc.getCid(),Field.Store.YES,Field.Index.NO));

93

lDoc.add(new Field("source",doc.getSource(),Field.Store.YES,Field.Index.NO));

94

lDoc.add(new Field("inputTime",doc.getInputTime().toString(),Field.Store.YES,Field.Index.NO));

95

return lDoc;

96

}

97



public String getIndexDir()

{

98

return indexDir;

99

}

100



public void setIndexDir(String indexDir)

{

101

this.indexDir = indexDir;

102

}

103


104



public String getIndexType()

{

105

return indexType;

106

}

107



public void setIndexType(String indexType)

{

108

this.indexType = indexType;

109

}

110

}

111


二、使用Lucene实现全文搜索

下面是MyRssSearch类的源码,该类主要实现使用Lucene中Searcher及QueryParser实现从索引库中搜索关键词。

1

package com.easyjf.lucene;

2


3

import java.util.List;

4

import org.apache.lucene.analysis.standard.StandardAnalyzer;

5

import org.apache.lucene.document.Document;

6

import org.apache.lucene.index.IndexReader;

7

import org.apache.lucene.queryParser.MultiFieldQueryParser;

8

import org.apache.lucene.queryParser.QueryParser;

9

import org.apache.lucene.search.Hits;

10

import org.apache.lucene.search.IndexSearcher;

11

import org.apache.lucene.search.Query;

12

import org.apache.lucene.search.Searcher;

13


14

import com.easyjf.search.MyRssUtil;

15

import com.easyjf.search.SearchContent;

16

import com.easyjf.web.tools.IPageList;

17

import com.easyjf.web.tools.PageList;

18


19



public class MyRssSearch

{

20

private String indexDir;

21

IndexReader ir;

22

Searcher search;

23

public IPageList search(String key,int pageSize,int currentPage)

24





{

25

IPageList pList=new PageList(new HitsQuery(doSearch(key)));

26

pList.doList(pageSize,currentPage,"","",null);

27

if(pList!=null)

28





{

29

List list=pList.getResult();

30



if(list!=null)

{

31



for(int i=0;i

{

32

list.set(i,lucene2searchObj((Document)list.get(i),key));

33

}

34

}

35

}

36



try

{

37

if(search!=null)search.close();

38

if(ir!=null)ir.close();

39

}

40

catch(Exception e)

41





{

42

e.printStackTrace();

43

}

44

return pList;

45

}

46

private SearchContent lucene2searchObj(Document doc,String key)

47





{

48

SearchContent searchObj=new SearchContent();

49

String title=doc.getField("title").stringValue();

50

searchObj.setTitle(title.replaceAll(key,""+key+""));

51

searchObj.setTvalue(doc.getField("cid").stringValue());

52

searchObj.setUrl(doc.getField("url").stringValue());

53

searchObj.setSource(doc.getField("source").stringValue());

54

searchObj.setLastUpdated(doc.getField("inputTime").stringValue());

55

searchObj.setIntro(MyRssUtil.content2intro(doc.getField("content").stringValue(),key));

56

return searchObj;

57

}

58

public Hits doSearch(String key)

59





{

60

Hits hits=null;

61



try

{

62

ir=IndexReader.open(indexDir);

63

search=new IndexSearcher(ir);

64



String fields[]=

{"title","content"};

65

QueryParser parser=new MultiFieldQueryParser(fields,new StandardAnalyzer());

66

Query query=parser.parse(key);

67

hits=search.search(query);

68

}

69

catch(Exception e)

70





{

71

e.printStackTrace();

72

}

73

//System.out.println("搜索结果:"+hits.length());

74

return hits;

75

}

76


77



public String getIndexDir()

{

78

return indexDir;

79

}

80



public void setIndexDir(String indexDir)

{

81

this.indexDir = indexDir;

82

}

83

}

84


  在上面的代码中,search方法返回一个封装了分页查询结果的IPageList,IPageList是EasyJWeb Tools业务引擎中的分页引擎,对于IPageList的使用,请看本人写的这篇文章《EasyJWeb Tools中业务引擎分页的设计实现》

  我们针对Lucene的的查询结果Hits结构,写了一个查询器HitsQuery。代码如下所示:

1

package com.easyjf.lucene;

2

import java.util.ArrayList;

3

import java.util.Collection;

4

import java.util.List;

5

import org.apache.lucene.search.Hits;

6

import com.easyjf.web.tools.IQuery;

7



public class HitsQuery implements IQuery

{

8

private int begin=0;

9

private int max=0;

10

private Hits hits;

11

public HitsQuery()

12





{

13


14

}

15

public HitsQuery(Hits hits)

16





{

17

if(hits!=null)

18





{

19

this.hits=hits;

20

this.max=hits.length();

21

}

22

}

23



public int getRows(String arg0)

{

24

// TODO Auto-generated method stub

25

return (hits==null?0:hits.length());

26

}

27



public List getResult(String arg0)

{

28

// TODO Auto-generated method stub

29

List list=new ArrayList();

30



for(int i=begin;i<(begin+max)&&(i

{

31



try

{

32

list.add(hits.doc(i));

33

}

34

catch(Exception e)

35





{

36

e.printStackTrace();

37

}

38

}

39

return list;

40

}

41



public void setFirstResult(int begin)

{

42

// TODO Auto-generated method stub

43

this.begin=begin;

44

}

45



public void setMaxResults(int max)

{

46

// TODO Auto-generated method stub

47

this.max=max;

48

}

49



public void setParaValues(Collection arg0)

{

50

// TODO Auto-generated method stub

51


52

}

53



public List getResult(String condition, int begin, int max)

{

54

// TODO Auto-generated method stub

55

if((begin>=0)&&(begin if(!(max>hits.length()))this.max=max;

56

return getResult(condition);

57

}

58

}

59


三、Web调用

  下面我们来看看在Web中如果调用商业逻辑层的全文检索功能。下面是处理用户请请的Action中关于搜索部分的源码:

1

package com.easyjf.news.action;

2



public class SearchAction implements IWebAction

{

3

public Page doSearch(WebForm form,Module module)throws Exception

4





{

5

String key=CommUtil.null2String(form.get("v"));

6

key=URLDecoder.decode(URLEncoder.encode(key,"ISO8859_1"),"utf-8");

7

form.set("v",key);

8

form.addResult("v2",URLEncoder.encode(key,"utf-8"));

9



if(key.getBytes().length>2)

{

10

String orderBy=CommUtil.null2String(form.get("order"));

11

int currentPage=CommUtil.null2Int(form.get("page"));

12

int pageSize=CommUtil.null2Int(form.get("pageSize"));

13

if(currentPage<1)currentPage=1;

14

if(pageSize<1)pageSize=15;

15

SearchEngine search=new SearchEngine(key,orderBy,pageSize,currentPage);

16

search.getLuceneSearch().setIndexDir(Globals.APP_BASE_DIR+"/WEB-INF/index");

17

search.doSearchByLucene();

18

IPageList pList=search.getResult();

19



if(pList!=null && pList.getRowCount()>0)

{

20

form.addResult("list",pList.getResult());

21

form.addResult("pages",new Integer(pList.getPages()));

22

form.addResult("rows",new Integer(pList.getRowCount()));

23

form.addResult("page",new Integer(pList.getCurrentPage()));

24

form.addResult("gotoPageHTML",CommUtil.showPageHtml(pList.getCurrentPage(),pList.getPages()));

25

}

26

else

27





{

28

form.addResult("notFound","true");//找不到数据

29

}

30

}

31

else

32

form.addResult("errMsg","您输入的关键字太短!");

33

form.addResult("hotSearch",SearchEngine.getHotSearch(20));

34

return null;

35

}

36

}

37

其中调用的SearchEngine类中有关Lucene部分的源码:

38



public class SearchEngine

{

39

private MyRssSearch luceneSearch=new MyRssSearch();

40

public void doSearchByLucene()

41





{

42

SearchKey keyObj=readCache();

43



if(keyObj!=null)

{

44

result=luceneSearch.search(key,pageSize,currentPage);

45



if(updateStatus)

{

46

keyObj.setReadTimes(new Integer(keyObj.getReadTimes().intValue()+1));

47

keyObj.update();

48

}

49

}

50

else//缓存中没有该关键字信息,生成关键字搜索结果

51





{

52

keyObj=new SearchKey();

53

keyObj.setTitle(key);

54

keyObj.setLastUpdated(new Date());

55

keyObj.setReadTimes(new Integer(1));

56

keyObj.setStatus(new Integer(0));

57

keyObj.setSequence(new Integer(1));

58

keyObj.setVdate(new Date());

59

keyObj.save();

60

result=luceneSearch.search(key,pageSize,currentPage);;

61


62

}

63

}

64

}

65
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: