NutchBean详解之初始化
2012-05-30 08:58
232 查看
NutchBean是实现nutch查询的一个入口,就像Nutch中的crawl一样。在NutchBean的mian函数中,以下一行代码实现了初始化:
final NutchBean bean = new NutchBean(conf);
这行代码涉及到的东西很多,包括初始化了LuceneSearchBean、IndexSearcher、FetchedSegments以及lucene中的查询类IndexSearcher(注意:前一个IndexSearcher是nutch的,而后一个是Lucene的)。
代码展示如下:
final NutchBeanbean = new NutchBean(conf);
public NutchBean(Configuration conf) throws IOException {
this(conf, null);
}
public NutchBean(Configuration conf, Path dir) throws IOException {
。。。。。。//省略
searchBean = new LuceneSearchBean(conf, indexDir, indexesDir);
。。。。。。//省略
segmentBean = new FetchedSegments(conf, segments);
}
初始化LuceneSearchBena的代码:
publicLuceneSearchBean(Configuration conf, Path indexDir, Path indexesDir)
throws IOException {
this.conf = conf;
this.fs =FileSystem.get(this.conf);
init(indexDir, indexesDir);
}
privatevoid init(Path indexDir, Path indexesDir)
throws IOException {
Path absIndexDir =indexDir.makeQualified(indexDir.getFileSystem(conf));
Path absIndexesDir =indexesDir.makeQualified(indexesDir.getFileSystem(conf));
//TODO
Path indexDirAddtion=newPath("E:/out/index");
Path[] indexes=new Path[2];
if (this.fs.exists(indexDir)) {
LOG.info("opening mergedindex in " + absIndexDir.toUri());
//TODO
indexes[0]=indexDir;
indexes[1]=indexDirAddtion;
//this.searcher = newIndexSearcher(indexDir, this.conf);
//TODO
/*此处做了修改,将IndexSearcher中的参数Path,改为Path[],用于
* 对多个索引路径进行查询*/
this.searcher=newIndexSearcher(indexes,this.conf);
} else {
if (!this.fs.exists(indexesDir)){
// should throw exception ?
LOG.warn("Neither "+ absIndexDir.toUri() + " nor " +
absIndexesDir.toUri()+ " found!");
} else {
LOG.info("openingindexes in " + absIndexesDir.toUri());
}
List<Path> vDirs = newArrayList<Path>();
FileStatus[] fstats =fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
Path[] directories =HadoopFSUtil.getPaths(fstats);
for(int i = 0; i <directories.length; i++) {
Path indexdone = newPath(directories[i], Indexer.DONE_NAME);
if(fs.isFile(indexdone)) {
vDirs.add(directories[i]);
}
}
directories = new Path[vDirs.size() ];
for(int i = 0;vDirs.size()>0; i++) {
directories[i] =vDirs.remove(0);
}
this.searcher = newIndexSearcher(directories, this.conf);
}
}
IndexSearcher的代码如下:
public IndexSearcher(Path[] indexDirs, Configuration conf) throwsIOException {
IndexReader[] readers = newIndexReader[indexDirs.length];
this.conf = conf;
this.fs = FileSystem.get(conf);
for (int i = 0; i <indexDirs.length; i++) {
readers[i] =IndexReader.open(getDirectory(indexDirs[i]));
}
init(new MultiReader(readers),conf);
}
private void init(IndexReader reader, Configuration conf) throwsIOException {
this.reader = reader;
this.luceneSearcher = new
org.apache.lucene.search.IndexSearcher(reader);
this.luceneSearcher.setSimilarity(new NutchSimilarity());
this.optimizer = newLuceneQueryOptimizer(conf);
this.queryFilters = newQueryFilters(conf);
}
初始化luceneIndexSearcher
privateIndexSearcher(IndexReader r, boolean closeReader) {
reader = r;
this.closeReader = closeReader;
List<IndexReader> subReadersList =new ArrayList<IndexReader>();
gatherSubReaders(subReadersList, reader);
subReaders = subReadersList.toArray(newIndexReader[subReadersList.size()]);
docStarts = new int[subReaders.length];
int maxDoc = 0;
for (int i = 0; i < subReaders.length;i++) {
docStarts[i] = maxDoc;
maxDoc += subReaders[i].maxDoc();
}
}
初始化segmentBean
public FetchedSegments(Configurationconf, Path[] segmentsDir)
throws IOException {
this.conf = conf;
this.fs = FileSystem.get(this.conf);
final FileStatus[] fstats = fs.listStatus(segmentsDir,
HadoopFSUtil.getPassDirectoriesFilter(fs));
final Path[] segmentDirs = HadoopFSUtil.getPaths(fstats);
this.summarizer = new SummarizerFactory(this.conf).getSummarizer();
this.segmentsDir = segmentsDir;
this.segUpdater = new SegmentUpdater();
if (segmentDirs != null) {
for (final Path segmentDir : segmentDirs){
segments.put(segmentDir.getName(),
new Segment(this.fs, segmentDir, this.conf));
}
}
this.segUpdater.start();
}
注意:在实现跨索引查询的时候,除了修改luceneSearchBean中的初始化代码,还得修改sgementBean中的代码。
final NutchBean bean = new NutchBean(conf);
这行代码涉及到的东西很多,包括初始化了LuceneSearchBean、IndexSearcher、FetchedSegments以及lucene中的查询类IndexSearcher(注意:前一个IndexSearcher是nutch的,而后一个是Lucene的)。
代码展示如下:
final NutchBeanbean = new NutchBean(conf);
public NutchBean(Configuration conf) throws IOException {
this(conf, null);
}
public NutchBean(Configuration conf, Path dir) throws IOException {
。。。。。。//省略
searchBean = new LuceneSearchBean(conf, indexDir, indexesDir);
。。。。。。//省略
segmentBean = new FetchedSegments(conf, segments);
}
初始化LuceneSearchBena的代码:
publicLuceneSearchBean(Configuration conf, Path indexDir, Path indexesDir)
throws IOException {
this.conf = conf;
this.fs =FileSystem.get(this.conf);
init(indexDir, indexesDir);
}
privatevoid init(Path indexDir, Path indexesDir)
throws IOException {
Path absIndexDir =indexDir.makeQualified(indexDir.getFileSystem(conf));
Path absIndexesDir =indexesDir.makeQualified(indexesDir.getFileSystem(conf));
//TODO
Path indexDirAddtion=newPath("E:/out/index");
Path[] indexes=new Path[2];
if (this.fs.exists(indexDir)) {
LOG.info("opening mergedindex in " + absIndexDir.toUri());
//TODO
indexes[0]=indexDir;
indexes[1]=indexDirAddtion;
//this.searcher = newIndexSearcher(indexDir, this.conf);
//TODO
/*此处做了修改,将IndexSearcher中的参数Path,改为Path[],用于
* 对多个索引路径进行查询*/
this.searcher=newIndexSearcher(indexes,this.conf);
} else {
if (!this.fs.exists(indexesDir)){
// should throw exception ?
LOG.warn("Neither "+ absIndexDir.toUri() + " nor " +
absIndexesDir.toUri()+ " found!");
} else {
LOG.info("openingindexes in " + absIndexesDir.toUri());
}
List<Path> vDirs = newArrayList<Path>();
FileStatus[] fstats =fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
Path[] directories =HadoopFSUtil.getPaths(fstats);
for(int i = 0; i <directories.length; i++) {
Path indexdone = newPath(directories[i], Indexer.DONE_NAME);
if(fs.isFile(indexdone)) {
vDirs.add(directories[i]);
}
}
directories = new Path[vDirs.size() ];
for(int i = 0;vDirs.size()>0; i++) {
directories[i] =vDirs.remove(0);
}
this.searcher = newIndexSearcher(directories, this.conf);
}
}
IndexSearcher的代码如下:
public IndexSearcher(Path[] indexDirs, Configuration conf) throwsIOException {
IndexReader[] readers = newIndexReader[indexDirs.length];
this.conf = conf;
this.fs = FileSystem.get(conf);
for (int i = 0; i <indexDirs.length; i++) {
readers[i] =IndexReader.open(getDirectory(indexDirs[i]));
}
init(new MultiReader(readers),conf);
}
private void init(IndexReader reader, Configuration conf) throwsIOException {
this.reader = reader;
this.luceneSearcher = new
org.apache.lucene.search.IndexSearcher(reader);
this.luceneSearcher.setSimilarity(new NutchSimilarity());
this.optimizer = newLuceneQueryOptimizer(conf);
this.queryFilters = newQueryFilters(conf);
}
初始化luceneIndexSearcher
privateIndexSearcher(IndexReader r, boolean closeReader) {
reader = r;
this.closeReader = closeReader;
List<IndexReader> subReadersList =new ArrayList<IndexReader>();
gatherSubReaders(subReadersList, reader);
subReaders = subReadersList.toArray(newIndexReader[subReadersList.size()]);
docStarts = new int[subReaders.length];
int maxDoc = 0;
for (int i = 0; i < subReaders.length;i++) {
docStarts[i] = maxDoc;
maxDoc += subReaders[i].maxDoc();
}
}
初始化segmentBean
public FetchedSegments(Configurationconf, Path[] segmentsDir)
throws IOException {
this.conf = conf;
this.fs = FileSystem.get(this.conf);
final FileStatus[] fstats = fs.listStatus(segmentsDir,
HadoopFSUtil.getPassDirectoriesFilter(fs));
final Path[] segmentDirs = HadoopFSUtil.getPaths(fstats);
this.summarizer = new SummarizerFactory(this.conf).getSummarizer();
this.segmentsDir = segmentsDir;
this.segUpdater = new SegmentUpdater();
if (segmentDirs != null) {
for (final Path segmentDir : segmentDirs){
segments.put(segmentDir.getName(),
new Segment(this.fs, segmentDir, this.conf));
}
}
this.segUpdater.start();
}
注意:在实现跨索引查询的时候,除了修改luceneSearchBean中的初始化代码,还得修改sgementBean中的代码。
相关文章推荐
- spring的初始化bean,销毁bean之前的操作详解
- 详解Spring 中如何控制2个bean中的初始化顺序
- Spring中初始化bean和销毁bean的时候执行某个方法的详解
- Spring IOC/BeanFactory/ApplicationContext的工作流程/实现原理/初始化/依赖注入源码详解
- Spring中初始化bean和销毁bean的时候执行某个方法的详解
- spring详解:通过FactoryBean自定义工厂初始化Bean
- Spring中初始化bean和销毁bean的时候执行某个方法的详解
- Spring Bean的初始化和销毁方式详解
- Spring中初始化bean和销毁bean的时候执行某个方法的详解
- Spring bean初始化原理详解
- Spring Bean的初始化和销毁方式详解(转载)
- Web.xml配置详解之context-param (加载spring的xml,然后初始化bean看的)
- Spring中初始化bean和销毁bean的时候执行某个方法的详解
- Spring中初始化bean和销毁bean的时候执行某个方法的详解
- Tomcat Jdbc数据源初始化过程以及错误Cannot resolve reference to bean 'dataSource' while setting bean property 'da
- Spring-注入参数详解-[字面值及引用其他Bean]
- Java中的初始化顺序详解
- 最短路练习13/poj/1847 / Tram/floyd解法;memset用0x3f初始化详解
- 详解Spring中bean的作用域
- Spring中bean的scope详解(转载)