nutch 1.2 增量爬取url 完成 recrawl.sh 编写
2011-02-21 23:41
295 查看
# 使用说明:在bin目录下建立 runbot.sh ,如果在window下执行的话,则使用 cygwin 来模拟使用
# bin/runbot.sh
# runbot script to run the Nutch bot for crawling and re-crawling.
# Usage: bin/runbot [safe]
# If executed in 'safe' mode, it doesn't delete the temporary
# directories generated during crawl. This might be helpful for
# analysis and recovery in case a crawl fails.
#
# Author: Susam Pal
depth=4
threads=8
adddays=5
topN=1500000 #Comment this statement if you don't want to set topN value
# Arguments for rm and mv
RMARGS="-rf"
MVARGS="--verbose"
# Parse arguments
if [ "$1" == "safe" ]
then
safe=yes
fi
if [-z "$NUTCH_HOME" ]
then
NUTCH_HOME=.
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi
NUTCH_HOME=/cygdrive/f/nutch
echo print nutch: $NUTCH_HOME
if [ -z "$CATALINA_HOME" ]
then
CATALINA_HOME=/cygdrive/e/tomcat
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME
fi
if [ -n "$topN" ]
then
topN="-topN $topN"
else
topN=""
fi
steps=8
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject crawl/crawldb urls
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN /
-adddays $adddays
if [ $? -ne 0 ]
then
echo "runbot: Stopping at depth $depth. No more URLs to fetch."
break
fi
segment=`ls -d crawl/segments/* | tail -1`
$NUTCH_HOME/bin/nutch fetch $segment -threads $threads
if [ $? -ne 0 ]
then
echo "runbot: fetch $segment at depth `expr $i + 1` failed."
echo "runbot: Deleting segment $segment."
rm $RMARGS $segment
continue
fi
$NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment
done
echo "----- Merge Segments (Step 3 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
if [ "$safe" != "yes" ]
then
rm $RMARGS crawl/segments
else
rm $RMARGS crawl/BACKUPsegments
mv $MVARGS crawl/segments crawl/BACKUPsegments
fi
mv $MVARGS crawl/MERGEDsegments crawl/segments
echo "----- Invert Links (Step 4 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*
echo "----- Index (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb /
crawl/segments/*
echo "----- Dedup (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch dedup crawl/NEWindexes
echo "----- Merge Indexes (Step 7 of $steps) -----"
$NUTCH_HOME/bin/nutch merge crawl/NEWindex crawl/NEWindexes
echo "----- Loading New Index (Step 8 of $steps) -----"
${CATALINA_HOME}/bin/shutdown.sh
if [ "$safe" != "yes" ]
then
rm $RMARGS crawl/NEWindexes
rm $RMARGS crawl/index
else
rm $RMARGS crawl/BACKUPindexes
rm $RMARGS crawl/BACKUPindex
mv $MVARGS crawl/NEWindexes crawl/BACKUPindexes
mv $MVARGS crawl/index crawl/BACKUPindex
fi
mv $MVARGS crawl/NEWindex crawl/index
${CATALINA_HOME}/bin/startup.sh
echo "runbot: FINISHED: Crawl completed!"
echo ""
//断电停止工作后,再执行这个。
// 刚开始我也不怎么喜欢shell这种方式的代码,不过幸好邻居是搞shell编程的,经它一分析,原来shell也是这么简单。
//来源:http://wiki.apache.org/nutch/Crawl
# bin/runbot.sh
# runbot script to run the Nutch bot for crawling and re-crawling.
# Usage: bin/runbot [safe]
# If executed in 'safe' mode, it doesn't delete the temporary
# directories generated during crawl. This might be helpful for
# analysis and recovery in case a crawl fails.
#
# Author: Susam Pal
depth=4
threads=8
adddays=5
topN=1500000 #Comment this statement if you don't want to set topN value
# Arguments for rm and mv
RMARGS="-rf"
MVARGS="--verbose"
# Parse arguments
if [ "$1" == "safe" ]
then
safe=yes
fi
if [-z "$NUTCH_HOME" ]
then
NUTCH_HOME=.
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script
else
echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME
fi
NUTCH_HOME=/cygdrive/f/nutch
echo print nutch: $NUTCH_HOME
if [ -z "$CATALINA_HOME" ]
then
CATALINA_HOME=/cygdrive/e/tomcat
echo runbot: $0 could not find environment variable NUTCH_HOME
echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script
else
echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME
fi
if [ -n "$topN" ]
then
topN="-topN $topN"
else
topN=""
fi
steps=8
echo "----- Inject (Step 1 of $steps) -----"
$NUTCH_HOME/bin/nutch inject crawl/crawldb urls
echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"
for((i=0; i < $depth; i++))
do
echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"
$NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN /
-adddays $adddays
if [ $? -ne 0 ]
then
echo "runbot: Stopping at depth $depth. No more URLs to fetch."
break
fi
segment=`ls -d crawl/segments/* | tail -1`
$NUTCH_HOME/bin/nutch fetch $segment -threads $threads
if [ $? -ne 0 ]
then
echo "runbot: fetch $segment at depth `expr $i + 1` failed."
echo "runbot: Deleting segment $segment."
rm $RMARGS $segment
continue
fi
$NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment
done
echo "----- Merge Segments (Step 3 of $steps) -----"
$NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*
if [ "$safe" != "yes" ]
then
rm $RMARGS crawl/segments
else
rm $RMARGS crawl/BACKUPsegments
mv $MVARGS crawl/segments crawl/BACKUPsegments
fi
mv $MVARGS crawl/MERGEDsegments crawl/segments
echo "----- Invert Links (Step 4 of $steps) -----"
$NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*
echo "----- Index (Step 5 of $steps) -----"
$NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb /
crawl/segments/*
echo "----- Dedup (Step 6 of $steps) -----"
$NUTCH_HOME/bin/nutch dedup crawl/NEWindexes
echo "----- Merge Indexes (Step 7 of $steps) -----"
$NUTCH_HOME/bin/nutch merge crawl/NEWindex crawl/NEWindexes
echo "----- Loading New Index (Step 8 of $steps) -----"
${CATALINA_HOME}/bin/shutdown.sh
if [ "$safe" != "yes" ]
then
rm $RMARGS crawl/NEWindexes
rm $RMARGS crawl/index
else
rm $RMARGS crawl/BACKUPindexes
rm $RMARGS crawl/BACKUPindex
mv $MVARGS crawl/NEWindexes crawl/BACKUPindexes
mv $MVARGS crawl/index crawl/BACKUPindex
fi
mv $MVARGS crawl/NEWindex crawl/index
${CATALINA_HOME}/bin/startup.sh
echo "runbot: FINISHED: Crawl completed!"
echo ""
//断电停止工作后,再执行这个。
// 刚开始我也不怎么喜欢shell这种方式的代码,不过幸好邻居是搞shell编程的,经它一分析,原来shell也是这么简单。
//来源:http://wiki.apache.org/nutch/Crawl
相关文章推荐
- nutch 1.4 的增量爬取(recrawl)脚本
- postfix sh 配合前面编写的savePostfixLog.sh定时检测信件发送完成立刻调用php分析错误日志
- nutch 1.2 war的二次开发 第一步,重新编写首页
- 用Visual Studio 2010编写Data Url生成工具C#版
- 从输入 URL 到页面加载完成的过程中都发生了什么事情?
- Sudoku 1.2 版本完成
- Android使用Gradle命令动态修改BASE_URL(测试/正式环境地址)完成打包,不需要修改代码
- 编写完成:组件对象字符串生成程序 V2.0 修正版
- 在eclipse配置nutch-1.2
- 编写方法,完成指定文件的复制和剪切;使用BufferedReader和BufferedWriter完成。
- 注解之编写自己的注解来完成校验编程任务(2)
- Nutch中的Injector为什么按照url链接的个数执行多次Map()函数?
- 从输入 URL 到页面加载完成的过程中都发生了什么事情?
- Mysql完成备份+增量备份shell
- loadrunner手动编写脚本完成一个登录购买流程
- Vue、angular的URL地址编写方式
- Linux的sh脚本编写基础知识
- 服务器环境之6:使用maven自动部署到tomcat及编写sh一键部署脚本
- 完成库存管理系统文档编写
- nutch1.2测试