您的位置:首页 > 其它

nutch 1.2 增量爬取url 完成 recrawl.sh 编写

2011-02-21 23:41 295 查看
# 使用说明:在bin目录下建立 runbot.sh ,如果在window下执行的话,则使用 cygwin 来模拟使用

# bin/runbot.sh

# runbot script to run the Nutch bot for crawling and re-crawling.

# Usage: bin/runbot [safe]

# If executed in 'safe' mode, it doesn't delete the temporary

# directories generated during crawl. This might be helpful for

# analysis and recovery in case a crawl fails.

#

# Author: Susam Pal

depth=4

threads=8

adddays=5

topN=1500000 #Comment this statement if you don't want to set topN value

# Arguments for rm and mv

RMARGS="-rf"

MVARGS="--verbose"

# Parse arguments

if [ "$1" == "safe" ]

then

safe=yes

fi

if [-z "$NUTCH_HOME" ]

then

NUTCH_HOME=.

echo runbot: $0 could not find environment variable NUTCH_HOME

echo runbot: NUTCH_HOME=$NUTCH_HOME has been set by the script

else

echo runbot: $0 found environment variable NUTCH_HOME=$NUTCH_HOME

fi

NUTCH_HOME=/cygdrive/f/nutch

echo print nutch: $NUTCH_HOME

if [ -z "$CATALINA_HOME" ]

then

CATALINA_HOME=/cygdrive/e/tomcat

echo runbot: $0 could not find environment variable NUTCH_HOME

echo runbot: CATALINA_HOME=$CATALINA_HOME has been set by the script

else

echo runbot: $0 found environment variable CATALINA_HOME=$CATALINA_HOME

fi

if [ -n "$topN" ]

then

topN="-topN $topN"

else

topN=""

fi

steps=8

echo "----- Inject (Step 1 of $steps) -----"

$NUTCH_HOME/bin/nutch inject crawl/crawldb urls

echo "----- Generate, Fetch, Parse, Update (Step 2 of $steps) -----"

for((i=0; i < $depth; i++))

do

echo "--- Beginning crawl at depth `expr $i + 1` of $depth ---"

$NUTCH_HOME/bin/nutch generate crawl/crawldb crawl/segments $topN /

-adddays $adddays

if [ $? -ne 0 ]

then

echo "runbot: Stopping at depth $depth. No more URLs to fetch."

break

fi

segment=`ls -d crawl/segments/* | tail -1`

$NUTCH_HOME/bin/nutch fetch $segment -threads $threads

if [ $? -ne 0 ]

then

echo "runbot: fetch $segment at depth `expr $i + 1` failed."

echo "runbot: Deleting segment $segment."

rm $RMARGS $segment

continue

fi

$NUTCH_HOME/bin/nutch updatedb crawl/crawldb $segment

done

echo "----- Merge Segments (Step 3 of $steps) -----"

$NUTCH_HOME/bin/nutch mergesegs crawl/MERGEDsegments crawl/segments/*

if [ "$safe" != "yes" ]

then

rm $RMARGS crawl/segments

else

rm $RMARGS crawl/BACKUPsegments

mv $MVARGS crawl/segments crawl/BACKUPsegments

fi

mv $MVARGS crawl/MERGEDsegments crawl/segments

echo "----- Invert Links (Step 4 of $steps) -----"

$NUTCH_HOME/bin/nutch invertlinks crawl/linkdb crawl/segments/*

echo "----- Index (Step 5 of $steps) -----"

$NUTCH_HOME/bin/nutch index crawl/NEWindexes crawl/crawldb crawl/linkdb /

crawl/segments/*

echo "----- Dedup (Step 6 of $steps) -----"

$NUTCH_HOME/bin/nutch dedup crawl/NEWindexes

echo "----- Merge Indexes (Step 7 of $steps) -----"

$NUTCH_HOME/bin/nutch merge crawl/NEWindex crawl/NEWindexes

echo "----- Loading New Index (Step 8 of $steps) -----"

${CATALINA_HOME}/bin/shutdown.sh

if [ "$safe" != "yes" ]

then

rm $RMARGS crawl/NEWindexes

rm $RMARGS crawl/index

else

rm $RMARGS crawl/BACKUPindexes

rm $RMARGS crawl/BACKUPindex

mv $MVARGS crawl/NEWindexes crawl/BACKUPindexes

mv $MVARGS crawl/index crawl/BACKUPindex

fi

mv $MVARGS crawl/NEWindex crawl/index

${CATALINA_HOME}/bin/startup.sh

echo "runbot: FINISHED: Crawl completed!"

echo ""

//断电停止工作后,再执行这个。

// 刚开始我也不怎么喜欢shell这种方式的代码,不过幸好邻居是搞shell编程的,经它一分析,原来shell也是这么简单。

//来源:http://wiki.apache.org/nutch/Crawl
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: