您的位置:首页 > 其它

使用自带 cluster-reuters.sh 聚类使用

2016-05-11 09:47 344 查看
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0 #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# Downloads the Reuters dataset and prepares it for clustering
#
# To run:  change into the mahout directory and type:
#  examples/bin/cluster-reuters.sh

if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
echo "This script clusters the Reuters data set using a variety of algorithms.  The data set is downloaded automatically."
exit
fi

SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
cd $SCRIPT_PATH
fi
START_PATH=`pwd`

# Set commands for dfs
source ${START_PATH}/set-dfs-commands.sh

MAHOUT="../../bin/mahout"

if [ ! -e $MAHOUT ]; then
echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
exit 1
fi

if [[ -z "$MAHOUT_WORK_DIR" ]]; then
WORK_DIR=/tmp/mahout-work-${USER}
else
WORK_DIR=$MAHOUT_WORK_DIR
fi

algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
if [ -n "$1" ]; then
choice=$1
else
echo "Please select a number to choose the corresponding clustering algorithm"
echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)"
echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
echo "3. ${algorithm[2]} clustering"
echo "4. ${algorithm[3]} clustering"
echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
read -p "Enter your choice : " choice
fi

echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
clustertype=${algorithm[$choice-1]}

if [ "x$clustertype" == "xclean" ]; then
rm -rf $WORK_DIR
$DFSRM $WORK_DIR
exit 1
else
$DFS -mkdir -p $WORK_DIR
mkdir -p $WORK_DIR
echo "Creating work directory at ${WORK_DIR}"
fi
if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
if [ ! -e ${WORK_DIR}/reuters-out ]; then
if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
if [ -n "$2" ]; then
echo "Copying Reuters from local download"
cp $2 ${WORK_DIR}/reuters21578.tar.gz
else
echo "Downloading Reuters-21578"
curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz
fi
fi
#make sure it was actually downloaded
if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
echo "Failed to download reuters"
exit 1
fi
mkdir -p ${WORK_DIR}/reuters-sgm
echo "Extracting..."
tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
fi
echo "Extracting Reuters"
$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
echo "Copying Reuters data to Hadoop"
set +e
$DFSRM ${WORK_DIR}/reuters-sgm
$DFSRM ${WORK_DIR}/reuters-out
$DFS -mkdir -p ${WORK_DIR}/
$DFS -mkdir ${WORK_DIR}/reuters-sgm
$DFS -mkdir ${WORK_DIR}/reuters-out
$DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
$DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
set -e
fi
fi
echo "Converting to Sequence Files from Directory"
$MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
fi

if [ "x$clustertype" == "xkmeans" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
-o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \
&& \
$MAHOUT kmeans \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
-c ${WORK_DIR}/reuters-kmeans-clusters \
-o ${WORK_DIR}/reuters-kmeans \
-dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
-x 10 -k 20 -ow --clustering \
&& \
$MAHOUT clusterdump \
-i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \
-o ${WORK_DIR}/reuters-kmeans/clusterdump \
-d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
-dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
--pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
&& \
cat ${WORK_DIR}/reuters-kmeans/clusterdump
elif [ "x$clustertype" == "xfuzzykmeans" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
-o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \
&& \
$MAHOUT fkmeans \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
-c ${WORK_DIR}/reuters-fkmeans-clusters \
-o ${WORK_DIR}/reuters-fkmeans \
-dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
-x 10 -k 20 -ow -m 1.1 \
&& \
$MAHOUT clusterdump \
-i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
-o ${WORK_DIR}/reuters-fkmeans/clusterdump \
-d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
-dt sequencefile -b 100 -n 20 -sp 0 \
&& \
cat ${WORK_DIR}/reuters-fkmeans/clusterdump
elif [ "x$clustertype" == "xlda" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
-o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
&& \
$MAHOUT rowid \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
-o ${WORK_DIR}/reuters-out-matrix \
&& \
rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \
&& \
$MAHOUT cvb \
-i ${WORK_DIR}/reuters-out-matrix/matrix \
-o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
-dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
-dt ${WORK_DIR}/reuters-lda-topics \
-mt ${WORK_DIR}/reuters-lda-model \
&& \
$MAHOUT vectordump \
-i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
-o ${WORK_DIR}/reuters-lda/vectordump \
-vs 10 -p true \
-d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
-dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
&& \
cat ${WORK_DIR}/reuters-lda/vectordump
elif [ "x$clustertype" == "xstreamingkmeans" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
-o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \
&& \
rm -rf ${WORK_DIR}/reuters-streamingkmeans \
&& \
$MAHOUT streamingkmeans \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
--tempDir ${WORK_DIR}/tmp \
-o ${WORK_DIR}/reuters-streamingkmeans \
-sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
-dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
-k 10 -km 100 -ow \
&& \
$MAHOUT qualcluster \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \
-c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000   \
-o ${WORK_DIR}/reuters-cluster-distance.csv \
&& \
cat ${WORK_DIR}/reuters-cluster-distance.csv
fi


随着mahout版本的升级,里面的算法库和例子越来越完善,我们使用自带的脚本来学习如何聚类

cluster-reuters.sh自动下载[b]reuters的数据,我们直接运行就可以了,该脚本自动将数据copy到hadoop,所以运行前开启hadoop[/b]

1.解压文件Extracting Reuters

$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters
${WORK_DIR}/reuters-sgm
${WORK_DIR}/reuters-out


2.转换成Sequence(方便在hadop读写)

$MAHOUT seqdirectory -i
${WORK_DIR}/reuters-out -o
${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential

3.转换成向量Vector

例如转换成稀疏向量

$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
-o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
4.接着选择相应的聚类算法进行运行

我选择了LDA聚类进行测试,LDA是类似狄利克雷的算法,都是从一个初始模型,通过不断的拟合模型从而达到目的

整个在为分布系统中,运行约10来分钟,
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: