标签:
#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # Downloads the Reuters dataset and prepares it for clustering # # To run: change into the mahout directory and type: # examples/bin/cluster-reuters.sh if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then echo "This script clusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically." exit fi SCRIPT_PATH=${0%/*} if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then cd $SCRIPT_PATH fi START_PATH=`pwd` # Set commands for dfs source ${START_PATH}/set-dfs-commands.sh MAHOUT="../../bin/mahout" if [ ! -e $MAHOUT ]; then echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.." exit 1 fi if [[ -z "$MAHOUT_WORK_DIR" ]]; then WORK_DIR=/tmp/mahout-work-${USER} else WORK_DIR=$MAHOUT_WORK_DIR fi algorithm=( kmeans fuzzykmeans lda streamingkmeans clean) if [ -n "$1" ]; then choice=$1 else echo "Please select a number to choose the corresponding clustering algorithm" echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)" echo "3. ${algorithm[2]} clustering" echo "4. ${algorithm[3]} clustering" echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR" read -p "Enter your choice : " choice fi echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering" clustertype=${algorithm[$choice-1]} if [ "x$clustertype" == "xclean" ]; then rm -rf $WORK_DIR $DFSRM $WORK_DIR exit 1 else $DFS -mkdir -p $WORK_DIR mkdir -p $WORK_DIR echo "Creating work directory at ${WORK_DIR}" fi if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then if [ ! -e ${WORK_DIR}/reuters-out ]; then if [ ! -e ${WORK_DIR}/reuters-sgm ]; then if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then if [ -n "$2" ]; then echo "Copying Reuters from local download" cp $2 ${WORK_DIR}/reuters21578.tar.gz else echo "Downloading Reuters-21578" curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz fi fi #make sure it was actually downloaded if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then echo "Failed to download reuters" exit 1 fi mkdir -p ${WORK_DIR}/reuters-sgm echo "Extracting..." tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm fi echo "Extracting Reuters" $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then echo "Copying Reuters data to Hadoop" set +e $DFSRM ${WORK_DIR}/reuters-sgm $DFSRM ${WORK_DIR}/reuters-out $DFS -mkdir -p ${WORK_DIR}/ $DFS -mkdir ${WORK_DIR}/reuters-sgm $DFS -mkdir ${WORK_DIR}/reuters-out $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out set -e fi fi echo "Converting to Sequence Files from Directory" $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential fi if [ "x$clustertype" == "xkmeans" ]; then $MAHOUT seq2sparse -i ${WORK_DIR}/reuters-out-seqdir/ -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector && $MAHOUT kmeans -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ -c ${WORK_DIR}/reuters-kmeans-clusters -o ${WORK_DIR}/reuters-kmeans -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -x 10 -k 20 -ow --clustering && $MAHOUT clusterdump -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` -o ${WORK_DIR}/reuters-kmeans/clusterdump -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints && cat ${WORK_DIR}/reuters-kmeans/clusterdump elif [ "x$clustertype" == "xfuzzykmeans" ]; then $MAHOUT seq2sparse -i ${WORK_DIR}/reuters-out-seqdir/ -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector && $MAHOUT fkmeans -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ -c ${WORK_DIR}/reuters-fkmeans-clusters -o ${WORK_DIR}/reuters-fkmeans -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -x 10 -k 20 -ow -m 1.1 && $MAHOUT clusterdump -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final -o ${WORK_DIR}/reuters-fkmeans/clusterdump -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 -dt sequencefile -b 100 -n 20 -sp 0 && cat ${WORK_DIR}/reuters-fkmeans/clusterdump elif [ "x$clustertype" == "xlda" ]; then $MAHOUT seq2sparse -i ${WORK_DIR}/reuters-out-seqdir/ -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector && $MAHOUT rowid -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors -o ${WORK_DIR}/reuters-out-matrix && rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model && $MAHOUT cvb -i ${WORK_DIR}/reuters-out-matrix/matrix -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* -dt ${WORK_DIR}/reuters-lda-topics -mt ${WORK_DIR}/reuters-lda-model && $MAHOUT vectordump -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 -o ${WORK_DIR}/reuters-lda/vectordump -vs 10 -p true -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 && cat ${WORK_DIR}/reuters-lda/vectordump elif [ "x$clustertype" == "xstreamingkmeans" ]; then $MAHOUT seq2sparse -i ${WORK_DIR}/reuters-out-seqdir/ -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector && rm -rf ${WORK_DIR}/reuters-streamingkmeans && $MAHOUT streamingkmeans -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ --tempDir ${WORK_DIR}/tmp -o ${WORK_DIR}/reuters-streamingkmeans -sc org.apache.mahout.math.neighborhood.FastProjectionSearch -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure -k 10 -km 100 -ow && $MAHOUT qualcluster -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000 -o ${WORK_DIR}/reuters-cluster-distance.csv && cat ${WORK_DIR}/reuters-cluster-distance.csv fi
随着mahout版本的升级,里面的算法库和例子越来越完善,我们使用自带的脚本来学习如何聚类
cluster-reuters.sh自动下载reuters的数据,我们直接运行就可以了,该脚本自动将数据copy到hadoop,所以运行前开启hadoop
1.解压文件Extracting Reuters
$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
2.转换成Sequence(方便在hadop读写)
$MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
3.转换成向量Vector
例如转换成稀疏向量
$MAHOUT seq2sparse \ | |
-i ${WORK_DIR}/reuters-out-seqdir/ \ | |
-o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \ |
4.接着选择相应的聚类算法进行运行
我选择了LDA聚类进行测试,LDA是类似狄利克雷的算法,都是从一个初始模型,通过不断的拟合模型从而达到目的
整个在为分布系统中,运行约10来分钟,
标签:
原文地址:http://blog.csdn.net/u013571243/article/details/51371727