码迷,mamicode.com
首页 > 其他好文 > 详细

使用自带 cluster-reuters.sh 聚类使用

时间:2016-05-12 11:46:20      阅读:198      评论:0      收藏:0      [点我收藏+]

标签:

#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# Downloads the Reuters dataset and prepares it for clustering
#
# To run:  change into the mahout directory and type:
#  examples/bin/cluster-reuters.sh

if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
  echo "This script clusters the Reuters data set using a variety of algorithms.  The data set is downloaded automatically."
  exit
fi

SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
  cd $SCRIPT_PATH
fi
START_PATH=`pwd`

# Set commands for dfs
source ${START_PATH}/set-dfs-commands.sh

MAHOUT="../../bin/mahout"

if [ ! -e $MAHOUT ]; then
  echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
  exit 1
fi

if [[ -z "$MAHOUT_WORK_DIR" ]]; then
  WORK_DIR=/tmp/mahout-work-${USER}
else
  WORK_DIR=$MAHOUT_WORK_DIR
fi

algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
if [ -n "$1" ]; then
  choice=$1
else
  echo "Please select a number to choose the corresponding clustering algorithm"
  echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" 
  echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
  echo "3. ${algorithm[2]} clustering"
  echo "4. ${algorithm[3]} clustering"
  echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
  read -p "Enter your choice : " choice
fi

echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
clustertype=${algorithm[$choice-1]}

if [ "x$clustertype" == "xclean" ]; then
  rm -rf $WORK_DIR
  $DFSRM $WORK_DIR
  exit 1
else
  $DFS -mkdir -p $WORK_DIR
  mkdir -p $WORK_DIR
  echo "Creating work directory at ${WORK_DIR}"
fi
if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
  if [ ! -e ${WORK_DIR}/reuters-out ]; then
    if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
	  if [ -n "$2" ]; then
	      echo "Copying Reuters from local download"
	      cp $2 ${WORK_DIR}/reuters21578.tar.gz
	  else
              echo "Downloading Reuters-21578"
              curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz
	  fi
      fi
      #make sure it was actually downloaded
      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
	  echo "Failed to download reuters"
	  exit 1
      fi
      mkdir -p ${WORK_DIR}/reuters-sgm
      echo "Extracting..."
      tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
    fi
    echo "Extracting Reuters"
    $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
    if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
        echo "Copying Reuters data to Hadoop"
        set +e
        $DFSRM ${WORK_DIR}/reuters-sgm
        $DFSRM ${WORK_DIR}/reuters-out
        $DFS -mkdir -p ${WORK_DIR}/
        $DFS -mkdir ${WORK_DIR}/reuters-sgm
        $DFS -mkdir ${WORK_DIR}/reuters-out
        $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
        $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
        set -e
    fi
  fi
  echo "Converting to Sequence Files from Directory"
  $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
fi

if [ "x$clustertype" == "xkmeans" ]; then
  $MAHOUT seq2sparse     -i ${WORK_DIR}/reuters-out-seqdir/     -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector   &&   $MAHOUT kmeans     -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/     -c ${WORK_DIR}/reuters-kmeans-clusters     -o ${WORK_DIR}/reuters-kmeans     -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure     -x 10 -k 20 -ow --clustering   &&   $MAHOUT clusterdump     -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'`     -o ${WORK_DIR}/reuters-kmeans/clusterdump     -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0     -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0     --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints     &&   cat ${WORK_DIR}/reuters-kmeans/clusterdump
elif [ "x$clustertype" == "xfuzzykmeans" ]; then
  $MAHOUT seq2sparse     -i ${WORK_DIR}/reuters-out-seqdir/     -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector   &&   $MAHOUT fkmeans     -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/     -c ${WORK_DIR}/reuters-fkmeans-clusters     -o ${WORK_DIR}/reuters-fkmeans     -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure     -x 10 -k 20 -ow -m 1.1   &&   $MAHOUT clusterdump     -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final     -o ${WORK_DIR}/reuters-fkmeans/clusterdump     -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0     -dt sequencefile -b 100 -n 20 -sp 0     &&   cat ${WORK_DIR}/reuters-fkmeans/clusterdump
elif [ "x$clustertype" == "xlda" ]; then
  $MAHOUT seq2sparse     -i ${WORK_DIR}/reuters-out-seqdir/     -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector   &&   $MAHOUT rowid     -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors     -o ${WORK_DIR}/reuters-out-matrix   &&   rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model   &&   $MAHOUT cvb     -i ${WORK_DIR}/reuters-out-matrix/matrix     -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20     -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-*     -dt ${WORK_DIR}/reuters-lda-topics     -mt ${WORK_DIR}/reuters-lda-model   &&   $MAHOUT vectordump     -i ${WORK_DIR}/reuters-lda-topics/part-m-00000     -o ${WORK_DIR}/reuters-lda/vectordump     -vs 10 -p true     -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-*     -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000     &&   cat ${WORK_DIR}/reuters-lda/vectordump
elif [ "x$clustertype" == "xstreamingkmeans" ]; then
  $MAHOUT seq2sparse     -i ${WORK_DIR}/reuters-out-seqdir/     -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector   &&   rm -rf ${WORK_DIR}/reuters-streamingkmeans   &&   $MAHOUT streamingkmeans     -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/     --tempDir ${WORK_DIR}/tmp     -o ${WORK_DIR}/reuters-streamingkmeans     -sc org.apache.mahout.math.neighborhood.FastProjectionSearch     -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure     -k 10 -km 100 -ow   &&   $MAHOUT qualcluster     -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000     -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000       -o ${WORK_DIR}/reuters-cluster-distance.csv     &&   cat ${WORK_DIR}/reuters-cluster-distance.csv
fi


随着mahout版本的升级,里面的算法库和例子越来越完善,我们使用自带的脚本来学习如何聚类

 

 

cluster-reuters.sh自动下载reuters的数据,我们直接运行就可以了,该脚本自动将数据copy到hadoop,所以运行前开启hadoop

 

1.解压文件Extracting Reuters

$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out

 

2.转换成Sequence(方便在hadop读写)

$MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential

 

3.转换成向量Vector

例如转换成稀疏向量

$MAHOUT seq2sparse \
  -i ${WORK_DIR}/reuters-out-seqdir/ \
  -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \

4.接着选择相应的聚类算法进行运行

 

我选择了LDA聚类进行测试,LDA是类似狄利克雷的算法,都是从一个初始模型,通过不断的拟合模型从而达到目的

 

整个在为分布系统中,运行约10来分钟,

使用自带 cluster-reuters.sh 聚类使用

标签:

原文地址:http://blog.csdn.net/u013571243/article/details/51371727

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!