层级聚类(Hierarchical Clustering)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from numpy import *

Code for hierarchical clustering, modified from 
Programming Collective Intelligence by Toby Segaran 
(O‘Reilly Media 2007, page 33). 

class cluster_node:
    def __init__(self, vec, left=None, right=None, distance=0.0, id=None, count=1):#面向对象oo 构造函数
        self.left = left
        self.right = right
        self.vec = vec
        self.id = id
        self.distance = distance
        self.count = count  # only used for weighted average

def L2dist(v1, v2):
    return sqrt(sum((v1 - v2) ** 2))

def L1dist(v1, v2):
    return sum(abs(v1 - v2))

# def Chi2dist(v1,v2):
#     return sqrt(sum((v1-v2)**2))

def hcluster(features, distance=L2dist):
    # cluster the rows of the "features" matrix
    distances = {}
    currentclustid = -1

    # clusters are initially just the individual rows
    clust = [cluster_node(array(features[i]), id=i) for i in range(len(features))]#每一个实例都赋值id

    while len(clust) > 1:
        lowestpair = (0, 1)
        closest = distance(clust[0].vec, clust[1].vec)

        # loop through every pair looking for the smallest distance
        for i in range(len(clust)):
            for j in range(i + 1, len(clust)):
                # distances is the cache of distance calculations
                if (clust[i].id, clust[j].id) not in distances:
                    distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)

                d = distances[(clust[i].id, clust[j].id)]

                if d < closest:
                    closest = d
                    lowestpair = (i, j)#距离最小的一对点

        # calculate the average of the two clusters
        mergevec = [(clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0                     for i in range(len(clust[0].vec))]#计算一个类中的两个点的距离的中间点

        # create the new cluster
        newcluster = cluster_node(array(mergevec), left=clust[lowestpair[0]],
                                  distance=closest, id=currentclustid)#将左右儿子已接近两点距离 两点的中心向量进行赋值,构造新的节点

        # cluster ids that weren‘t in the original set are negative
        currentclustid -= 1
        del clust[lowestpair[1]]#删除掉已经合并为一个节点的左右两个(在clust里面)节点
        del clust[lowestpair[0]]

    return clust[0]#返回一个包含所有节点的树结构

def extract_clusters(clust, dist):#
    # extract list of sub-tree clusters from hcluster tree with distance<dist
    clusters = {}
    if clust.distance < dist:
        # we have found a cluster subtree
        return [clust]
        # check the right and left branches
        cl = []
        cr = []
        if clust.left != None:
            cl = extract_clusters(clust.left, dist=dist)
        if clust.right != None:
            cr = extract_clusters(clust.right, dist=dist)
        return cl + cr

def get_cluster_elements(clust):
    # return ids for elements in a cluster sub-tree
    if clust.id >= 0:
        # positive id means that this is a leaf
        return [clust.id]
        # check the right and left branches
        cl = []
        cr = []
        if clust.left != None:
            cl = get_cluster_elements(clust.left)
        if clust.right != None:
            cr = get_cluster_elements(clust.right)
        return cl + cr

def printclust(clust, labels=None, n=0):
    # indent to make a hierarchy layout
    for i in range(n): print ‘ ‘,
    if clust.id < 0:
        # negative id means that this is branch
        print ‘-‘
        # positive id means that this is an endpoint
        if labels == None:
            print clust.id
            print labels[clust.id]

    # now print the right and left branches
    if clust.left != None: printclust(clust.left, labels=labels, n=n + 1)
    if clust.right != None: printclust(clust.right, labels=labels, n=n + 1)

def getheight(clust):
    # Is this an endpoint? Then the height is just 1
    if clust.left == None and clust.right == None: return 1

    # Otherwise the height is the same of the heights of
    # each branch
    return getheight(clust.left) + getheight(clust.right)

def getdepth(clust):
    # The distance of an endpoint is 0.0
    if clust.left == None and clust.right == None: return 0

    # The distance of a branch is the greater of its two sides
    # plus its own distance
    return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance


