层次聚类算法的Python实现。
Python实现
1 | import math |
2 | import pylab as pl |
3 | |
4 | |
5 | # 数据处理:得到训练数据集dataset |
6 | def get_dataset(): |
7 | # 西瓜数据集:每三个一组(编号,密度,含糖量) |
8 | data = """ |
9 | 1,0.697,0.46,2,0.774,0.376,3,0.634,0.264,4,0.608,0.318,5,0.556,0.215, |
10 | 6,0.403,0.237,7,0.481,0.149,8,0.437,0.211,9,0.666,0.091,10,0.243,0.267, |
11 | 11,0.245,0.057,12,0.343,0.099,13,0.639,0.161,14,0.657,0.198,15,0.36,0.37, |
12 | 16,0.593,0.042,17,0.719,0.103,18,0.359,0.188,19,0.339,0.241,20,0.282,0.257, |
13 | 21,0.748,0.232,22,0.714,0.346,23,0.483,0.312,24,0.478,0.437,25,0.525,0.369, |
14 | 26,0.751,0.489,27,0.532,0.472,28,0.473,0.376,29,0.725,0.445,30,0.446,0.459 |
15 | """ |
16 | a = data.split(',') |
17 | return [(float(a[i]), float(a[i+1])) for i in range(1, len(a)-1, 3)] |
18 | |
19 | |
20 | # 计算两个元组的欧几里得距离 |
21 | def dist(a, b): |
22 | return math.sqrt(math.pow(a[0]-b[0], 2) + math.pow(a[1]-b[1], 2)) |
23 | |
24 | |
25 | # 计算两个聚类簇的平均距离 |
26 | def dist_avg(Ci, Cj): |
27 | return sum(dist(i, j) for i in Ci for j in Cj) / (len(Ci)*len(Cj)) |
28 | |
29 | |
30 | # 找到距离最小的下标及最小距离 |
31 | def find_Min(M): |
32 | min = 10000 |
33 | x = 0 |
34 | y = 0 |
35 | for i in range(len(M)): |
36 | for j in range(len(M[i])): |
37 | if i != j and M[i][j] < min: |
38 | min = M[i][j] |
39 | x = i |
40 | y = j |
41 | return x, y, min |
42 | |
43 | |
44 | # 算法模型 |
45 | # 参数:样本数据集, 距离计算函数,聚类簇数 |
46 | def AGNES(D, dist, k): |
47 | # 先初始化为|D|个类 |
48 | C = [] |
49 | M = [] |
50 | for i in D: |
51 | Ci = [] |
52 | Ci.append(i) |
53 | C.append(Ci) |
54 | # M存放C中两两间的距离 |
55 | for i in C: |
56 | Mi = [] |
57 | for j in C: |
58 | Mi.append(dist(i, j)) |
59 | M.append(Mi) |
60 | # 实时聚类个数q |
61 | q = len(D) |
62 | # 合并更新 |
63 | while q > k: |
64 | x, y, min = find_Min(M) |
65 | C[x].extend(C[y]) |
66 | C.remove(C[y]) |
67 | # 更新距离 |
68 | M = [] |
69 | for i in C: |
70 | Mi = [] |
71 | for j in C: |
72 | Mi.append(dist(i, j)) |
73 | M.append(Mi) |
74 | # 聚类数减一 |
75 | q -= 1 |
76 | return C |
77 | |
78 | |
79 | # 训练结果可视化 |
80 | def draw(C): |
81 | color = ['r', 'y', 'g', 'b', 'c', 'k', 'm'] |
82 | for i in range(len(C)): |
83 | x = [] # x坐标列表 |
84 | y = [] # y坐标列表 |
85 | for j in range(len(C[i])): |
86 | x.append(C[i][j][0]) |
87 | y.append(C[i][j][1]) |
88 | pl.scatter(x, y, marker='x', color=color[i%len(color)], label=i+1) |
89 | pl.legend(loc='upper left') |
90 | pl.title('AGNES') |
91 | pl.show() |
92 | |
93 | |
94 | if __name__ == '__main__': |
95 | # 数据处理得到训练数据集 |
96 | dataset = get_dataset() |
97 | # 设置聚类簇个数k |
98 | k = 4 |
99 | # 层次聚类形成k个聚类簇 |
100 | C = AGNES(dataset, dist_avg, k) |
101 | draw(C) |