密度聚类,即基于密度的聚类(density-based clustering),此类算法假设聚类结构能通过样本分布的紧密程度确定,可以发现任意形状的聚类,且对带有噪音点的数据起着重要的作用。
DBSCAN算法
DBSCAN(Density-Based Spatial Clustering of Application with Noise)是一种典型的密度聚类算法,能够将足够高密度的区域划分成簇,并能在具有噪声的空间数据库中发现任意形状的簇。该方法基于“邻域”参数(ϵ, MinPts)来刻画样本分布的紧密程度。
DBSCAN的相关概念和原理参考:DBSCAN
Python实现
1 | import math |
2 | import numpy as np |
3 | import pylab as pl |
4 | |
5 | |
6 | # 数据处理:得到训练数据集dataset |
7 | def get_dataset(): |
8 | # 西瓜数据集:每三个一组(编号,密度,含糖量) |
9 | data = """ |
10 | 1,0.697,0.46,2,0.774,0.376,3,0.634,0.264,4,0.608,0.318,5,0.556,0.215, |
11 | 6,0.403,0.237,7,0.481,0.149,8,0.437,0.211,9,0.666,0.091,10,0.243,0.267, |
12 | 11,0.245,0.057,12,0.343,0.099,13,0.639,0.161,14,0.657,0.198,15,0.36,0.37, |
13 | 16,0.593,0.042,17,0.719,0.103,18,0.359,0.188,19,0.339,0.241,20,0.282,0.257, |
14 | 21,0.748,0.232,22,0.714,0.346,23,0.483,0.312,24,0.478,0.437,25,0.525,0.369, |
15 | 26,0.751,0.489,27,0.532,0.472,28,0.473,0.376,29,0.725,0.445,30,0.446,0.459 |
16 | """ |
17 | a = data.split(',') |
18 | return [(float(a[i]), float(a[i+1])) for i in range(1, len(a)-1, 3)] |
19 | |
20 | |
21 | # 计算欧几里得距离:a,b分别为两个元组 |
22 | def dist(a, b): |
23 | return math.sqrt(math.pow(a[0]-b[0], 2) + math.pow(a[1]-b[1], 2)) |
24 | |
25 | |
26 | # 算法模型 |
27 | # 参数:邻域参数e, Minpts,dist距离函数 |
28 | def DBSCAN(D, e, Minpts, dist): |
29 | T = set() # 核心对象集合T |
30 | k = 0 # 聚类个数k |
31 | C = [] # 聚类集合C |
32 | P = set(D) # 未访问集合P |
33 | # 计算核心对象 |
34 | for d in D: |
35 | if len([i for i in D if dist(d, i) <= e]) >= Minpts: |
36 | T.add(d) |
37 | # 开始聚类 |
38 | while len(T): |
39 | P_old = P |
40 | o = list(T)[np.random.randint(0, len(T))] |
41 | P = P - set(o) |
42 | Q = [] |
43 | Q.append(o) |
44 | while len(Q): |
45 | q = Q[0] |
46 | Nq = [i for i in D if dist(q, i) <= e] |
47 | if len(Nq) >= Minpts: |
48 | S = P & set(Nq) |
49 | Q += (list(S)) |
50 | P = P - S |
51 | Q.remove(q) |
52 | k += 1 |
53 | Ck = list(P_old - P) |
54 | T = T - set(Ck) |
55 | C.append(Ck) |
56 | return C, k |
57 | |
58 | |
59 | # 训练结果可视化 |
60 | def draw(C): |
61 | color = ['r', 'y', 'g', 'b', 'c', 'k', 'm'] |
62 | for i in range(len(C)): |
63 | x = [] # x坐标列表 |
64 | y = [] # y坐标列表 |
65 | for j in range(len(C[i])): |
66 | x.append(C[i][j][0]) |
67 | y.append(C[i][j][1]) |
68 | pl.scatter(x, y, marker='x', color=color[i%len(color)], label=i+1) |
69 | pl.legend(loc='upper left') |
70 | pl.title('DBSCAN') |
71 | pl.show() |
72 | |
73 | |
74 | if __name__ == '__main__': |
75 | # 数据处理得到训练数据集 |
76 | dataset = get_dataset() |
77 | # 设置邻域参数 |
78 | e = 0.11 |
79 | Minpts = 5 |
80 | # 密度聚类得到k个聚类簇:以dist函数作为距离度量 |
81 | C, k = DBSCAN(dataset, e, Minpts, dist) |
82 | # 聚类结果展示 |
83 | draw(C) |