密度聚类算法

密度聚类,即基于密度的聚类(density-based clustering),此类算法假设聚类结构能通过样本分布的紧密程度确定,可以发现任意形状的聚类,且对带有噪音点的数据起着重要的作用。

DBSCAN算法

DBSCAN(Density-Based Spatial Clustering of Application with Noise)是一种典型的密度聚类算法,能够将足够高密度的区域划分成簇,并能在具有噪声的空间数据库中发现任意形状的簇。该方法基于“邻域”参数(ϵ, MinPts)来刻画样本分布的紧密程度。

DBSCAN的相关概念和原理参考:DBSCAN

Python实现

1
import math
2
import numpy as np
3
import pylab as pl
4
5
6
# 数据处理:得到训练数据集dataset
7
def get_dataset():
8
    # 西瓜数据集:每三个一组(编号,密度,含糖量)
9
    data = """
10
    1,0.697,0.46,2,0.774,0.376,3,0.634,0.264,4,0.608,0.318,5,0.556,0.215,
11
    6,0.403,0.237,7,0.481,0.149,8,0.437,0.211,9,0.666,0.091,10,0.243,0.267,
12
    11,0.245,0.057,12,0.343,0.099,13,0.639,0.161,14,0.657,0.198,15,0.36,0.37,
13
    16,0.593,0.042,17,0.719,0.103,18,0.359,0.188,19,0.339,0.241,20,0.282,0.257,
14
    21,0.748,0.232,22,0.714,0.346,23,0.483,0.312,24,0.478,0.437,25,0.525,0.369,
15
    26,0.751,0.489,27,0.532,0.472,28,0.473,0.376,29,0.725,0.445,30,0.446,0.459
16
    """
17
    a = data.split(',')
18
    return [(float(a[i]), float(a[i+1])) for i in range(1, len(a)-1, 3)]
19
20
21
# 计算欧几里得距离:a,b分别为两个元组
22
def dist(a, b):
23
    return math.sqrt(math.pow(a[0]-b[0], 2) + math.pow(a[1]-b[1], 2))
24
25
26
# 算法模型
27
# 参数:邻域参数e, Minpts,dist距离函数
28
def DBSCAN(D, e, Minpts, dist):
29
    T = set()   # 核心对象集合T
30
    k = 0       # 聚类个数k
31
    C = []      # 聚类集合C
32
    P = set(D)  # 未访问集合P
33
    # 计算核心对象
34
    for d in D:
35
        if len([i for i in D if dist(d, i) <= e]) >= Minpts:
36
            T.add(d)
37
    # 开始聚类
38
    while len(T):
39
        P_old = P
40
        o = list(T)[np.random.randint(0, len(T))]
41
        P = P - set(o)
42
        Q = []
43
        Q.append(o)
44
        while len(Q):
45
            q = Q[0]
46
            Nq = [i for i in D if dist(q, i) <= e]
47
            if len(Nq) >= Minpts:
48
                S = P & set(Nq)
49
                Q += (list(S))
50
                P = P - S
51
            Q.remove(q)
52
        k += 1
53
        Ck = list(P_old - P)
54
        T = T - set(Ck)
55
        C.append(Ck)
56
    return C, k
57
58
59
# 训练结果可视化
60
def draw(C):
61
    color = ['r', 'y', 'g', 'b', 'c', 'k', 'm']
62
    for i in range(len(C)):
63
        x = []    # x坐标列表
64
        y = []    # y坐标列表
65
        for j in range(len(C[i])):
66
            x.append(C[i][j][0])
67
            y.append(C[i][j][1])
68
        pl.scatter(x, y, marker='x', color=color[i%len(color)], label=i+1)
69
    pl.legend(loc='upper left')
70
    pl.title('DBSCAN')
71
    pl.show()
72
73
74
if __name__ == '__main__':
75
    # 数据处理得到训练数据集
76
    dataset = get_dataset()
77
    # 设置邻域参数
78
    e = 0.11
79
    Minpts = 5
80
    # 密度聚类得到k个聚类簇:以dist函数作为距离度量
81
    C, k = DBSCAN(dataset, e, Minpts, dist)
82
    # 聚类结果展示
83
    draw(C)

聚类结果