朴素贝叶斯法是基于贝叶斯定理与特征条件独立假设的分类方法。对于输入数据集,首先基于特征条件独立假设学习输入/输出的联合概率分布,然后基于此模型对给定的输入x,利用贝叶斯定理求出后验概率最大的输出y。
进一步学习:
朴素贝叶斯分类器
- 四种实现方式:自己实现、高斯贝叶斯模型、伯努利模型、多项式模型
1 | import numpy as np |
2 | import pandas as pd |
3 | from sklearn.datasets import load_iris |
4 | from sklearn.model_selection import train_test_split |
5 | import math |
6 | |
7 | |
8 | # 加载数据 |
9 | def load_data(): |
10 | iris = load_iris() |
11 | df = pd.DataFrame(iris.data, columns=iris.feature_names) |
12 | df['label'] = iris.target |
13 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] |
14 | data = np.array(df.iloc[:100, :]) |
15 | return data[:, :-1], data[:, -1] |
16 | |
17 | |
18 | # 朴素贝叶斯分类器 |
19 | class NaiveBayes: |
20 | def __init__(self): |
21 | self.model = None |
22 | |
23 | # 数学期望 |
24 | def mean(self, X): |
25 | return sum(X) / float(len(X)) |
26 | |
27 | # 标准差 |
28 | def stdev(self, X): |
29 | avg = self.mean(X) |
30 | return math.sqrt(sum([pow(x-avg, 2) for x in X]) / float(len(X))) |
31 | |
32 | # 高斯概率密度函数 |
33 | def gaussian_probability(self, x, mean, stdev): |
34 | exponent = math.exp(-(math.pow(x-mean, 2)/(2*math.pow(stdev, 2)))) |
35 | return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent |
36 | |
37 | # 处理X_train |
38 | def summarize(self, train_data): |
39 | summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)] |
40 | return summaries |
41 | |
42 | # 分类别求出数学期望和标准差 |
43 | def fit(self, X, y): |
44 | labels = list(set(y)) |
45 | data = {label: [] for label in labels} |
46 | for f, label in zip(X, y): |
47 | data[label].append(f) |
48 | self.model = {label: self.summarize(value) for label, value in data.items()} |
49 | return 'GaussianNB train done!' |
50 | |
51 | # 计算概率 |
52 | def calculate_probabilities(self, input_data): |
53 | # summaries:{0.0: [(5.0, 0.37),(3.42, 0.40)], 1.0: [(5.8, 0.449),(2.7, 0.27)]} |
54 | # input_data:[1.1, 2.2] |
55 | probabilities = {} |
56 | for label, value in self.model.items(): |
57 | probabilities[label] = 1 |
58 | for i in range(len(value)): |
59 | mean, stdev = value[i] |
60 | probabilities[label] *= self.gaussian_probability(input_data[i], mean, stdev) |
61 | return probabilities |
62 | |
63 | # 预测类别 |
64 | def predict(self, X_test): |
65 | # {0.0: 2.9680340789325763e-27, 1.0: 3.5749783019849535e-26} |
66 | label = sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])[-1][0] |
67 | return label |
68 | |
69 | # 评价模型 |
70 | def score(self, X_test, y_test): |
71 | right = 0 |
72 | for X, y in zip(X_test, y_test): |
73 | label = self.predict(X) |
74 | if label == y: |
75 | right += 1 |
76 | return right / float(len(X_test)) |
77 | |
78 | |
79 | if __name__ == '__main__': |
80 | # 加载数据 |
81 | X, y = load_data() |
82 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) |
83 | |
84 | # 自己实现NaiveBayes分类器 |
85 | print('My NaiveBayes') |
86 | model = NaiveBayes() |
87 | model.fit(X_train, y_train) |
88 | one_example = [4.4, 3.2, 1.3, 0.2] |
89 | print('The label of [4.4, 3.2, 1.3, 0.2] is: ' + str(model.predict(one_example))) |
90 | print('Accuracy in test data set: ' + str(model.score(X_test, y_test))) |
91 | print() |
92 | |
93 | # sklearn库中的NaiveBayes分类器 |
94 | from sklearn.naive_bayes import GaussianNB # 高斯贝叶斯模型 |
95 | print("Sklearn's GaussianNB") |
96 | clf = GaussianNB() |
97 | clf.fit(X_train, y_train) |
98 | print('The label of [4.4, 3.2, 1.3, 0.2] is: ' + str(clf.predict([[4.4, 3.2, 1.3, 0.2]]))) |
99 | print('Accuracy in test data set: ' + str(clf.score(X_test, y_test))) |
100 | print() |
101 | |
102 | from sklearn.naive_bayes import BernoulliNB # 伯努利模型 |
103 | print("Sklearn's BernoulliNB") |
104 | clf = BernoulliNB() |
105 | clf.fit(X_train, y_train) |
106 | print('The label of [4.4, 3.2, 1.3, 0.2] is: ' + str(clf.predict([[4.4, 3.2, 1.3, 0.2]]))) |
107 | print('Accuracy in test data set: ' + str(clf.score(X_test, y_test))) |
108 | print() |
109 | |
110 | from sklearn.naive_bayes import MultinomialNB # 多项式模型 |
111 | print("Sklearn's MultinomialNB") |
112 | clf = MultinomialNB() |
113 | clf.fit(X_train, y_train) |
114 | print('The label of [4.4, 3.2, 1.3, 0.2] is: ' + str(clf.predict([[4.4, 3.2, 1.3, 0.2]]))) |
115 | print('Accuracy in test data set: ' + str(clf.score(X_test, y_test))) |
- 实验结果
1 | My NaiveBayes |
2 | The label of [4.4, 3.2, 1.3, 0.2] is: 0.0 |
3 | Accuracy in test data set: 1.0 |
4 | |
5 | GaussianNB in sklearn |
6 | The label of [4.4, 3.2, 1.3, 0.2] is: [0.] |
7 | Accuracy in test data set: 1.0 |
8 | |
9 | BernoulliNB in sklearn |
10 | The label of [4.4, 3.2, 1.3, 0.2] is: [0.] |
11 | Accuracy in test data set: 0.4666666666666667 |
12 | |
13 | MultinomialNB in sklearn |
14 | The label of [4.4, 3.2, 1.3, 0.2] is: [0.] |
15 | Accuracy in test data set: 1.0 |