diff --git a/assignment-3/handout/source.py b/assignment-3/handout/source.py index 43aa5bb327e9a34a31b8c553af7cd9104acdd220..5cdb02368d31afb3cdf967cc26eec7b132e4b9f7 100644 --- a/assignment-3/handout/source.py +++ b/assignment-3/handout/source.py @@ -1,34 +1,37 @@ import numpy as np + class KMeans: def __init__(self, n_clusters): pass - + def fit(self, train_data): pass - + def predict(self, test_data): pass + class GaussianMixture: def __init__(self, n_clusters): pass - + def fit(self, train_data): pass - + def predict(self, test_data): pass + class ClusteringAlgorithm: def __init__(self): pass - + def fit(self, train_data): pass - + def predict(self, test_data): - pass + pass \ No newline at end of file diff --git a/assignment-3/handout/tester_demo.py b/assignment-3/handout/tester_demo.py index 19ec0e8091691d4aaaa6b53dbb695fde9e826d89..a569dddc62de7f598dcc986179caedd842e44b77 100644 --- a/assignment-3/handout/tester_demo.py +++ b/assignment-3/handout/tester_demo.py @@ -114,4 +114,4 @@ if __name__ == "__main__": if len(sys.argv) > 1 and sys.argv[1] == "--report": test_all(True) else: - test_all() + test_all() \ No newline at end of file diff --git a/assignment-3/submission/19307110020/README.md b/assignment-3/submission/19307110020/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f708b655d91352c64baa33d2ba2ddb4fb84d70dd --- /dev/null +++ b/assignment-3/submission/19307110020/README.md @@ -0,0 +1,210 @@ +# HW3-聚类算法 + +## 19307110020-贺正夫 + +[TOC] + +### 一、简介 + +- 在本次作业中,实现了K-means++和使用EM算法完成无监督学习的高斯混合模型。 +- 实现了各个维度与各个数量级样本数量下的实验与可视化,验证了算法的正确性与对输入数据的鲁棒性。 +- 实现了自动寻找最优聚类数量的算法。 + +### 二、模型说明 + +在本作业中实现了两种聚类算法。其中Kmeans算法采用Kmeans++初始化,其余方法十分平凡,不赘述。 + +GMM算法也十分平凡,但是在此处由于需要初始化协方差矩阵,想要尽可能精确地初始化并没有什么好的办法,有同学认为精确的协方差矩阵初始化不是必要的,我没有系统学过概率论,但是有总比没有好嘛。于是对每个维度单独min-max标准化,并采用单位阵做初始协方差矩阵,这样尽可能的忠实贴合原数据分布,不影响聚类结果,在可视化中将其复原。以下是EM算法的推导: + +#### Initialization: + +采用Kmeans++初始化K个高斯分布的均值,每个分布的prior是相同的。 + +#### E_step: + +计算概率的时候,先按照每个类算后验,再按每个点归一化。 + +#### M_step: + +先计算每一类的概率之和。 + + +$$ +n_k = \sum_{x \in C} P(x_k) +$$ +第k类的均值更新公式为 +$$ +\mu_k = \sum_{x\in C}x *P(x_k) /n_k +$$ +第k类的方差更新公式为: +$$ +Cov_k=(X - \mu_k)^2 * P(x_k) +$$ +其中的减法与乘法都是np中适用的。 + +### 三、基础实验 + +本文针对上述两种算法,做了如下实验: + +#### 实验1:基础 + +基本实验信息: + +**聚类类别:3** + +| 原始数据信息 | Mean | Covariance | +| ------------ | -------- | ---------------------- | +| Cluster 1 | (1, 2) | [[73, 0], [0, 22]] | +| Cluster 2 | (16, -5) | [[21.2, 0], [0, 32.1]] | +| Cluster 3 | (10, 22) | [[10, 5], [5, 10]] | + +| K-means聚类中心结果 | Kmeans聚类中心坐标 | Ground Truth | +| ------------------- | ------------------ | ------------ | +| Cluster 1 | [-2.7, 2.2] | (1, 2) | +| Cluster 2 | [13.4, -2.0] | (16, -5) | +| Cluster 3 | [10.0, 21.9] | (10, 22) | + +已经较好的完成了需求 + +![Figure_1](img/Figure_1.png) + + + +| GMM聚类中心结果 | GMM聚类中心坐标 | Ground Truth | +| --------------- | --------------- | ------------ | +| Cluster 1 | [1.0 1.9] | (1, 2) | +| Cluster 2 | [15.7, -6.0] | (16, -5) | +| Cluster 3 | [9.9, 22.0] | (10, 22) | + +| GMM聚类方差结果 | GMM聚类方差 | Ground Truth | +| --------------- | ---------------------------- | ---------------------- | +| Cluster 1 | [[66.0, 3.9], [3.9, 21.9]] | [[73, 0], [0, 22]] | +| Cluster 2 | [[28.9, -4.6], [-4.6, 34.4]] | [[21.2, 0], [0, 32.1]] | +| Cluster 3 | [[10.1, 4.3], [4.3, 9.3]] | [[10, 5], [5, 10]] | + +显而易见的是,GMM得到了更准确的聚类中心。方差一项无从比较,但是也十分贴合原有的方差分布。 + +![Figure_1](img/Figure_2.png) + +#### 实验2:更多类,更崎岖的高斯分布 + +基本实验信息: + +**聚类类别:5** + +| 原始数据信息 | Mean | Covariance | +| ------------ | ---------- | ---------------------- | +| Cluster 1 | (1, 2) | [[73, 0], [0, 22]] | +| Cluster 2 | (16, -5) | [[21.2, 0], [0, 32.1]] | +| Cluster 3 | (10, 22) | [[10, 5], [5, 10]] | +| Cluster 4 | (-2, 18) | [[10, 5], [5, 10]] | +| Cluster 5 | (-15, -10) | [[22, 0], [0, 88]] | + +| K-means聚类中心结果 | Kmeans聚类中心坐标 | Ground Truth | 备注 | +| ------------------- | ------------------ | ------------ | ------------------------ | +| Cluster 1 | [-9.75 , 0.0] | (1, 2) | 这类几乎不能找到对应的GT | +| Cluster 2 | [10.3, -1.3] | (16, -5) | 这类几乎不能找到对应的GT | +| Cluster 3 | [ 9.9, 22.0] | (10, 22) | | +| Cluster 4 | [-1.7, 15.7] | (-2, 18) | | +| Cluster 5 | [-15.4, -15.7] | (-15, -10) | | + +此时KMeans表现并不好,原因很显然:数据的分布十分不规整,正态分布形状各异,单纯的靠距离最近的算法已经很难找到合适的中心。这是因为采样是概率的,但是距离最近不是概然的。这是由于其默认每个分布的协方差矩阵总是单位阵的倍数的原因。因此在采样密度较小的区域,采样对应的高斯分布将完全失去对这些区域的预测能力,相反地,距离更近的聚类中心会获得这些点,故产生了许多错误预测。 + +从下图中可见,每个聚类的高斯轮廓都接近圆形,这是Kmeans在处理形状各异的高斯分布时很大的缺陷。 + +![Figure_4](img/Figure_4.png) + +| GMM聚类中心结果 | GMM聚类中心坐标 | Ground Truth | +| --------------- | :-------------- | ------------ | +| Cluster 1 | [ -2.1 18.0] | (-2, 18) | +| Cluster 2 | [ -1.0 1.8] | (1, 2) | +| Cluster 3 | [ 15.4 -3.4] | (16, -5) | +| Cluster 4 | [ 10.0 22.1] | (10, 22) | +| Cluster 5 | [-15.0 -11.5] | (-15, -10) | + +这一组数据相当精确。在寻找对应的聚类中心时几乎没有任何困难。 + +| GMM聚类方差结果 | GMM聚类方差 | Ground Truth | +| --------------- | :-------------------------- | ---------------------- | +| Cluster 1 | [[ 8.5 4.6], [ 4.6 10.8]] | [[10, 5], [5, 10]] | +| Cluster 2 | [[68.7 4.9], [ 4.9 27.5]] | [[73, 0], [0, 22]] | +| Cluster 3 | [[21.8 -2.9 ], [-2.9 33.8]] | [[21.2, 0], [0, 32.1]] | +| Cluster 4 | [[11.3 5.4], [5.4 9.8]] | [[10, 5], [5, 10]] | +| Cluster 5 | [[21.7 0.2], [ 0.2 75.6]]] | [[22, 0], [0, 88]] | + +这组数据也极其精确。可以认为本算法在二维上的分类具有极其精确的聚类效果,即使不同隐变量对应的高斯分布交叠较多,也可以在很高的精度下完成聚类任务。 + +![Figure_3](img/Figure_3.png) + +#### 实验3:极少数据下的鲁棒性 + +为方便可视化,在本实验中采用二维数据。在更高维数据上均可以达到一样的效果,如tester中的样例2。 + +**聚类类别:2** + +仅输入三个点。两个较近一个较远。此时不需要统计学数据来表达算法的结果,直接看如下图片即可: + +![Figure_5](img/Figure_5.png) + +![Figure_6](img/Figure_6.png) + +此时红色聚类对应的高斯分布并没有在图上显示出来!因为它尺度坍缩到了1e-150数量级。对此我不知道如何解释,但是可以正确聚类。 + +#### 实验4:忽略离群点 + +在有些数据中,本可以明显分成K类,但是由于数据并不总是干净的,可能出现一些离群点,本部分试图探索算法能否忽略它们。 + +主体数据采用实验1中的数据,手动加入离群点(100, 100), (100, -100), (-100, 100), (-100, -100)。结果如下: + + + +![Figure_7](img/Figure_7.png) + +![Figure_8](img/Figure_8.png) + +显然,四个点就可以让这两个聚类算法败的很惨。其中GMM的图是蓝色的是因为红色聚类对应的正态分布尺度太大了,已经可以覆盖到3000-4000坐标。这显然不是我们期待的。 + +这和采用的Kmeans++初始化脱不了干系。于是本作业中提出的Kmeans++++应运而生。 + +### 四、Kmeans++++ + +在Kmeans++逐一选取剩余K-1个聚类中心时,如果有明显比距离均值高出`threshold`倍的点,那么其就不会成为可被选取的中心点,其距离(即之后被采样的概率)被置为0。代码如下: + +```python +if self.quadra_plus_threshold is not None: + for i in range(distances.shape[0]): + if distances[i] > self.quadra_plus_threshold * distances.mean(): + distances[i] = 0. +``` + +相比于Kmeans++的效果,Kmeans++++表现如下: + +![Figure_9](img/Figure_9.png) + +其不受离群点影响,回到了正常的聚类情况。 + +但是对于GMM来说,这一改进并没有带来太大的帮助,这是因为其在Maximization步继续会受到离群点的影响。思索良久也没有找到解决办法,似乎只能做预先的离群点处理,限于时间,遂作罢。 + +### 五、自动寻找K值 + +Elbow method是一个经验方法,而且肉眼观察也因人而异,特别是遇到模棱两可的时候。因此本文采用Stanford University于2000年提出的Gap Statistics方法。原理如下: + + +$$ +Gap(K)=E(logD_k)−logD_k \\ +where\ \ D_k =\sum_{i=1}^K\sum_{x\in C_i}||x-M_i|| +$$ +其中$M_i$表示第$i$个聚类中心点,这和Elbow Method采用的定义相同。 + +直观来讲,Elbow Method希望找到使$D_k$下降最快的k,但究竟什么是最快需要自己判断,在Gap Statistics中,引入了一个相对均匀变化的量,随着K的增加,其可以作为$D_k$减少的参照,因此gap就衡量了其成为elbow的权重。取argmax即可找到合适的K。 + +具体实现详见代码与注释。 + +以下是对一简单样例的运行结果:![Figure_10](img/Figure_10.png) + +以下是对难度较高的样例1的运行结果:![Figure_11](img/Figure_11.png) + +### 六、感想 + +最后一次作业了。感觉自己学到了很多发现问题解决问题的能力,但是代码水平还是不够看,最后一个实验跑的时间很久,差不多可以训练一个小模型了...以及自己的概率论水平也不太够。希望多多锻炼。 + diff --git a/assignment-3/submission/19307110020/img/Figure_1.png b/assignment-3/submission/19307110020/img/Figure_1.png new file mode 100644 index 0000000000000000000000000000000000000000..43f21a7ef70ede47b53b4494e1a238902c7bb899 Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_1.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_10.png b/assignment-3/submission/19307110020/img/Figure_10.png new file mode 100644 index 0000000000000000000000000000000000000000..1779c81a8ab7b6cf3f5d9a25fb369dece13052cd Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_10.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_11.png b/assignment-3/submission/19307110020/img/Figure_11.png new file mode 100644 index 0000000000000000000000000000000000000000..6d8d181b72d6a136562b629e75de7532f816b0bd Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_11.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_2.png b/assignment-3/submission/19307110020/img/Figure_2.png new file mode 100644 index 0000000000000000000000000000000000000000..2646c4a49a909fabb1abae549a2e770cf983b2a6 Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_2.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_3.png b/assignment-3/submission/19307110020/img/Figure_3.png new file mode 100644 index 0000000000000000000000000000000000000000..38ea0a5f1655358633cb801b09e276651e536bb9 Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_3.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_4.png b/assignment-3/submission/19307110020/img/Figure_4.png new file mode 100644 index 0000000000000000000000000000000000000000..87ec588d33c78305784e7996f13c352ec73de5ec Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_4.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_5.png b/assignment-3/submission/19307110020/img/Figure_5.png new file mode 100644 index 0000000000000000000000000000000000000000..0a7643a449a8f19f28d367a6ea50b28d9c646eb5 Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_5.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_6.png b/assignment-3/submission/19307110020/img/Figure_6.png new file mode 100644 index 0000000000000000000000000000000000000000..7376955f3755f3f8c414702644ce7eab9f8db269 Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_6.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_7.png b/assignment-3/submission/19307110020/img/Figure_7.png new file mode 100644 index 0000000000000000000000000000000000000000..1a10faaddcb477cbc78ff4bff89da21bbd326116 Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_7.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_8.png b/assignment-3/submission/19307110020/img/Figure_8.png new file mode 100644 index 0000000000000000000000000000000000000000..375c2122e3ac1bca915337b4ec8a51a5866c1bfb Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_8.png differ diff --git a/assignment-3/submission/19307110020/img/Figure_9.png b/assignment-3/submission/19307110020/img/Figure_9.png new file mode 100644 index 0000000000000000000000000000000000000000..4bbe40d2ec4322239b17fa51c924797a4a70bd3e Binary files /dev/null and b/assignment-3/submission/19307110020/img/Figure_9.png differ diff --git a/assignment-3/submission/19307110020/source.py b/assignment-3/submission/19307110020/source.py new file mode 100644 index 0000000000000000000000000000000000000000..79f46e7e5e3717ce07837a661b57b3592b4d10fe --- /dev/null +++ b/assignment-3/submission/19307110020/source.py @@ -0,0 +1,201 @@ +import numpy as np + + +def calculate_distance(x1, x2): + return np.mean(np.sqrt((x1 - x2) ** 2)) + + +class KMeans: + def __init__(self, n_cluster, epochs=10, quadra_plus_threshold=10): # class initialization + self.n_cluster = n_cluster + self.epochs = epochs + self.centers = [] + self.quadra_plus_threshold = quadra_plus_threshold + + def fit(self, train_data): + def find_nearest_centroid(point): + """ + find the nearest class center and returns the distance. + :param point: query point + :return: minimum distance to every current center. + """ + res = calculate_distance(point, centroid_list[0]) + for _ in range(1, len(centroid_list)): + res = min(res, calculate_distance(point, centroid_list[_])) + return res + + # randomly initialize the first centroid. + points_list = list(train_data) + centroid_list = [] + index = np.random.randint(0, len(points_list) - 1) + centroid_list.append(points_list[index]) + del points_list[index] + + # randomly choose the rest cluster centers. + # the sampling weight of each point is the minimum distance to every current center. + for _ in range(self.n_cluster - 1): + distances = [find_nearest_centroid(__) for __ in points_list] + distances = np.array([x ** 2 for x in distances]) + tmpmean = distances.mean() + if self.quadra_plus_threshold is not None: # filter outsiders! + for i in range(distances.shape[0]): + if distances[i] > self.quadra_plus_threshold * tmpmean: + distances[i] = 0. + weighted_list = distances / np.sum(distances) + chosen_index = np.random.choice(np.arange(len(distances)), p=weighted_list) + centroid_list.append(points_list[chosen_index]) + del points_list[chosen_index] + self.centers = centroid_list + + # Ordinary KMeans iterative steps. + # Predict the cluster of each point and update class centers with the mean of points assigned to this cluster. + for epoch in range(self.epochs): + self.predict_cluster = self.predict(train_data) + for ct in range(len(self.centers)): + idx, = np.where(self.predict_cluster == ct) + samples = train_data[idx, :] + self.centers[ct] = np.mean(samples, axis=0) + + return self + + def predict(self, test_data): + """ + Predict the cluster of each point according to its nearest centroid. + :return: predicted cluster labels + """ + predict_cluster = np.zeros(shape=(len(test_data),)) + for n, arr in enumerate(test_data): + predict_cluster[n] = np.argmin([calculate_distance(arr, _) for _ in self.centers]) # minimum distance + return predict_cluster + + +def scale_data(data): + """ + standardization of input data + :param data: np.array + :return: min-max standardized data + """ + for i in range(data.shape[1]): + max_ = data[:, i].max() + min_ = data[:, i].min() + data[:, i] = (data[:, i] - min_) / (max_ - min_) + return data + + +def normal(x, mean, cov): + """ + :param x: query data + :param mean: mean of the given normal distribution + :param cov: covariance matrix of the given normal distribution + :return: pdf value + """ + s, u = np.linalg.eigh(cov) + t = s.dtype.char.lower() + eps = 1E6 * np.finfo(t).eps * np.max(abs(s)) + s_pinv = np.array([0 if abs(x) <= eps else min(1/x, 1e8) for x in s], dtype=float) + d = s[s > eps] + maha = np.sum(np.square(np.dot((x - mean), np.multiply(u, np.sqrt(s_pinv)))), axis=-1) + return np.exp(-0.5 * (len(d) * np.log(2 * np.pi) + np.sum(np.log(d)) + maha)) + + +class GaussianMixture: + def __init__(self, n_cluster, epochs=200): + self.n_cluster = n_cluster + self.p = np.random.rand(self.n_cluster) + self.p = self.p / self.p.sum() # all p's sum up to 1 + self.epochs = epochs + + def fit(self, train_data): + self.train_data = scale_data(train_data) + self.N, self.D = train_data.shape + self.means = np.array(KMeans(self.n_cluster).fit(train_data).centers) # K-means++ initialization + self.covs = np.array([np.eye(self.D)] * self.n_cluster) + self.alpha = np.ones(self.n_cluster) / self.n_cluster # same prior + for epoch in range(self.epochs): # iterative optimization + self.e_step() + self.m_step() + return self + + def e_step(self): + """ + ordinary expectation step + """ + self.gamma = np.zeros((self.N, self.n_cluster)) + for k in range(self.n_cluster): + self.gamma[:, k] = self.alpha[k] * normal(self.train_data, self.means[k], self.covs[k]) + for i in range(self.N): + self.gamma[i, :] /= np.sum(self.gamma[i, :]) + + def m_step(self): + """ + ordinary maximization step + """ + for k in range(self.n_cluster): + nk = np.sum(self.gamma[:, k]) + self.means[k, :] = np.sum(self.train_data * self.gamma[:, k].reshape(-1, 1), axis=0) / nk + bias = self.train_data - self.means[k] + self.covs[k] = np.matmul(bias.T, (bias * self.gamma[:, k].reshape(-1, 1))) / nk + self.alpha[k] = nk / self.N + + def predict(self, test_data): + """ + :return: the predicted labels given priors and distribution + """ + test_data = scale_data(test_data) + density = np.empty((test_data.shape[0], self.n_cluster)) + for i in range(self.n_cluster): + density[:, i] = normal(test_data, self.means[i], self.covs[i]) + posterior = density * self.p + predicted = np.argmax(posterior, axis=1) + return predicted + + +class ClusteringAlgorithm: + def __init__(self, candidate_max=10, it=10): + self.it = it + self.candidate_list = range(2, candidate_max) # candidate_max must be larger than 2 + + def fit(self, train_data): + def generate_uniform_points(data): + """ + :return: a set of points uniformly sampled from the bounding box (can be rectangle or hyper-cubes) + """ + mins = np.argmin(data, axis=0) + maxs = np.argmax(data, axis=0) + reference_data_set = np.zeros((data.shape[0], data.shape[1])) + for i in range(data.shape[0]): + for j in range(data.shape[1]): + reference_data_set[i][j] = np.random.uniform(data[mins[j]][j], data[maxs[j]][j]) + return reference_data_set + + def dispersion(data, k): + """ + :return: expectation of the logD_k, the first item in the gap statistics function + """ + model = KMeans(k).fit(data) + distances_from_mean = np.arange(k) + for i in range(k): + for idx, label in enumerate(model.predict_cluster): + if i == label: + distances_from_mean[i] += np.sum((data[idx] - model.centers[i]) ** 2) + return np.log(np.sum(distances_from_mean)) + + def gap_statistic(data, nthcluster): + actual_dispersion = dispersion(data, nthcluster) + # ref_dispersion is the dispersion of the given data, sum of distances between all points and their centroid + ref_dispersion = np.mean([dispersion(generate_uniform_points(data), nthcluster) for _ in range(self.it)]) + return actual_dispersion, ref_dispersion + + # the following code block computes the optimum K + dispersion_values = np.zeros((len(self.candidate_list), 2)) + for i, cluster in enumerate(self.candidate_list): + dispersion_values[i] = gap_statistic(train_data, cluster) + gaps = dispersion_values[:, 1] - dispersion_values[:, 0] + n_clusters = np.argmax(gaps) + 2 # plus 2 because K starts from 2 + + # ordinary fit and predict given n_clusters + self.model = KMeans(n_clusters) + return self.model.fit(train_data) + + def predict(self, test_data): + return self.model.predict(test_data) \ No newline at end of file