diff --git a/assignment-3/submission/18307130074/img/1.png b/assignment-3/submission/18307130074/img/1.png new file mode 100644 index 0000000000000000000000000000000000000000..28940aa85155f930f4ae0605860d55e5fbc8fc5e Binary files /dev/null and b/assignment-3/submission/18307130074/img/1.png differ diff --git a/assignment-3/submission/18307130074/img/10.png b/assignment-3/submission/18307130074/img/10.png new file mode 100644 index 0000000000000000000000000000000000000000..c3f414598d56329fe0e7d2b534261ab6257ffd57 Binary files /dev/null and b/assignment-3/submission/18307130074/img/10.png differ diff --git a/assignment-3/submission/18307130074/img/11.png b/assignment-3/submission/18307130074/img/11.png new file mode 100644 index 0000000000000000000000000000000000000000..c18b6d5424973985f364a38aff71ac75a173b7aa Binary files /dev/null and b/assignment-3/submission/18307130074/img/11.png differ diff --git a/assignment-3/submission/18307130074/img/12.png b/assignment-3/submission/18307130074/img/12.png new file mode 100644 index 0000000000000000000000000000000000000000..4d2b7815271f67c011d4e096b43a4f74d5ebf6f8 Binary files /dev/null and b/assignment-3/submission/18307130074/img/12.png differ diff --git a/assignment-3/submission/18307130074/img/2.png b/assignment-3/submission/18307130074/img/2.png new file mode 100644 index 0000000000000000000000000000000000000000..b5c933facbbefa9dfa3a590d383dad5ac0ef8fa5 Binary files /dev/null and b/assignment-3/submission/18307130074/img/2.png differ diff --git a/assignment-3/submission/18307130074/img/3.png b/assignment-3/submission/18307130074/img/3.png new file mode 100644 index 0000000000000000000000000000000000000000..3a63d6e8a75ee1b8cb04af13b6526fe58c785701 Binary files /dev/null and b/assignment-3/submission/18307130074/img/3.png differ diff --git a/assignment-3/submission/18307130074/img/4.png b/assignment-3/submission/18307130074/img/4.png new file mode 100644 index 0000000000000000000000000000000000000000..42cc6f1fdf472568529144518bd324ac29255f45 Binary files /dev/null and b/assignment-3/submission/18307130074/img/4.png differ diff --git a/assignment-3/submission/18307130074/img/5.png b/assignment-3/submission/18307130074/img/5.png new file mode 100644 index 0000000000000000000000000000000000000000..51bacd94e84a6daae0267ce0f70345a6f25b1687 Binary files /dev/null and b/assignment-3/submission/18307130074/img/5.png differ diff --git a/assignment-3/submission/18307130074/img/6.png b/assignment-3/submission/18307130074/img/6.png new file mode 100644 index 0000000000000000000000000000000000000000..98fb6af1c032bc531d1e05619c90d0739df9daca Binary files /dev/null and b/assignment-3/submission/18307130074/img/6.png differ diff --git a/assignment-3/submission/18307130074/img/7.png b/assignment-3/submission/18307130074/img/7.png new file mode 100644 index 0000000000000000000000000000000000000000..12a6d231038826c35f25424bb7d989946f94a904 Binary files /dev/null and b/assignment-3/submission/18307130074/img/7.png differ diff --git a/assignment-3/submission/18307130074/img/8.png b/assignment-3/submission/18307130074/img/8.png new file mode 100644 index 0000000000000000000000000000000000000000..b2b8f76b2528006b6008901d26c02758d2739193 Binary files /dev/null and b/assignment-3/submission/18307130074/img/8.png differ diff --git a/assignment-3/submission/18307130074/img/9.png b/assignment-3/submission/18307130074/img/9.png new file mode 100644 index 0000000000000000000000000000000000000000..9347afb312151f4c85e25f8c61c5c69f0719b9da Binary files /dev/null and b/assignment-3/submission/18307130074/img/9.png differ diff --git a/assignment-3/submission/18307130074/readme.md b/assignment-3/submission/18307130074/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..d6f4a1ba0cf7c91c0ed254b9047b692b2bd6a2a0 --- /dev/null +++ b/assignment-3/submission/18307130074/readme.md @@ -0,0 +1,162 @@ +# Assignment3:K-Means&GMM + +> 姓名:姜博天 +> +> 学号:18307130074 + +## 1. K-Means + +### 1.1 算法实现 + +​ 根据readme.md的提示查阅scikit-learn的中文文档,叙述如下:K-means 通常被称为 Lloyd’s algorithm(劳埃德算法)。在基本术语中,算法有三个步骤: 第一步是选择 initial centroids (初始质心),最基本的方法是从**X**数据集中选择 **K**个样本。初始化完成后,K-means 由两个其他步骤之间的循环组成。 第一步将每个样本分配到其 nearest centroid (最近的质心)。第二步通过取分配给每个先前质心的所有样本的平均值来创建新的质心。计算旧的和新的质心之间的差异,并且算法重复这些最后的两个步骤,直到该值小于阈值。换句话说,算法重复这个步骤,直到质心不再显著移动。K-means 相当于具有 小的全对称协方差矩阵的 期望最大化算法。 + +**初始中心的选择** + +​ 给定足够的时间,K-means 将总是收敛的,但这可能是局部最小的。这很大程度上取决于质心的初始化。 因此,通常会进行几次初始化不同质心的计算。帮助解决这个问题的一种方法是 k-means++ 初始化方案,它将初始化质心,并尽可能使其彼此远离,导致比随机初始化更好的结果。在本次作业中实现了随机化以及K-means++初始化两种不同的初始化质心的方法。 + +**距离度量** + +​ 使用欧几里得距离作为计算两个数据点距离的方式 + + + +### 1.2 设计实验 + +参照tester_demo.py中的数据生成器构造数据 + +**构造一组如tester_demo.py中data_1的较分散普通数据** + +得到如下图像,共计迭代21次![](./img/1.png)**构造一组二维较密集的数据** + +得到如下图像,共计迭代45次 + +![](./img/2.png) + +**使用KMeans++初始化质心,重复实验,观察迭代次数的变化** + +迭代次数16次,得到如下图像 + +![](./img/10.png) + +迭代次数47次,得到如下图像 + +![](./img/11.png) + + + +## 2. GMM + +### 2.1 算法实现 + +​ GMM假设数据是从多个高斯分布中生成的:有K个高斯分布,赋予每一个分布一个权重,每当生成一个数据时,就按权重的比例随机选择一个分布,然后按照该分布生成数据。概率密度函数如下: +$$ +P(x|\theta) = \sum_{i=1}^{K}p_k \times N(x|\theta_k) +$$ +GMM聚类的主要思想仍是通过EM方法,主要分为两步: + + 1. E: 求出期望 + 2. M: 最大似然估计参数 + +直到最大似然收敛为止 + +根据数学推导,我们每一轮修改参数的公式如下: +$$ +p_k^{(t+1)} = \frac{1}{N} \sum_{i=1}^NP(zi=k|x_i,\theta^{(t)}) = \frac{1}{N}\sum_{i=1}^Nr_{ik}\\ +令s_k=\sum_{i=1}^Nr_{ik}\\ +\mu_k = \frac{1}{s_k}\sum_{i=1}^{N}r_{ik}x_i\\ +\Sigma_k = \frac{1}{s_k}\sum_{i=1}^Nr_{ik}(x_i-\mu_k)^T(x_i-\mu_k)\\ +其中r_{ik}为后验概率,r_{ik}=即P(z_i=k|x_i,\theta^{(t)}) +$$ +手动设置迭代次数上限为30,期望达成的目标是 +$$ +\theta^{(t+1)} = argmax E_{z|x,\theta^{(t)}}[logP(x,z|\theta^{(t)})] +$$ + + +### 2.2 设计实验 + +参照tester_demo.py中的数据生成器构造数据 + +**构造一组如tester_demo.py中data_1的较分散普通数据**得到如下图像 + +![](./img/3.png) + +**将迭代次数修改到50**得到如下图像 + +![](./img/4.png) + +**构造一组二维较密集的数据**得到如下图像, + +![](./img/5.png) + +**将迭代次数修改到50**得到如下图像 + +![](./img/6.png) + +可以看到30和50的区别并不明显,猜测是因为到30以后,每次迭代对参数的改变量极小,那么将迭代次数改为12 + +**密集数据的结果如下所示**![](./img/7.png) + +**稀疏数据的结果如下所示** + +![](./img/12.png) + + + + + +## 3. 自动选择聚簇数量 + +### 3.1 算法实现 + +​ 在scikit-learn的中文文档中有有关如何选取聚簇数量的描述:一种高效的方法是利用 BIC(贝叶斯信息准则)来选择高斯混合的分量数。 理论上,它仅当在近似状态下可以恢复正确的分量数(即如果有大量数据可用,并且假设这些数据实际上是一个混合高斯模型独立同分布生成的)。注意:使用变分贝叶斯高斯混合可以避免高斯混合模型中分量数的选择。所以考虑用BIC来实现自动选择聚簇数量算法。通过查阅资料,发现还有一种度量叫做AIC,本次作业我采用类似F1score的打分方式为模型打分,即利用2 * AIC * BIC / (AIC + BIC)作为模型的评分,通过遍历K找到最合适的K值。 +$$ +AIC\ = 2 * k - 2 * ln(L)\\ +BIC\ = k * ln(n) - 2 * ln(L)\\ +Score = \frac{2 * AIC *BIC}{AIC + BIC} +$$ + + +### 2.2 设计实验 + +| Score | K | +| ----- | ---- | +| 39093 | 2 | +| 37947 | 3 | +| 37824 | 4 | +| 38504 | 5 | +| 38378 | 6 | +| 38955 | 7 | +| 39319 | 8 | +| 39010 | 9 | +| 38447 | 10 | +| 39277 | 11 | + +![](./img/8.png) + +**** + +**使用KMeans++初始化质心,观察是否有影响** + +| Score | K | +| ----- | ---- | +| 39177 | 2 | +| 37903 | 3 | +| 37752 | 4 | +| 37471 | 5 | +| 37990 | 6 | +| 38210 | 7 | +| 38428 | 8 | +| 38345 | 9 | +| 38561 | 10 | +| 38796 | 11 | + +![](./img/9.png) + +肉眼观察的话,可以感觉会使得Score略比不适用KMeans++时低一些,也就是说,性能要稍微好一些,不过由于低的幅度极小,所以也可能是由于数据的偏差引起的。 + + + +## 4. 实验结论 + +KMeans从某种程度上来说是GMM的一个特例,但是很显然KMeans的效率要比GMM高很多,并且原理简单。GMM更适合去做需要概率而不是确定分类的任务,如果对分类精确度要求没那没高,采用KMeans是一个非常不错的算法。 \ No newline at end of file diff --git a/assignment-3/submission/18307130074/source.py b/assignment-3/submission/18307130074/source.py new file mode 100644 index 0000000000000000000000000000000000000000..efd87a866982d2192b219a0e7a58b5b7a746493f --- /dev/null +++ b/assignment-3/submission/18307130074/source.py @@ -0,0 +1,423 @@ +import numpy as np +import matplotlib.pyplot as plt +plt.style.use('seaborn') + + +# 计算两个点的欧几里得距离 +def Euclidistance(pointa, pointb): + return np.sqrt(np.sum(np.power(pointa - pointb, 2))) + + +# 从数据集中随机生成k个质心 +def CentroidGenerator(K, data): + t_data = list(data) + np.random.shuffle(t_data) + centroids = t_data[:K] + return np.mat(centroids) + + +# 从数据集中利用KMeans++初始化质心 +def Centroid_Generator(K, data): + + centroids = [] + centroids.append(data[np.random.randint(data.shape[0]), :]) # 随机选择一个点作为第一个质心 + + # 迭代 k-1 次 + for cnt in range(K-1): + + dis = [] # 存储距离 + + for i in range(data.shape[0]): + + point = data[i, :] + minn = float('inf') + + for j in range(len(centroids)): # 扫描所有质心,选出该样本点与最近的类中心的距离 + + mid = Euclidistance(point, centroids[j]) + minn = min(minn, mid) + + dis.append(minn) + + dis = np.array(dis) + next_centroid = data[np.argmax(dis), :] # 返回最大dis的下标,即距离当前所有质心最远的一个点 + centroids.append(next_centroid) + + centroids = np.mat(centroids) + return centroids + + +class KMeans: + """ + 1.从数据集中随机选择k个作为质心, + 2.将每个样本分配给其最近的质心 + 3.创建新的质心 + 4.重复2、3两步直到结束 + """ + + def __init__(self, n_clusters): + """ + K: 簇的数量 + num: 数据集大小 + centroids: 当前状态下K个质心 + """ + + self.K = n_clusters + self.num = 0 + self.c = 0 + self.centroids = None + + def fit(self, train_data): + self.num = train_data.shape[0] + + # 构建一个信息矩阵以存储每一个数据点所属的类以及其距离质心的距离 + self.InfoMatrix = np.mat(np.zeros((self.num, 2))) + + # self.centroids = CentroidGenerator(self.K, train_data) + self.centroids = Centroid_Generator(self.K, train_data) + + # 判定是否继续聚类的指示变量 + continueC = True + + while continueC: + + continueC = False + self.c += 1 + + # 将每个样本分配给其最近的质心 + for i in range(self.num): + + minn = float('inf') # 该样本数据点距离质心的最小距离 + clusterNo = -1 # 该样本点所属的类别编号 + + # 寻找最近的质心 + for j in range(self.K): + dist = Euclidistance(train_data[i], self.centroids[j]) + if dist < minn: + minn = dist + clusterNo = j + # 如果某个点所属的类发生了变化则需要继续进行聚类 + if self.InfoMatrix[i, 0] != clusterNo: + continueC = True + # 更新信息矩阵 + self.InfoMatrix[i, :] = clusterNo, minn + + # 创建新的质心 + for i in range(self.K): + MidMatrix = train_data[np.nonzero(self.InfoMatrix[:, 0].A == i)[0]] + self.centroids[i, :] = np.mean(MidMatrix, axis=0) + + self.InfoMatrix = np.array(self.InfoMatrix) + + def predict(self, test_data): + + size = test_data.shape[0] # 获取测试集的大小 + ans = np.zeros((size, 1)) # 构建答案矩阵以存储每一个测试集数据所属的簇 + for i in range(size): + + minn = float('inf') # 该样本数据点距离质心的最小距离 + clusterNo = -1 # 该样本点所属的类别编号 + + # 寻找最近的质心 + for j in range(self.K): + dist = Euclidistance(test_data[i], self.centroids[j]) + if dist < minn: + minn = dist + clusterNo = j + + ans[i] = clusterNo + + ans = ans.squeeze() + return ans + + +class GaussianMixture: + + def __init__(self, n_clusters): + """ + K: 簇的数量 + p: 任一点属于每个簇的概率 + means: 参数均值 + cov: 参数协方差 + centroids: 为了统一画图而定义的变量,在高斯混合模型中不适用 + times: 迭代次数 + maxtimes: 最大迭代次数 + """ + self.K = n_clusters + self.centroids = None + self.p = None + self.means = None + self.cov = [] + self.times = 0 + self.maxtimes = 30 + + def fit(self, train_data): + + num, dimension = train_data.shape + r = np.zeros((num, self.K)) #概率矩阵 + + # 初始化并归一化概率 + self.p = np.random.rand(self.K) + self.p /= np.sum(self.p) + + # 通过KMeans初始化协方差和均值 + tpmodel = KMeans(self.K) + tpmodel.fit(train_data) + res = tpmodel.predict(train_data) + + self.means = tpmodel.centroids # 质心作为均值 + + for i in range(self.K): + + mid = train_data[np.where(res == i)] - self.means[i] + tri = np.eye(self.means[0].shape[1]) + covt = np.array(np.power(np.sum(np.multiply(mid, mid), axis=0), 0.5)) + for j in range(len(covt[0])): + tri[j][j] = covt[0][j] + self.cov.append(tri) # 构造协方差矩阵 + + + def Expectation(x, means, cov): + + size = np.shape(cov)[0] + cov += np.eye(size) * 0.001 + det = np.linalg.det(cov) + inv = np.linalg.inv(cov) + X = (x - means).reshape((1, size)) + return 1.0 / np.power(np.power(2 * np.pi, size) * np.abs(det), 0.5) * np.exp(-0.5 * X.dot(inv).dot(X.T))[0][0] + + while self.times < self.maxtimes: + + self.times += 1 + + # EM算法中的E步,计算期望 + for i in range(num): + + mid = [self.p[j] * Expectation(train_data[i], self.means[j], self.cov[j]) for j in range(self.K)] + sum = np.sum(np.array(mid)) + if sum == 0: + sum = self.K + r[i] = (mid / sum).reshape(r[i].shape) + + # EM算法中的M步,更新参数 + for i in range(self.K): + s = np.sum([r[j][i] for j in range(num)]) + self.p[i] = 1.0 * s / num + self.means[i] = 1.0 / s * (np.sum([r[j][i] * train_data[j] for j in range(num)], axis=0)) + X = train_data - self.means[i] + self.cov[i] = 1.0 / s * (np.sum([r[j][i] * X[j].reshape((dimension, 1)).dot(X[j].reshape((1, dimension))) for j in range(num)], axis=0)) + + def predict(self, test_data): + + size = test_data.shape[0] + r = np.zeros((size, self.K)) #概率矩阵 + + def Expectation(x, means, cov): + + size = np.shape(cov)[0] + cov += np.eye(size) * 0.0001 + det = np.linalg.det(cov) + inv = np.linalg.inv(cov) + X = (x - means).reshape((1, size)) + return 1.0 / np.power(np.power(2 * np.pi, size) * np.abs(det), 0.5) * np.exp(-0.5 * X.dot(inv).dot(X.T))[0][0] + + for i in range(size): + mid = [self.p[j] * Expectation(test_data[i], self.means[j], self.cov[j]) for j in range(self.K)] + sum = np.sum(np.array(mid)) + r[i] = (mid / sum).reshape(r[i].shape) + return np.argmax(r, axis=1) + + +def CA(model, data): + """ + AIC = 2 * k - 2 * ln(L) + """ + k = model.K + labels = np.array(model.InfoMatrix[:, 0].reshape(1,-1))[0].astype("int") + centroid = model.centroids + n, dimension = data.shape + S = sum([Euclidistance(data[np.where(labels == i)], centroid[i]) ** 2 for i in range(k)]) # 每个数据点和质心的距离平方和 + var = S * 1.0 / (n - k) / dimension + count = np.bincount(labels) # 每一类中数据的数量 + LN = np.sum([count[i] * (np.log(count[i]) - np.log(n)) - (count[i] * dimension / 2) * (np.log(2 * np.pi * var) + 1) + (dimension / 2) for i in range(k)]) + AIC = 2 * k - 2 * LN + return AIC + + + +def CB(model, data): + """ + BIC = k * ln(n) - 2 * ln(L) + """ + k = model.K + labels = np.array(model.InfoMatrix[:, 0].reshape(1,-1))[0].astype("int") + centroid = model.centroids + n, dimension = data.shape + S = sum([Euclidistance(data[np.where(labels == i)], centroid[i]) ** 2 for i in range(k)]) # 每个数据点和质心的距离平方和 + var = S * 1.0 / (n - k) / dimension + count = np.bincount(labels) # 每一类中数据的数量 + LN = np.sum([count[i] * (np.log(count[i]) - np.log(n)) - (count[i] * dimension / 2) * (np.log(2 * np.pi * var) + 1) + (dimension / 2) for i in range(k)]) + BIC = k * np.log(n) - 2 * LN + return BIC + + +class ClusteringAlgorithm: + + def __init__(self): + self.Kmax = 12 + self.score = float('inf') + self.centroids = None + self.K = 0 + + def fit(self, train_data): + for i in range(2, self.Kmax): + model = KMeans(i) + model.fit(train_data) + AIC = CA(model, train_data) + BIC = CB(model, train_data) + Score = 2 * AIC * BIC / (AIC + BIC) + print(Score) + if Score < self.score: + self.K = i + self.score = Score + + + def predict(self, test_data): + model = KMeans(self.K) + model.fit(test_data) + self.centroids = model.centroids + return model.predict(test_data) + + +# 数据随机化函数 +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +# 簇质心可视化函数 +def centroid_visualize(centroids): + midlist = np.transpose(centroids).tolist() + plt.scatter(midlist[0], midlist[1], marker="*", color='yellow', s=100) + + +# 簇可视化函数 +def cluster_visualize(clusters, centroids, cnt): + K = len(clusters) + color = plt.cm.get_cmap("plasma", K+1) + for i in range(K): + midlist = np.transpose(clusters[i]).tolist() + plt.scatter(midlist[0], midlist[1], color=color(i), s=10) + # centroid_visualize(centroids) + plt.savefig('./img/%d.png'%cnt, dpi=400) + plt.show() + + +# 下面是数据生成函数 +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + mean = (10, -10) + cov = np.array([[10, 0], [0, 10]]) + w = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z, w) + return data, 4 + + +def data_2(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (2, -1) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (-1, 2) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + mean = (0, -0) + cov = np.array([[10, 0], [0, 10]]) + w = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z, w) + return data, 4 + + +def data_3(): + mean = (0, 0, 0) + cov = np.array([[10, 0, 0], [0, 10, 0], [0, 0, 10]]) + x = np.random.multivariate_normal(mean, cov, (500,)) + + mean = (0, 8, 12) + cov = np.array([[5, 0, 0], [0, 5, 0], [0, 0, 5]]) + y = np.random.multivariate_normal(mean, cov, (500,)) + + mean = (0, 20, 1) + cov = np.array([[7, 0, 0], [0, 7, 0], [0, 0, 7]]) + z = np.random.multivariate_normal(mean, cov, (500,)) + + data, _ = shuffle(x, y, z) + return data, 3 + + +def data_4(): + mean = (0, 0, 0) + cov = np.array([[10, 0, 0], [0, 10, 0], [0, 0, 10]]) + x = np.random.multivariate_normal(mean, cov, (500,)) + + mean = (1, 1, 1) + cov = np.array([[5, 0, 0], [0, 5, 0], [0, 0, 5]]) + y = np.random.multivariate_normal(mean, cov, (500,)) + + mean = (-2, 5, 1) + cov = np.array([[7, 0, 0], [0, 7, 0], [0, 0, 7]]) + z = np.random.multivariate_normal(mean, cov, (500,)) + + data, _ = shuffle(x, y, z) + return data, 3 + + +# 分析作图函数 +def Analysis(data_function, algorithm_class): + data, n_clusters = data_function() + model = algorithm_class(n_clusters) + model.fit(data) + res = model.predict(data) + clusters = [data[np.where(res==i)] for i in range(model.K)] + cluster_visualize(clusters, model.centroids, 12) + + +def Analysis_algorithm(data_function, algorithm_class): + data, n_clusters = data_function() + model = algorithm_class() + model.fit(data) + res = model.predict(data) + print(model.K) + clusters = [data[np.where(res==i)] for i in range(model.K)] + cluster_visualize(clusters, model.centroids, 9) + + +if __name__ == "__main__": + Analysis(data_1, GaussianMixture) diff --git a/assignment-3/submission/18307130074/tester_demo.py b/assignment-3/submission/18307130074/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..19ec0e8091691d4aaaa6b53dbb695fde9e826d89 --- /dev/null +++ b/assignment-3/submission/18307130074/tester_demo.py @@ -0,0 +1,117 @@ +import numpy as np +import sys + +from source import KMeans, GaussianMixture + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all()