diff --git a/assignment-3/submission/17307130331/README.md b/assignment-3/submission/17307130331/README.md new file mode 100644 index 0000000000000000000000000000000000000000..346d3af489c96fe167b2c767a91caff1197e158a --- /dev/null +++ b/assignment-3/submission/17307130331/README.md @@ -0,0 +1,62 @@ +# 实验报告 + +**17307130331 陈疏桐** + +本次实验实现了Kmeans聚类和GaussianMixture聚类模型,并生成数据完成了基本实验。模型代码见`source.py`。 + +## 基本实验 + +### 数据集1 + +随机产生二元高斯分布数据,数据集的详细情况如下: + +``` +CLUSTER-1: + mean = (0,0) + variance = ([[124,0],[0,225]]) +CLUSTER-2: + mean = (-35, 40) + variance = ([[32,0],[0,33]]) +CLUSTER-3: + mean = ((30, 50)) + variance = ([[44, 0], [0, 20]]) +``` + + +KMeans和GaussianMixture聚类结果分别如下: +![km1](img/p1.png) +![gs1](img/p2.png) + +图中不同颜色代表数据点的不同标签;红色点是簇的中心。由图中可见,在三个簇交汇的边缘,KMeans以距离中心的远近判断数据点所属的簇,因此在簇的边缘将数据点分到三个不同的簇,GaussianMixture则有更好的聚类效果。 + +### 数据集2 + +同样是2元高斯分布,数据集情况如下: +``` +CLUSTER-1: + mean = (0,0) + variance = ([[124,0],[0,225]]) +CLUSTER-2: + mean = (-20, -25) + variance = [[132,0],[0,33]] +CLUSTER-3: + mean = (30, 15) + variance = ([[73, 0], [0, 20]]) +CLUSTER-4: + mean = (11, -2) + variance = ([[34,0], [0, 138]]) +``` + +KMeans和GaussianMixture聚类结果分别如下: +![km1](img/p3.png) +![gs1](img/p4.png) +可见,GaussianMixture的聚类效果还是比KMeans要更好,聚类中心与数据集真实分布的均值更接近;另外,KMeans聚类的边界更加规则、清晰,GaussianMixture更适合聚类之间分布重叠程度更大的、呈椭圆状分布的数据。 + +### 数据集3 + +利用`scikit-learn`产生环状簇聚数据集。 +用KMeans 和 GaussianMixture进行聚类,结果分别如图: +![km2](img/p7.png) +![gs3](img/p8.png) + +对于这种非高斯分布、非球形或椭圆形的数据,KMeans和GaussianMixture聚类的效果差不多,且都不理想,可以用其他方式,如基于密度的聚类方法来聚类。 \ No newline at end of file diff --git a/assignment-3/submission/17307130331/img/p1.png b/assignment-3/submission/17307130331/img/p1.png new file mode 100644 index 0000000000000000000000000000000000000000..e3f8ddb046e5868f6b3b82a7f421a6fa9b1fa12d Binary files /dev/null and b/assignment-3/submission/17307130331/img/p1.png differ diff --git a/assignment-3/submission/17307130331/img/p2.png b/assignment-3/submission/17307130331/img/p2.png new file mode 100644 index 0000000000000000000000000000000000000000..95e2926b9ef5be60d2de0e23721349c5dfd0a57d Binary files /dev/null and b/assignment-3/submission/17307130331/img/p2.png differ diff --git a/assignment-3/submission/17307130331/img/p3.png b/assignment-3/submission/17307130331/img/p3.png new file mode 100644 index 0000000000000000000000000000000000000000..8178173f9b681041311f5c853f43e44cbdc4fd6d Binary files /dev/null and b/assignment-3/submission/17307130331/img/p3.png differ diff --git a/assignment-3/submission/17307130331/img/p4.png b/assignment-3/submission/17307130331/img/p4.png new file mode 100644 index 0000000000000000000000000000000000000000..aac3b637e7c0d75304ee0f360134c91fc7b696e0 Binary files /dev/null and b/assignment-3/submission/17307130331/img/p4.png differ diff --git a/assignment-3/submission/17307130331/img/p5.png b/assignment-3/submission/17307130331/img/p5.png new file mode 100644 index 0000000000000000000000000000000000000000..cf11b621975a9f5165372ccb8bd33d4bad6c1b62 Binary files /dev/null and b/assignment-3/submission/17307130331/img/p5.png differ diff --git a/assignment-3/submission/17307130331/img/p6.png b/assignment-3/submission/17307130331/img/p6.png new file mode 100644 index 0000000000000000000000000000000000000000..d6a042d33ad9b3c2c6c542146eee165414e180a6 Binary files /dev/null and b/assignment-3/submission/17307130331/img/p6.png differ diff --git a/assignment-3/submission/17307130331/img/p7.png b/assignment-3/submission/17307130331/img/p7.png new file mode 100644 index 0000000000000000000000000000000000000000..28553042f8529afa6f8f0f3bfe5300c95e27aee5 Binary files /dev/null and b/assignment-3/submission/17307130331/img/p7.png differ diff --git a/assignment-3/submission/17307130331/img/p8.png b/assignment-3/submission/17307130331/img/p8.png new file mode 100644 index 0000000000000000000000000000000000000000..7c4f7aa08fb25490ef28f00aaa60a4d12d4fd9c3 Binary files /dev/null and b/assignment-3/submission/17307130331/img/p8.png differ diff --git a/assignment-3/submission/17307130331/source.py b/assignment-3/submission/17307130331/source.py new file mode 100644 index 0000000000000000000000000000000000000000..b3c6e3de22e84a46fbaa5e4b25f5b7cf23ac34ce --- /dev/null +++ b/assignment-3/submission/17307130331/source.py @@ -0,0 +1,243 @@ +import numpy as np +import random +import matplotlib.pyplot as plt + +class KMeans: + + def __init__(self, n_clusters, max_iters=None): + self.n_clusters = n_clusters + self.centers = None + self.n = None + self.max_iters = max_iters + + def initial_centers(self, train_data): + #idx = [i for i in range(train_data.shape[0])] + #rdx = random.sample(idx, self.n_clusters) + rng = np.random.default_rng() + centers = rng.choice(train_data, size=self.n_clusters, axis=0, replace=False) # 无放回抽样 + #centers = train_data[rdx] + return centers + + + def dis_matrix(self, x, y, square=True): + + xx = np.einsum('ij,ij->i', x, x)[:, np.newaxis] + yy = np.einsum('ij,ij->i', y, y)[np.newaxis, :] + dot = np.dot(x, y.T) + res = np.abs(xx + yy - 2 * dot) # (x-y)^2=x^2+y^2-2xy + return res if square else np.sqrt(res + 1e-9) + + def avg_dis(self, dots): + dis_sum = np.einsum("ij->", self.dis_matrix(dots, dots, False)) + return dis_sum/(2*dots.shape[0]) + + def _iter_(self, data): + cluster_labels = np.argmin(self.dis_matrix(data, self.centers), axis=1) # 把每个数据点分配到距离最近的中心所属簇 + clusters = [[] for _ in range(self.n_clusters)] # 保存各个簇中的数据点,方便再计算簇中心 + for i in range(data.shape[0]): + clusters[cluster_labels[i]].append(data[i]) + for i in range(self.n_clusters): + clusters[i] = np.array(clusters[i]) + return cluster_labels, clusters + + def fit(self, train_data): + self.n = train_data.shape[0] + self.centers = self.initial_centers(train_data) + cluster_labels, clusters = self._iter_(train_data) + + num_iters = 0 + + while True: + num_iters += 1 + self.centers = np.array([np.mean(c, axis=0) for c in clusters]) + new_cluster_labels, clusters = self._iter_(train_data) + if (new_cluster_labels == cluster_labels).all(): + break + if self.max_iters and num_iters == self.max_iters: + break + cluster_labels = new_cluster_labels + return sum([self.avg_dis(c) for c in clusters]) + + def predict(self, test_data): + assert self.centers is not None + return np.argmin(self.dis_matrix(test_data, self.centers), axis=1) + +class GaussianMixture: + + def __init__(self, n_clusters, max_iters=50): + self.n_clusters = n_clusters # 聚类簇数 + self.n = None # 训练样本容量 + self.dim = None # 样本维度 + self.cov = None # 协方差 + self.mean = None # 高斯分布的均值 + self.max_iters = max_iters + self.pi = [1/self.n_clusters for i in range(self.n_clusters)] + self.gamma = None + + def prob_normal(self, data, mean, cov): + tmp = (data-mean) @ np.linalg.inv(cov) @ (data-mean).T + tmp2 = (2 * np.pi) ** (-len(data)/2) * np.linalg.det(cov)**(-1/2) + return np.exp(tmp /(-2)) * tmp2 + + + def dis_matrix(self, x, y, square=True): + + xx = np.einsum('ij,ij->i', x, x)[:, np.newaxis] + yy = np.einsum('ij,ij->i', y, y)[np.newaxis, :] + dot = np.dot(x, y.T) + res = np.abs(xx + yy - 2 * dot) # (x-y)^2=x^2+y^2-2xy + return res if square else np.sqrt(res + 1e-9) + + def avg_dis(self, dots): + dis_sum = np.einsum("ij->", self.dis_matrix(dots, dots, False)) + return dis_sum/(2*dots.shape[0]) + def initial_centers(self, train_data): + #idx = [i for i in range(train_data.shape[0])] + #rdx = random.sample(idx, self.n_clusters) + rng = np.random.default_rng() + centers = rng.choice(train_data, size=self.n_clusters, axis=0, replace=False) # 无放回抽样 + dis_centers = self.avg_dis(centers) + for i in range(10): + new_centers = rng.choice(train_data, size=self.n_clusters, axis=0, replace=False) + new_dis = self.avg_dis(new_centers) + if new_dis>dis_centers: + dis_centers = new_dis + centers = new_centers + + #centers = train_data[rdx] + return centers + + + def fit(self, train_data): + + self.n = train_data.shape[0] + self.dim = train_data.shape[1] + + if train_data.shape[0] < 5 * train_data.shape[1]: + train_data = np.concatenate([train_data, train_data+np.random.rand(self.n, self.dim), + train_data+np.random.rand(self.n, self.dim), + train_data+np.random.rand(self.n, self.dim), + train_data+np.random.rand(self.n, self.dim)], axis=0) + self.n = train_data.shape[0] + self.dim = train_data.shape[1] + + # 初始化均值和方差 + self.mean = self.initial_centers(train_data) + self.cov = np.concatenate([np.eye(self.dim)[np.newaxis, :] for _ in range(self.n_clusters)], axis=0) + + num_iters = 0 + + while True: + num_iters+=1 + # E step + # 计算每一个数据点属于每一个分布的概率 + self.gamma = np.zeros(shape = (self.n, self.n_clusters)) + for i in range(self.n): + for j in range(self.n_clusters): + self.gamma[i,j] = self.pi[j] * self.prob_normal(train_data[i], self.mean[j], self.cov[j]) + self.gamma[i,j] /= sum([self.pi[k] * self.prob_normal(train_data[i], self.mean[k], self.cov[k]) for k in range(self.n_clusters)])+1e-10 + N = np.sum(self.gamma, axis=0) + + # M step + # 更新均值 + prev_mean = self.mean + self.mean = np.zeros(shape = (self.n_clusters, self.dim)) + for i in range(self.n_clusters): + self.mean[i] = np.average(train_data, axis=0, weights=self.gamma[:, i]) + + # 更新方差 + prev_cov = self.cov + for i in range(self.n_clusters): + tmp = np.array([np.dot((train_data - self.mean[i])[j].reshape(-1,1), (train_data-self.mean[i])[j].reshape(1, -1)) for j in range(self.n)]) + self.cov[i] = np.average(tmp, axis=0, weights = self.gamma[:, i]) + #self.cov[i] /= N[i] + + # 更新pi + self.pi = [N[i]/self.n for i in range(self.n_clusters)] + + if self.max_iters and num_iters==self.max_iters: + break + if (prev_mean==self.mean).all() and (prev_cov == self.cov).all(): + break + + + def predict(self, test_data): + assert self.mean is not None + assert self.cov is not None + probs = [] + + for i in range(test_data.shape[0]): + probs.append([self.prob_normal(test_data[i], self.mean[k], self.cov[k]) for k in range(self.n_clusters)]) + return np.argmax(probs, axis=1) + +class ClusteringAlgorithm: + + def __init__(self): + pass + + def fit(self, train_data): + pass + + def predict(self, test_data): + pass + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + +def generate_dataset1(): + mean1 = (0,0) + cov1 = np.array([[124,0],[0,225]]) + d1 = np.random.multivariate_normal(mean1, cov1, (500,)) + + mean2 = (-35, 40) + cov2 = np.array([[32,0],[0,33]]) + d2 = np.random.multivariate_normal(mean2, cov2, (560,)) + + mean3 = ((30, 50)) + cov3 = np.array([[44, 0], [0, 20]]) + d3 = np.random.multivariate_normal(mean3, cov3, (300, )) + + data,_ = shuffle(d1, d2, d3) + + + return (data,data), 3 + +def generate_dataset3(): + from sklearn import cluster, datasets + n_samples = 1000 + noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) + noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) + + data, _ = shuffle(noisy_circles[0]) + return (data, data), 2 +def generate_dataset2(): + mean1 = (0,0) + cov1 = np.array([[124,0],[0,225]]) + d1 = np.random.multivariate_normal(mean1, cov1, (500,)) + + mean2 = (-20, -25) + cov2 = np.array([[132,0],[0,33]]) + d2 = np.random.multivariate_normal(mean2, cov2, (560,)) + + mean3 = ((30, 15)) + cov3 = np.array([[73, 0], [0, 20]]) + d3 = np.random.multivariate_normal(mean3, cov3, (300, )) + + mean4 = ((11, -2)) + cov4 = np.array([[34,0], [0, 138]]) + d4 = np.random.multivariate_normal(mean4, cov4, (410, )) + + data,_ = shuffle(d1, d2, d3, d4) + + + return (data,data), 4 \ No newline at end of file diff --git a/assignment-3/submission/17307130331/tester_demo.py b/assignment-3/submission/17307130331/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..19ec0e8091691d4aaaa6b53dbb695fde9e826d89 --- /dev/null +++ b/assignment-3/submission/17307130331/tester_demo.py @@ -0,0 +1,117 @@ +import numpy as np +import sys + +from source import KMeans, GaussianMixture + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all()