diff --git a/assignment-1/submission/18307130003/source.py b/assignment-1/submission/18307130003/source.py index e6bc70f8a0c3faff84ec5b52c083e95676140fab..3676e4eed7bb10282f277697d818b8ff2103879e 100644 --- a/assignment-1/submission/18307130003/source.py +++ b/assignment-1/submission/18307130003/source.py @@ -149,8 +149,6 @@ class KNN: print(f'best k = {self.k}\n') - print(f'best k = {self.k}\n') - def predict(self, test_data: np.ndarray) -> np.ndarray: ''' Predict the label of a point using our model. diff --git a/assignment-3/submission/18307130003/README.md b/assignment-3/submission/18307130003/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1e2e90a478b7d70794b748bd2676b38172a3004d --- /dev/null +++ b/assignment-3/submission/18307130003/README.md @@ -0,0 +1,902 @@ +# 实验报告 + +本次作业使用纯 NumPy 实现了一个 K-Means 模型和一个 GMM 模型,并利用 Gap Statistic 方法实现了数据集中聚簇数量的自动推测。 + +## 目录 + +- [实验报告](#实验报告) + - [目录](#目录) + - [K-Means 模型](#k-means-模型) + - [1.1 算法思路](#11-算法思路) + - [1.2 一些优化](#12-一些优化) + - [1.3 基础实验](#13-基础实验) + - [1.3.0 生成数据集](#130-生成数据集) + - [1.3.1 实验 1](#131-实验-1) + - [1.3.1.1 数据集参数](#1311-数据集参数) + - [1.3.1.2 实验结果](#1312-实验结果) + - [1.3.2 实验 2](#132-实验-2) + - [1.3.2.1 数据集参数](#1321-数据集参数) + - [1.3.2.2 实验结果](#1322-实验结果) + - [1.3.3 实验 3](#133-实验-3) + - [1.3.3.1 数据集参数](#1331-数据集参数) + - [1.3.3.2 实验结果](#1332-实验结果) + - [1.3.4 实验 4](#134-实验-4) + - [1.3.4.1 数据集参数](#1341-数据集参数) + - [1.3.4.2 实验结果](#1342-实验结果) + - [1.3.5 实验 5](#135-实验-5) + - [1.3.5.1 数据集参数](#1351-数据集参数) + - [1.3.5.2 实验结果](#1352-实验结果) + - [GMM 模型](#gmm-模型) + - [2.1 算法思路](#21-算法思路) + - [2.2 一些优化](#22-一些优化) + - [2.3 基础实验](#23-基础实验) + - [2.3.0 生成数据集](#230-生成数据集) + - [2.3.1 实验 1](#231-实验-1) + - [2.3.1.1 数据集参数](#2311-数据集参数) + - [2.3.1.2 实验结果](#2312-实验结果) + - [2.3.2 实验 2](#232-实验-2) + - [2.3.2.1 数据集参数](#2321-数据集参数) + - [2.3.2.2 实验结果](#2322-实验结果) + - [2.3.3 实验 3](#233-实验-3) + - [2.3.3.1 数据集参数](#2331-数据集参数) + - [2.3.3.2 实验结果](#2332-实验结果) + - [2.3.4 实验 4](#234-实验-4) + - [2.3.4.1 数据集参数](#2341-数据集参数) + - [2.3.4.2 实验结果](#2342-实验结果) + - [2.3.5 实验 5](#235-实验-5) + - [2.3.5.1 数据集参数](#2351-数据集参数) + - [2.3.5.2 实验结果](#2352-实验结果) + - [自动选择聚簇数量](#自动选择聚簇数量) + - [3.1 算法思路](#31-算法思路) + - [3.2 一些优化](#32-一些优化) + - [3.3 基础实验](#33-基础实验) + - [3.3.0 生成数据集](#330-生成数据集) + - [3.3.1 实验 1](#331-实验-1) + - [3.3.1.1 数据集参数](#3311-数据集参数) + - [3.3.1.2 实验结果](#3312-实验结果) + - [3.3.2 实验 2](#332-实验-2) + - [3.3.2.1 数据集参数](#3321-数据集参数) + - [3.3.2.2 实验结果](#3322-实验结果) + - [3.3.3 实验 3](#333-实验-3) + - [3.3.3.1 数据集参数](#3331-数据集参数) + - [3.3.3.2 实验结果](#3332-实验结果) + - [3.3.4 实验 4](#334-实验-4) + - [3.3.4.1 数据集参数](#3341-数据集参数) + - [3.3.4.2 实验结果](#3342-实验结果) + - [3.3.5 实验 5](#335-实验-5) + - [3.3.5.1 数据集参数](#3351-数据集参数) + - [3.3.5.2 实验结果](#3352-实验结果) + - [运行代码](#运行代码) + +## K-Means 模型 + +### 1.1 算法思路 + +K-Means 模型的算法思路很简单,具体训练过程如下: + +1. 给定 $K$ 值(需要将数据集聚成几个簇),初始时随机选择 $K$ 个聚簇中心 +2. 将每个数据点分配给距离最近的聚簇中心 +3. 修正聚簇中心为本次分配到此聚簇中心的数据点的中心点(平均值) +4. 重复以上步骤,直到满足终止条件;这里我们选择的终止条件是,没有数据点的标签(即分配到的聚簇中心)再发生变化(当然也可以设置为其他合理的终止条件) + +预测时,我们选择距离数据点最近的聚簇中心,作为数据点所属的聚簇。 + +例如对于一个随机生成的数据集,我们利用 K-Means 模型将其分为 3 类的结果如下: + +![K-Means](./img/k-means_1_train.png) + +### 1.2 一些优化 + +K-Means 模型的一个问题是,由于初始时的聚簇中心是随机选择的,如果选择得不好,可能会导致收敛到局部最优解而非全局最优解。例如对于同样的数据集,可能生成如下的分类结果: + +![K-Means 局部最优解](./img/k-means_1_bad_train.png) + +作为优化,我们参考 scikit-learn 库的思路,对于同样的训练数据,使用不同的随机种子训练 `n_epochs` 次(此参数可调整,默认为 `10`),最后选择最优的训练模型作为预测时使用的模型。这里我们对「最优」的判断标准是,各簇内距离(各数据点到聚簇中心距离)的平方和最小,具体来说,即要求如下参数的值最小: + +$$ +W += \sum\_{\mathbf{x}\_i\in \mathbf{D}} + \sum\_{\mathbf{c}\_k\in \mathbf{C}} + (\mathbf{x}\_i - \mathbf{c}\_k)^2 +$$ + +其中,$\mathbf{x}\_i$ 为数据点,$\mathbf{D}$ 为数据集,$\mathbf{c}\_k$ 为聚簇中心,$\mathbf{C}$ 为聚簇中心集。 + +经实验,对于一般的数据集来说,这个优化可以有效地使 K-Means 模型收敛到全局最优解。 + +### 1.3 基础实验 + +#### 1.3.0 生成数据集 + +我们利用以下函数生成数据集: + +```python {.line-numbers} +# tester.py + +class TestSuite: + ''' + Multiple testing data for models. + ''' + + def __init__(self) -> None: + self.rng: np.random.Generator = np.random.default_rng() + + def generate_normal(self, param: NormalParameters) -> np.ndarray: + ''' + Generate a dataset from a Gaussian distribution with given parameters. + + Args: + `param`: parameters used to generate a dataset + + Return: + shape(N, d) + ''' + + size, mean, cov, scale = param + if len(mean) > 1: + return self.rng.multivariate_normal(mean, cov, size) + else: + return self.rng.normal(mean[0], scale, size) + + def generate_data(self, *params: NormalParameters) -> Tuple[np.ndarray, int]: + ''' + Generate a dataset for tests. + + Args: + `params`: a tuple of parameters to generate datasets + + Return: + `dataset`: shape(N, d) + `n_clusters`: the number of clusters to partition into + ''' + + dataset, _labels = self.combine(*tuple( + self.generate_normal(p) for p in params + )) + n_clusters: int = len(params) + return dataset, n_clusters + +``` + +对于一维的情形,我们使用函数 `numpy.random.default_rng().normal` 和参数 `mean`, `scale`, `size` 从一维高斯分布中生成数据集;对于多维的情形,我们使用函数 `numpy.random.default_rng().multivariate_normal` 和参数 `mean`, `cov`, `size` 从多维高斯分布中生成数据集。其中: + +- `mean` 表示数据集的均值 +- `cov` 表示数据集的协方差(对于多维的情形) +- `scale` 表示数据集的标准差(对于一维的情形) +- `size` 表示数据集的大小 + +如此生成若干数据集后,我们将它们合并为一个大数据集,并对其进行打乱。其中 80% 作为训练集,20% 作为测试集。 + +#### 1.3.1 实验 1 + +第一个实验,我们对 3 个二维高斯分布数据集进行分类。 + +##### 1.3.1.1 数据集参数 + +```python {.line-numbers} +mean = (1, 2) +cov = [[73, 0], [0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (16, -5) +cov = [[21.2, 0], [0, 32.1]] +size = 200 +``` + +```python {.line-numbers} +mean = (10, 22) +cov = [[10, 5], [5, 10]] +size = 1000 +``` + +##### 1.3.1.2 实验结果 + +训练集,要求分为 3 类: + +![K-Means 训练集](./img/k-means_1_train.png) + +测试集,要求分为 3 类: + +![K-Means 测试集](./img/k-means_1_test.png) + +#### 1.3.2 实验 2 + +第二个实验,我们增加数据集的数量,对 5 个二维高斯分布数据集进行分类。 + +##### 1.3.2.1 数据集参数 + +```python {.line-numbers} +mean = (1, 0) +cov = [[73, 0], [0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (20, 15) +cov = [[21.2, 0], [0, 32.1]] +size = 400 +``` + +```python {.line-numbers} +mean = (10, -22) +cov = [[10, 5], [5, 10]] +size = 1000 +``` + +```python {.line-numbers} +mean = (-12, -6) +cov = [[7, 3], [3, 16]] +size = 500 +``` + +```python {.line-numbers} +mean = (-15, 17) +cov = [[15, 0], [0, 12]] +size = 600 +``` + +##### 1.3.2.2 实验结果 + +训练集,要求分为 5 类: + +![K-Means 训练集](./img/k-means_2_train.png) + +测试集,要求分为 5 类: + +![K-Means 测试集](./img/k-means_2_test.png) + +#### 1.3.3 实验 3 + +第三个实验,我们提高数据集的维度,对 3 个三维高斯分布数据集进行分类。 + +##### 1.3.3.1 数据集参数 + +```python {.line-numbers} +mean = (-6, 3, 5) +cov = [[73, 0, 0], [0, 50, 0], [0, 0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (12, 0, -10) +cov = [[20, 5, 0], [5, 20, 0], [0, 0, 20]] +size = 500 +``` + +```python {.line-numbers} +mean = (10, -20, 0) +cov = [[10, 1, 3], [1, 10, 0], [3, 0, 10]] +size = 800 +``` + +##### 1.3.3.2 实验结果 + +训练集,要求分为 3 类: + +![K-Means 训练集](./img/k-means_3_train.png) + +测试集,要求分为 3 类: + +![K-Means 测试集](./img/k-means_3_test.png) + +#### 1.3.4 实验 4 + +第四个实验,我们降低数据集的维度,对 3 个一维高斯分布数据集进行分类。 + +##### 1.3.4.1 数据集参数 + +```python {.line-numbers} +mean = (-20,) +scale = 2 +size = 100 +``` + +```python {.line-numbers} +mean = (0,) +scale = 1 +size = 150 +``` + +```python {.line-numbers} +mean = (15,) +scale = 2 +size = 100 +``` + +##### 1.3.4.2 实验结果 + +训练集,要求分为 3 类: + +![K-Means 训练集](./img/k-means_4_train.png) + +测试集,要求分为 3 类: + +![K-Means 测试集](./img/k-means_4_test.png) + +#### 1.3.5 实验 5 + +第五个实验,为了与之后的 GMM 模型进行对比,我们对 3 个扁平形状的二维高斯分布数据集进行分类。 + +##### 1.3.5.1 数据集参数 + +```python {.line-numbers} +mean = (0, -5) +cov = [[73, 0], [0, 2]] +size = 800 +``` + +```python {.line-numbers} +mean = (-3, 0) +cov = [[100, 0], [0, 2]] +size = 500 +``` + +```python {.line-numbers} +mean = (2, 5) +cov = [[70, 1], [1, 3]] +size = 500 +``` + +##### 1.3.5.2 实验结果 + +训练集,要求分为 3 类: + +![K-Means 训练集](./img/k-means_5_train.png) + +测试集,要求分为 3 类: + +![K-Means 测试集](./img/k-means_5_test.png) + +可见,对于这样的数据集,由于 K-Means 模型以距离为唯一参考的特性,其分类结果不是很理想。这也是我们为什么要引入 GMM 模型。 + +## GMM 模型 + +### 2.1 算法思路 + +GMM 模型的算法思路类似于 K-Means 模型,区别在于 GMM 模型不再以到聚簇中心的距离为参考标准,而是用 $K$ 个单一高斯分布的线性组合来拟合原数据集。我们将使用 EM 算法进行迭代,具体训练过程如下: + +1. 给定 $K$ 值(需要将数据集聚成几个簇),初始化各个参数 + + - `means`:从数据集中随机选择 $K$ 个点,作为每个高斯分布的初始中心点 + - `covs`:对于多维的情形,初始化每个高斯分布的协方差为一个单元矩阵 + - `scales`:对于一维的情形,初始化每个高斯分布的标准差为 $1$ + - `weights`:初始化任一数据点被分配到每个聚簇的先验概率为 $\frac{1}{K}$ + +2. EM 算法的 E(xpectation) 步骤:通过目前的参数计算每个数据点被分配到每个聚簇的概率 + + - 先计算给定参数的高斯分布的概率密度函数(PDF) + + - 对于多维的情形, + + $$ + p(\mathbf{x}) + = \frac{1}{\sqrt{(2\pi)^d |\mathbf{\Sigma}\_i|}} + \exp( + -\frac{1}{2} + (\mathbf{x}-\mathbf{\mu}\_i)^\mathrm{T} + \mathbf{\Sigma}\_i^{-1} + (\mathbf{x}-\mathbf{\mu}\_i) + ) + $$ + + 其中 $i$ 表示第 $i$ 个聚簇、$d$ 表示数据点的维度、$\mathbf{\mu}$ 表示 `means`、$\mathbf{\Sigma}$ 表示 `covs` + + - 对于一维的情形, + + $$ + p(x) + = \frac{1}{\sqrt{2\pi\sigma\_i^2}} + \exp(-\frac{(x-\mu\_i)^2}{2\sigma\_i^2}) + $$ + + 其中 $i$ 表示第 $i$ 个聚簇、$\mu$ 表示 `means`、$\sigma$ 表示 `scales` + + - 然后利用 Bayes' Theorem,计算每个数据点被分配到每个聚簇的概率 + + $$ + f_i(\mathbf{x}) + = \frac{p(\mathbf{x})\phi\_i}{\sum\limits\_{i=1}^k p(\mathbf{x})\phi\_i} + $$ + + 其中 $i$ 表示第 $i$ 个聚簇、$\phi$ 表示 `weights`、$k$ 表示 $K$ 值 + +3. EM 算法的 M(aximization) 步骤:根据当前每个聚簇的概率矩阵,更新模型的各个参数 + + - `means`: + + $$ + \mathbf{\mu}\_i + = \frac + {\sum\limits\_{i=1}^k(f\_i(\mathbf{x})\cdot \mathbf{x})} + {\sum\limits\_{i=1}^k f\_i(\mathbf{x})} + $$ + + - `covs`:对于多维的情形, + + $$ + \mathbf{\Sigma}\_i + = \frac + {\sum\limits\_{i=1}^k( + f\_i(\mathbf{x})\cdot + (\mathbf{x}-\mathbf{\mu}\_i)^\mathrm{T} + (\mathbf{x}-\mathbf{\mu}\_i) + )} + {\sum\limits\_{i=1}^k f\_i(\mathbf{x})} + $$ + + - `scales`:对于一维的情形, + + $$ + \mathbf{\sigma}\_i + = \sqrt{ + \frac + {\sum\limits\_{i=1}^k(f\_i(x)\cdot (x-\mu\_i)^2)} + {\sum\limits\_{i=1}^k f\_i(x)} + } + $$ + + - `weights`: + + $$ + \phi\_i = \frac{1}{N} {\sum\limits\_{i=1}^k f\_i(\mathbf{x})} + $$ + +4. 重复 EM 算法,直到满足终止条件;GMM 模型的终止条件与 K-Means 模型一致 + +预测时,对于每个数据点,我们根据每个聚簇的概率矩阵,选择概率最大的聚簇作为数据点所属的聚簇。 + +例如对于一个随机生成的数据集,我们利用 GMM 模型将其分为 3 类的结果如下: + +![GMM](./img/gmm_1_train.png) + +### 2.2 一些优化 + +与 K-Means 模型类似,GMM 模型初始时的聚簇中心也是随机选择的,因此这里我们采用了和 K-Means 模型一样的优化思路,即使用不同的随机种子训练 `n_epochs` 次,最后选择最优的训练模型作为预测时使用的模型。与 K-Means 模型不同的地方在于,在 GMM 模型中,我们不能简单地使用到聚簇中心的距离作为参考标准。实际上,我们希望每个数据点被分配到其所属的聚簇时,其在概率矩阵中对应的概率尽可能大。因此,这里我们对「最优」的判断标准是,要求如下参数的值最大: + +$$ +\sum\limits\_{i=1}^N \max\_{1\le j\le k}{\{f\_j(\mathbf{x})\}} +$$ + +经实验,这个优化也可以有效地使 GMM 模型收敛到全局最优解。 + +### 2.3 基础实验 + +#### 2.3.0 生成数据集 + +这里我们使用与 K-Means 相同的数据集生成方法。 + +#### 2.3.1 实验 1 + +第一个实验,我们对 3 个二维高斯分布数据集进行分类。 + +##### 2.3.1.1 数据集参数 + +```python {.line-numbers} +mean = (1, 2) +cov = [[73, 0], [0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (16, -5) +cov = [[21.2, 0], [0, 32.1]] +size = 200 +``` + +```python {.line-numbers} +mean = (10, 22) +cov = [[10, 5], [5, 10]] +size = 1000 +``` + +##### 2.3.1.2 实验结果 + +训练集,要求分为 3 类: + +![GMM 训练集](./img/gmm_1_train.png) + +测试集,要求分为 3 类: + +![GMM 测试集](./img/gmm_1_test.png) + +#### 2.3.2 实验 2 + +第二个实验,我们对 5 个二维高斯分布数据集进行分类。 + +##### 2.3.2.1 数据集参数 + +```python {.line-numbers} +mean = (1, 0) +cov = [[73, 0], [0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (20, 15) +cov = [[21.2, 0], [0, 32.1]] +size = 400 +``` + +```python {.line-numbers} +mean = (10, -22) +cov = [[10, 5], [5, 10]] +size = 1000 +``` + +```python {.line-numbers} +mean = (-12, -6) +cov = [[7, 3], [3, 16]] +size = 500 +``` + +```python {.line-numbers} +mean = (-15, 17) +cov = [[15, 0], [0, 12]] +size = 600 +``` + +##### 2.3.2.2 实验结果 + + +训练集,要求分为 5 类: + +![GMM 训练集](./img/gmm_2_train.png) + +测试集,要求分为 5 类: + +![GMM 测试集](./img/gmm_2_test.png) + +#### 2.3.3 实验 3 + +第三个实验,我们对 3 个三维高斯分布数据集进行分类。 + +##### 2.3.3.1 数据集参数 + +```python {.line-numbers} +mean = (-6, 3, 5) +cov = [[73, 0, 0], [0, 50, 0], [0, 0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (12, 0, -10) +cov = [[20, 5, 0], [5, 20, 0], [0, 0, 20]] +size = 500 +``` + +```python {.line-numbers} +mean = (10, -20, 0) +cov = [[10, 1, 3], [1, 10, 0], [3, 0, 10]] +size = 800 +``` + +##### 2.3.3.2 实验结果 + +训练集,要求分为 3 类: + +![GMM 训练集](./img/gmm_3_train.png) + +测试集,要求分为 3 类: + +![GMM 测试集](./img/gmm_3_test.png) + +#### 2.3.4 实验 4 + +第四个实验,我们对 3 个一维高斯分布数据集进行分类。 + +##### 2.3.4.1 数据集参数 + +```python {.line-numbers} +mean = (-20,) +scale = 2 +size = 100 +``` + +```python {.line-numbers} +mean = (0,) +scale = 1 +size = 150 +``` + +```python {.line-numbers} +mean = (15,) +scale = 2 +size = 100 +``` + +##### 2.3.4.2 实验结果 + +训练集,要求分为 3 类: + +![GMM 训练集](./img/gmm_4_train.png) + +测试集,要求分为 3 类: + +![GMM 测试集](./img/gmm_4_test.png) + +#### 2.3.5 实验 5 + +第五个实验,我们对 3 个扁平形状的二维高斯分布数据集进行分类。 + +##### 2.3.5.1 数据集参数 + +```python {.line-numbers} +mean = (0, -5) +cov = [[73, 0], [0, 2]] +size = 800 +``` + +```python {.line-numbers} +mean = (-3, 0) +cov = [[100, 0], [0, 2]] +size = 500 +``` + +```python {.line-numbers} +mean = (2, 5) +cov = [[70, 1], [1, 3]] +size = 500 +``` + +##### 2.3.5.2 实验结果 + +训练集,要求分为 3 类: + +![GMM 训练集](./img/gmm_5_train.png) + +测试集,要求分为 3 类: + +![GMM 测试集](./img/gmm_5_test.png) + +可见,GMM 模型对于高斯混合分布数据集,其分类结果比 K-Means 模型更加准确。 + +## 自动选择聚簇数量 + +### 3.1 算法思路 + +这里我们利用 Gap Statistic 方法实现了数据集中聚簇数量的自动推测。Gap Statistic 方法的思想是,对于数据集聚类结果的簇内距离,它和同规模均匀分布的期望簇内距离相比,两者的差距越大,则认为模型的分类结果越好,即 $K$ 值的选择越好。具体来说,即希望如下参数的值尽可能大: + +$$ +\mathrm{Gap}\_k = \mathrm{E}(\log {W'}\_k) - \log W\_k +$$ + +其中,$k$ 即选择的 $K$ 值,$W\_k$ 为数据集 $\mathbf{D}$ 聚类结果的簇内距离的平方和,${W'}\_k$ 为同规模均匀分布 $\mathbf{U}$ 聚类结果的簇内距离的平方和。这里 $W$ 的具体定义参见 1.2 节。 + +我们利用 Monte Carlo Method 计算 $\mathrm{E}(\log {W'}\_k)$ 的值。我们在数据集 $\mathbf{D}$ 覆盖的矩形范围内进行 $B$ 次随机均匀采样,得到 $B$ 个不同的 ${W'}\_k^{(b)}$,于是我们有 + +$$ +\mathrm{E}(\log {W'}\_k) += \frac{1}{B} \sum\limits\_{b=1}^B \log {W'}\_k^{(b)} +$$ + +我们的算法底层建立在 K-Means 模型的基础上。实验时,需指定扫描时的最大 $K$ 值 $K\_{\max}$,模型将在 $[1, K\_{\max}]$ 的范围内找到使得 $\mathrm{Gap}\_k$ 的值最大的 $K$ 值,作为推测的数据集中的聚簇数量。 + +### 3.2 一些优化 + +为了一定程度上减少扫描时间,我们设定了一个终止阈值 `BREAK_THRESHOLD`(默认为 `3`)。当 $\mathrm{Gap}\_k$ 连续 `BREAK_THRESHOLD` 次没有增长时,我们认为已经找到了最优的 $K$ 值,模型自动终止扫描。 + +### 3.3 基础实验 + +#### 3.3.0 生成数据集 + +这里我们使用与 K-Means 相同的数据集生成方法。 + +#### 3.3.1 实验 1 + +第一个实验,我们对 3 个二维高斯分布数据集进行分类。 + +##### 3.3.1.1 数据集参数 + +```python {.line-numbers} +mean = (1, 2) +cov = [[73, 0], [0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (16, -5) +cov = [[21.2, 0], [0, 32.1]] +size = 200 +``` + +```python {.line-numbers} +mean = (10, 22) +cov = [[10, 5], [5, 10]] +size = 1000 +``` + +##### 3.3.1.2 实验结果 + +训练集,不指定 $K$ 值: + +![Auto K-Means 训练集 Gap-K 图](./img/auto-k-means_1_gaps.png) + +模型选择的 $K$ 值为 $3$,分类结果: + +![Auto K-Means 训练集](./img/auto-k-means_1_train.png) + +测试集,不指定 $K$ 值: + +![Auto K-Means 测试集](./img/auto-k-means_1_test.png) + +#### 3.3.2 实验 2 + +第二个实验,我们对 5 个二维高斯分布数据集进行分类。 + +##### 3.3.2.1 数据集参数 + +```python {.line-numbers} +mean = (1, 0) +cov = [[73, 0], [0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (20, 15) +cov = [[21.2, 0], [0, 32.1]] +size = 400 +``` + +```python {.line-numbers} +mean = (10, -22) +cov = [[10, 5], [5, 10]] +size = 1000 +``` + +```python {.line-numbers} +mean = (-12, -6) +cov = [[7, 3], [3, 16]] +size = 500 +``` + +```python {.line-numbers} +mean = (-15, 17) +cov = [[15, 0], [0, 12]] +size = 600 +``` + +##### 3.3.2.2 实验结果 + +训练集,不指定 $K$ 值: + +![Auto K-Means 训练集 Gap-K 图](./img/auto-k-means_2_gaps.png) + +模型选择的 $K$ 值为 $6$,分类结果: + +![Auto K-Means 训练集](./img/auto-k-means_2_train.png) + +测试集,不指定 $K$ 值: + +![Auto K-Means 测试集](./img/auto-k-means_2_test.png) + +虽然实际上应当是 $5$ 个高斯分布,不过考虑到模型在 $K=5$ 和 $K=6$ 处都取得了较高的 $\mathrm{Gap}$ 值,这个分类结果也是可以理解的。 + +#### 3.3.3 实验 3 + +第三个实验,我们对 3 个三维高斯分布数据集进行分类。 + +##### 3.3.3.1 数据集参数 + +```python {.line-numbers} +mean = (-6, 3, 5) +cov = [[73, 0, 0], [0, 50, 0], [0, 0, 22]] +size = 800 +``` + +```python {.line-numbers} +mean = (12, 0, -10) +cov = [[20, 5, 0], [5, 20, 0], [0, 0, 20]] +size = 500 +``` + +```python {.line-numbers} +mean = (10, -20, 0) +cov = [[10, 1, 3], [1, 10, 0], [3, 0, 10]] +size = 800 +``` + +##### 3.3.3.2 实验结果 + +训练集,不指定 $K$ 值: + +![Auto K-Means 训练集 Gap-K 图](./img/auto-k-means_3_gaps.png) + +模型选择的 $K$ 值为 $4$,分类结果: + +![Auto K-Means 训练集](./img/auto-k-means_3_train.png) + +测试集,不指定 $K$ 值: + +![Auto K-Means 测试集](./img/auto-k-means_3_test.png) + +类似地,模型在 $K=3$ 和 $K=4$ 处都取得了较高的 $\mathrm{Gap}$ 值,说明这两种 $K$ 值都是可以接受的。 + +#### 3.3.4 实验 4 + +第四个实验,我们对 3 个一维高斯分布数据集进行分类。 + +##### 3.3.4.1 数据集参数 + +```python {.line-numbers} +mean = (-20,) +scale = 2 +size = 100 +``` + +```python {.line-numbers} +mean = (0,) +scale = 1 +size = 150 +``` + +```python {.line-numbers} +mean = (15,) +scale = 2 +size = 100 +``` + +##### 3.3.4.2 实验结果 + +训练集,不指定 $K$ 值: + +![Auto K-Means 训练集 Gap-K 图](./img/auto-k-means_4_gaps.png) + +模型选择的 $K$ 值为 $3$,分类结果: + +![Auto K-Means 训练集](./img/auto-k-means_4_train.png) + +测试集,不指定 $K$ 值: + +![Auto K-Means 测试集](./img/auto-k-means_4_test.png) + +#### 3.3.5 实验 5 + +第五个实验,我们对 3 个扁平形状的二维高斯分布数据集进行分类。 + +##### 3.3.5.1 数据集参数 + +```python {.line-numbers} +mean = (0, -5) +cov = [[73, 0], [0, 2]] +size = 800 +``` + +```python {.line-numbers} +mean = (-3, 0) +cov = [[100, 0], [0, 2]] +size = 500 +``` + +```python {.line-numbers} +mean = (2, 5) +cov = [[70, 1], [1, 3]] +size = 500 +``` + +##### 3.3.5.2 实验结果 + +训练集,不指定 $K$ 值: + +![Auto K-Means 训练集 Gap-K 图](./img/auto-k-means_5_gaps.png) + +模型选择的 $K$ 值为 $1$,分类结果: + +![Auto K-Means 训练集](./img/auto-k-means_5_train.png) + +测试集,不指定 $K$ 值: + +![Auto K-Means 测试集](./img/auto-k-means_5_test.png) + +实际上,从人类的视角来看,这个聚类结果也是可以理解的。 + +## 运行代码 + +执行以下指令进行模型的训练与预测。 + +```bash +python ./tester.py +``` + +生成数据集使用的参数等可以在 `TestSuite` 类中对应调整。 diff --git a/assignment-3/submission/18307130003/img/auto-k-means_1_gaps.png b/assignment-3/submission/18307130003/img/auto-k-means_1_gaps.png new file mode 100644 index 0000000000000000000000000000000000000000..9f7543762793a7d4963a948a300472abdf4a45a3 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_1_gaps.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_1_test.png b/assignment-3/submission/18307130003/img/auto-k-means_1_test.png new file mode 100644 index 0000000000000000000000000000000000000000..70d00a4f1875b0431d141d87d956229bc9ea0196 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_1_test.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_1_train.png b/assignment-3/submission/18307130003/img/auto-k-means_1_train.png new file mode 100644 index 0000000000000000000000000000000000000000..b7796f447147c5a1f6def5de169e85b64ca5a1d5 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_1_train.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_2_gaps.png b/assignment-3/submission/18307130003/img/auto-k-means_2_gaps.png new file mode 100644 index 0000000000000000000000000000000000000000..82f4d68b0aa9df8faa50fcdef65e5d466a4de58f Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_2_gaps.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_2_test.png b/assignment-3/submission/18307130003/img/auto-k-means_2_test.png new file mode 100644 index 0000000000000000000000000000000000000000..5bc4521d128b470dcebd53bffe32064ab09efbfa Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_2_test.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_2_train.png b/assignment-3/submission/18307130003/img/auto-k-means_2_train.png new file mode 100644 index 0000000000000000000000000000000000000000..3d9714fc8c732724fd7e5d089e4dc262d6a4ef89 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_2_train.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_3_gaps.png b/assignment-3/submission/18307130003/img/auto-k-means_3_gaps.png new file mode 100644 index 0000000000000000000000000000000000000000..6e8a0833de91409c860fb3d7df6a0c38bf03363d Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_3_gaps.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_3_test.png b/assignment-3/submission/18307130003/img/auto-k-means_3_test.png new file mode 100644 index 0000000000000000000000000000000000000000..fd390f8f6122921d7af2d83849f36749e0dd72b8 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_3_test.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_3_train.png b/assignment-3/submission/18307130003/img/auto-k-means_3_train.png new file mode 100644 index 0000000000000000000000000000000000000000..49a024e0edfebb02dc8b7bf68f901ff8a183d282 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_3_train.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_4_gaps.png b/assignment-3/submission/18307130003/img/auto-k-means_4_gaps.png new file mode 100644 index 0000000000000000000000000000000000000000..571a76cb555eee0e61b6e32980c09bb98c0331f5 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_4_gaps.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_4_test.png b/assignment-3/submission/18307130003/img/auto-k-means_4_test.png new file mode 100644 index 0000000000000000000000000000000000000000..2ea96f28c950ad426a2cdd2e3b0f7289161e6f1d Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_4_test.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_4_train.png b/assignment-3/submission/18307130003/img/auto-k-means_4_train.png new file mode 100644 index 0000000000000000000000000000000000000000..6c3c77cbfb279a7795b49d12431cfc1edefcb3ae Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_4_train.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_5_gaps.png b/assignment-3/submission/18307130003/img/auto-k-means_5_gaps.png new file mode 100644 index 0000000000000000000000000000000000000000..141a8974ecee5fdc53b7773a735b6007d9873b4b Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_5_gaps.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_5_test.png b/assignment-3/submission/18307130003/img/auto-k-means_5_test.png new file mode 100644 index 0000000000000000000000000000000000000000..85793ef444a91696f053249b7082bcdba5dafdd8 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_5_test.png differ diff --git a/assignment-3/submission/18307130003/img/auto-k-means_5_train.png b/assignment-3/submission/18307130003/img/auto-k-means_5_train.png new file mode 100644 index 0000000000000000000000000000000000000000..caeb2d566d323643a99106023bc9fcfe790ca2e8 Binary files /dev/null and b/assignment-3/submission/18307130003/img/auto-k-means_5_train.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_1_test.png b/assignment-3/submission/18307130003/img/gmm_1_test.png new file mode 100644 index 0000000000000000000000000000000000000000..680e3025eb3100030852db2304e98568a141de37 Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_1_test.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_1_train.png b/assignment-3/submission/18307130003/img/gmm_1_train.png new file mode 100644 index 0000000000000000000000000000000000000000..d8cbf04f179125b53c12590d3e42550909e85203 Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_1_train.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_2_test.png b/assignment-3/submission/18307130003/img/gmm_2_test.png new file mode 100644 index 0000000000000000000000000000000000000000..05d52221d19e19e7c8650f96f0cde94496170d20 Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_2_test.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_2_train.png b/assignment-3/submission/18307130003/img/gmm_2_train.png new file mode 100644 index 0000000000000000000000000000000000000000..14e985ea1ecbcf41f23d96843671402d2f5a373b Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_2_train.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_3_test.png b/assignment-3/submission/18307130003/img/gmm_3_test.png new file mode 100644 index 0000000000000000000000000000000000000000..e11e4fd98703445fee62033b6e1a5c06ee3b6927 Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_3_test.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_3_train.png b/assignment-3/submission/18307130003/img/gmm_3_train.png new file mode 100644 index 0000000000000000000000000000000000000000..97f9d6c84ee827ec2040fd78bb2c1448f76408a8 Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_3_train.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_4_test.png b/assignment-3/submission/18307130003/img/gmm_4_test.png new file mode 100644 index 0000000000000000000000000000000000000000..9832c07e16f4debefb176e596942fdc5ad2147b2 Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_4_test.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_4_train.png b/assignment-3/submission/18307130003/img/gmm_4_train.png new file mode 100644 index 0000000000000000000000000000000000000000..8de9eba054f9b41940aef96f0ec65b1ac94371ac Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_4_train.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_5_test.png b/assignment-3/submission/18307130003/img/gmm_5_test.png new file mode 100644 index 0000000000000000000000000000000000000000..4bedfe87d5529464087a68692ea93cc18f44f214 Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_5_test.png differ diff --git a/assignment-3/submission/18307130003/img/gmm_5_train.png b/assignment-3/submission/18307130003/img/gmm_5_train.png new file mode 100644 index 0000000000000000000000000000000000000000..bc6a8b9f1a644db865114ca27fadc75b3adb097b Binary files /dev/null and b/assignment-3/submission/18307130003/img/gmm_5_train.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_1_bad_test.png b/assignment-3/submission/18307130003/img/k-means_1_bad_test.png new file mode 100644 index 0000000000000000000000000000000000000000..cbe08c0d516d2582178f7aed19a16547a3ff6b51 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_1_bad_test.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_1_bad_train.png b/assignment-3/submission/18307130003/img/k-means_1_bad_train.png new file mode 100644 index 0000000000000000000000000000000000000000..bb4a7ee3cbb3fc11de821e7149547f1dfe3181e1 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_1_bad_train.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_1_test.png b/assignment-3/submission/18307130003/img/k-means_1_test.png new file mode 100644 index 0000000000000000000000000000000000000000..99fb1616b190d4ce37261c2941a06894a727e679 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_1_test.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_1_train.png b/assignment-3/submission/18307130003/img/k-means_1_train.png new file mode 100644 index 0000000000000000000000000000000000000000..7cf2b1ce1bc4eae522e612098f8322bb2bebef97 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_1_train.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_2_test.png b/assignment-3/submission/18307130003/img/k-means_2_test.png new file mode 100644 index 0000000000000000000000000000000000000000..62e4d9f28c349612699bd871943aec7dd6140211 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_2_test.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_2_train.png b/assignment-3/submission/18307130003/img/k-means_2_train.png new file mode 100644 index 0000000000000000000000000000000000000000..8a388fb3e6d4125d576ec65701bcf6796ccd006f Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_2_train.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_3_test.png b/assignment-3/submission/18307130003/img/k-means_3_test.png new file mode 100644 index 0000000000000000000000000000000000000000..1fc327004d3f5ff891bac8acad40e88165e16803 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_3_test.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_3_train.png b/assignment-3/submission/18307130003/img/k-means_3_train.png new file mode 100644 index 0000000000000000000000000000000000000000..837a12a43d7d2d4ad5ec78cf50b66366f99e39da Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_3_train.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_4_test.png b/assignment-3/submission/18307130003/img/k-means_4_test.png new file mode 100644 index 0000000000000000000000000000000000000000..861702903e1a9ffdb87d3c86e5923b3fdb12c958 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_4_test.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_4_train.png b/assignment-3/submission/18307130003/img/k-means_4_train.png new file mode 100644 index 0000000000000000000000000000000000000000..27b023bd80919684f7cb7eb37368b31c1dd9f603 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_4_train.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_5_test.png b/assignment-3/submission/18307130003/img/k-means_5_test.png new file mode 100644 index 0000000000000000000000000000000000000000..da6a0fafb2b624dcff28a3d4f38b77f8c66290ce Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_5_test.png differ diff --git a/assignment-3/submission/18307130003/img/k-means_5_train.png b/assignment-3/submission/18307130003/img/k-means_5_train.png new file mode 100644 index 0000000000000000000000000000000000000000..f5b1da45b95556b29e3830925bb730f91c56cec1 Binary files /dev/null and b/assignment-3/submission/18307130003/img/k-means_5_train.png differ diff --git a/assignment-3/submission/18307130003/source.py b/assignment-3/submission/18307130003/source.py new file mode 100644 index 0000000000000000000000000000000000000000..1f752604c63cda3c2bf4f6483638de95b65dbe5a --- /dev/null +++ b/assignment-3/submission/18307130003/source.py @@ -0,0 +1,473 @@ +from abc import ABC, abstractmethod +import math +import numpy as np +from utils import ( + UniformParameters, + assert_, + distance, + multinormal_pdf, + normal_pdf, +) + + +class Model(ABC): + ''' + The abstract class (ABC) of a model. + ''' + + @abstractmethod + def __init__(self, n_clusters: int, n_epochs: int) -> None: + ''' + Args: + `n_clusters`: the number of clusters to partition into + `n_epochs`: the number of epochs to run in total + ''' + + # Dataset properties + self.train_data: np.ndarray = None + self.data_size: int = 0 + self.dimension: int = 0 + + # Model parameters + self.k: int = n_clusters + self.n_epochs: int = n_epochs + + # Others + self.rng: np.random.Generator = np.random.default_rng() + self.FLOAT_MAX: float = np.finfo(float).max + self.EPS: float = np.finfo(float).eps + + @abstractmethod + def fit(self, train_data: np.ndarray) -> float: + ''' + Train the model using training data. + + Args: + `train_data`: shape(N, d) + + Return: + The training loss. + ''' + + pass + + @abstractmethod + def predict(self, test_data: np.ndarray) -> np.ndarray: + ''' + Predict the labels of testing data using the pre-trained model. + + Args: + `test_data`: shape(N, d) + + Return: + The predicted labels of testing data, shape(N). + ''' + + pass + + +class KMeans(Model): + ''' + k-means clustering algorithm, which aims to partition n observations into k + clusters in which each observation belongs to the cluster with the nearest + mean, serving as a prototype of the cluster. + ''' + + def __init__(self, n_clusters: int, n_epochs: int = 10) -> None: + + super().__init__(n_clusters, n_epochs) + + # Model parameters + self.centroids: np.ndarray = None + + def label(self, point: np.ndarray, centroids: np.ndarray) -> int: + ''' + Args: + `point`: shape(d) + `centroids`: shape(k, d) + + Return: + The index of the closest centroid to the point. + ''' + + return np.argmin([distance(point, c) for c in centroids]) + + def fit(self, train_data: np.ndarray) -> float: + + self.data_size: int = train_data.shape[0] + self.dimension: int = ( + train_data.shape[1] if len(train_data.shape) > 1 + else 1 + ) + self.train_data = ( + train_data if self.dimension > 1 + else train_data.reshape(self.data_size,) + ) + min_loss = self.FLOAT_MAX + + # Run n_epochs times to find the best model. + for i in range(self.n_epochs): + # Select k initial centroids randomly from training data. + centroids: np.ndarray = train_data[ + np.random.choice(self.data_size, size=self.k, replace=False) + ] + labels: np.ndarray = np.zeros(self.data_size, dtype=int) + + while True: + # Assign labels to points based on their closest centroids. + new_labels = np.array([ + self.label(point, centroids) for point in train_data + ]) + + # Check for convergence. + if np.array_equal(new_labels, labels): + break + labels = new_labels + + # Find new centroids from the means of points in each cluster. + centroids = np.array([ + np.mean(train_data[labels == i], axis=0) for i in range(self.k) + ]) + + loss: float = np.sum((train_data - centroids[labels]) ** 2) + if loss < min_loss: + min_loss = loss + self.centroids = centroids + + print('Training progress: {}/{}, \tloss: {}'.format( + i + 1, self.n_epochs, loss, + ), end='\r') + + print() + return min_loss + + def predict(self, test_data: np.ndarray) -> np.ndarray: + + return np.array([ + self.label(point, self.centroids) for point in test_data + ]) + + +class GaussianMixture(Model): + ''' + Gaussian Mixture Model (GMM) is a probabilistic model that assumes there + are a certain number of Gaussian distributions, and each of these + distributions represent a cluster. Hence, a GMM tends to group the data + points belonging to a single distribution together. + ''' + + def __init__(self, n_clusters: int, n_epochs: int = 5) -> None: + + super().__init__(n_clusters, n_epochs) + + # Dataset properties + self.train_data: np.ndarray = None + self.data_size: int = 0 + self.dimension: int = 0 + + # Model parameters + + # means: shape(k, d) + self.means: np.ndarray = None + # covs: shape(k, d, d) + self.covs: np.ndarray = None + # scales: shape(k) + self.scales: np.ndarray = None + # weights: shape(k) + self.weights: np.ndarray = None + # likelihoods: shape(N, k) + self.likelihoods: np.ndarray = None + + self.best_means: np.ndarray = None + self.best_covs: np.ndarray = None + self.best_scales: np.ndarray = None + self.best_weights: np.ndarray = None + self.best_likelihoods: np.ndarray = None + + def _init_params(self) -> None: + ''' + Randomly initialize the starting parameters. + ''' + + # Select k initial centroids randomly from training data. + self.means = self.train_data[ + np.random.choice(self.data_size, size=self.k, replace=False) + ] + + if self.dimension > 1: + # Initialize the coefficient of variation of each Gaussian + # distribution to be an identity matrix. + self.covs = np.array([ + np.identity(self.dimension) for _i in range(self.k) + ]) + else: + # Initialize the standard deviation of each Gaussian distribution + # to be 1. + self.scales = np.ones(self.k) + + # Initialize the likelihood that an observation belongs to a cluster + # to be 1/k for all clusters. + self.weights = np.ones(self.k) / self.k + + def _e_step(self, dataset: np.ndarray = None, test_mode: bool = False) -> None: + ''' + In the E(xpectation) step, we calculate the likelihood of each + observation x_i belonging to each cluster using the current estimated + parameters. + + Args: + `dataset`: shape(N, d) + `test_mode`: whether test data is being used + ''' + + if dataset is None: + dataset = self.train_data + data_size: int = dataset.shape[0] + + if test_mode: + self.means = self.best_means + self.covs = self.best_covs + self.scales = self.best_scales + self.weights = self.best_weights + self.likelihoods = self.best_likelihoods + + # Calculate the posterior probability that an observation belongs to + # each Gaussian distribution, using Bayes' Theorem. + self.likelihoods = np.zeros((data_size, self.k)) + for i, x_i in enumerate(dataset): + # Calculate the likelihood of each observation x_i. + if self.dimension > 1: + f_i: np.ndarray = np.array([ + self.weights[k] * multinormal_pdf( + x_i, self.means[k], self.covs[k] + self.EPS + ) for k in range(self.k) + ]) + else: + f_i: np.ndarray = np.array([ + self.weights[k] * normal_pdf( + x_i, self.means[k], self.scales[k] + self.EPS + ) for k in range(self.k) + ]) + f_i_sum: float = np.sum(f_i) + self.EPS + self.likelihoods[i] = f_i / f_i_sum + + def _m_step(self) -> None: + ''' + In the M(aximization) step, we re-estimate our learning parameters for + each cluster. + ''' + + # shape(n, k).sum(axis=0) = shape(k) + cluster_likelihoods: np.ndarray = np.sum( + self.likelihoods, axis=0 + ) + self.EPS + assert_( + 'cluster_likelihoods.shape', + cluster_likelihoods.shape, + (self.k,), + ) + + # shape(k) / scalar = shape(k) + self.weights = cluster_likelihoods / self.data_size + assert_('weights.shape', self.weights.shape, (self.k,)) + + if self.dimension > 1: + # shape(k, n) * shape(n, d) = shape(k, d) + self.means = np.matmul( + self.likelihoods.T, self.train_data + ) / cluster_likelihoods[:, None] + assert_('means.shape', self.means.shape, (self.k, self.dimension)) + + # shape(k, d, d) + self.covs = np.array([ + # shape(d, d) / scalar = shape(d, d) + np.sum([ + # scalar * shape(d, 1) * shape(1, d) = shape(d, d) + self.likelihoods[i, k] * np.matmul( + x_m.reshape((self.dimension, 1)), + x_m.reshape((1, self.dimension)), + ) + # shape(n, d) - shape(d) = shape(n, d) + for i, x_m in enumerate(self.train_data - self.means[k]) + ], axis=0) / cluster_likelihoods[k] + for k in range(self.k) + ]) + assert_( + 'covs.shape', + self.covs.shape, + (self.k, self.dimension, self.dimension), + ) + else: + # shape(k, n) * shape(n) = shape(k) + # shape(k) / shape(k) = shape(k) + self.means = np.dot( + self.likelihoods.T, self.train_data + ) / cluster_likelihoods + assert_('means.shape', self.means.shape, (self.k,)) + + # shape(k) + self.scales = np.array([ + np.sqrt(np.sum([ + self.likelihoods[i, k] * (x_m ** 2) + for i, x_m in enumerate(self.train_data - self.means[k]) + ], axis=0) / cluster_likelihoods[k]) + for k in range(self.k) + ]) + assert_('scales.shape', self.scales.shape, (self.k,)) + + def fit(self, train_data: np.ndarray) -> float: + + self.data_size: int = train_data.shape[0] + self.dimension: int = ( + train_data.shape[1] if len(train_data.shape) > 1 + else 1 + ) + self.train_data = ( + train_data if self.dimension > 1 + else train_data.reshape(self.data_size,) + ) + min_loss = 0.0 + + # Run n_epochs times to find the best model. + for i in range(self.n_epochs): + self._init_params() + labels: np.ndarray = np.zeros(self.data_size, dtype=int) + + while True: + self._e_step() + self._m_step() + + # Assign labels to points based on the most possible cluster + # according to the likelihood matrix. + new_labels = np.argmax(self.likelihoods, axis=1) + + # Check for convergence. + if np.array_equal(new_labels, labels): + break + labels = new_labels + + loss: float = -np.mean(np.max(self.likelihoods, axis=1)) + if loss < min_loss: + min_loss = loss + self.best_means = self.means + self.best_covs = self.covs + self.best_scales = self.scales + self.best_weights = self.weights + self.best_likelihoods = self.likelihoods + + print('Training progress: {}/{}, \tloss: {}'.format( + i + 1, self.n_epochs, loss, + ), end='\r') + + print() + return min_loss + + def predict(self, test_data: np.ndarray) -> np.ndarray: + + self._e_step(dataset=test_data, test_mode=True) + return np.argmax(self.likelihoods, axis=1) + + +class ClusteringAlgorithm(Model): + ''' + Based on k-means clustering algorithm, using Gap Statistic to estimate the + number of clusters in the dataset automatically. + ''' + + def __init__( + self, max_n_clusters: int, n_epochs: int = 10, n_ref: int = 5 + ) -> None: + ''' + Args: + `max_n_clusters`: the maximum number of clusters to test + `n_epochs`: the number of epochs to run in total + `n_ref`: the number of reference datasets in Gap Statistic + ''' + + # Not setting the number of clusters at the beginning. + super().__init__(0, n_epochs) + + # Model parameters + self.max_k = max_n_clusters + self.n_ref = n_ref + self.gaps: np.ndarray = None + self.BREAK_THRESHOLD = 3 + + self.best_model = None + + def generate_uniform(self, param: UniformParameters) -> np.ndarray: + ''' + Generate a dataset from a uniform distribution with given parameters. + + Args: + `param`: parameters used to generate a dataset + + Return: + shape(N, d) + ''' + + size, (low, high) = param + dimension: int + if np.isscalar(low): + assert_('np.isscalar(high)', np.isscalar(high), True) + dimension = 1 + return self.rng.uniform(low, high, size) + else: + assert_('low.shape[0]', low.shape[0], high.shape[0]) + dimension = low.shape[0] + return self.rng.uniform(low, high, (size, dimension)) + + def fit(self, train_data: np.ndarray) -> float: + + gaps, max_gap, prev_gap = [0.0], 0.0, 0.0 + min_loss = self.FLOAT_MAX + # Break the loop if the gap statistics is not likely to ascend anymore. + break_counter = 0 + + for k in range(1, self.max_k + 1): + model = KMeans(k, self.n_epochs) + loss = model.fit(train_data) + + low: np.ndarray = np.amin(train_data, axis=0) + high: np.ndarray = np.amax(train_data, axis=0) + + # Generate n_ref reference datasets, cluster each one using the + # current model, and calculate the gap statistic. + log_random_loss: float = np.mean([ + # Since the dataset is generated from a uniform distribution, + # there's no need to run the k-means clustering algorithm for + # multiple times. + math.log(KMeans(k, 1).fit(self.generate_uniform( + UniformParameters( + size=model.data_size, + intervals=(low, high), + ) + ))) for _i in range(self.n_ref) + ]) + gap = log_random_loss - math.log(loss) + gaps.append(gap) + print('Tried k: {}/{}, \tgap statistics: {}'.format( + k, self.max_k, gap + )) + + if gap > max_gap: + max_gap = gap + min_loss = loss + self.best_model = model + + if gap > prev_gap: + break_counter = 0 + else: + break_counter += 1 + if break_counter >= self.BREAK_THRESHOLD: + print('Gap statistics is not ascending, loop terminated.') + break + prev_gap = gap + + self.gaps = np.array(gaps) + print('Estimated k: {}'.format(self.best_model.k)) + return min_loss + + def predict(self, test_data: np.ndarray) -> np.ndarray: + + return self.best_model.predict(test_data) diff --git a/assignment-3/submission/18307130003/tester.py b/assignment-3/submission/18307130003/tester.py new file mode 100644 index 0000000000000000000000000000000000000000..c2c14ae7279129374d28952c711324a4d0dffa4e --- /dev/null +++ b/assignment-3/submission/18307130003/tester.py @@ -0,0 +1,347 @@ +from typing import Callable, List, Tuple, Type +import math +import numpy as np +import matplotlib.pyplot as plt +from source import ( + ClusteringAlgorithm as AutoKMeans, + Model, + KMeans, + GaussianMixture as GMM, +) +from utils import assert_, NormalParameters + + +class TestSuite: + ''' + Multiple testing data for models. + ''' + + def __init__(self) -> None: + self.rng: np.random.Generator = np.random.default_rng() + + def generate_normal(self, param: NormalParameters) -> np.ndarray: + ''' + Generate a dataset from a Gaussian distribution with given parameters. + + Args: + `param`: parameters used to generate a dataset + + Return: + shape(N, d) + ''' + + size, mean, cov, scale = param + if len(mean) > 1: + return self.rng.multivariate_normal(mean, cov, size) + else: + return self.rng.normal(mean[0], scale, size) + + def combine(self, *datasets: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + ''' + Combine several datasets into a single dataset. + + Args: + `*datasets`: a tuple of datasets needed to combine + + Return: + `dataset`: shape(N, d), where N is the total size of all datasets + `labels`: shape(N), the labels for all points in the dataset + ''' + + dataset: np.ndarray = np.concatenate(datasets) + labels: np.ndarray = np.concatenate([ + np.ones(d.shape[0], dtype=int) * i + for (i, d) in enumerate(datasets) + ]) + indices = np.arange(dataset.shape[0]) + np.random.shuffle(indices) + dataset = dataset[indices] + labels = labels[indices] + return dataset, labels + + def generate_data(self, *params: NormalParameters) -> Tuple[np.ndarray, int]: + ''' + Generate a dataset for tests. + + Args: + `params`: a tuple of parameters to generate datasets + + Return: + `dataset`: shape(N, d) + `n_clusters`: the number of clusters to partition into + ''' + + dataset, _labels = self.combine(*tuple( + self.generate_normal(p) for p in params + )) + n_clusters: int = len(params) + return dataset, n_clusters + + def train( + self, train_data: np.ndarray, model: Model + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + ''' + Train a model with training data. + + Args: + `train_data`: shape(N, d) + `model`: the model that we need to train + `n_clusters`: the number of clusters to partition into + + Return: + `train_labels`: the predicted labels of training data, shape(N) + `centroids`: the centroids calculated from training data, shape(k, d) + `gaps`: the gap statistics of each k, shape(k + 1) + ''' + + model.fit(train_data) + train_labels = model.predict(train_data) + centroids: np.ndarray = None + gaps: np.ndarray = None + + if isinstance(model, KMeans): + centroids = model.centroids + elif isinstance(model, GMM): + centroids = model.means + elif isinstance(model, AutoKMeans): + centroids = model.best_model.centroids + gaps = model.gaps + return train_labels, centroids, gaps + + def evaluate(self, test_data: np.ndarray, model: Model) -> Tuple[np.ndarray]: + ''' + Evaluate a model with testing data. + + Args: + `test_data`: shape(N, d) + `model`: the model that we need to evaluate + + Return: + `test_labels`: The predicted labels of testing data, shape(N) + ''' + + test_labels = model.predict(test_data) + return test_labels + + def test_data_1(self) -> Tuple[np.ndarray, int]: + + return self.generate_data( + NormalParameters( + size=800, + mean=(1, 2), + cov=[[73, 0], [0, 22]], + ), + NormalParameters( + size=200, + mean=(16, -5), + cov=[[21.2, 0], [0, 32.1]], + ), + NormalParameters( + size=1000, + mean=(10, 22), + cov=[[10, 5], [5, 10]], + ), + ) + + def test_data_2(self) -> Tuple[np.ndarray, int]: + + return self.generate_data( + NormalParameters( + size=800, + mean=(1, 0), + cov=[[73, 0], [0, 22]], + ), + NormalParameters( + size=400, + mean=(20, 15), + cov=[[21.2, 0], [0, 32.1]], + ), + NormalParameters( + size=1000, + mean=(10, -22), + cov=[[10, 5], [5, 10]], + ), + NormalParameters( + size=500, + mean=(-12, -6), + cov=[[7, 3], [3, 16]], + ), + NormalParameters( + size=600, + mean=(-15, 17), + cov=[[15, 0], [0, 12]], + ), + ) + + def test_data_3(self) -> Tuple[np.ndarray, int]: + + return self.generate_data( + NormalParameters( + size=800, + mean=(-6, 3, 5), + cov=[[73, 0, 0], [0, 50, 0], [0, 0, 22]], + ), + NormalParameters( + size=500, + mean=(12, 0, -10), + cov=[[20, 5, 0], [5, 20, 0], [0, 0, 20]], + ), + NormalParameters( + size=800, + mean=(10, -20, 0), + cov=[[10, 1, 3], [1, 10, 0], [3, 0, 10]], + ), + ) + + def test_data_4(self) -> Tuple[np.ndarray, int]: + + return self.generate_data( + NormalParameters( + size=100, + mean=(-20,), + scale=2, + ), + NormalParameters( + size=150, + mean=(0,), + scale=1, + ), + NormalParameters( + size=100, + mean=(15,), + scale=2, + ), + ) + + def test_data_5(self) -> Tuple[np.ndarray, int]: + + return self.generate_data( + NormalParameters( + size=800, + mean=(0, -5), + cov=[[73, 0], [0, 2]], + ), + NormalParameters( + size=500, + mean=(-3, 0), + cov=[[100, 0], [0, 2]], + ), + NormalParameters( + size=500, + mean=(2, 5), + cov=[[70, 1], [1, 3]], + ), + ) + + def run(self) -> None: + ''' + Run all the tests. + ''' + + testcases: List[Tuple[ + str, Callable[[], Tuple[np.ndarray, int]], Type[Model], int + ]] = [ + ('k-means_1', self.test_data_1, KMeans, 0), + ('k-means_2', self.test_data_2, KMeans, 0), + ('k-means_3', self.test_data_3, KMeans, 0), + ('k-means_4', self.test_data_4, KMeans, 0), + ('k-means_5', self.test_data_5, KMeans, 0), + ('gmm_1', self.test_data_1, GMM, 0), + ('gmm_2', self.test_data_2, GMM, 0), + ('gmm_3', self.test_data_3, GMM, 0), + ('gmm_4', self.test_data_4, GMM, 0), + ('gmm_5', self.test_data_5, GMM, 0), + ('auto-k-means_1', self.test_data_1, AutoKMeans, 10), + ('auto-k-means_2', self.test_data_2, AutoKMeans, 10), + ('auto-k-means_3', self.test_data_3, AutoKMeans, 10), + ('auto-k-means_4', self.test_data_4, AutoKMeans, 10), + ('auto-k-means_5', self.test_data_5, AutoKMeans, 10), + ] + + for testcase in testcases: + name, get_dataset, model_class, n_clusters = testcase + + # Obtain training data and testing data + dataset, real_n_clusters = get_dataset() + train_size: int = math.floor(dataset.shape[0] * 0.8) + train_data: np.ndarray = dataset[:train_size] + test_data: np.ndarray = dataset[train_size:] + + # Train the model with training data + model = model_class(n_clusters or real_n_clusters) + train_labels, centroids, gaps = self.train(train_data, model) + + # Evaluate the model with testing data + test_labels = self.evaluate(test_data, model) + + # Visualize the datasets with labels + visualize(name + '_train', train_data, train_labels, centroids) + visualize(name + '_test', test_data, test_labels, centroids) + + # Visualize the gap statistics for + if gaps is not None: + visualize_gaps(name + '_gaps', gaps) + + print(f'{name}: Done.') + + +def visualize( + name: str, + dataset: np.ndarray, + labels: np.ndarray, + centroids: np.ndarray = None, +) -> None: + ''' + Visualize a dataset with labels. + + Args: + `name`: the output filename when saving the figure + `dataset`: shape(N, d) + `labels`: shape(N) + `centroids`: shape(k, d) + ''' + + assert_('dataset.shape[0]', dataset.shape[0], labels.shape[0]) + + # Plot the data points and the centroids. + if len(dataset.shape) > 1: + plt.scatter(dataset[:, 0], dataset[:, 1], c=labels, s=30) + if centroids is not None: + plt.scatter( + centroids[:, 0], centroids[:, 1], c='black', s=100, alpha=0.5, + ) + else: + plt.scatter(dataset, np.zeros(dataset.shape[0]), c=labels, s=30) + if centroids is not None: + plt.yticks([]) + plt.scatter( + centroids, np.zeros(centroids.shape[0]), + c='black', s=100, alpha=0.5, + ) + + # Save the figure to a local file. + plt.savefig(f'img/{name}') + plt.clf() + + +def visualize_gaps(name: str, gaps: np.ndarray) -> None: + ''' + Visualize the gap statistics. + + Args: + `name`: the output filename when saving the figure + `gaps`: the gap statistics of each k, shape(k + 1) + ''' + + # Plot the gap statistics. + indices = np.arange(1, gaps.shape[0], dtype=int) + plt.xticks(indices) + plt.plot(indices, gaps[indices], '-bo') + + # Save the figure to a local file. + plt.savefig(f'img/{name}') + plt.clf() + + +if __name__ == '__main__': + TestSuite().run() diff --git a/assignment-3/submission/18307130003/tester_demo.py b/assignment-3/submission/18307130003/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..f6dbd555444eaabc2b38d914f295e72b2565a2d8 --- /dev/null +++ b/assignment-3/submission/18307130003/tester_demo.py @@ -0,0 +1,118 @@ +import numpy as np +import sys + +from source import KMeans, GaussianMixture + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int) * i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_function, algorithm_class): + (train_data, test_data), n_clusters = data_function() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len(res.shape) == 1 and res.shape[0] == test_data.shape[0], \ + "shape of result is wrong" + + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all() diff --git a/assignment-3/submission/18307130003/utils.py b/assignment-3/submission/18307130003/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..37d82a1dace1a13fe0e409842b99ea1756ef4547 --- /dev/null +++ b/assignment-3/submission/18307130003/utils.py @@ -0,0 +1,95 @@ +from typing import Any, List, NamedTuple, Tuple +import numpy as np +import math + + +class NormalParameters(NamedTuple): + ''' + Attributes: + `size`: the number of data points in the dataset + `mean`: the mean of the distribution + `cov`: the coefficient of variation of the distribution (dimension > 1) + `scale`: the standard deviation of the distribution (dimension = 1) + ''' + + size: int + mean: Tuple[float, ...] + cov: List[List[float]] = None + scale: float = None + + +class UniformParameters(NamedTuple): + ''' + Attributes: + `size`: the number of data points in the dataset + `intervals`: the range of each dimension, Tuple[shape(N), shape(N)] + ''' + + size: int + intervals: Tuple[np.ndarray, np.ndarray] + + +def distance(point_1: np.ndarray, point_2: np.ndarray) -> float: + ''' + Args: + `point_1`: shape(d) + `point_2`: shape(d) + + Return: + The Euclidean distance between two points. + ''' + + return np.linalg.norm(point_1 - point_2) + + +def multinormal_pdf(x: np.ndarray, mean: np.ndarray, cov: np.ndarray) -> float: + ''' + The probability density function of a multivariate Gaussian distribution + with given parameters. + + Args: + `x`: an observation, shape(d) + `mean`: the mean of the distribution, shape(d) + `cov`: the coefficient of variation of the distribution, shape(d, d) + + Return: + f(x | mean, cov) + ''' + + cov_det: float = np.linalg.det(cov) + dim: int = mean.shape[0] + const: float = (((2 * math.pi) ** dim) * cov_det) ** (-1/2) + x_m: np.ndarray = x - mean + exp: float = -np.dot(x_m, np.linalg.solve(cov, x_m)) / 2 + return const * math.exp(exp) + + +def normal_pdf(x: float, mean: float, scale: float) -> float: + ''' + The probability density function of a Gaussian distribution with given + parameters. + + Args: + `x`: an observation + `mean`: the mean of the distribution + `scale`: the standard deviation of the distribution + + Return: + f(x | mean, scale) + ''' + + const = (2 * math.pi) ** (-1/2) / scale + exp = -((x - mean) / scale) ** 2 / 2 + return const * math.exp(exp) + + +def assert_(var_name: str, got: Any, expected: Any) -> None: + ''' + Args: + `var_name`: variable name for logging + `got`: actual value + `expected`: expected value + ''' + + message = f'Assertion failed for {var_name}: expected {expected}, got {got}' + assert got == expected, message