diff --git a/assignment-1/submission/15307130115/README.md b/assignment-1/submission/15307130115/README.md deleted file mode 100644 index ce194396f5d9e37e6bcb08d789b47ea79a5af6b0..0000000000000000000000000000000000000000 --- a/assignment-1/submission/15307130115/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# 课程报告 - -这是一个课程报告的样例,我的代码在 [source.py](./source.py) 中。 - -我使用了 `sklearn` 中的 `KNeighborsClassifier`,所以我的代码无法通过限定依赖包的自动测试,但我仍可以获得 80% 分数中的大部分。 - -我以如下参数生成了数据集(由于 Gitee 网站的 bug,你需要用`\\\\`来替换 latex 公式中的 `\\`) - -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -73 & 0 \\\\ -0 & 22 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -1 & 2 -\end{array}\right] -\end{array} -$$ - -这是我生成的训练集: - -![训练集](./img/train.png) - -这是我生成的测试集(查看源码了解如何控制图片的大小) - -测试集 - -我可以通过表格或者图片报告我的实验结果 - -Algo | Acc | ------| ---- | -KNN | 0.94 | - -## 代码使用方法 - -```bash -python source.py g # 生成数据集 -python source.py d # 展示数据集 -python source.py # 训练和测试 -``` diff --git a/assignment-1/submission/15307130115/img/test.png b/assignment-1/submission/15307130115/img/test.png deleted file mode 100644 index 91eb7f4d19ce5d49a18e16441f824b8190ea4b1a..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/15307130115/img/test.png and /dev/null differ diff --git a/assignment-1/submission/15307130115/img/train.png b/assignment-1/submission/15307130115/img/train.png deleted file mode 100644 index b33f36f126e616f1ade524bbd1c814355f2d518f..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/15307130115/img/train.png and /dev/null differ diff --git a/assignment-1/submission/15307130115/source.py b/assignment-1/submission/15307130115/source.py deleted file mode 100644 index 4717ab4ea92c73cd07d43101822c360cac8c2014..0000000000000000000000000000000000000000 --- a/assignment-1/submission/15307130115/source.py +++ /dev/null @@ -1,96 +0,0 @@ -import sys -import numpy as np -import matplotlib.pyplot as plt -from sklearn.neighbors import KNeighborsClassifier - -class KNN: - - def __init__(self): - pass - - def fit(self, train_data, train_label): - N = train_data.shape[0] - cut = N//5*4 - - train_data, dev_data = train_data[:cut,], train_data[cut:,] - train_label, dev_label = train_label[:cut,], train_label[cut:,] - - max_score = 0 - max_score_K = 0 - for k in range(2,6): - clf = KNeighborsClassifier(n_neighbors=k) - clf.fit(train_data, train_label) - score = clf.score(dev_data, dev_label) - if score > max_score: - max_score, max_score_K = score, k - - self.clf = KNeighborsClassifier(n_neighbors=max_score_K) - self.clf.fit( - np.concatenate([train_data,dev_data]), - np.concatenate([train_label, dev_label]) - ) - - def predict(self, test_data): - return self.clf.predict(test_data) - - -def generate(): - mean = (1, 2) - cov = np.array([[73, 0], [0, 22]]) - x = np.random.multivariate_normal(mean, cov, (800,)) - - mean = (16, -5) - cov = np.array([[21.2, 0], [0, 32.1]]) - y = np.random.multivariate_normal(mean, cov, (200,)) - - mean = (10, 22) - cov = np.array([[10,5],[5,10]]) - z = np.random.multivariate_normal(mean, cov, (1000,)) - - idx = np.arange(2000) - np.random.shuffle(idx) - data = np.concatenate([x,y,z]) - label = np.concatenate([ - np.zeros((800,),dtype=int), - np.ones((200,),dtype=int), - np.ones((1000,),dtype=int)*2 - ]) - data = data[idx] - label = label[idx] - - train_data, test_data = data[:1600,], data[1600:,] - train_label, test_label = label[:1600,], label[1600:,] - np.save("data.npy",( - (train_data, train_label), (test_data, test_label) - )) - - -def read(): - (train_data, train_label), (test_data, test_label) = np.load("data.npy",allow_pickle=True) - return (train_data, train_label), (test_data, test_label) - -def display(data, label, name): - datas =[[],[],[]] - for i in range(len(data)): - datas[label[i]].append(data[i]) - - for each in datas: - each = np.array(each) - plt.scatter(each[:, 0], each[:, 1]) - plt.savefig(f'img/{name}') - plt.show() - -if __name__ == "__main__": - if len(sys.argv) > 1 and sys.argv[1] == "g": - generate() - if len(sys.argv) > 1 and sys.argv[1] == "d": - (train_data, train_label), (test_data, test_label) = read() - display(train_data, train_label, 'train') - display(test_data, test_label, 'test') - else: - (train_data, train_label), (test_data, test_label) = read() - - model = KNN() - model.fit(train_data, train_label) - res = model.predict(test_data) - print("acc =",np.mean(np.equal(res, test_label))) \ No newline at end of file diff --git a/assignment-1/submission/16300110008/README.md b/assignment-1/submission/16300110008/README.md deleted file mode 100644 index f1e2cc5375b8670facf59ce6320bddcceb526798..0000000000000000000000000000000000000000 --- a/assignment-1/submission/16300110008/README.md +++ /dev/null @@ -1,343 +0,0 @@ -# 课程报告 - -这是一个有关KNN模型的实验报告,我的代码保存在source.py中。本次实验使用了**numpy**库作为数据分析的工具,基于**matplotlib**库进行数据展示。 - -## 一、数据集的生成与划分 - -笔者采用以下参数产生四个不同的二维高斯分布数据集,每类使用不同的标签标记,这四组数据集分别是: - -+ 第一类数据,数量为400个,标签为0: - -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -35 & 0 \\\\ -0 & 23 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -13 & 4 -\end{array}\right] -\end{array} -$$ - -+ 第二类数据,数量为800个,标签为1: - -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -12 & 1 \\\\ -1 & 35 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -1 & 14 -\end{array}\right] -\end{array} -$$ - -+ 第三类数据,数量为1200个,标签为2: - -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -33 & 5 \\\\ -5 & 9 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} --8 & -3 -\end{array}\right] -\end{array} -$$ - -+ 第四类数据,数量为1600个,标签为3: - -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -25 & 6 \\\\ -6 & 18 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} --15 & 10 -\end{array}\right] -\end{array} -$$ - -将这四类数据集混合后打乱次序,将其中的80%(3200个)划分为训练集,20%(800个)划分为测试集,可以形成以下图像 - -这是我生成的训练集: - -训练集 - -这是我生成的测试集: - -测试集 - -## 二、KNN模型 - -笔者的KNN模型包含5个属性和2个方法。 - -三个属性分别为*k、data、label、mean、std*,其中*k*用来记录模型选择的近邻个数,*data*用来保存训练集的输入,*label*保存训练集的标签,*mean*记录训练集的均值,*std*记录训练集的标准差。 - -两个方法分别是`model.fit()`与`model.predict()`。`model.fit()`方法主要包含以下步骤: - -1. 进行标准化处理: - $$ - \bar{X}=\frac{X-\mu}{\sigma} - $$ - 其中$ \mu $是训练集的均值,$\sigma$是训练集的标准差。 - -2. 采用k折交叉验证方式,将训练集分割为k个子集(这里k取5),将其中一个作为验证集,其余的作为训练集。 - -3. 对KNN的参数k,从1开始逐步增大,寻找到在验证集上准确率最高的k,作为测试时使用的参数。其中k增大的步长为2,上限为`min(30, int(n * .8))`即取30与训练集数量乘以0.8以后较小的那一个值。 - -`model.predict()`方法用来预测新数据的类别,主要是先对数据进行标准化处理,再逐一进行预测,距离上采用欧氏距离作为度量,计算样本$X_{new}$与训练集中每一个样本之间的距离,从中选出最小的k个样本,采用投票的方式决定$X_{new}$的类别。 - -## 三、实验结果分析 - -笔者进行了十次实验,采用了不同的随机数种子产生不同的二维正态分布集合,这十次的实验结果如下: - -| 序号 | K | Acc | -| :------- | :------- | ---------- | -| 1 | 5 | 93.63% | -| 2 | 29 | 95.00% | -| 3 | 25 | 94.13% | -| 4 | 19 | 96.38% | -| 5 | 23 | 94.63% | -| 6 | 9 | 94.75% | -| 7 | 9 | 93.50% | -| 8 | 21 | 93.13% | -| 9 | 9 | 94.75% | -| 10 | 19 | 92.63% | -| **均值** | **16.8** | **94.23%** | - -可以看出,k取值存在选取大值的倾向,10次实验中仅有一次k值小于等于5,共计4次小于10,其余实验k取值都在19以上;准确率能够达到92%以上,比较高,说明KNN分类器模型的确能够对数据集进行正确分类,而分类错误的样本主要集中于不同*label*数据集间的交叉地带。 - -## 四、修改数据集进行实验探究 - -下面,笔者将通过修改生成数据集之间的距离、方差、数量以及计算样本距离的方式和归一化与否来对KNN模型进行探究。 - -### 1、数据集间的距离 - -在保持前文生成数据集方差不变的情况下,笔者修改了$\mu$的取值,从而可以控制分布之间的远近关系。 - -1. #### 缩小分布之间的相对距离 - - 笔者将原定的$\mu$参数缩小为原来的$1/10$,分布之间的距离更近,如下图所示: - - 训练集: - - 训练集_更近 - - 训练集: - - 训练集_更近 - - 距离拉近后,不同分布已经重合到了一起,同样进行十次训练,其结果如下 - - | 序号 | K | Acc | - | :------- | :------- | ---------- | - | 1 | 27 | 43.50% | - | 2 | 27 | 42.25% | - | 3 | 21 | 41.86% | - | 4 | 29 | 43.25% | - | 5 | 17 | 43.63% | - | 6 | 29 | 42.75% | - | 7 | 27 | 41.38% | - | 8 | 29 | 40.00% | - | 9 | 23 | 44.13% | - | 10 | 27 | 41.88% | - | **均值** | **25.6** | **42.46%** | - - 可见,这十次训练所取的K值更大,仅有一次小于20;准确率最高不超过45%,说明分布之间的距离较近会降低分类器的准确率。 - - - -2. #### 扩大分布之间的相对距离 - - 笔者将原定的$\mu$参数扩大为原来的2倍,分布之间的距离更远,如下图所示: - - 训练集: - - 训练集_更远 - - 测试集: - - 测试集_更远 - - 距离放大后,不同分布之间没有重合,实验结果显示分类正确率为100%,k值均取1 - - -由此可知,在方差不变的情况下,分布之间距离越近KNN分类器的准确率越低,距离越远准确率越高。这与KNN分类器的分类原理是分不开的,当分布之间距离较大时,待检测样本的周围只有同类样本,所以分类正确率较高,反之则会受到其他分布样本的干扰,从而降低准确率。 - -### 2、分布的方差 -在保持原分布$\mu$不变的情况下,笔者修改了分布的方差进行实验。 -1. #### 缩小方差 - 将方差缩小为原来的1/2,同一标签下内的样本分布得更加紧密,如下图所示: - 训练集: - - 训练集_更聚集 - - 测试集: - - 测试集_更聚集 - - 同样进行十次实验,结果如下: - | 序号 | K | Acc | - | :------- | :------- | ---------- | - | 1 | 13 | 99.63% | - | 2 | 9 | 99.50% | - | 3 | 3 | 99.50% | - | 4 | 9 | 100.00% | - | 5 | 5 | 99.38% | - | 6 | 19 | 99.00% | - | 7 | 7 | 99.38% | - | 8 | 3 | 98.75% | - | 9 | 17 | 99.75% | - | 10 | 9 | 98.63% | - | **均值** | **9.4** | **99.35%** | - - 这十次实验的k取值更小,仅有三次超过10,准确率更高,表明方差减小有可能提升KNN分类器的准确率。 -2. #### 增大方差 - - 笔者将原定的$\mu$参数扩大为原来的2倍,分布之间的距离更远,如下图所示: - 训练集: - - 训练集_更分散 - - 测试集: - - 测试集_更分散 - - 同样进行十次实验,结果如下: - | 序号 | K | Acc | - | :------- | :------- | ---------- | - | 1 | 17 | 84.50% | - | 2 | 19 | 82.50% | - | 3 | 21 | 83.50% | - | 4 | 19 | 84.63% | - | 5 | 15 | 85.88% | - | 6 | 7 | 81.13% | - | 7 | 15 | 83.38% | - | 8 | 13 | 82.63% | - | 9 | 23 | 84.75% | - | 10 | 29 | 83.50% | - | **均值** | **17.8** | **83.64%** | - - 这十次实验的k取值变大,准确率降低,表明方差增大有可能降低KNN分类器的准确率。 - - -总而言之,数据集分布的离散程度会影响分类器的准确率,同一分布的数据越集中,分类准确率越高,反之越低。数据越分散,相当于不同的数据之间重合的可能性越高,因此更难进行准确地分类。 -### 3、数据的数量 -将每个标签下数据的数量降低为原来的1/10、扩大为原来的2倍,分别进行10次实验,实验结果如下: - -| 序号 | k(数量减少) | Acc(数量减少) | k(数量增大) | Acc(数量增大) | -| -------- | ------------- | --------------- | ------------- | --------------- | -| 1 | 15 | 95.00% | 5 | 94.69% | -| 2 | 1 | 95.00% | 25 | 94.50% | -| 3 | 5 | 92.50% | 13 | 93.88% | -| 4 | 3 | 93.75% | 21 | 93.56% | -| 5 | 27 | 98.75% | 15 | 94.88% | -| 6 | 19 | 93.75% | 23 | 94.06% | -| 7 | 3 | 96.25% | 29 | 94.50% | -| 8 | 5 | 93.75% | 13 | 94.63% | -| 9 | 3 | 90.00% | 17 | 93.63% | -| 10 | 11 | 98.75% | 15 | 95.19% | -| **均值** | **9.2** | **94.75%** | **17.6** | **94.35%** | - -可见,在一定范围内,数量的改变并不能影响模型的准确率,但是通过实验发现当样本数量增大时,KNN模型的速度会变慢。 - -### 4、样本距离计算方式 - -下面,笔者分别采用以下距离进行实验: - -曼哈顿距离(L1范数): - -$$ -L_ 1(x_ i,x_ j) = \sum^{n}_ {l=1} |x^{(l)}_ {i} - x^{(l)}_ {j}| -\tag{1} -$$ - -欧氏距离(L2范数): - -$$ -L_2(x_i,x_j) = -(\sum^{n}_ {l=1} |x^{(l)}_ i-x^{(l)}_ {j}|^2 )^{\frac{1}{2}} -\tag{2}$$ - -切比雪夫距离(L∞范数): -$$ -L_{\infty}(x_i,x_j) = max_l|x^{(l)}_i-x^{(l)}_j| -\tag{3} -$$ - -此外,笔者还引入了高斯核函数$K(x_i,x_j)$,将原始样本$x_i$映射入新的特征空间变为$\phi(x_i)$,在新的空间内进行距离计算。实际上,通过核函数,可以不用直接转换样本的坐标而隐式地计算样本在新特征空间内的距离,因为在新的特征空间中,两个样本点之间的距离$D(x_i,x_j)$可以通过核函数隐式地获得: -$$ -\begin{align} -D^2(x_i,x_j)&=||\phi(x_i)-\phi(x_j)||^2\\\\ -&=K(x_i, x_i) - 2K(x_i,x_j) + K(x_j,x_j) -\end{align} -$$ -这三种距离的控制主要通过`dis_function()`函数实现: - -```python -def dis_function(x, y, mode='L2'): - d = y.shape[1] - x = x.reshape(1, -1) - y = y.reshape(-1, d) - if mode == 'L2': - return np.linalg.norm(x - y, axis=1) - elif mode == 'L1': - return np.linalg.norm(x - y, axis=1, ord=1) - elif mode == 'L_inf': - return np.linalg.norm(x - y, axis=1, ord=np.inf) - elif mode == 'Gaussian': - gamma = 1e-3 - var = lambda x_i, x_j: np.exp(- gamma * np.square(np.linalg.norm(x_i - x_j, axis=1))) - return np.sqrt(var(x, x) - 2 * var(x, y) + var(y, y)) -``` - -为了使采用不同距离的分类器具有区别性,笔者将原数据集的参数$\Sigma$扩大为原来的两倍,实验结果如下: - -| 序号 | k_L1 | Acc_L1 | k_L2 | Acc_L2 | k_L∞ | Acc_L∞ | k_Gaussian | Acc_Gaussian | -| -------- | -------- | ---------- | -------- | ---------- | -------- | ---------- | ---------- | ------------ | -| 1 | 23 | 84.25% | 17 | 84.50% | 15 | 83.88% | 17 | 84.50% | -| 2 | 19 | 82.63% | 19 | 82.50% | 15 | 82.38% | 19 | 82.50% | -| 3 | 21 | 83.23% | 21 | 83.50% | 15 | 83.63% | 21 | 83.50% | -| 4 | 19 | 85.25% | 19 | 84.63% | 21 | 85.00% | 19 | 84.63% | -| 5 | 13 | 85.50% | 15 | 85.88% | 23 | 85.88% | 15 | 85.88% | -| 6 | 21 | 82.75% | 7 | 81.13% | 13 | 83.13% | 7 | 81.13% | -| 7 | 13 | 83.38% | 15 | 83.38% | 19 | 84.13% | 15 | 83.38% | -| 8 | 11 | 82.87% | 13 | 82.63% | 17 | 82.26% | 13 | 82.63% | -| 9 | 21 | 84.63% | 23 | 84.75% | 9 | 83.63% | 23 | 84.75% | -| 10 | 27 | 84.00% | 29 | 83.50% | 23 | 83.38% | 29 | 83.50% | -| **均值** | **18.8** | **83.89%** | **17.8** | **83.64%** | **17.0** | **83.72%** | **17.8** | **83.64%** | - -可以看出,在此数据分布下,不同的距离计算方式并没有产生较大的影响。此外,采用高斯核函数映射原数据所得到的结果并无不同,可见此情景下,高斯核函数也没有产生促进分类的作用。 - -### 5、归一化 - -下面是分别是在原数据集上进行的实验结果,区别是是否执行了标准化: - -| 序号 | k_without_normalization | Acc_without_normalization | k_with_normalization | Acc_with_normalization | -| -------- | ----------------------- | ------------------------- | -------------------- | ---------------------- | -| 1 | 5 | 93.50% | 5 | 93.63% | -| 2 | 25 | 94.63% | 29 | 95.00% | -| 3 | 27 | 94.50% | 25 | 94.13% | -| 4 | 9 | 95.50% | 19 | 96.38% | -| 5 | 25 | 94.50% | 23 | 94.63% | -| 6 | 9 | 94.88% | 9 | 94.75% | -| 7 | 9 | 93.13% | 9 | 93.50% | -| 8 | 19 | 93.25% | 21 | 93.13% | -| 9 | 5 | 95.50% | 9 | 94.75% | -| 10 | 17 | 93.25% | 19 | 92.63% | -| **均值** | **18.8** | **94.26%** | **16.8** | **94.23%** | - -从现有的实验结果来看,标准化并不能明显地提升KNN分类器的准确率,其是否有效还需要进一步的探究。 - -## 五、代码使用方式 - -```bash -python source.py g # 生成数据集 -python source.py d # 展示数据集 -python source.py # 训练和测试 -``` \ No newline at end of file diff --git a/assignment-1/submission/16300110008/img/test.png b/assignment-1/submission/16300110008/img/test.png deleted file mode 100644 index d134664a436ac258eae767ab0245d0a6af0753fe..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/test.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/test_close.png b/assignment-1/submission/16300110008/img/test_close.png deleted file mode 100644 index 5dc3292a77d8176c8446cfab2fc11726dd1a8ee7..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/test_close.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/test_divide.png b/assignment-1/submission/16300110008/img/test_divide.png deleted file mode 100644 index d019e98a5c41530e60920e67a903277f088d6ee6..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/test_divide.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/test_far.png b/assignment-1/submission/16300110008/img/test_far.png deleted file mode 100644 index 1fed8f774e5b48753f86fa7336be29ad407fa70c..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/test_far.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/test_gather.png b/assignment-1/submission/16300110008/img/test_gather.png deleted file mode 100644 index 34733b975ead375a61abfbb15b59e16f1ea182b4..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/test_gather.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/train.png b/assignment-1/submission/16300110008/img/train.png deleted file mode 100644 index f083fb235fba9f5e34175a9708fe5ac6979b423d..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/train.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/train_close.png b/assignment-1/submission/16300110008/img/train_close.png deleted file mode 100644 index efce494fc03a80b7d587ea1dae9f7c8dfb20f56a..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/train_close.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/train_divide.png b/assignment-1/submission/16300110008/img/train_divide.png deleted file mode 100644 index bea982d45d81519fbaa9b1dbb10c89fb3949d224..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/train_divide.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/train_far.png b/assignment-1/submission/16300110008/img/train_far.png deleted file mode 100644 index 80e7c6dd3a3d62ae39533f32ea0b29b52833e96a..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/train_far.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/img/train_gather.png b/assignment-1/submission/16300110008/img/train_gather.png deleted file mode 100644 index 6860b6113ea0c0ee832491ea37e9d5b136e7869a..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16300110008/img/train_gather.png and /dev/null differ diff --git a/assignment-1/submission/16300110008/source.py b/assignment-1/submission/16300110008/source.py deleted file mode 100644 index 712d25173c94b0c8107c7808267ed8482c464af1..0000000000000000000000000000000000000000 --- a/assignment-1/submission/16300110008/source.py +++ /dev/null @@ -1,244 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -import random -import sys - -NUM_OF_LABELS = 4 -DIMENSION = 2 -# 用来控制数据集的样本个数 -NUM_BASIS = 200 - - - -def set_random_seed(seed): - np.random.seed(seed) - random.seed(seed) - - -def dis_function(x, y, mode='L2'): - d = y.shape[1] - x = x.reshape(1, -1) - y = y.reshape(-1, d) - if mode == 'L2': - return np.linalg.norm(x - y, axis=1) - elif mode == 'L1': - return np.linalg.norm(x - y, axis=1, ord=1) - elif mode == 'L_inf': - return np.linalg.norm(x - y, axis=1, ord=np.inf) - elif mode == 'Gaussian': - gamma = 1e-3 - var = lambda x_i, x_j: np.exp(- gamma * np.square(np.linalg.norm(x_i - x_j, axis=1))) - return np.sqrt(var(x, x) - 2 * var(x, y) + var(y, y)) - - - -class KNN: - - def __init__(self): - self.k = 1 - self.data = None - self.label = None - self.mean = 0 - self.std = 0 - - def fit(self, train_data, train_label): - n, d = train_data.shape - acc_best = 0 - k_best = self.k - k_max = min(30, int(n * 0.8)) - - # 归一化 - self.mean = np.mean(train_data, axis=0) - self.std = np.std(train_data, axis=0) - train_data = (train_data - self.mean) / self.std - - idx = np.arange(n) - np.random.shuffle(idx) - - train_data = train_data[idx] - train_label = train_label[idx] - - acc_best = 0 - k_best = 0 - - for j in range(5): - val_start_point = int(n * j * .2) - val_end_point = int(n * (j + 1) * .2) - - temp_val_data = train_data[ - val_start_point: val_end_point, : - ] - temp_val_label = train_label[ - val_start_point: val_end_point - ] - - temp_train_data = np.vstack( - [ - train_data[:val_start_point, :], - train_data[val_end_point:, :] - ] - ) - - temp_train_label = np.concatenate( - [ - train_label[:val_start_point], - train_label[val_end_point:] - ] - ) - - # temp_train_data, temp_val_data = train_data[:int( - # n * 0.8), :], train_data[int(n * 0.8):, :] - # temp_train_label, temp_val_label = train_label[:int( - # n * 0.8)], train_label[int(n * 0.8):] - self.data = temp_train_data - self.label = temp_train_label - - for i in range(1, k_max, 2): - self.k = i - res = self.predict(temp_val_data, normalization=False) - acc = np.mean(np.equal(res, temp_val_label)) - # print(i, acc) - if acc > acc_best: - acc_best = acc - k_best = i - # print(acc_best, k_best) - self.k = k_best - # print(f'j={j}') - self.data = train_data - self.label = train_label - - def predict(self, test_data, normalization=True, mode='Gaussian'): - n, d = test_data.shape - if normalization: - test_data = (test_data - self.mean) / self.std - res = [] - for i in range(n): - temp = test_data[i, :] - # dis = np.linalg.norm(temp - self.data, axis=1) # 使用欧式距离 - # 使用核函数 - dis = dis_function(temp, self.data, mode=mode) - idx = np.argpartition(dis, self.k)[:self.k] - # print(self.data[idx], self.label[idx], np.argmax(np.bincount(self.label[idx]))) - res.append(np.argmax(np.bincount(self.label[idx]))) - return np.array(res) - - -def generate_cov(dimension): - # 用于生成随机的数据集 - A = np.abs(np.random.randn(dimension, dimension)) - B = np.dot(A, A.T) - return B - - -def generate(): - global NUM_OF_LABELS - global DIMENSION - # 通过控制系数调整mean和cov - mean = [(13 * 1, 4 * 1), (1 * 1, 14 * 1), (-8 * 1, -3 * 1), (-15 * 1, 10 * 1)] - # mean = [tuple(np.random.randn(DIMENSION,) * mean_factor) - # for i in range(NUM_OF_LABELS)] - - cov = [ - np.array([[35, 0], [0, 23]]) * 1, - np.array([[12, 1], [1, 35]]) * 1, - np.array([[33, 5], [5, 9]]) * 1, - np.array([[25, 6], [6, 18]]) * 1, - ] - # cov = [ - # generate_cov(DIMENSION) * cov_factor for i in range(NUM_OF_LABELS) - # ] - dataset = [None] * NUM_OF_LABELS - data_num = [] - num_of_examples = 0 - for i in range(NUM_OF_LABELS): - data_num.append((i + 1) * 2 * NUM_BASIS) - num_of_examples += (i + 1) * 2 * NUM_BASIS - dataset[i] = np.random.multivariate_normal( - mean[i], cov[i], ((i + 1) * 2 * NUM_BASIS, )) - # print(data_num) - idx = np.arange(num_of_examples) - np.random.shuffle(idx) - data = np.concatenate([item for item in dataset]) - label = np.concatenate( - # [ - # np.zeros((data_num[0],), dtype=int), - # np.ones((data_num[1],), dtype=int), - # np.ones((data_num[2],), dtype=int) * 2, - # np.ones((data_num[3],), dtype=int) * 3 - # ] - [np.ones((data_num[i],), dtype=int) * i for i in range(NUM_OF_LABELS)] - ) - - data = data[idx] - label = label[idx] - - train_data, test_data = data[:int( - num_of_examples * 0.8), :], data[int(num_of_examples * 0.8):, :] - train_label, test_label = label[:int( - num_of_examples * 0.8)], label[int(num_of_examples * 0.8):] - - np.save( - 'data.npy', - ( - (train_data, train_label), - (test_data, test_label) - ) - ) - - -def read(): - (train_data, train_label), (test_data, test_label) = np.load( - 'data.npy', allow_pickle=True) - return (train_data, train_label), (test_data, test_label) - - -def display(data, label, name): - global NUM_OF_LABELS - datas = [[] for i in range(NUM_OF_LABELS)] - for i in range(len(data)): - datas[label[i]].append(data[i]) - - for each in datas: - each = np.array(each) - plt.scatter(each[:, 0], each[:, 1]) - plt.savefig(f'img/{name}') - plt.show() - -def experiment(): - generate() - data = read() - train_data, train_label = data[0][0], data[0][1] - test_data, test_label = data[1][0], data[1][1] - # display(train_data, train_label, 'train_divide') - # display(test_data, test_label, 'test_divide') - model = KNN() - model.fit(train_data, train_label) - res = model.predict(test_data, normalization=True) - acc = np.mean(np.equal(res, test_label)) - print(model.k) - print("acc =", acc) - return model.k, acc, compute_index(res, test_label) - -def compute_index(res, label): - res_bin = np.bincount(res) - label_bin = np.bincount(label) - return res_bin, label_bin - - -if __name__ == '__main__': - - if len(sys.argv) > 1 and sys.argv[1] == "g": - generate() - elif len(sys.argv) > 1 and sys.argv[1] == "d": - (train_data, train_label), (test_data, test_label) = read() - display(train_data, train_label, 'train') - display(test_data, test_label, 'test') - else: - (train_data, train_label), (test_data, test_label) = read() - - model = KNN() - model.fit(train_data, train_label) - res = model.predict(test_data) - print("acc =",np.mean(np.equal(res, test_label))) - - diff --git a/assignment-1/submission/16307100065/.keep b/assignment-1/submission/16307100065/.keep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/assignment-1/submission/16307100065/README.md b/assignment-1/submission/16307100065/README.md deleted file mode 100644 index 06a0098602e802319c84db26985360e4c9e99cfb..0000000000000000000000000000000000000000 --- a/assignment-1/submission/16307100065/README.md +++ /dev/null @@ -1,214 +0,0 @@ -# KNN实验探索 - -## 一.KNN算法代码实现 - -首先定义KNN类,KNN类中包括三个方法: - -```python -import numpy as np -from collections import Counter #用于计数 -def __init__(self,k): - #输入超参k,并赋给内部变量self._k;定义另外内部三个重要变量 - #self._train_data,self._train_label,self._test_data - self._k = k - self._train_data=None - self._train_label = None - self._test_data=None -``` - -```python -def fit(self,train_data,train_label): - #输入train_data,train_label并赋给内部变量 - self._train_data=train_data - self._train_label=train_label -``` - -```python -def predict(self,test_data): - #输入test_data并赋给内部变量 - self._test_data=test_data - predicts_ =[] - #遍历测试集 - for i in self._test_data: - #对测试集中的数据求与每一个训练集中数据的欧氏距离 - distances_ = [np.sum((i-x)**2)**0.5 for x in self._train_data] - distances = np.array(distances_) - #用Counter函数求距离前k个中0-1的个数 - sorted_distances = np.argsort(distances) - topK = [self._train_label[j] for j in sorted_distances[0:self._k]] - votes = Counter(topK) - #预测结果为距离前k个中0-1数量多的种类 - predict = votes.most_common(1)[0][0] - predicts_.append(predict) - predicts = np.array(predicts_) - return predicts -``` - - - -## 二.试验探究 - -### 1.二维随机正态分布的简单分类实验 - -(1)两维之间完全不相关 - -```python -import numpy as np -from source import KNN -import matplotlib.pyplot as plt -#每一维均值为0,方差为10,并且两维独立,创建1000个数据 -cov = [[10,0],[0,10]] -data = np.around(np.random.multivariate_normal((0,0),cov,1000),2) -#对应分类随机取0或1 -label = np.random.choice([0,1],size=1000,replace=True) -#按8:2的比例分为训练集和测试集 -n = len(data)//5 -train_data = data[0:4*n] -train_label = label[0:4*n] -test_data = data[4*n:] -test_label = label[4*n:] -#调用KNN类,k赋值5,将训练集输入模型 -model = KNN(5) -model.fit(train_data, train_label) - -#绘制分类图 -#第一维和第二维分别作为x,y轴 -x_show = train_data[:,0] -y_show = train_data[:,1] -x_min,x_max=x_show.min(),x_show.max() -y_min,y_max=y_show.min(),y_show.max() -#将坐标系分为200×200的网格 -xx,yy = np.meshgrid(np.linspace(x_min,x_max,200),np.linspace(y_min,y_max,200)) -#将网格放入模型预测,预测每一个网格的分类 -z1 = np.c_[xx.ravel(),yy.ravel()] -z = np.array(z1) -pred = model.predict(z) -pred = pred.reshape(xx.shape) -#绘制网格分类图和训练集的散点图 -plt.pcolormesh(xx,yy,pred,cmap=plt.cm.Pastel1) -plt.scatter(x_show,y_show,s=80,c=train_label,cmap=plt.cm.spring,edgecolors='k') -plt.xlim(xx.min(),xx.max()) -plt.ylim(yy.min(),yy.max()) -plt.show() -``` - -![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195821_9a6657ad_8843048.png "二维结果.png") - -结果如图所示。由于data是根据二元正态分布随机取值的,并且label也是在0-1之间随机选取的,所以data和label之间是完全无关的。所以分类图也是不规则的。 - -(2)二维之间完全正相关 - -```python -#使两个维度相关系数为1 -cov = [[10,10],[10,10]] -``` - -![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195844_3f64dcf7_8843048.png "二维相关.png") - -结果如图所示。由于两维相关系数为1,所以所有点都在y=x的直线上。由于label的随机选取,所以0-1分类区域是在这条直线上的随机分布。 - -(3)完全负相关 - -```python -#两个维度相关系数为-1 -cov = [[10,-10],[-10,10]] -``` - -![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195855_21448894_8843048.png "负二维.png") - -结果如图所示。 - -(4)一般情况下 - -```python -#使相关系数为0.2 -cov = [[10,2],[2,10]] -``` - -![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195908_c9a5e2fa_8843048.png "0.2.png") - -```python -#使相关系数为0.8 -cov = [[10,8],[8,10]] -``` - -![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195921_5cf96f45_8843048.png "0.8.png") - -分别将相关系数设为0.2与0.8,可以看出0.2时分布还相对散乱,而0.8时已经可以看出明显的线性关系。同样,由于label的随机选取,在直线上0-1的分布仍然是随机的。 - -### 2.二维随机正态分布的多分类实验 - -```python -#相关系数取0.8 -cov = [[10,8],[8,10]] -#五分类任务 -label = np.random.choice([0,1,2,3,4],size=1000,replace=True) -``` - -![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195937_f438d130_8843048.png "五分类0.8.png") - -```python -#相关系数取0.2 -cov = [[10,2],[2,10]] -#五分类任务 -label = np.random.choice([0,1,2,3,4],size=1000,replace=True) -``` - -![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195959_868f36e5_8843048.png "五0.2.png") - -结果如图所示。与简单分类结果类似,这是由于两维的相关系数是一样的,而仅仅是将label从0-1随机取值变为0-1-2-3-4随机取值,并没有本质变化,只是人为定义的分类数变多了。 - -### 3. 实验发现 - -#### (1)简单分类与多分类 - -以相关系数为0.2的二维正态分布,k值取5为例。 - -运行10次0-1的简单二分类实验,结果分别为: - -acc = 0.51,0.545,0.535,0.515 ,0.435, 0.465,0.56,0.53, 0.515,0.53 - -平均acc=0.514 - -运行10次三分类实验,结果如下: - -acc = 0.295,0.32,0.35,0.34,0.31,0.405 ,0.325 ,0.3,0.32,0.32 - -平均acc = 0.329 - -运行10次四分类实验,结果如下: - -acc = 0.25,0.22,0.255,0.19,0.215,0.335,0.3,0.25,0.28,0.225 - -平均acc = 0.252 - -运行10此五分类实验,结果如下: - -acc = 0.265,0.185,0.22,0.235,0.195,0.19,0.22, 0.23,0.2,0.215 - -平均acc = 0.2155 - - - -可以发现,当分类数越多时,预测的准确率越低。这应该是由于data和label本身之间没有相关性,acc近似等于(1/分类数),和瞎猜的准确率是近似的。 - -#### (2)k值的选取 - -以简单二分类为例。 - -k值取5,运行10次的平均acc =0.512 - -k值取4,运行10次的平均acc =0.507 - -k值取6,运行10次的平均acc =0.485 - -k值取3,运行10次的平均acc =0.495 - -k值取7,运行10次的平均acc =0.508 - - - -可知,k取5是恰当的。但同样由于data和label之间没有相关性,所以不同k值之间准确率的差异不大,都在0.5左右,近似于瞎猜。因为分类区域本身就是随机的,距离预测点越近的点并不代表属于该点类别的概率越大。 - -所以,该次尝试的调参是没有意义的。如果选取本身有意义,并且属性与分类有关系的数据组进行测试,取不同的k值的准确率才会有显著差异,此时的调参才有意义。 - diff --git a/assignment-1/submission/16307100065/img/.keep b/assignment-1/submission/16307100065/img/.keep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/assignment-1/submission/16307100065/img/0.2.png b/assignment-1/submission/16307100065/img/0.2.png deleted file mode 100644 index 5cc12d92a5e696a6181dc47fef6caf9de5a4d7c9..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16307100065/img/0.2.png and /dev/null differ diff --git a/assignment-1/submission/16307100065/img/0.8.png b/assignment-1/submission/16307100065/img/0.8.png deleted file mode 100644 index abe0d2e4626cc82b2e44961f668c4d732aa057cf..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16307100065/img/0.8.png and /dev/null differ diff --git "a/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\233\270\345\205\263.png" "b/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\233\270\345\205\263.png" deleted file mode 100644 index 4eb7d1805b16224826b6522b18680a6951f08085..0000000000000000000000000000000000000000 Binary files "a/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\233\270\345\205\263.png" and /dev/null differ diff --git "a/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\273\223\346\236\234.png" "b/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\273\223\346\236\234.png" deleted file mode 100644 index cfb88bc7cf41bc6bb02cd3467b2f475c43a405a2..0000000000000000000000000000000000000000 Binary files "a/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\273\223\346\236\234.png" and /dev/null differ diff --git "a/assignment-1/submission/16307100065/img/\344\272\2240.2.png" "b/assignment-1/submission/16307100065/img/\344\272\2240.2.png" deleted file mode 100644 index 5d151517b7b0da7e0ccf89ae43a44b848831e19c..0000000000000000000000000000000000000000 Binary files "a/assignment-1/submission/16307100065/img/\344\272\2240.2.png" and /dev/null differ diff --git "a/assignment-1/submission/16307100065/img/\344\272\224\345\210\206\347\261\2730.8.png" "b/assignment-1/submission/16307100065/img/\344\272\224\345\210\206\347\261\2730.8.png" deleted file mode 100644 index d5d72a9445ffdab5b80e24d02ee76a3789244d50..0000000000000000000000000000000000000000 Binary files "a/assignment-1/submission/16307100065/img/\344\272\224\345\210\206\347\261\2730.8.png" and /dev/null differ diff --git "a/assignment-1/submission/16307100065/img/\350\264\237\344\272\214\347\273\264.png" "b/assignment-1/submission/16307100065/img/\350\264\237\344\272\214\347\273\264.png" deleted file mode 100644 index 5f28d4a81b3deaa217a3d1a78602d6df915b3226..0000000000000000000000000000000000000000 Binary files "a/assignment-1/submission/16307100065/img/\350\264\237\344\272\214\347\273\264.png" and /dev/null differ diff --git a/assignment-1/submission/16307100065/source.py b/assignment-1/submission/16307100065/source.py deleted file mode 100644 index 4ae97853ca5174d7f23998d209a5603a4a7b7c58..0000000000000000000000000000000000000000 --- a/assignment-1/submission/16307100065/source.py +++ /dev/null @@ -1,35 +0,0 @@ -#算法代码 -import numpy as np -from collections import Counter -class KNN: - - def __init__(self,k): - self._k = k - self._train_data=None - self._train_label = None - self._test_data=None - - def fit(self,train_data,train_label): - self._train_data=train_data - self._train_label=train_label - - - def predict(self,test_data): - self._test_data=test_data - predicts_ =[] - #遍历测试集 - for i in self._test_data: - #对测试集中的数据求距离每一个训练集中数据的欧氏距离 - distances_ = [np.sum((i-x)**2)**0.5 for x in self._train_data] - distances = np.array(distances_) - #用Counter函数求距离前k个 - sorted_distances = np.argsort(distances) - topK = [self._train_label[j] for j in sorted_distances[0:self._k]] - votes = Counter(topK) - #预测结果 - predict = votes.most_common(1)[0][0] - predicts_.append(predict) - predicts = np.array(predicts_) - return predicts - - diff --git a/assignment-1/submission/16307100065/train&test.py b/assignment-1/submission/16307100065/train&test.py deleted file mode 100644 index 31f28e4c3580e54f001d4ac69fb1c3f9ddf533a9..0000000000000000000000000000000000000000 --- a/assignment-1/submission/16307100065/train&test.py +++ /dev/null @@ -1,39 +0,0 @@ -#实验代码 -import numpy as np -from lknn import KNN -import matplotlib.pyplot as plt -#每一维均值为0,方差为10,并且两维独立,创建1000个数据 -cov = [[10,0],[0,10]] -data = np.around(np.random.multivariate_normal((0,0),cov,1000),2) -#对应分类随机取0或1 -label = np.random.choice([0,1],size=1000,replace=True) -#按8:2的比例分为训练集和测试集 -n = len(data)//5 -train_data = data[0:4*n] -train_label = label[0:4*n] -test_data = data[4*n:] -test_label = label[4*n:] -#调用KNN类,k赋值5,将训练集输入模型 -model = KNN(5) -model.fit(train_data, train_label) -#绘制分类图 -#第一维和第二维分别作为x,y轴 -x_show = train_data[:,0] -y_show = train_data[:,1] -x_min,x_max=x_show.min(),x_show.max() -y_min,y_max=y_show.min(),y_show.max() -xx,yy = np.meshgrid(np.linspace(x_min,x_max,200),np.linspace(y_min,y_max,200)) -#将网格放入模型预测,预测每一个网格的分类 -z1 = np.c_[xx.ravel(),yy.ravel()] -z = np.array(z1) -pred = model.predict(z) -pred = pred.reshape(xx.shape) -#绘制网格分类图和训练集的散点图 -plt.pcolormesh(xx,yy,pred,cmap=plt.cm.Pastel1) -plt.scatter(x_show,y_show,s=80,c=train_label,cmap=plt.cm.spring,edgecolors='k') -plt.xlim(xx.min(),xx.max()) -plt.ylim(yy.min(),yy.max()) -plt.show() -#计算acc -res = model.predict(test_data) -print("acc =",np.mean(np.equal(res, test_label))) \ No newline at end of file diff --git a/assignment-1/submission/16307130040/README.md b/assignment-1/submission/16307130040/README.md deleted file mode 100644 index 8c6b9b94639e4ef0462ed57212e65dc05b741d32..0000000000000000000000000000000000000000 --- a/assignment-1/submission/16307130040/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# 实验报告1 - - - -### 1,KNN的实现 - -```python - def predict1(self,test_data,k): - predict=[] - for instance in test_data: - distances=np.array([self.distance (x,instance) for x in self.X]) - - kneighbors=np.argsort(np.array(distances))[:k] - count = Counter(self.y[kneighbors]) - predict.append(count.most_common()[0][0]) - return predict -``` - -将测试点最近的k个点列出,并找出其中出现最多的标签,作为预测的结果。 - -```python - def fit(self, train_data, train_label): - X_train,X_test,y_train,y_test=self.train_test_split(train_data, train_label) - self.X=np.array(X_train) - self.y=np.array(y_train) - max_accurate=0 - best_k=0 - for k in self.k_n: - accurate=0 - train_predict=self.predict1(X_test,k) - correct = np.count_nonzero((train_predict==y_test)==True) - - accurate=correct/len(X_test) - if (accurate>max_accurate): - max_accurate=accurate - best_k=k - self.k_select=best_k -``` - -k_n为[1,2,3,4,5],knn将输入的数据分为训练集和测试集,并从k_n中选择一个准确率最高的k值。 - -### 2,实验部分 - -```python -def generate(): - X1 = np.random.multivariate_normal([1,50], [[1,0],[0,10]], 100) - X2 = np.random.multivariate_normal([3,50], [[1,0],[0,10]], 100) - X3 = np.random.multivariate_normal([5,50], [[1,0],[0,10]], 100) - X = np.concatenate([X1,X2,X3]) - y = np.array([0]*100 + [1]*100 +[2]*100) - idx = np.arange(300) - np.random.shuffle(idx) - data=X=X[idx] - label=y=y[idx] - - X_train=X[:240] - X_test=X[240:] - y_train=y[:240] - y_test=y[240:] - return np.array(X_train),np.array(X_test),np.array(y_train),np.array(y_test) - -``` - -生成数据,将它们分为训练集和测试集。将训练集输入KNN,之后利用KNN预测测试集的标签。 - -这是训练集: - -![avatar](./img/train.png) - -这是测试集: - - - -​ ![avatar](./img/test.png) - -这是对测试集的预测: - -![avatar](./img/predict.png) - -这个预测的准确度为0.75. \ No newline at end of file diff --git a/assignment-1/submission/16307130040/img/predict.png b/assignment-1/submission/16307130040/img/predict.png deleted file mode 100644 index a853678b11d98ccc9a012b637a588a37723b42ff..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16307130040/img/predict.png and /dev/null differ diff --git a/assignment-1/submission/16307130040/img/test.png b/assignment-1/submission/16307130040/img/test.png deleted file mode 100644 index d9d8b3f379704464fcf2c4bff575425ef3404c0f..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16307130040/img/test.png and /dev/null differ diff --git a/assignment-1/submission/16307130040/img/train.png b/assignment-1/submission/16307130040/img/train.png deleted file mode 100644 index 2d61295b21b4bf115c35c594ca716f564a79a00a..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/16307130040/img/train.png and /dev/null differ diff --git a/assignment-1/submission/16307130040/source.py b/assignment-1/submission/16307130040/source.py deleted file mode 100644 index 397fbc05ad396f8bac4e0fee34426e6ee57d7f8f..0000000000000000000000000000000000000000 --- a/assignment-1/submission/16307130040/source.py +++ /dev/null @@ -1,94 +0,0 @@ -import numpy as np -from collections import Counter -import matplotlib.pyplot as plt - -class KNN: - k_n=[1,2,3,4,5] - k_select=0 - X=[] - y=[] - def __init__(self): - pass - - def train_test_split(self,X,y): - offset=int(len(X)*0.8) - X_train=X[:offset] - X_test=X[offset:] - y_train=y[:offset] - y_test=y[offset:] - - return np.array(X_train),np.array(X_test),np.array(y_train),np.array(y_test) - - def distance(self,instance1,instance2): - dist = np.sqrt(sum((instance1 - instance2)**2)) - return dist - - def predict1(self,test_data,k): - predict=[] - for instance in test_data: - distances=np.array([self.distance (x,instance) for x in self.X]) - - kneighbors=np.argsort(np.array(distances))[:k] - count = Counter(self.y[kneighbors]) - predict.append(count.most_common()[0][0]) - return predict - - def fit(self, train_data, train_label): - X_train,X_test,y_train,y_test=self.train_test_split(train_data, train_label) - self.X=np.array(X_train) - self.y=np.array(y_train) - max_accurate=0 - best_k=0 - for k in self.k_n: - accurate=0 - train_predict=self.predict1(X_test,k) - correct = np.count_nonzero((train_predict==y_test)==True) - - accurate=correct/len(X_test) - if (accurate>max_accurate): - max_accurate=accurate - best_k=k - self.k_select=best_k - - def predict(self, test_data): - return self.predict1(test_data,self.k_select) - -def generate(): - X1 = np.random.multivariate_normal([1,50], [[1,0],[0,10]], 100) - X2 = np.random.multivariate_normal([3,50], [[1,0],[0,10]], 100) - X3 = np.random.multivariate_normal([5,50], [[1,0],[0,10]], 100) - X = np.concatenate([X1,X2,X3]) - y = np.array([0]*100 + [1]*100 +[2]*100) - idx = np.arange(300) - np.random.shuffle(idx) - - data=X=X[idx] - label=y=y[idx] - - X_train=X[:240] - X_test=X[240:] - y_train=y[:240] - y_test=y[240:] - return np.array(X_train),np.array(X_test),np.array(y_train),np.array(y_test) - -def display(data, label, name): - datas =[[],[],[]] - colors=['b','r','y'] - for i in range(len(data)): - datas[label[i]].append(data[i]) - for i,each in enumerate(datas): - each = np.array(each) - plt.scatter(each[:, 0], each[:, 1],c=colors[i]) - plt.show() - -if __name__ == '__main__': - X_train,X_test,y_train,y_test=generate() - model=KNN() - model.fit(X_train,y_train) - predict=model.predict(X_test) - display(X_train,y_train,'train') - display(X_test,y_test,'test') - display(X_test,predict,'predict') - correct = np.count_nonzero((predict==y_test)) - accurate=correct/len(X_test) - print('accu=',accurate) \ No newline at end of file diff --git a/assignment-1/submission/17307110367/README.md b/assignment-1/submission/17307110367/README.md deleted file mode 100644 index f34b168681d1fdb43378914439a666a038baab1d..0000000000000000000000000000000000000000 --- a/assignment-1/submission/17307110367/README.md +++ /dev/null @@ -1,163 +0,0 @@ -# 课程报告 - 我的KNN模型**只用到了numpy包**,所以我的代码应该可以通过限定依赖包的自动测试。 - ## 一、数据集的生成和划分 - 首先我以如下参数生成了3个符合二维高斯分布的集合的数据集。 - - 第一类数据800个,标注为0: -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -73 & 0 \\\\ -0 & 22 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -1 & 2 -\end{array}\right] -\end{array} -$$ - - - 第二类数据200个,标注为1: -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -21.2 & 0 \\\\ -0 & 32.1 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -16 & -5 -\end{array}\right] -\end{array} -$$ - - 第三类数据1000个,标注为2: - -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 5 \\\\ -5 & 10 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -10 & 22 -\end{array}\right] -\end{array} -$$ - - 将这些图片和对应的标注混合并打乱次序,就能够得到我们数据集。从其中取出80%(1600对)作为我们的训练集。 - 这是我生成的训练集 - -![train_1](./img/train_1.png) - -取出其中另外的20%(400对)作为我们的测试集。 -这是我生成的测试集 -![test_1](./img/test_1.png) - - ## 二、KNN模型的建立 - 我的KNN模型主要分为两部分。第一部分的fit函数将训练集分为训练集和验证集,根据KNN模型在验证集上的最优结果自动地选择最优的K值。第二部分的predict函数用已选择出的K值代入KNN模型中,预测测试集的标签。 - ### 2.1 fit函数的编写 - fit函数主要包含以下几个步骤: - (1)将已有的训练集的次序打乱,分出其中的20%作为验证集,80%作为测试集。 - - (2)遍历待选的K值,在暂时确定K值的情况下在验证集上测试模型的结果。若训练集数量小于20,则待选K值的范围是range(1,训练集数量,2);若训练集数量大于20,则待选K值的范围是range(1,20,2)。 - - (3)找到验证集预测准确率最高的模型所对应的K值。将该值作为后续在predict函数中运用的K值。 - - ### 2.2 predict函数的编写 - predict函数主要包含以下几个步骤: - (1)遍历测试集中的每一个点。当取出测试集中的某一点时,计算该点与训练集中的每个点的距离。 - - (2)对计算好的距离进行从小到大排序,取出前K个点 - - (3)统计前K个值中各个类别的数量 - - (4)数量最多的类别便是预测结果。 - - - ## 三、实验结果与分析 - 在命令行运行 python source.py g即可生成数据集并查看准确率结果。由于每次随机生成的数据集略有差异,每次的K值和准确率也略有差异。重复实验10次,结果如下表: - - -实验次数 |K值 |准确率 ----|---|--- -1| 11| 0.96 -2 | 9|0.9675 -3| 15| 0.955 -4 | 11|0.9475 -5| 5| 0.94 -6 | 5|0.94 -7| 11| 0.96 -8 | 7|0.945 -9| 7| 0.95 -10 | 19|0.955 - -取这10次实验准确率的均值,得到模型的最终准确率为0.952。 -最终的模型准确率较高。准确率不为1的原因是测试集中蓝色与橙色的点有着一定的交集,对于处于交集中的数据我们也很难分清楚数据点到底属于哪一个类别。这部分的失误对于KNN来说似乎是无法避免的。 - - -## 三、修改数据集进行实验探究 -### 3.1 修改高斯分布的距离 -我们的预期是:在其它条件不变的情况下,高斯分布的距离越大,数据分的越开,KNN越容易预测准确。高斯分布的距离越小,数据离得越近,KNN的准确率越低。 -#### (1)设置参数使得三个类别分的更开。 - - 修改第二类数据的均值,使得它与另外两类数据分的更开: -$$ -\mu_2 = [\begin{matrix}30 & -20\end{matrix}] -$$ - - 此时测试集的数据分布如下图: - ![test_2](./img/test_2.png) - 多次实验得到模型的准确率为0.99。符合我们的预期。 - -#### (2)设置参数使得三个类别离得更近。 - - 修改每一类数据的均值,使得它们离得更近: - -$$ -\mu_1 = [\begin{matrix}1 & 2\end{matrix}] -$$ - -$$ -\mu_2 = [\begin{matrix}15 & 0\end{matrix}] -$$ -$$ -\mu_3 = [\begin{matrix}10 & 10\end{matrix}] -$$ - - 此时测试集的数据分布如下图: - - ![test_3](./img/test_3.png) - 此时模型的准确率均值只有0.84。符合我们的预期。 - - ### 3.2 修改高斯分布的方差 - 我们的预期是:在其它条件不变的情况下,高斯分布的方差越大,数据越容易混淆,因此KNN的结果越差。高斯分布的方差越小,数据越集中,KNN的结果越好。 -#### 设置参数使得第二和第三类数据的协方差更大。 - 修改第二,三类数据的方差如下: -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -73 & 0 \\\\ -0 & 22 -\end{array}\right] -\end{array} -$$ - - 此时测试集的数据分布如下图: - - ![test_4](./img/test_4.png) - - 此时模型的准确率均值只有0.74,符合我们的预期。显然KNN的结果在这种情况下并不理想。 - - ### 3.3 修改数据的数量 - #### (1)使各类数据翻倍。 - 对第一,第二,第三类的训练和测试数据翻倍。 - 此时测试集的数据分布如下图: - ![test_5](./img/test_5.png) - - 对模型运行多次求均值,得到模型的准确率约为0.956。准确率相比于原先提升了一点点。这点提升微乎其微,背后的原因可能是数据翻倍时同样也使得数据间的交叠翻倍,对于这部分的交叠数据,模型很难判别正确。因此准确率没有什么改变。 - - #### (2)使第一类数据翻倍 - 只对第一类的训练和测试数据翻倍。 - 此时测试集的数据分布如下图: - ![test_6](./img/test_6.png) - 对模型运行多次求均值,得到模型的准确率约为0.96。可见准确率提升了一些。这是由于第一类的数据翻倍,导致在数据交叠区域数据点更倾向于被判别为第一类数据,因此准确率必定会有一定的提升 \ No newline at end of file diff --git a/assignment-1/submission/17307110367/img/.keep b/assignment-1/submission/17307110367/img/.keep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/assignment-1/submission/17307110367/img/test_1.png b/assignment-1/submission/17307110367/img/test_1.png deleted file mode 100644 index eec47aac6c3a4a91fbb79b4e46c19b4dcaf56ad7..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307110367/img/test_1.png and /dev/null differ diff --git a/assignment-1/submission/17307110367/img/test_2.png b/assignment-1/submission/17307110367/img/test_2.png deleted file mode 100644 index b7273b4820997fc7ccf6f1238e0f6d5dc53776f1..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307110367/img/test_2.png and /dev/null differ diff --git a/assignment-1/submission/17307110367/img/test_3.png b/assignment-1/submission/17307110367/img/test_3.png deleted file mode 100644 index 1389dfe352cc414a710dab394b0a5cbd5173c5e9..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307110367/img/test_3.png and /dev/null differ diff --git a/assignment-1/submission/17307110367/img/test_4.png b/assignment-1/submission/17307110367/img/test_4.png deleted file mode 100644 index c33f255f7f5c059f8b3fa50fd5f02ebdf81a8204..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307110367/img/test_4.png and /dev/null differ diff --git a/assignment-1/submission/17307110367/img/test_5.png b/assignment-1/submission/17307110367/img/test_5.png deleted file mode 100644 index f15a0935c5fc56cadf8f259b40b0b960831d256f..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307110367/img/test_5.png and /dev/null differ diff --git a/assignment-1/submission/17307110367/img/test_6.png b/assignment-1/submission/17307110367/img/test_6.png deleted file mode 100644 index 528bd1180529c99a903a7affc9c08cbb6f0af35b..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307110367/img/test_6.png and /dev/null differ diff --git a/assignment-1/submission/17307110367/img/train_1.png b/assignment-1/submission/17307110367/img/train_1.png deleted file mode 100644 index 580335fd8c4753ea64df8189940b63039eb02168..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307110367/img/train_1.png and /dev/null differ diff --git a/assignment-1/submission/17307110367/source.py b/assignment-1/submission/17307110367/source.py deleted file mode 100644 index bc70f0c0f1ea938f09f35eee810449c19758ef39..0000000000000000000000000000000000000000 --- a/assignment-1/submission/17307110367/source.py +++ /dev/null @@ -1,172 +0,0 @@ -import sys -import numpy as np -import matplotlib.pyplot as plt - -class KNN: - - def __init__(self): - self.train_data = None - self.train_label = None - self.k = None - - def fit(self, train_data, train_label): - self.train_data = train_data - self.train_label = train_label - # 将训练集打乱 - data_size = self.train_data.shape[0] - shuffled_indices = np.random.permutation(data_size) - shuffled_data = train_data[shuffled_indices] - shuffled_label = train_label[shuffled_indices] - # test_ratio为测试集所占的百分比,划分训练集和验证集 - test_ratio = 0.2 - test_set_size = int(data_size * test_ratio) - valid_data = shuffled_data[:test_set_size] - valid_label = shuffled_label[:test_set_size] - training_data = shuffled_data[test_set_size:] - training_label = shuffled_label[test_set_size:] - # 在验证集上对不同的K值进行测试 - record ={} - if training_data.shape[0] < 20: - k_number = training_data.shape[0] - else: - k_number = 20 - for k in range(1,k_number,2): - data_size = training_data.shape[0] - predict_result = np.array([]) - for i in range(valid_data.shape[0]): - diff = np.tile(valid_data[i], (data_size, 1)) - training_data - sqdiff = diff ** 2 - squareDist = np.sum(sqdiff, axis=1) - dist = squareDist ** 0.5 - # test_data到其它点的距离 - sorteddiffdist = np.argsort(dist) - # 对这些距离从小到大排序 - classCount = {} - for j in range(k): - Label = training_label[sorteddiffdist[j]] - classCount[Label] = classCount.get(Label, 0) + 1 - # 统计距离中前K个值中各个类别的数量 - maxCount = 0 - for key, value in classCount.items(): - if value > maxCount: - maxCount = value - result = key - predict_result = np.append(predict_result, result) - acc = np.mean(np.equal(predict_result, valid_label)) - record[k] = acc - # 取验证准确率最高的K值作为K值 - maxCount = 0 - for key, value in record.items(): - if value > maxCount: - maxCount = value - k_result = key - print("k=",k_result) - self.k = k_result - - def predict(self, test_data): - data_size = self.train_data.shape[0] - predict_result = np.array([]) - for i in range(test_data.shape[0]): - diff = np.tile(test_data[i],(data_size,1)) - self.train_data - sqdiff = diff **2 - squareDist = np.sum(sqdiff, axis =1) - dist = squareDist **0.5 - # test_data到其它点的距离 - sorteddiffdist = np.argsort(dist) - # 对这些距离从小到大排序 - classCount ={} - for j in range(self.k): - Label = self.train_label[sorteddiffdist[j]] - classCount[Label] = classCount.get(Label,0) + 1 - # 统计距离中前K个值中各个类别的数量 - maxCount = 0 - for key, value in classCount.items(): - if value > maxCount: - maxCount = value - result = key - predict_result = np.append(predict_result,result) - # 数量最多的就是预测的结果 - return predict_result - - -def generate(): - mean = (1, 2) - cov = np.array([[73, 0], [0, 22]]) - x = np.random.multivariate_normal(mean, cov, (800,)) - #x = np.random.multivariate_normal(mean, cov, (1600,)) - - mean = (16, -5) - #mean = (30, -20) - #mean = (15, 0) - cov = np.array([[21.2, 0], [0, 32.1]]) - #cov = np.array([[73, 0], [0, 22]]) - y = np.random.multivariate_normal(mean, cov, (200,)) - #y = np.random.multivariate_normal(mean, cov, (400,)) - - mean = (10, 22) - #mean = (10,10) - cov = np.array([[10, 5], [5, 10]]) - #cov = np.array([[73, 0], [0, 22]]) - z = np.random.multivariate_normal(mean, cov, (1000,)) - #z = np.random.multivariate_normal(mean, cov, (2000,)) - - idx = np.arange(2000) - #idx = np.arange(2800) - np.random.shuffle(idx) - data = np.concatenate([x, y, z]) - label = np.concatenate([ - np.zeros((800,), dtype=int), - np.ones((200,), dtype=int), - np.ones((1000,), dtype=int) * 2 - ]) - # label = np.concatenate([ - # np.zeros((1600,), dtype=int), - # np.ones((200,), dtype=int), - # np.ones((1000,), dtype=int) * 2 - # ]) - # data = data[idx] - # label = label[idx] - - train_data, test_data = data[:1600, ], data[1600:, ] - train_label, test_label = label[:1600, ], label[1600:, ] - # train_data, test_data = data[:2240, ], data[2240:, ] - # train_label, test_label = label[:2240, ], label[2240:, ] - np.save("data.npy", ( - (train_data, train_label), (test_data, test_label) - )) - - -def read(): - (train_data, train_label), (test_data, test_label) = np.load("data.npy", allow_pickle=True) - return (train_data, train_label), (test_data, test_label) - - -def display(data, label, name): - datas = [[], [], []] - for i in range(len(data)): - datas[label[i]].append(data[i]) - - for each in datas: - each = np.array(each) - plt.scatter(each[:, 0], each[:, 1]) - plt.savefig(f'./{name}') - plt.show() - - -if __name__ == "__main__": - if len(sys.argv) > 1 and sys.argv[1] == "g": - generate() - if len(sys.argv) > 1 and sys.argv[1] == "d": - (train_data, train_label), (test_data, test_label) = read() - display(train_data, train_label, 'train') - display(test_data, test_label, 'test') - else: - (train_data, train_label), (test_data, test_label) = read() - - model = KNN() - model.fit(train_data, train_label) - res = model.predict(test_data) - print("acc =", np.mean(np.equal(res, test_label))) - - - diff --git a/assignment-1/submission/17307130133/README.md b/assignment-1/submission/17307130133/README.md deleted file mode 100644 index b4ef5ec932b48a64641332fa51b023e214123724..0000000000000000000000000000000000000000 --- a/assignment-1/submission/17307130133/README.md +++ /dev/null @@ -1,129 +0,0 @@ -# 课程报告 - -## KNN实现 - -### 距离的计算 - -$$ -\begin{array}\\\\ -assume\ test\ matrix\ P: M\times D,\ train\ matrix\ C:N\times D. D\ is\ the\ dimension\ of\ data.\\\\ -let\ P_i\ is\ ith\ row\ in\ P,\ C_j\ is\ jth\ row\ in\ C.\\\\ -distance\ between\ P_i\ and\ C_j:\\\\ -d(P_i,C_j) = \sqrt{\sum_{k=1}^{D}(P_{ik}-C_{jk})^2}\\\\ -=\sqrt{\sum_{k=1}^DP_{ik}^2+\sum_{k=1}^DC_{jk}^2-2\sum_{k=1}^D(P_{ik}C_{jk})}\\\\ -=\sqrt{||P_i||^2+||C_j||^2-2P_iC_j^T}\\\\ -then\ we\ can\ calculate\ the\ whole\ distance\ matrix\ only\ with\ matrix\ operations. -\end{array} -$$ - -### 数据预处理:归一化 - -将数据归一化到[0, 1],在服从正态分布的数据集上测试时表现不佳(详见实验部分)。最终代码中有实现归一化(normalize中),但是并未应用。 - -### k的取值 - -fit函数中,k在[2, 6]中取最优值。fit函数先把train_data划分为train_data和dev_data,比例为1:4;然后计算出距离矩阵;最后k遍历[2, 6],找到在dev_data上测试所得到最高正确率的k值,应用于最后的预测。 - -## 实验 - -### 实验一 正则化 - -此实验目的是探究KNN中数据正则化的影响。 - -实验了多组数据,只有在测试数据参数为 - -$$ -\begin{array}{l} -&\Sigma_1&=&\left[\begin{array}{cc} -3 & 0 \\\\ -0 & 70 -\end{array}\right] \qquad -&\Sigma_2&=&\left[\begin{array}{cc} -4 & 0 \\\\ -0 & 65 -\end{array}\right] \qquad -&\Sigma_3&=&\left[\begin{array}{cc} -2 & 0 \\\\ -0 & 75 -\end{array}\right]\\\\ -&\mu_1&=&\left[\begin{array}{ll} -4 & -20 -\end{array}\right] -&\mu_2&=&\left[\begin{array}{ll} -5 & 0 -\end{array}\right] -&\mu_3&=&\left[\begin{array}{ll} -6 & 20 -\end{array}\right] -\end{array} -$$ - -时,使用正则化取得更好的结果。 - -训练集: - -train1 - -测试集: - -test1 - -| k | 2 | 3 | 4 | 5 | 6 | -| ---------------- | ------ | ------ | ------ | ------ | ------ | -| acc_dev 无归一化 | 87.81% | 91.88% | 91.25% | 91.25% | 91.25% | -| acc_dev 有归一化 | 87.81% | 91.88% | 91.25% | 91.25% | 91.25% | - -最佳k值都为3,无归一化时,在test上为准确率:88.25%;有归一化时,在test上准确率为89.25%。 - -在其他使用正态分布生成的数据中,都是不使用归一化准确率更高。在上例中,使用归一化正确率提升仅1%,而在其他数据上,不使用归一化正确率提高会高得多。所以在最终的代码中并未对数据进行归一化处理。在本系列实验中,归一化与否并不影响k的最佳取值。 - -实验结论:首先,归一化并不适合在正态分布上的KNN分类。其次,归一化不影响最佳k值。 - -### 实验二 改变分布之间的距离 - -使用两个正态分布探究不同高斯分布之间距离的影响。先保持高斯分布的协方差矩阵不变,改变均值之间的距离。 - -训练集: - -![train_2](./img/train_2.jpg) - -测试集: - -![train_2](./img/test_2.jpg) - -| 序号 | 1 | 2 | 3 | 4 | 5 | 6 | -| ------- | ------ | ------ | ------ | ------ | ------ | ------ | -| 准确率 | 97.75% | 98.25% | 98.50% | 92.25% | 87.75% | 85.00% | -| 最佳k值 | 2 | 3 | 5 | 5 | 5 | 6 | - -可以看出,两个分布的数据点范围开始重叠时,准确率开始下降,重叠范围越大,准确率越低,k值也在相应增大。 - -接下来,保持两个分布均值距离不变,仅改变高斯分布的协方差矩阵。 - -训练集: - -train_2 - -测试集: - -train_2 - -| 序号 | 1 | 2 | 3 | 4 | -| ------- | ------ | ------ | ------ | ------ | -| 准确率 | 96.75% | 96.25% | 95.00% | 92.50% | -| 最佳k值 | 5 | 5 | 6 | 3 | - -类似地,准确率随着分布的重叠而降低。 - -## 代码使用方法 - -```bash -python source.py g # 生成数据集 -python source.py d # 展示数据集 -python source.py # 训练和测试 -``` - -# 参考 - -距离的计算:https://blog.csdn.net/IT_forlearn/article/details/100022244?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.control&dist_request_id=&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.control - diff --git a/assignment-1/submission/17307130133/img/test_1.png b/assignment-1/submission/17307130133/img/test_1.png deleted file mode 100644 index 1b3ef8c56c035e5122cdfacd83b9c436051d4b02..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307130133/img/test_1.png and /dev/null differ diff --git a/assignment-1/submission/17307130133/img/test_2.jpg b/assignment-1/submission/17307130133/img/test_2.jpg deleted file mode 100644 index 3ceff6d6091c5d283fcba0023e053e079e4720d0..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307130133/img/test_2.jpg and /dev/null differ diff --git a/assignment-1/submission/17307130133/img/test_3.jpg b/assignment-1/submission/17307130133/img/test_3.jpg deleted file mode 100644 index 784506a6a23f0fcfaac61f422d03f0de9a4aab95..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307130133/img/test_3.jpg and /dev/null differ diff --git a/assignment-1/submission/17307130133/img/train_1.png b/assignment-1/submission/17307130133/img/train_1.png deleted file mode 100644 index fab155cdf1d85d91e888e28a678ee2dc11d63d68..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307130133/img/train_1.png and /dev/null differ diff --git a/assignment-1/submission/17307130133/img/train_2.jpg b/assignment-1/submission/17307130133/img/train_2.jpg deleted file mode 100644 index b723ef988fb70cc8f5efe0dd7e502798135578e0..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307130133/img/train_2.jpg and /dev/null differ diff --git a/assignment-1/submission/17307130133/img/train_3.jpg b/assignment-1/submission/17307130133/img/train_3.jpg deleted file mode 100644 index 5ba83b8810ff506b4eae19ce531494986142458d..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/17307130133/img/train_3.jpg and /dev/null differ diff --git a/assignment-1/submission/17307130133/source.py b/assignment-1/submission/17307130133/source.py deleted file mode 100644 index dc9938a32ac6c410feff6fb95204e0861363809d..0000000000000000000000000000000000000000 --- a/assignment-1/submission/17307130133/source.py +++ /dev/null @@ -1,132 +0,0 @@ -import sys -import numpy as np -import matplotlib.pyplot as plt - -class KNN: - - def __init__(self): - self.train_data = None - self.train_label = None - self.k = 2 - - def fit(self, train_data, train_label): - # train_data = self.normalize(train_data) - self.train_data = train_data - self.train_label = train_label - - N = train_data.shape[0] - cut = N // 5 * 4 - train_data, dev_data = train_data[:cut,], train_data[cut:,] - train_label, dev_label = train_label[:cut,], train_label[cut:,] - - dists = self.compute_distances(train_data, dev_data) - - max_acc = 0 - max_acc_k = 2 - for k in range(2,7): - res = self.get_label(dists, train_label, k) - acc = np.mean(np.equal(res, dev_label)) - print("k = %d, acc = %f" % (k, acc)) - if acc > max_acc: - max_acc = acc - max_acc_k = k - print("best k = %d" % max_acc_k) - self.k = max_acc_k - - def predict(self, test_data): - # test_data = self.normalize(test_data) - dists = self.compute_distances(self.train_data, test_data) - return self.get_label(dists, self.train_label, self.k) - - def compute_distances(self, train_data, test_data): - num_train = train_data.shape[0] - num_test = test_data.shape[0] - dists = np.zeros((num_test, num_train)) - - train_square = np.sum(np.square(train_data), axis=1).reshape(1, num_train) - test_square = np.sum(np.square(test_data), axis=1).reshape(num_test, 1) - dists = np.sqrt(train_square + test_square - 2 * np.dot(test_data, train_data.T)) - - return dists - - def get_label(self, dists, train_label, k): - num_test = dists.shape[0] - y_predict = np.zeros(num_test, dtype=train_label.dtype) - for i in range(num_test): - closest_y = list(train_label[np.argsort(dists[i,:])[:k]]) - y_predict[i] = max(closest_y, key = closest_y.count) - return y_predict - - def normalize(self, data): - if len(data) == 0: - return data - return (data - np.min(data)) / (np.max(data) - np.min(data)) - - -def generate(): - # mean = (1, 2) - # cov = np.array([[73, 0], [0, 22]]) - mean = (-17, 2) - cov = np.array([[103, 0],[0, 22]]) - x = np.random.multivariate_normal(mean, cov, (1200,)) - - # mean = (16, -5) - # cov = np.array([[21.2, 0], [0, 32.1]]) - mean = (10, -5) - cov = np.array([[101.2, 0],[0, 32.1]]) - y = np.random.multivariate_normal(mean, cov, (800,)) - - # mean = (10, 22) - # cov = np.array([[10,5],[5,10]]) - # z = np.random.multivariate_normal(mean, cov, (1000,)) - - idx = np.arange(2000) - np.random.shuffle(idx) - # data = np.concatenate([x,y,z]) - data = np.concatenate([x,y]) - label = np.concatenate([ - # np.zeros((800,),dtype=int), - # np.ones((200,),dtype=int), - # np.ones((1000,),dtype=int)*2 - np.zeros((1200,),dtype=int), - np.ones((800,),dtype=int), - ]) - data = data[idx] - label = label[idx] - - train_data, test_data = data[:1600,], data[1600:,] - train_label, test_label = label[:1600,], label[1600:,] - np.save("data.npy",( - (train_data, train_label), (test_data, test_label) - )) - -def read(): - (train_data, train_label), (test_data, test_label) = np.load("data.npy", allow_pickle=True) - return (train_data, train_label), (test_data, test_label) - -def display(data, label, name): - # datas = [[], [], []] - datas = [[], []] - for i in range(len(data)): - datas[label[i]].append(data[i]) - - for each in datas: - each = np.array(each) - plt.scatter(each[:, 0], each[:, 1]) - plt.savefig(f'img/{name}') - plt.show() - -if __name__ == "__main__": - if len(sys.argv) > 1 and sys.argv[1] == "g": - generate() - elif len(sys.argv) > 1 and sys.argv[1] == "d": - (train_data, train_label), (test_data, test_label) = read() - display(train_data, train_label, 'train') - display(test_data, test_label, 'test') - else: - (train_data, train_label), (test_data, test_label) = read() - - model = KNN() - model.fit(train_data, train_label) - res = model.predict(test_data) - print("acc =",np.mean(np.equal(res, test_label))) diff --git a/assignment-1/submission/18307130090/README.md b/assignment-1/submission/18307130090/README.md deleted file mode 100644 index 7f124928fdebc2f38c8563755f3c6fb073f6102f..0000000000000000000000000000000000000000 --- a/assignment-1/submission/18307130090/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# PRML-2021 Assignment1 - -姓名:夏海淞 - -学号:18307130090 - -## 简述 - -在本次实验中,我实现了K近邻算法,在自己生成的数据集上进行测试,将其在不同数据集上的性能与作为对照的随机算法进行了比较。同时,尝试对性能瓶颈进行了一定优化,效果较好。 - -## 算法介绍 - -### 数据集生成 - -`source.py`中的`generate_data`函数用于生成数据集。该函数接收`mean` `cov` `nums`三个数组参数,每个数组的第`i`个元素分别表示第`i`类点的均值、协方差和数量。`generate_data`函数通过调用`numpy.random.multivariate_normal`方法生成数据。 - -`source.py`中的`show_plot`函数用于绘制数据集的散点图。该函数接收两个`ndarray`参数和一个字符串参数,分别表示样本点、样本标签和图片名称。`show_plot`函数通过调用`matplotlib`库完成绘制,将图存放至项目的`\img`路径中。 - -按照实验要求,将数据打乱后,取80%作为训练集,20%作为测试集。 - -训练集和测试集的散点图如下所示: - -
- train - test -
- -### 模型训练与预测 - -#### `fit` - -`fit`方法首先将训练数据进一步按照80%/20%的比例划分为训练集和验证集,随后枚举K近邻算法中的参数`k`,最大值设为20,进行训练。 - -对于每个`k`,枚举验证集中的点,在训练集中计算出距离该点最近的`k`个点,在`k`个点对应的标签中选取数量最多的作为验证点的预测标签。最后计算出验证集的平均预测准确率acc,选取acc最大的k作为模型的参数。 - -#### `predict` - -`predict`方法与`fit`方法类似:将`fit`方法中训练出的`k`作为参数,枚举测试集的点,计算出距离该点最近的`k`个点。选取数量最多的标签作为预测标签。因为该过程与`fit`方法中枚举`k`进行训练的过程完全一致,因此将其封装成函数进行复用,同时也方便后续对其进行性能优化。 - -## 实验结果与探究 - -### 数据集参数 - -$$ -\mu_x=[5,5] ,\Sigma_x=\begin{bmatrix}34&1 \\\\ 5&10\end{bmatrix},|x|=1600\\\\ -\mu_y=[10,15],\Sigma_y=\begin{bmatrix}20&4 \\\\ 3&24\end{bmatrix},|y|=400\\\\ -\mu_z=[20,5],\Sigma_z=\begin{bmatrix}30&2 \\\\ 1&10\end{bmatrix},|z|=2000 -$$ - -### 实验结果 - -重复10次实验,得到每次实验的模型参数`k`、训练准确率`train_acc`、测试准确率`test_acc`和对照准确率(按照权重随机选择)`control`: - -| k | train_acc | test_acc | control | -| ---- | --------- | -------- | ------- | -| 8 | 0.9047 | 0.8988 | 0.4075 | -| 4 | 0.8984 | 0.8775 | 0.4438 | -| 10 | 0.8875 | 0.8888 | 0.43 | -| 18 | 0.8813 | 0.8888 | 0.43 | -| 15 | 0.8922 | 0.8688 | 0.4075 | -| 14 | 0.9016 | 0.88 | 0.41 | -| 9 | 0.8891 | 0.8863 | 0.405 | -| 19 | 0.8969 | 0.8863 | 0.405 | -| 18 | 0.8875 | 0.8613 | 0.4038 | -| 14 | 0.8875 | 0.8925 | 0.445 | - -得到训练准确率、测试准确率和对照准确率的平均值分别为0.8927,0.8829和0.4188。 - -由此可知该模型的泛化能力较好,性能相较随机选择算法有明显提高。 - -### 实验探究 - -该部分主要探究不同数据集对K近邻算法性能的影响因素。 - -#### 数据量大小 - -固定上述数据集的均值和协方差,改变样本容量为原先的$k$倍$(k\in[0.1,2])$,作出$k$关于`acc`的图如下: - -acc_nums - -由图可知,数据集容量的大小对于模型性能没有明显的影响。 - -推测:出现这种现象的原因可能是K近邻模型过于简单,样本容量较小时也能得到较好的性能。 - -#### 数据集距离 - -固定上述数据集的协方差和容量,改变均值为原先的$k$倍$(k\in[0.1,2])$,作出$k$关于`acc`的图如下: - -acc_mean - -由图可知,数据集的均值越大(即不同数据集的距离越大),模型效果越好。 - -推测:出现这种现象的原因可能是不同数据集之间距离越远,测试点周围属于自己的类的训练点比例越高,因此性能越好。 - -#### 数据集方差 - -固定上述数据集的均值和容量,改变协方差为原先的$k$倍$(k\in[0.1,2])$,作出$k$关于`acc`的图如下: - -acc_cov - -由图可知,数据集的协方差越大,模型效果越好。 - -推测:出现这种现象的原因和数据集距离的原因类似。数据集协方差增大时,不同数据集越倾向于“混合”在一起,使得测试点周围属于自己的类的点比例不断减小,性能随之降低。 - -#### 总结 - -综上,数据集的容量对K近邻模型的影响较小,而不同数据集的距离和协方差分别对K近邻模型性能产生了正面和负面的影响。 - -## 性能优化 - -### 问题归纳 - -注意到K近邻算法的核心操作是:给定$n$个点$x_1,x_2,\cdots,x_n$和一个查询点$y$,要求返回距离$y$最近的$k$个点。常见的算法为计算出$x_i(i=1,2,\cdots,n)$与$y$的距离后按照距离进行排序,取出前$k$个点。这样做的时间复杂度为$O(n\log n)$。 - -### 优化1 - -考虑到$k$一般很小(具体实现中$k\le25$),因此尝试使用堆进行优化。算法步骤为: - -1. 计算出$x_1,\cdots,x_k$与$y$的距离,插入大根堆中; -2. 计算出$x_i(i=k+1,k+2,\cdots,n)$与$y$的距离,与堆顶元素比较。如果$dist(x_i,y)$比堆顶元素小,则弹出堆顶元素,将$dist(x_i,y)$插入堆中。 - -算法结束时堆中的元素即为$x_i$与$y$的距离中最近的$k$个。时间复杂度$O(n\log k)$。由于$k\le25$,因此算法近似为$O(n)$。 - -然而在具体实现中发现优化后速度反而慢于优化前。可能原因是暴力排序调用的是`numpy.argsort`,其底层调用了C的库,常数较小;而堆优化调用的是`heapq`库,完全由Python实现,常数较大,因此性能反而不如优化前。 - -### 优化2 - -后来发现`numpy`库提供了一个名为`numpy.argpartition`的方法。该方法与快速排序的`partition`操作类似,接收一个`ndarray`参数和一个`int`参数并返回一个数组,其中前$k$个为前$k$小的元素参数。由主定理可知,该方法的时间复杂度为$O(n)$。因此改用`numpy.argpartition`计算距离$y$最近的$k$个点,提高了运算速度。 - -`fit`方法分别在使用`numpy.argsort`和`numpy.argpartition`的情况下的运行时间比较图: - -time \ No newline at end of file diff --git a/assignment-1/submission/18307130090/img/readme/acc_cov.png b/assignment-1/submission/18307130090/img/readme/acc_cov.png deleted file mode 100644 index e187ecebdb9c294dfcbad886487b4b0f7fceb4f2..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130090/img/readme/acc_cov.png and /dev/null differ diff --git a/assignment-1/submission/18307130090/img/readme/acc_mean.png b/assignment-1/submission/18307130090/img/readme/acc_mean.png deleted file mode 100644 index 8419fdf833238cfada421178870aa5e86d62923b..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130090/img/readme/acc_mean.png and /dev/null differ diff --git a/assignment-1/submission/18307130090/img/readme/acc_nums.png b/assignment-1/submission/18307130090/img/readme/acc_nums.png deleted file mode 100644 index d84bc984a7c161e0f9b764b4a267ee990d44785a..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130090/img/readme/acc_nums.png and /dev/null differ diff --git a/assignment-1/submission/18307130090/img/readme/test.png b/assignment-1/submission/18307130090/img/readme/test.png deleted file mode 100644 index 893580fcb95deac02a084e15583e5cf2e95ad5d8..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130090/img/readme/test.png and /dev/null differ diff --git a/assignment-1/submission/18307130090/img/readme/time.png b/assignment-1/submission/18307130090/img/readme/time.png deleted file mode 100644 index f76d49f9fabe732cb07eb2a271db4a22b3ed83d0..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130090/img/readme/time.png and /dev/null differ diff --git a/assignment-1/submission/18307130090/img/readme/train.png b/assignment-1/submission/18307130090/img/readme/train.png deleted file mode 100644 index b0b19b6f518c4d8a79848be7aa76dbff4050f88e..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130090/img/readme/train.png and /dev/null differ diff --git a/assignment-1/submission/18307130090/img/test.png b/assignment-1/submission/18307130090/img/test.png deleted file mode 100644 index bb50d7cd57a3ccd2d5e87dfd3abb59cbea12d934..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130090/img/test.png and /dev/null differ diff --git a/assignment-1/submission/18307130090/img/train.png b/assignment-1/submission/18307130090/img/train.png deleted file mode 100644 index 2f2105f7befcf80573111eb2b64f77dda866c295..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130090/img/train.png and /dev/null differ diff --git a/assignment-1/submission/18307130090/source.py b/assignment-1/submission/18307130090/source.py deleted file mode 100644 index 766de86932d9baf64d44cfc385daed350b70935e..0000000000000000000000000000000000000000 --- a/assignment-1/submission/18307130090/source.py +++ /dev/null @@ -1,160 +0,0 @@ -import random -import sys -import time - -import matplotlib.pyplot as plt -import numpy as np - - -class KNN: - - def __init__(self): - self.train_data = None - self.train_labels = None - self.k = None - - # func=bf表示使用O(nlogn)的算法 func=opt表示使用O(n)的算法 - def get_predict_labels(self, k, train_data, train_labels, valid_data, func='bf'): - - # 确保func字段只能取bf或者opt - assert func in {'bf', 'opt'} - - predict_labels = np.array([]) - for valid_dot in valid_data: - - # 计算每个train_dot与valid_dot之间的距离 - dist = np.linalg.norm(train_data - valid_dot, axis=1) - - # 计算距离最小的k个train_dot的下标 - dist_index = np.argsort(dist)[:k] if func == 'bf' else np.argpartition(dist, k)[:k] - - # 计算数量最多的标签 - count_dict = {} - max_count = 0 - for index in dist_index: - index = int(index) - train_label = train_labels[index] - count_dict[train_label] = count_dict.get(train_label, 0) + 1 - max_count = max(max_count, count_dict[train_label]) - predict_label = np.array([]) - for train_label, count in count_dict.items(): - if max_count != count: continue - predict_label = np.append(predict_label, train_label) - predict_labels = np.append(predict_labels, np.random.choice(predict_label)) - - return predict_labels - - def fit(self, input_data, input_labels): - self.train_data = input_data - self.train_labels = input_labels - - # 将数据打乱 - shuffled_data, shuffled_labels = shuffle(input_data, input_labels) - - # 划分为训练集和验证集 - ratio, data_size = 0.2, shuffled_data.shape[0] - valid_size = int(data_size * ratio) - train_size = data_size - valid_size - valid_data, valid_labels = shuffled_data[:valid_size], shuffled_labels[:valid_size] - train_data, train_labels = shuffled_data[valid_size:], shuffled_labels[valid_size:] - - # 枚举k,求出最佳参数 - k_size = min(25, train_size) - max_acc, best_k = -1, 0 - for k in range(1, k_size): - predict_labels = self.get_predict_labels(k, train_data, train_labels, valid_data, func='opt') - acc = np.mean(np.equal(predict_labels, valid_labels)) - # print(f'k={k} acc={acc}') - if acc > max_acc: - max_acc = acc - best_k = k - print(f'k={best_k} train_acc={max_acc}') - self.k = best_k - - def predict(self, test_data): - return self.get_predict_labels(self.k, self.train_data, self.train_labels, test_data, func='opt') - - -def generate_data(mean, cov, nums): - n = len(mean) - assert n == len(cov) and n == len(nums) - data = np.concatenate([np.random.multivariate_normal(mean[i], cov[i], int(nums[i])) for i in range(n)]) - labels = np.concatenate([np.ones(int(nums[i]), dtype=int) * i for i in range(n)]) - - data, labels = shuffle(data, labels) - - ratio, data_size = 0.2, len(data) - test_size = int(ratio * data_size) - test_data, test_label = data[:test_size], labels[:test_size] - train_data, train_label = data[test_size:], labels[test_size:] - np.save('data.npy', (train_data, train_label, test_data, test_label)) - - -def shuffle(data, labels): - data_size = len(data) - assert data_size == len(labels) - - indices = np.random.permutation(data_size) - return data[indices], labels[indices] - - -def save_plot(data, labels, name): - data_size = len(data) - assert data_size == len(labels) - total = {} - for i in range(data_size): - label = labels[i] - if label not in total: - total[label] = [] - else: - total[label].append(data[i]) - for category in total.values(): - if category == []: continue - category = np.array(category) - plt.scatter(category[:, 0], category[:, 1]) - plt.title(name) - plt.savefig(f'./img/{name}') - plt.close() - - -def read(): - return np.load('data.npy', allow_pickle=True) - - -def generate_control(nums, length): - n = len(nums) - labels = [i for i in range(n)] - return random.choices(labels, nums, k=length) - - -def train(mean, cov, nums, generate, ratio=(1, 1, 1)): - if generate: - generate_data(mean * ratio[0], cov * ratio[1], nums * ratio[2]) - train_data, train_label, test_data, test_label = read() - save_plot(train_data, train_label, 'train') - save_plot(test_data, test_label, 'test') - knn = KNN() - start_time = time.time() - knn.fit(train_data, train_label) - end_time = time.time() - training_time = end_time - start_time - # print(f'training stime={training_time} s') - ans = knn.predict(test_data) - control_group = generate_control(nums, len(test_label)) - test_acc = np.mean(np.equal(ans, test_label)) - control = np.mean(np.equal(control_group, test_label)) - return test_acc, control - - -if __name__ == '__main__': - nums = np.array([1600, 400, 2000], dtype=int) - mean = np.array([[5, 5], [10, 15], [20, 5]]) - cov = np.array([ - [[34, 5], [5, 10]], - [[20, 5], [5, 24]], - [[30, 5], [5, 10]] - ]) - generate = True if len(sys.argv) > 1 and sys.argv[1] == 'g' else False - acc, control = train(mean, cov, nums, generate) - print(f'acc={acc} control={control}') - pass diff --git a/assignment-1/submission/18307130104/README.md b/assignment-1/submission/18307130104/README.md deleted file mode 100644 index 547377052cdbe1f8ecf6c88d8f20086418b3842e..0000000000000000000000000000000000000000 --- a/assignment-1/submission/18307130104/README.md +++ /dev/null @@ -1,195 +0,0 @@ - - -18307130104 - -# 课程报告 - -这是 prml 的 assignment-1 课程报告,我的代码在 [source.py](./source.py) 中。 - -在 assignment-1 中,用 python 实现了一个 KNN 类,调用该类下 fit() 函数可以对数据进行训练,调用 predict() 可以对多组目标数据进行预测,返回一个 list 对应每组数据的结果。 - -## KNN 类实现 - -### fit() 函数 - -fit() 函数的任务有两个:将用于训练的点按照类别分别进行储存;选择合适的 K,也就是用于预测的最近点个数。 - -接下来说明 K 数值的选择方法。对于输入的数据,选择前 $\frac 3 4$ 的数据作为训练集,后 $\frac 1 4$ 的数据作为验证集。逐一尝试 1~14 作为 K 值时模型在验证集上的正确率,选取其中正确率最高的 K 作为模型的 K 值保存下来。 - -选择 1~14 是因为训练数据的规模为 2000,如果训练数据的规模进行了修改,这一个范围也可以进行修改,不过这一范围对大部分数据规模都比较适用。 - -### predict() 函数 - -predict() 函数会根据模型中存储的数据和选定的 K 对给定数据进行预测。 - -采用欧拉距离作为两个点的距离数值,选择距离最近的 K 个点进行投票,获票最多的类别就是预测结果。对于获票相同的情况选择编号比较小的类别。 - -## 测试与展示 - -```shell -python source.py g // 生成数据 -python source.py // 进行测试 -python source.py d // 生成展示图片 -``` - -generate() 和 display() 函数均从示例代码中获得。其中 display() 函数中增加了对某种类别预测结果为空的判断防止报错。 - -> 需要保证运行环境中有 img 文件夹,否则程序无法正确运行。(由于不能用 os 包所以不知道怎么判断是否存在文件夹) -> -> 另外,如果使用 wsl 等环境会导致输出图像有重叠。 - -## 探究性实验 - -## 实验1 - -采用以下参数生成 3 组数据。 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -73 & 0 \\\\ -0 & 22 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -1 & 2 -\end{array}\right] -\end{array} -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -21.2 & 0 \\\\ -0 & 32.1 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -16 & -5 -\end{array}\right] -\end{array} -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 5 \\\\ -5 & 10 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -10 & 22 -\end{array}\right] -\end{array} -$$ - -训练数据,测试数据,测试结果如下三图 - -训练集测试集测试结果 - -程序输出如下(之后的实验输出均采用如下的输出格式) - -| K | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -| ------ | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | -| 正确率 | 0.93 | 0.92 | 0.93 | 0.94 | 0.94 | 0.95 | 0.95 | 0.95 | 0.95 | 0.95 | 0.96 | 0.96 | 0.96 | 0.96 | - -| 选取 K | 正确率 | -| -- | -- | -| 14 | 0.96 | - -将实验1作为基准,对不同数据集上的模型效果进行对比。这组数据的特点在于虽然不同种类之间的点有交集,但是区分仍然非常明显,比较符合实际中的分类问题的特征。 - -## 实验2 - -采用以下参数生成 3 组数据。 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -6 & 0 \\\\ -0 & 4 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -1 & 2 -\end{array}\right] -\end{array} -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -5 & 0 \\\\ -0 & 7 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -16 & -5 -\end{array}\right] -\end{array} -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -9 & 5 \\\\ -0 & 5 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -10 & 22 -\end{array}\right] -\end{array} -$$ - -训练数据,测试数据,测试结果如下三图 - -训练集测试集测试结果 - -程序输出如下 - -| K | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -| ------ | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | -| 正确率 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | - - -| 选取 K | 正确率 | -| -- | -- | -|1|acc = 1.0| - -实验2的数据集中数据的协方差比较小,对应的,数据比较集中,数据集中区域的交叉比较小,所以对应的,模型的准确度非常高,这种情况的分类非常简单,因此模型表现优秀也在预期之中。 - -## 实验3 - -采用以下参数生成 3 组数据。 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -73 & 0 \\\\ -0 & 22 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -13 & -2 -\end{array}\right] -\end{array} -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -21.2 & 0 \\\\ -0 & 32.1 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -16 & -5 -\end{array}\right] -\end{array} -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 5 \\\\ -5 & 10 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -18 & -7 -\end{array}\right] -\end{array} -$$ - -训练数据,测试数据,测试结果如下三图 - -训练集测试集测试结果 - -程序输出如下 - -| K | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -| ------ | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | -| 正确率 | 0.67 | 0.76 | 0.73 | 0.75 | 0.73 | 0.74 | 0.74 | 0.76 | 0.74 | 0.75 | 0.74 | 0.74 | 0.74 | 0.74 | - -| 选取 K | 正确率 | -| -- | -- | -|2|acc = 0.71| - -相比于实验1,虽然数据的协方差没有变化,但是数据的中心点比较靠近,具体表现出来,数据集中区域的重合部分非常大,非常难以区别。可以看到正确率也有非常大幅度的下滑。 - -如果再加大协方差,测试准确率也会进一步下降。 - -## 结论 - -可以看到对于数据集中不同类别区分度比较大的情况,KNN 有着非常优秀的表现。对于数据重叠情况比较大的情况,KNN 的效果也并不理想。 diff --git a/assignment-1/submission/18307130104/img/test-1.png b/assignment-1/submission/18307130104/img/test-1.png deleted file mode 100644 index dee425303e2612d99294b8b67d9d3caa62c45d19..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/test-1.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/test-2.png b/assignment-1/submission/18307130104/img/test-2.png deleted file mode 100644 index af4d06ba839eb16dc7864f7f9ccbbb0fc6288353..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/test-2.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/test-3.png b/assignment-1/submission/18307130104/img/test-3.png deleted file mode 100644 index df464c743c65e3df01aab75decd3d46b5040f981..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/test-3.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/test.png b/assignment-1/submission/18307130104/img/test.png deleted file mode 100644 index d4880f0b716a928bb8c98e57db1450c4005e4de0..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/test.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/test_res-1.png b/assignment-1/submission/18307130104/img/test_res-1.png deleted file mode 100644 index b59def122bd96d11caef1846cbed617f2ff1e77a..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/test_res-1.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/test_res-2.png b/assignment-1/submission/18307130104/img/test_res-2.png deleted file mode 100644 index af4d06ba839eb16dc7864f7f9ccbbb0fc6288353..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/test_res-2.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/test_res-3.png b/assignment-1/submission/18307130104/img/test_res-3.png deleted file mode 100644 index 4e2e9e4b84105a214163ef8001950c84551cc868..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/test_res-3.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/test_res.png b/assignment-1/submission/18307130104/img/test_res.png deleted file mode 100644 index 3398f85d9e2c12e1aed1d46aef32b0ebfe40a736..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/test_res.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/train-1.png b/assignment-1/submission/18307130104/img/train-1.png deleted file mode 100644 index 8ac4cafb374d598c4c81167a7a3e249856803f82..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/train-1.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/train-2.png b/assignment-1/submission/18307130104/img/train-2.png deleted file mode 100644 index 5574b29d7ef53b5e38af79f85a1ae4ebcbb8f137..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/train-2.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/train-3.png b/assignment-1/submission/18307130104/img/train-3.png deleted file mode 100644 index 4e5eb59db4f0293e1e59fa4747e61112138f3153..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/train-3.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/img/train.png b/assignment-1/submission/18307130104/img/train.png deleted file mode 100644 index 1906f232e9f385005824fea153d1c8c649000cdb..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130104/img/train.png and /dev/null differ diff --git a/assignment-1/submission/18307130104/source.py b/assignment-1/submission/18307130104/source.py deleted file mode 100644 index 060fd3ca6cbd1b0c81fe22fb1c051eb7e275651d..0000000000000000000000000000000000000000 --- a/assignment-1/submission/18307130104/source.py +++ /dev/null @@ -1,147 +0,0 @@ -import sys -import numpy as np -import matplotlib.pyplot as plt - -class KNN: - - def __init__(self): - self.ldata = {} - self.K = 10 - self.cnt = 0 - - def fit(self, train_data, train_label): - totsz = len(train_data) - pretrainsz = totsz * 3 // 4 - for i in range(0, pretrainsz): - if train_label[i] in self.ldata: - self.ldata[train_label[i]].append(train_data[i]) - else: - self.ldata[train_label[i]] = [train_data[i]] - pretraindata = train_data[pretrainsz : totsz] - pretrainlabel = train_label[pretrainsz : totsz] - maxAcc = 0 - takeK = 3 - for preK in range(1, 15): - pretrainres = [] - self.K = preK - for d in pretraindata: - pretrainres.append(self.predict_one(d)) - acc = np.mean(np.equal(pretrainres, pretrainlabel)) - print(acc) - if acc > maxAcc: - maxAcc = acc - takeK = preK - self.K = takeK - print("take K", takeK) - self.ldata.clear() - for (d, l) in zip(train_data, train_label): - if(l in self.ldata): - self.ldata[l].append(d) - else: - self.ldata[l] = [d] - - def dist(self, s1, s2): - sum = 0 - for (k1, k2) in zip(s1, s2): - sum += (k1 - k2) ** 2 - return sum - def takeFirst(self, elem): - return elem[0] - def predict_one(self, data): - result = None - tmpl = [] - for l in self.ldata: - for s in self.ldata[l]: - tmpl.append([self.dist(s, data), l]) - tmpl.sort(key=self.takeFirst) - num = {} - for i in self.ldata: - num[i] = 0 - cnt = 0 - # for l in tmpl: - # print(l) - # print(' ') - for l in tmpl: - num[l[1]] += 1 - cnt += 1 - if(cnt >= self.K): - break - maxi = -1 - for i in self.ldata: - # print(i) - if num[i] > maxi: - maxi = num[i] - result = i - # print(result) - return result - - def predict(self, test_data): - result = [] - for x in test_data: - result.append(self.predict_one(x)) - return result - -def generate(): - mean = (1, 2) - cov = np.array([[73, 0], [0, 22]]) - x = np.random.multivariate_normal(mean, cov, (800,)) - - mean = (16, -5) - cov = np.array([[21.2, 0], [0, 32.1]]) - y = np.random.multivariate_normal(mean, cov, (200,)) - - mean = (10, 22) - cov = np.array([[10,5],[5,10]]) - z = np.random.multivariate_normal(mean, cov, (1000,)) - - idx = np.arange(2000) - np.random.shuffle(idx) - data = np.concatenate([x,y,z]) - label = np.concatenate([ - np.zeros((800,),dtype=int), - np.ones((200,),dtype=int), - np.ones((1000,),dtype=int)*2 - ]) - data = data[idx] - label = label[idx] - - train_data, test_data = data[:1600,], data[1600:,] - train_label, test_label = label[:1600,], label[1600:,] - np.save("data.npy",( - (train_data, train_label), (test_data, test_label) - )) - -def display(data, label, name): - datas =[[],[],[]] - for i in range(len(data)): - datas[label[i]].append(data[i]) - - for each in datas: - each = np.array(each) - if(each.size > 0): - plt.scatter(each[:, 0], each[:, 1]) - plt.savefig(f'img/{name}') - plt.show() - -def read(): - (train_data, train_label), (test_data, test_label) = np.load("data.npy",allow_pickle=True) - return (train_data, train_label), (test_data, test_label) - -if __name__ == "__main__": - if len(sys.argv) > 1 and sys.argv[1] == "g": - print("generate") - generate() - if len(sys.argv) > 1 and sys.argv[1] == "d": - (train_data, train_label), (test_data, test_label) = read() - # for l in test_label: - # print(l) - display(train_data, train_label, 'train') - display(test_data, test_label, 'test') - else: - (train_data, train_label), (test_data, test_label) = read() - - model = KNN() - model.fit(train_data, train_label) - res = model.predict(test_data) - display(test_data, res, 'test_res') - print("acc =",np.mean(np.equal(res, test_label))) \ No newline at end of file diff --git a/assignment-1/submission/18307130116/README.md b/assignment-1/submission/18307130116/README.md deleted file mode 100644 index 142f441ff2c2994c3d62c3cbb861fd7c637837f8..0000000000000000000000000000000000000000 --- a/assignment-1/submission/18307130116/README.md +++ /dev/null @@ -1,235 +0,0 @@ -# KNN分类器 - -[toc] - -## 依赖包 - -`numpy` - -`matplotlib` - -## 函数功能介绍 - -### KNN - -**`fit(self, train_data, train_label)`** - -`train_data`训练点集 - -`train_label`训练标签 - -**功能简介:**`fit`函数将会取出训练集中的10%用于寻找让准确率最大的K,如果训练集少于10个点,则会默认`K = 1`,否则将会选择1到10中使得准确率最大的K,作为预测时使用的K - ---- - -`predict(self, test_data)` - -**功能简介:**根据前一步学习到的K预测对应的类别 - -### 实验函数与辅助函数 - -**`distance(point1, point2, method="Euclid")`** - -`point1`和`point2`为需要计算距离的两个点 - -`method`给出了计算距离的指标,默认为欧氏距离,`Manhattan`可按照曼哈顿距离计算 - -**功能简介:**函数开始会将输入标准化为[m, 1]的向量,并按照相应的方式计算两个点之间的距离 - -------- - -**`dis(dis_label)`** - -**功能简介:**`sort`的`key`函数,取出二元组(distance, label)中的distance - ---- - -**`nearest_k_label_max(point, point_arr, label_arr, k)`** - -`point`需寻找k个临近点的目标点 - -`point_arr`已有的点集 - -`label_arr`已有点集对应的标签集合 - -`k`考虑的最近的点的数量 - -**功能简介:**函数将计算目标点和点集中所有点的距离,找到K个距离最近点,并返回出现最多次数的`label` - ---- - -**`data_generate_and_save(class_num, mean_list, cov_list, num_list, save_path = "")`** - -`class_num` 共包含的类的数量 - -`mean_list` 各个类的高斯分布对应的均值矩阵 - -`cov_list` 各个类的协方差矩阵 - -`num_list` num_list[i]对应于第i个类的点数 - -`save_path` 生成的点集的存储路径,默认为当前目录下的`data.npy`,路径需以下划线结尾 - -**功能简介:**该函数通过调用`numpy.random.multivariate_normal`,生成指定数目的点,随机打乱后,划分其中的80%为训练数据,20%为测试数据,以元组`((train_data, train_label), (test_data, test_label))`的形式保存 - ---- - -**`data_load(path = "")`** - -`path` 加载点集的存储路径,默认为当前目录下的`data.npy`,路径需以下划线结尾 - -**功能简介:**点集需以元组`((train_data, train_label), (test_data, test_label))`的形式保存 - ---- - -**`visualize(data, label, class_num = 1, test_data=[])`** - -*可视化目前只支持二维,如果是高维点集,将只可视化前两维* - -`data` 训练点集坐标 - -`label`训练点集对应的标签 - -`class_num`类别总数,默认值为1 - -`test_data`测试点集坐标 - -**功能简介:**绘制点集散点图,不同类别自适应的用不同颜色表征,测试点集将通过"+"表征 - -## 实验 - -首先,我们生成了三类坐标点,每类数量100 - -其对应的数量和协方差矩阵如下表所示 - -| | 均值 | 协方差矩阵 | -| ------- | ------- | ----------------- | -| class 1 | (1, 2) | [[10, 0], [0, 2]] | -| class 2 | (4, 5) | [[7, 3], [15, 1]] | -| class 3 | (-2, 6) | [[0, 1], [1, 2]] | - -测试了1-10对应的准确率,如下图所示 - -k1 - -在保证准确率不变的条件下,选择较小的数值k=5,预测的准确率达83.3%,对应数据可视化如下图 - -Figure_1 - -### 对比实验1:减少点集重叠 - -上图能较为清晰的看到,三种颜色的点集分布基本分离开,但是仍存在一部分重叠,推测重叠部分会使得KNN效果变差,下面通过改变均值和协方差验证这一结论 - -首先将协方差对应更改成为 - -| | 均值 | 协方差矩阵 | -| ------- | ------- | --------------- | -| class 1 | (1, 2) | [[1,0], [0, 1]] | -| class 2 | (4, 5) | [[1,0], [0, 1]] | -| class 3 | (-2, 6) | [[1,0], [0, 1]] | - -对应K的曲线和点集分布图如下 - -Figure_2_1Figure_2_2 - -此时选择K = 3,对应的KNN准确率已经提高到了96.7%符合预期 - -同样的,我们更改对应的均值大小,使得高斯分布尽可能分开 - -| | 均值 | 协方差矩阵 | -| ------- | --------- | ----------------- | -| class 1 | (-10, 2) | [[10, 0], [0, 2]] | -| class 2 | (4, 5) | [[7, 3], [15, 1]] | -| class 3 | (-2, -16) | [[0, 1], [1, 2]] | - -对应曲线如下,准确率达到1.0,此时K=1已经达到了最大值 - -Figure_2_3Figure_2_4 - -#### 结论 - -从该对比实验中,我们能够较为清晰的看到点集分布对于KNN准确率的影响,当类之间重合度较低时,KNN的准确率显著提升 - -### 对比实验2:距离选择 - -在上述实验中,我们采用的距离为欧式距离,下面将更改距离计算方式为曼哈顿距离,考察对应的影响 - -当点集区分较开时,曼哈顿距离与欧式距离在准确率上差别不大,这里不做展示,当点集重叠程度较高时,对以下分布生成了多组数据 - -| | 均值 | 协方差矩阵 | -| ------- | ------ | ----------------- | -| class 1 | (1, 4) | [[10, 0], [0, 2]] | -| class 2 | (2, 5) | [[7, 3], [15, 1]] | -| class 3 | (2, 6) | [[0, 1], [1, 2]] | - -对应的k值选取和准确率(acc)如下表所示 - -| 欧氏距离 | 曼哈顿距离 | -| ----------------- | ------------------ | -| k = 3, acc = 0.7 | k=3, acc= 0.683 | -| k = 1, acc = 0.53 | k = 1, acc = 0.483 | -| k = 7, acc = 0.63 | k = 8, acc = 0.567 | - -综合来看点集分布重叠程度较高时,欧氏距离优于曼哈顿距离,推测以高斯分布生成的点,欧式距离对某一维度上较大差距的惩罚大于曼哈顿距离,较符合高斯分布点生成方式,较好拟合当前位置的概率密度,从而准确率更高 - -#### 结论 - -当点集区分较开时,曼哈顿距离和欧式距离差别不大,点集重合较大时,欧式距离由于曼哈顿距离 - -### 对比实验3:点集数量 - -对于如下分布 - -| | 均值 | 协方差矩阵 | -| ------- | ------- | ----------------- | -| class 1 | (1, 4) | [[10, 0], [0, 2]] | -| class 2 | (2, -3) | [[7, 3], [15, 1]] | -| class 3 | (2, 5) | [[0, 1], [1, 2]] | - -分别生成了[100, 100, 100], [100, 10, 100], [100, 50, 200],[200, 200, 200]四组,每组多次避免偶然误差 - -结果如下表格所示 - -| | [100, 100, 100] | [100, 10, 100] | [100, 50, 200] | [200, 200, 200] | -| ---- | --------------- | -------------- | -------------- | --------------- | -| 1 | 0.867 | 0.809 | 0.886 | 0.875 | -| 2 | 0.800 | 0.809 | 0.843 | 0.825 | -| 3 | 0.867 | 0.809 | 0.857 | 0.9 | -| 4 | 0.917 | 0.761 | 0.886 | 0.792 | -| 平均 | 0.862 | 0.797 | 0.868 | 0.848 | - -#### 结论 - -当点集数量上升时,增大重叠面积,准确率相应下降,当某组点数量显著小于其他点集时,将会较大影响到准确率,当差距过大时,将会一定程度上退化成N-1分类问题,反而导致准确率提升 - -### 对比实验4:各维度尺度 - -当各个维度的尺度并不匹配时,例如(年龄,财产)二元组,基于空间上欧式距离相当于退化成为闵式距离,为进一步对比其影响,生成了如下数据 - -| | 均值 | 协方差矩阵 | -| ------- | -------- | --------------------- | -| class 1 | (1, 400) | [[10, 0], [0, 20000]] | -| class 2 | (2, 300) | [[7, 0], [0, 10000]] | -| class 3 | (2, 300) | [[1, 0], [0, 10000]] | - -其中一组对应k和点集分布如下图所示,多次测量的平均准确率为0.399 - -Figure_5_1Figure_5_2 - -为了对比其影响,我们等比例放缩对应的维度100倍, - -| | 均值 | 协方差矩阵 | -| ------- | ------ | ----------------- | -| class 1 | (1, 4) | [[10, 0], [0, 2]] | -| class 2 | (2, 3) | [[7, 0], [15, 1]] | -| class 3 | (2, 3) | [[1, 0], [0, 1]] | - -对应的k和点集可视化如下图 - -Figure_6_1Figure_6_2 - -多次测量的平均准确率为0.539 - -#### 结论 - -尺度归一化较大程度的影响了准确率的大小,通过等比例尺度放缩,准确率有了较大提升,但是,结合前面点集分布的表现,推测当点集自身区分较开时,归一化的影响不大 \ No newline at end of file diff --git a/assignment-1/submission/18307130116/img/Figure_1.png b/assignment-1/submission/18307130116/img/Figure_1.png deleted file mode 100644 index b840aa5b2862be15a71968435433efc147086318..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_1.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/Figure_2_1.png b/assignment-1/submission/18307130116/img/Figure_2_1.png deleted file mode 100644 index 5e2b73e556a36aa5db294e9c2c42fc039728279d..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_2_1.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/Figure_2_2.png b/assignment-1/submission/18307130116/img/Figure_2_2.png deleted file mode 100644 index 3c6ec2fa9693474116ae15a76359f69b442d99b1..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_2_2.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/Figure_2_3.png b/assignment-1/submission/18307130116/img/Figure_2_3.png deleted file mode 100644 index a893f35d277af8c818a69f49cee5e2bbe06c2367..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_2_3.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/Figure_2_4.png b/assignment-1/submission/18307130116/img/Figure_2_4.png deleted file mode 100644 index 34e3cb5e2c15ae4104a1f12fbd9ef62af24cb03e..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_2_4.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/Figure_5_1.png b/assignment-1/submission/18307130116/img/Figure_5_1.png deleted file mode 100644 index 09921dca1bbeebae81d5b0f71eafe9ab0f0ce75a..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_5_1.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/Figure_5_2.png b/assignment-1/submission/18307130116/img/Figure_5_2.png deleted file mode 100644 index 18ed90b7cd1ec5f2c91a863393b21b655b040eb6..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_5_2.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/Figure_6_1.png b/assignment-1/submission/18307130116/img/Figure_6_1.png deleted file mode 100644 index 6fdc07c00f7cfdcead4f8cf98880ce1cd76f9526..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_6_1.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/Figure_6_2.png b/assignment-1/submission/18307130116/img/Figure_6_2.png deleted file mode 100644 index 72685efbfd9bc42f675811e5f92bf88c6bbc3851..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/Figure_6_2.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/img/k1.png b/assignment-1/submission/18307130116/img/k1.png deleted file mode 100644 index 8a81a8e624428a86d14851ca1a9848cf11c61be0..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130116/img/k1.png and /dev/null differ diff --git a/assignment-1/submission/18307130116/source.py b/assignment-1/submission/18307130116/source.py deleted file mode 100644 index 4daa13c95a45ed7371bb33f20bdd2f4d821894ae..0000000000000000000000000000000000000000 --- a/assignment-1/submission/18307130116/source.py +++ /dev/null @@ -1,154 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.cm as cm - -def distance(point1, point2, method="Euclid"): - """ - suppose dimention of points is m * 1 - """ - if point1.ndim == 1: - point1 = np.expand_dims(point1, axis=1) - if point2.ndim == 1: - point2 = np.expand_dims(point2, axis=1) - if point1.shape[0] == 1: - point1 = point1.reshape(-1, 1) - if point2.shape[0] == 1: - point2 = point2.reshape(-1, 1) - dimention_num = point1.shape[0] - result = 0 - if(method == "Euclid"): - if dimention_num != point1.size: - print("error") - return -1 - for iter in range(dimention_num): - result += (point1[iter, 0]-point2[iter, 0])**2 - return pow(result, 0.5) - if(method == "Manhattan"): - if dimention_num != point1.size: - print("error") - return -1 - for iter in range(dimention_num): - result += abs(point1[iter, 0]-point2[iter, 0]) - return result - -def dis(dis_label): - return dis_label[0] - -def nearest_k_label_max(point, point_arr, label_arr, k): - distance_arr = [] - for iter in range(len(point_arr)): - distance_arr.append((distance(point, point_arr[iter]), label_arr[iter])) - distance_arr.sort(key=dis) - result = [] - for iter in range(k): - result.append(distance_arr[iter][1]) - return max(result, key=result.count) - -class KNN: - - def __init__(self): - pass - - def fit(self, train_data, train_label): - num = train_data.shape[0] - dimention_num = train_data.shape[1] - self.train_data = train_data - self.train_label = train_label - dev_num = int(num * 0.1) - dev_data = train_data[:dev_num] - dev_label = train_label[:dev_num] - train_data = train_data[dev_num:] - train_label = train_label[dev_num:] - correct_cout_max = 0 - k_max = 0 - accu = [] - if dev_num == 0: - print("points number too few, so we choose k = 1") - self.k = 1 - return - - for iter in range(1, min(num-dev_num, 10)):#find the best k - correct_count = 0 - for j in range(len(dev_data)): - predict_label = nearest_k_label_max(dev_data[j], train_data, train_label, iter) - if(predict_label == dev_label[j]): - correct_count += 1 - if correct_count > correct_cout_max: - correct_cout_max = correct_count - k_max = iter - accu.append(correct_count/dev_num) - x = range(1, min(num-dev_num, 10)) - #this part is only for experiment, so I commented it for auto test - # plt.plot(x,accu) - # plt.show() - self.k = k_max - print("choose k=", k_max) - - def predict(self, test_data): - result = [] - for iter in range(len(test_data)): - result.append(nearest_k_label_max(test_data[iter,:], self.train_data, self.train_label, self.k)) - return np.array(result) - -#here we need some utils -def data_generate_and_save(class_num, mean_list, cov_list, num_list, save_path = ""): - """ - class_num: the number of class - mean_list: mean_list[i] stand for the mean of class[i] - cov_list: similar to mean_list, stand for the covariance - num_list: similar to mean_list, stand for the number of points in class[i] - save_path: the data storage path, end with slash. - """ - data = np.random.multivariate_normal(mean_list[0], cov_list[0], (num_list[0],)) - label = np.zeros((num_list[0],),dtype=int) - total = num_list[0] - - for iter in range(1, class_num): - temp = np.random.multivariate_normal(mean_list[iter], cov_list[iter], (num_list[iter],)) - label_temp = np.ones((num_list[iter],),dtype=int)*iter - data = np.concatenate([data, temp]) - label = np.concatenate([label, label_temp]) - total += num_list[iter] - - idx = np.arange(total) - np.random.shuffle(idx) - data = data[idx] - label = label[idx] - train_num = int(total * 0.8) - train_data = data[:train_num, ] - test_data = data[train_num:, ] - train_label = label[:train_num, ] - test_label = label[train_num:, ] - # print(test_label.size) - np.save(save_path+"data.npy", ((train_data, train_label), (test_data, test_label))) - -def data_load(path = ""): - (train_data, train_label), (test_data, test_label) = np.load(path+"data.npy",allow_pickle=True) - return (train_data, train_label), (test_data, test_label) - -def visualize(data, label, class_num = 1, test_data=[]): - data_x = {} - data_y = {} - for iter in range(class_num): - data_x[iter] = [] - data_y[iter] = [] - for iter in range(len(label)): - data_x[label[iter]].append(data[iter, 0]) - data_y[label[iter]].append(data[iter, 1]) - colors = cm.rainbow(np.linspace(0, 1, class_num)) - - for class_idx, c in zip(range(class_num), colors): - plt.scatter(data_x[class_idx], data_y[class_idx], color=c) - if(len(test_data) != 0): - plt.scatter(test_data[:, 0], test_data[:, 1], marker='+') - plt.show() - -#experiment begin -if __name__ == "__main__": - mean_list = [(1, 4), (2, 3), (2, 3)] - cov_list = [np.array([[10, 0], [0, 2]]), np.array([[7, 0], [0, 1]]), np.array([[1, 0], [0, 1]])] - num_list = [200, 200, 200] - save_path = "" - data_generate_and_save(3, mean_list, cov_list, num_list, save_path) - # (train_data, train_label), (test_data, test_label) = data_load() - # visualize(train_data, train_label, 3) \ No newline at end of file diff --git a/assignment-1/submission/18307130213/README.md b/assignment-1/submission/18307130213/README.md deleted file mode 100644 index 1312179c838a324adb960c53606d9a76ed582a4f..0000000000000000000000000000000000000000 --- a/assignment-1/submission/18307130213/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# 课程报告 - -## KNN类实现 - -KNN类的实现位于 [source.py](./source.py) 。 - -### 初始化 - -初始化时,我们给 `KNN` 类的 `private` 变量赋值 `None`,表示尚未进行训练,防止使用未训练模型进行测试。 - -### 训练 - -训练函数包括两部分:检查数据是否合法(数据维度是否匹配),以及将所有训练数据中的点保存下来。 - -我们会根据训练数据集的大小 `n` 和标签数 `l` 来决定超参 `K = min(n, log2(n), l + 1)` 。 - -从实验情况来看,这样的超参选择是合理的。 - -### 测试 - -测试函数同样包含两个部分:检测数据是否与训练数据维度相同,并给出对于所有点的标签预测。 - - - -## 数据生成与可视化 - -在给定参数N时,数据生成部分能够生成一套含 `N` 个位置不同,协方差矩阵随机的二维高斯分布的数据。 - -其中 `80%` 会用于训练,剩下 `20%` 用于测试。 - -这是 `N=5` 时生成的训练集: - -![训练集](./img/exptrain.png) - -这是 `N=5` 时生成的测试集: - -![测试集](./img/exptest.png) - - - -## 效果评估 - -以下为随机情况下中获得的一些准确度,当N过大时由于生成数据过密,效果下降。 - -| Algo | Acc | -| ----------- | ------------------ | -| ----- | ----- | -| KNN (N=2) | 0.9983193277310924 | -| KNN (N=3) | 0.9986807387862797 | -| KNN (N=5) | 0.9744360902255639 | -| KNN (N=10) | 0.868824531516184 | -| KNN (N=100) | 0.7205387205387206 | - - - -## 代码使用方法 - -以 `N=3` 为例: - -```bash -python source.py g 3 # 生成数据集 - -python source.py d 3 # 生成数据集的可视化结果(保存在img文件夹下) - -python source.py # 训练和测试 -``` \ No newline at end of file diff --git a/assignment-1/submission/18307130213/img/exptest.png b/assignment-1/submission/18307130213/img/exptest.png deleted file mode 100644 index 1e95008faf1f147efa733242da45b730ca69e04b..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130213/img/exptest.png and /dev/null differ diff --git a/assignment-1/submission/18307130213/img/exptrain.png b/assignment-1/submission/18307130213/img/exptrain.png deleted file mode 100644 index a2a24be956aea669a30fe4562563154da9020047..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/18307130213/img/exptrain.png and /dev/null differ diff --git a/assignment-1/submission/18307130213/source.py b/assignment-1/submission/18307130213/source.py deleted file mode 100644 index 7a53de852289de55e74fe9a70c7ac56fdb5372ec..0000000000000000000000000000000000000000 --- a/assignment-1/submission/18307130213/source.py +++ /dev/null @@ -1,148 +0,0 @@ -import math -import heapq -import numpy as np -import random -import matplotlib.pyplot as plt -import sys - -class KNN: - - def __init__(self): - self.__data = None - self.__lable = None - self.__num = None - self.__dim = None - self.__k = None - - def fit(self, train_data, train_label): - if type(train_data) != np.ndarray: - print('error: wrong type of train_data') - return - if len(train_data.shape) != 2: - print('error: wrong shape of train_data') - return - if type(train_label) != np.ndarray: - print('error: wrong type of train_label') - return - if len(train_label.shape) != 1: - print('error: wrong shape of train_label') - return - num_data, dim_data = train_data.shape - num_label, = train_label.shape - if num_data != num_label: - print('error: shape of train_data and train_label can not match') - return - if num_data < 1: - print('error: less than 1 data') - return - - label_k = len(np.unique(train_label)) - - self.__data = train_data - self.__label = train_label - self.__num = num_data - self.__dim = dim_data - self.__k = min(num_data, math.floor(math.log(num_data, 2)), label_k + 1) - - print('finish: fit') - return - - def predict(self, test_data): - if self.__k == None: - print('error: not fit yet') - return - if type(test_data) != np.ndarray: - print('error: wrong type of test_data') - return - if len(test_data.shape) != 2: - print('error: wrong shape of test_data') - return - - test_data_num, test_data_dim = test_data.shape - if test_data_dim != self.__dim: - print('error: wrong dimention of test_data') - return - - tmp_ans = [] - for i in range(test_data_num): - tmp_inum = [j for j in range(self.__num)] - closest = heapq.nsmallest(self.__k, tmp_inum, key = lambda s: np.linalg.norm(test_data[i]-self.__data[s])) - tmp_dict = {} - lab, cnt = -1, 0 - for j in range(self.__k): - tmp_cnt = tmp_dict[self.__label[closest[j]]] = tmp_dict.get(self.__label[closest[j]], 0) + 1 - if tmp_cnt > cnt: - lab, cnt = self.__label[closest[j]], tmp_cnt - tmp_ans.append(lab) - - return np.array(tmp_ans) - -def generate(n): - np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) - if n <= 0: - print('error: n <= 0') - return - r = n/max(1, math.log(n, 2)) - sizs = [] - xs = [] - for i in range(n): - theta = i*(2*math.pi/n) - mean = (r*math.cos(theta) , r*math.sin(theta)) - rand_mat = np.random.rand(2, 2) - cov = rand_mat.transpose()*rand_mat - siz = random.randint(200, 1000) - sizs.append(siz) - x = np.random.multivariate_normal(mean, cov, (siz, )) - xs.append(x) - siz = sum(sizs) - idx = np.arange(siz) - np.random.shuffle(idx) - data = np.concatenate(xs) - label = np.concatenate([np.ones((sizs[j], ), dtype=int)*j for j in range(n)]) - data = data[idx] - label = label[idx] - - train_data, test_data = data[:(siz//n)*(n-1),], data[(siz//n)*(n-1):,] - train_label, test_label = label[:(siz//n)*(n-1),], label[(siz//n)*(n-1):,] - - np.save("data.npy",( - (train_data, train_label), (test_data, test_label) - )) - -def read(): - (train_data, train_label), (test_data, test_label) = np.load("data.npy",allow_pickle=True) - return (train_data, train_label), (test_data, test_label) - -def genimg(n, data, label, name): - datas =[[] for i in range(n)] - for i in range(len(data)): - datas[label[i]].append(data[i]) - - for each in datas: - each = np.array(each) - plt.scatter(each[:, 0], each[:, 1]) - plt.savefig(f'img/{name}') - plt.close() - # plt.show() - -if __name__ == '__main__': - if len(sys.argv) > 1 and sys.argv[1] == 'g': - try: - n = int(sys.argv[2]) - generate(n) - except: - print('error: wrong n') - elif len(sys.argv) > 1 and sys.argv[1] == 'd': - (train_data, train_label), (test_data, test_label) = read() - try: - n = int(sys.argv[2]) - genimg(n, train_data, train_label, 'train') - genimg(n, test_data, test_label, 'test') - except: - print('somthing goes wrong!') - else: - (train_data, train_label), (test_data, test_label) = read() - model = KNN() - model.fit(train_data, train_label) - res = model.predict(test_data) - print("acc =",np.mean(np.equal(res, test_label))) \ No newline at end of file diff --git a/assignment-1/submission/18307130341/README.md b/assignment-1/submission/18307130341/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e2d7dbd1aa21f08d8f510d067bd8855b49136783 --- /dev/null +++ b/assignment-1/submission/18307130341/README.md @@ -0,0 +1,102 @@ +# 实验报告:ASS1-KNN分类器 + +18307130341 黄韵澄 + +[toc] + +### 1.1 实验概述 + +​ 本实验使用k近邻方法(KNN),实现了对二维带标签点集的分类。 + +​ 在**1.2**中生成了3组符合高斯分布的二维点集,并打上0~2共3种标签。初始数据集被划分为训练数据(train_data,80%)和测试数据(test_data,20%),并将训练数据放进KNN模型中进行训练。 + +​ **1.3**中是对KNN模型的实现部分,包括初始化、模型训练、预测分类三个部分。 + +​ **1.4**中对KNN模型进行测试和实验探究。 + +​ **1.5**中完成实验的自动测试部分。 + +### 1.2 数据集生成 + +​ 数据集的生成在函数`data_generate`中实现。每种标签各生成了`num=200`个点集,通过设置其均值`mean`和协方差`cov`来生成符合高斯分布的二维数据集。生成数据集的参数如下: +$$ +\mu_0 = [6, 4],\ \Sigma_0 = \begin{bmatrix}35&4\\4&11\\\end{bmatrix}\\ +\mu_1 = [11, 14],\ \Sigma_1 = \begin{bmatrix}21&6\\6&24 \\\end{bmatrix}\\ +\mu_2 = [22, 6],\ \Sigma_2 = \begin{bmatrix}25&5\\5&10\\\end{bmatrix}\\ +$$ +​ 生成的数据集组成`data`,使用`shuffle`函数将其打乱,取前80%作为`train_data`,后20%作为`test_data`。 + +​ 使用`matplotlib`绘图,测试集和训练集散点图如下: + +![train_data](img\train_data.png) + +
Fig 1:训练集
+ +![test_data](D:\YC\study\大三下\模式识别与机器学习\pj1\prml-21-spring\assignment-1\submission\18307130341\img\test_data.png) + +
Fig 2: 测试集 + +### 1.3 KNN模型的实现 + +#### 1.3.1 KNN初始化 + +​ `__init__`成员函数中初始化KNN模型。定义`data`、`label`作为k近邻的候选点集,`num`为点集规模,`k`为模型训练后选取的最优k值。 + +#### 1.3.2 模型训练——fit函数 + +​ 将`train_data`再次划分为`train_set_data`和`dev_set_data`。`train_set_data`作为候选点集,`dev_set_data`作为开发集,实现对超参`k`的选取。 + +​ 超参`k`的选取范围设置为`1~15`。 + +​ 对每个候选k,对开发集进行类别预测。具体方式是在训练集中找到最近的k个点中出现中最多的类别作为预测类别。对每个k计算出其类别预测平均准确率acc,绘制成图如下: + +![k_acc](img\k_acc.png) + +
Fig 3:acc-k折线图 + +​ 在1%的波动范围内,选取准确度最高的最小k作为最终选取的超参`k`,此时模型训练完成。在上图的样例中,最终选取的超参`k`为6。 + +#### 1.3.3 类别预测——predict函数 + +​ 对于每个需要预测的数据,找到训练集中距离最近的`k`个点,距离选取为欧几里得距离。 + +​ 最终每个点的预测类别为:`k`个最近邻的点中出现最多的类别。`k`为1.3.2中训练好的超参`k`。 + +### 1.4 模型测试和实验探究 + +#### 1.4.1 模型测试 + +​ 用`test_data`进行模型测试,输出模型的准确率。实验重复10次,结果如下: +$$ +\begin{array}{c|l} {实验次数}&{1}&{2}&{3}&{4}&{5}&{6}&{7}&{8}&{9}&{10}\\ \hline {k}&{5}&{8}&{9}&{3}&{11}&{6}&{8}&{6}&{9}&{7}\\ {acc}&{0.83}&{0.88}&{0.8}&{0.83}&{0.87}&{0.85}&{0.88}&{0.88}&{0.86}&{0.88}\\ \end{array} +$$ + +
Table 1: 模型准确率 + +​ 平均超参`k`为`7~8`之间,平均准确率`acc`为`0.856`。 + +#### 1.4.2 实验探究 + +​ (1) 修改高斯分布距离,使得三种点集更加分散(重合部分更少): + +![Fig4](img\Fig4.png) + +
Fig 4: 修改高斯分布距离后的点集 + +​ 该数据集测试出的模型准确率为92.92%。多次重复实验,增大点集的距离,得出结论:高斯分布距离越大,点集越分散,模型准确率越高。 + +​ (2) 修改高斯分布距离,使得三种点集更加分散(重合部分更少): + +![Fig5](img\Fig5.png) + +
Fig 5: 修改高斯分布方差后的点集 + +​ 该数据集测试出的模型准确率为97.5%。多次重复实验,减小点坐标方差(对角线的值),得出结论:高斯分布方差越小,点集内部更加集中,模型准确率更高。 + +### 1.5 自动化测试 + +​ 程序只导入了`numpy`和`matplotlib`包。 + +​ 配置conda环境进行测试,可以通过测试。 + +​ \ No newline at end of file diff --git a/assignment-1/submission/18307130341/img/Fig4.png b/assignment-1/submission/18307130341/img/Fig4.png new file mode 100644 index 0000000000000000000000000000000000000000..73f0b16e63c730206ad6ef5e0e7ae0357edd87e3 Binary files /dev/null and b/assignment-1/submission/18307130341/img/Fig4.png differ diff --git a/assignment-1/submission/18307130341/img/Fig5.png b/assignment-1/submission/18307130341/img/Fig5.png new file mode 100644 index 0000000000000000000000000000000000000000..9e268c13558259d449fb5bb068592089c7b8424c Binary files /dev/null and b/assignment-1/submission/18307130341/img/Fig5.png differ diff --git a/assignment-1/submission/18307130341/img/k_acc.png b/assignment-1/submission/18307130341/img/k_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..9be3895d0efe8698d2ad4d4059ea016b7023b36f Binary files /dev/null and b/assignment-1/submission/18307130341/img/k_acc.png differ diff --git a/assignment-1/submission/18307130341/img/test_data.png b/assignment-1/submission/18307130341/img/test_data.png new file mode 100644 index 0000000000000000000000000000000000000000..4542a24e3ccdaa56269acd5055648ada456894e8 Binary files /dev/null and b/assignment-1/submission/18307130341/img/test_data.png differ diff --git a/assignment-1/submission/18307130341/img/train_data.png b/assignment-1/submission/18307130341/img/train_data.png new file mode 100644 index 0000000000000000000000000000000000000000..bff7980984d6f66e8d5e9855a95af7f742189149 Binary files /dev/null and b/assignment-1/submission/18307130341/img/train_data.png differ diff --git a/assignment-1/submission/18307130341/source.py b/assignment-1/submission/18307130341/source.py new file mode 100644 index 0000000000000000000000000000000000000000..d84eea82f98a6ac33f7f5eb4158e4877fa839f94 --- /dev/null +++ b/assignment-1/submission/18307130341/source.py @@ -0,0 +1,112 @@ +import numpy as np +import matplotlib.pyplot as plt + +class KNN: + + def __init__(self): + self.data = [] + self.label = [] + self.num = 0 + self.k = 1 + + def fit(self, train_data, train_label): + self.num, _ = train_data.shape + + ratio = 0.8 + idx = np.random.permutation(self.num) + train_data = train_data[idx] + train_label = train_label[idx] + + train_set_num = (int)(ratio * self.num) + train_set_data = train_data[:train_set_num] + train_set_label = train_label[:train_set_num] + dev_set_data = train_data[train_set_num:] + dev_set_label = train_label[train_set_num:] + + self.data = train_set_data + self.label = train_set_label + self.num = train_set_num + max_acc = -1 + max_k = 15 + + acc_k = [] + + for k in range(1,max_k): + self.k = k + predict_label = self.predict(dev_set_data) + acc = np.mean(np.equal(predict_label, dev_set_label)) + acc_k.append(acc) + if acc >= max_acc + 0.01: + max_acc = acc + select_k = k + + # Graph_Plot(acc_k, "k_acc") + + self.k = select_k + self.num , _ = train_data.shape + self.data = train_data + self.label = train_label + + def predict(self, test_data): + predict_label = [] + for x in test_data: + dis = np.array([np.sqrt(sum((x-y)**2)) for y in self.data]) + knn = np.argsort(np.array(dis))[:self.k] + result = np.argmax(np.bincount(self.label[knn])) + predict_label.append(result) + return predict_label + +def data_generate(num): + mean = [(6,4), (11, 14), (22, 6)] + cov = [[35, 4], [4, 11]],[[21, 6], [6, 24]],[[25, 5], [5, 10]] + data0 = np.random.multivariate_normal(mean[0], cov[0], num) + data1 = np.random.multivariate_normal(mean[1], cov[1], num) + data2 = np.random.multivariate_normal(mean[2], cov[2], num) + data = np.concatenate([data0,data1,data2]) + label = np.array([0]*num + [1]*num + [2]*num) + + idx = np.random.permutation(3*num) + + data = data[idx] + label = label[idx] + + return data, label + +def Graph_Plot(acc, name): + plt.plot(acc) + plt.xlabel('k') + plt.ylabel('acc') + # plt.savefig(f'img/{name}') + plt.close() + +def Graph_Scatter(data, label, name): + points =[[],[],[]] + for i in range(len(data)): + points[label[i]].append(data[i]) + for points_set in points: + points_set = np.array(points_set) + plt.scatter(points_set[:, 0], points_set[:, 1]) + # plt.show() + # plt.savefig(f'img/{name}') + plt.close() + + +if __name__ == "__main__": + num = 400 + data, label = data_generate(num) + train_num = (int)(num* 3 * 0.8) + train_data = data[:train_num] + train_label = label[:train_num] + test_data = data[train_num:] + test_label = label[train_num:] + + # Graph_Scatter(train_data, train_label, "train_data") + # Graph_Scatter(test_data, test_label,"test_data") + + model = KNN() + model.fit(train_data, train_label) + test_predict = model.predict(test_data) + acc = np.mean(np.equal(test_predict, test_label)) + print("k = ", model.k) + print("acc = ", acc) + diff --git a/assignment-1/submission/19210680053/README.md b/assignment-1/submission/19210680053/README.md deleted file mode 100644 index 6ae1a49f48c030f79bcc25f37bd717d6fe307c48..0000000000000000000000000000000000000000 --- a/assignment-1/submission/19210680053/README.md +++ /dev/null @@ -1,246 +0,0 @@ -# 课程报告 - -## 说明 - -我使用的包为numpy,在class KNN中: - - -a.使用函数euclidean进行向量间欧式距离的计算 - - -b.使用closest函数进行逐个向量输入,分别计算它与全部train data的欧氏距离,并输出距它最近k个点出现次数最多train label。当最近k个点不存在出现次数最多train label(如出现次数均等),将进行label随机输出 - - -c.使用predict函数将全部test data逐个输入,得到预测结果 - - -d.使用choose函数,将预测结果与test label进行比对,结果相同取值为1,不同为0,进行准确率计算。k值选择范围根据训练与测试集数量决定(最小值为2,最大值为数据量的10%),从中选取使预测结果准确率最高k值,并输出对准确率预测 - - -## 数据生成 实验探究 - -我使用以下参数生成了如下三个二维高斯分布,label分别为0,1,2 - - - label=0 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 0 \\\\ -0 & 10 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -20 & 25 -\end{array}\right] -\end{array} -$$ - - - label=1 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -23 & 0 \\\\ -0 & 22 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -16 & -5 -\end{array}\right] -\end{array} -$$ - - - label=2 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 5 \\\\ -5 & 10 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -20 & 25 -\end{array}\right] -\end{array} -$$ - -这是我生成的训练集: - - -训练集 - - -这是我生成的测试集: - - -测试集 - - -可以通过如下表格来报告我的实验结果 - -Algo |kvalue|Acc | ------| ---- |---- | -KNN | 5 |0.6225 | - - - - -由于label=0和label=2的对应高斯分布较靠近,导致训练准确性为62.25%。 - - -为进一步探究高斯分布距离对预测准确性影响,我使用如下参数进行分布生成: - - label=0 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 2.1 \\\\ -2.1 & 12 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -20 & 25 -\end{array}\right] -\end{array} -$$ - - - label=1 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -23 & 0 \\\\ -0 & 22 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -20 & 25 -\end{array}\right] -\end{array} -$$ - - - label=2 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 5 \\\\ -5 & 10 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -20 & 25 -\end{array}\right] -\end{array} -$$ - -这是我生成的训练集: - - -训练集 - -这是我生成的测试集: - - -测试集 - - -可以通过如下表格来报告我的实验结果 - -Algo |kvalue|Acc | ------| ---- |---- | -KNN | 12 |0.485 | - -此时3个高斯分布距离彼此都很近,进行不同k值选取,实验的准确性最高达到48.5%。 - -|k |Acc | ------ | ---- | -| 2 | 0.4525 | -| 3 | 0.4375 | -| 4 | 0.4475 | -| 5 | 0.4300 | -| 6 | 0.4675 | -| 7 | 0.4525 | -| 8 | 0.4775 | -| 9 | 0.4450 | -| 10 | 0.4650 | -| 11 | 0.4700 | -| 12 | 0.4850 | -| 13 | 0.4750 | -| 14 | 0.4650 | -| 15 | 0.4625 | -| 16 | 0.4775 | -| 17 | 0.4650 | -| 18 | 0.4800 | -| 19 | 0.4700 | -| 20 | 0.4725 | - - -改变高斯分布距离,我使用以下参数生成高斯分布。 - - - label=0 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 2.1 \\\\ -2.1 & 12 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -20 & 25 -\end{array}\right] -\end{array} -$$ - - - label=1 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -23 & 0 \\\\ -0 & 22 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -16 & -5 -\end{array}\right] -\end{array} -$$ - - - label=2 -$$ -\begin{array}{l} -\Sigma=\left[\begin{array}{cc} -10 & 5 \\\\ -5 & 10 -\end{array}\right] \\\\ -\mu=\left[\begin{array}{ll} -3 & 5 -\end{array}\right] -\end{array} -$$ - -这是我生成的训练集: - - -训练集 - - -这是我生成的测试集: - - -测试集 - - -可以通过如下表格来报告我的实验结果 - -Algo |kvalue|Acc | ------| ---- |---- | -KNN | 2 |0.9975 | - - -此时3个高斯分布距离较远,通过较少的k值即可得到较为准确的判断。增加高斯分布间的距离可以提升实验的准确性。 - -## 代码使用方法 - -```bash -改变mode数值: -mode=0 #数据生成 -mode=1 #数据可视化 -mode取非0-1值 #训练和测试 diff --git a/assignment-1/submission/19210680053/img/test 1.png b/assignment-1/submission/19210680053/img/test 1.png deleted file mode 100644 index bf515460fd3bf6e81d027117399749a3b10c29fe..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19210680053/img/test 1.png and /dev/null differ diff --git a/assignment-1/submission/19210680053/img/test 2.png b/assignment-1/submission/19210680053/img/test 2.png deleted file mode 100644 index 1d962680d1019a7b4946d61b7a66ede507ad0d4c..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19210680053/img/test 2.png and /dev/null differ diff --git a/assignment-1/submission/19210680053/img/test 3.png b/assignment-1/submission/19210680053/img/test 3.png deleted file mode 100644 index 3ab9d8b6157ed19597c283688c34daeef54beeeb..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19210680053/img/test 3.png and /dev/null differ diff --git a/assignment-1/submission/19210680053/img/train 1.png b/assignment-1/submission/19210680053/img/train 1.png deleted file mode 100644 index dbe1db24a876a4b564d98b3009aefae717ba433c..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19210680053/img/train 1.png and /dev/null differ diff --git a/assignment-1/submission/19210680053/img/train 2.png b/assignment-1/submission/19210680053/img/train 2.png deleted file mode 100644 index 406126994e9ac71f4a43d6d182e72d88e4eaceed..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19210680053/img/train 2.png and /dev/null differ diff --git a/assignment-1/submission/19210680053/img/train 3.png b/assignment-1/submission/19210680053/img/train 3.png deleted file mode 100644 index 761f9ee658095183c7c2a3925b6cbb9c51fde989..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19210680053/img/train 3.png and /dev/null differ diff --git a/assignment-1/submission/19210680053/source.py b/assignment-1/submission/19210680053/source.py deleted file mode 100644 index 0f5e2424b154e74548445bef39f513dee6b40c94..0000000000000000000000000000000000000000 --- a/assignment-1/submission/19210680053/source.py +++ /dev/null @@ -1,103 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np - -class KNN(): - def euclidean(self,v1,v2): - return np.sqrt(np.sum(np.square(v1 - v2))) - def fit(self, X_train, Y_train): - self.train_data = train_data - self.train_label = train_label - def predict(self, train_data,k): - predictions = [] - for item in train_data: - label = self.closest(item,k) - predictions.append(label) - return predictions - - def closest(self, item,k): - min_ind = 0 - distlst=[] - idxlst=list(range(len(self.train_data))) - #get distance between test_data with train_data - for i in range(0,len(self.train_data)): - distlst.append(self.euclidean(item, self.train_data[i])) - #make up a dictionary with distance and index - distdict=dict(zip(idxlst,distlst)) - distdict=dict(sorted(distdict.items(),key=lambda item:item[1])) - #get first K nearest position - min_ind=list(dict(list(distdict.items())[:k]).keys()) - min_dist=[self.train_label[i] for i in min_ind] - return max(min_dist,key=min_dist.count) - - def choose(self,test_data,test_label): - acclst=[] - for k in range(2,7): - res=self.predict(test_data,k) - acc=np.mean(np.equal(res, test_label)) - acclst.append(acc) - max_acc=max(acclst) - max_k=acclst.index(max_acc)+2 - return max_k,max_acc - - -def generate(): - mean = (20, 25) - cov = np.array([[10,2.1], [2.1, 12]]) - x = np.random.multivariate_normal(mean, cov, (800,)) - - mean = (16, -5) - cov = np.array([[23, 0], [0, 22]]) - y = np.random.multivariate_normal(mean, cov, (200,)) - - mean = (3, 5) - cov = np.array([[10,5],[5,10]]) - z = np.random.multivariate_normal(mean, cov, (1000,)) - - idx = np.arange(2000) - np.random.shuffle(idx) - data = np.concatenate([x,y,z]) - label = np.concatenate([ - np.zeros((800,),dtype=int), - np.ones((200,),dtype=int), - np.ones((1000,),dtype=int)*2 - ]) - data = data[idx] - label = label[idx] - - train_data, test_data = data[:1600,], data[1600:,] - train_label, test_label = label[:1600,], label[1600:,] - np.save("data.npy",((train_data, train_label), (test_data, test_label) - )) - -def display(data, label, name): - datas =[[],[],[]] - for i in range(len(data)): - datas[label[i]].append(data[i]) - - for each in datas: - each = np.array(each) - plt.scatter(each[:, 0], each[:, 1]) - label=[str(i) for i in list(range(len(datas)))] - plt.legend(['label '+i for i in label]) - plt.show() - -def read(): - (train_data, train_label), (test_data, test_label) = np.load("data.npy",allow_pickle=True) - return (train_data, train_label), (test_data, test_label) - - -if __name__ == "__main__": - mode=0 - if mode == 0: - generate() - if mode == 1: - (train_data, train_label), (test_data, test_label) = read() - display(train_data, train_label, 'train') - display(test_data, test_label, 'test') - else: - (train_data, train_label), (test_data, test_label) = read() - - model = KNN() - model.fit(train_data, train_label) - k ,acc = model.choose(test_data,test_label) - print("k=",k,"acc=",acc*100,"%") \ No newline at end of file diff --git a/assignment-1/submission/19307110020/README.md b/assignment-1/submission/19307110020/README.md deleted file mode 100644 index 187378e8d1e1fff3b396413ddc2e09588a505b13..0000000000000000000000000000000000000000 --- a/assignment-1/submission/19307110020/README.md +++ /dev/null @@ -1,142 +0,0 @@ -# PRML21春-HW1-KNN分类器 - -### 简介 - -实现KNN分类器并探索细节。本设计实现了一系列KNN算法的变种,包括: - -- 实现了数据的min-max标准化 -- 证明了标准化的有效性 - -- 自动寻找较优的K值 -- 实现了两种距离(欧氏距离/曼哈顿距离)的算法 -- 比较了两种距离尺度的特性 - -- 实现了对每个训练集中点的置信度的加权 -- 实现了对K个近邻点按距离的加权 -- 证明以上两种加权方式是有效的消融实验 -- 探索常规KNN算法对数据分布的响应 -- 验证了加权后的算法相对于常规算法对数据分布的鲁棒性 - - - -### 前言 - -本实验没有探索数据本身对于常规KNN算法的影响。 - -我认为“点集重叠越少表现越好”之类的结论是显而易见的,不希望在数据本身上花太多时间。 - -此外,我没有对数据可视化,因为数据的分布给定之后,大致可以预见其可视化结果。本设计中实现了许多有意义的改进,因此重点在于算法本身的探索上,与作业手册的第四条“修改数据集的属性(例如:不同高斯分布之间的距离),进行探究性实验”有出入,还望助教老师多多包涵~ - - - -### min-max标准化 - -#### 实现细节 - -对每一个channel,都实现了 - -`x=(x-(该channel中最小值))/((该channel中最大值)-(该channel中最小值))` - -的标准化,将所有的值映射到`[0,1]`上,并且会在训练集中记录训练集上每一channel的最大/最小值,并对测试集做同样的映射,以保证两者采用同一映射,如果对两个数据集分别归一化,将会有标准化方式不统一的问题。 - -#### 正确性 - -其带来的作用是显而易见的,在闵氏距离族中,不论采用何种距离度量,都会出现同一问题:两点之间的距离最主要受尺度最大的特征影响,这一显然的结论不需要实验便可以举出例子证明。 - -经典的波士顿房价预测问题(尽管其原型是回归问题,仍然可以用KNN算法分析房价等级,例如高、中、低等):“历年房价中位数”的尺度是万数量级的,而“交通便利指数”、“环保指数”、“与就业中心的距离”都是十分有用的特征,然而其尺度远比房价小! - -如果不加处理就直接进行KNN算法,显然其他特征全部被忽略,KNN算法将完全被历年房价主导!这是不合理的。在此处尺度最大的特征恰巧是相关性极强的,算法尚能工作,如果尺度最大的恰巧是相关性低的,则算法将完全失效! - - - -### 自动寻找K值 - -由于算法之间(自动寻找K值,置信度加权,距离加权)互相牵制,因此在搜索K值时只能采用常规KNN算法,本算法在`1~min(16,训练集样本个数)`之中搜索所有K,找到准确率最高的K值。 - -在示例程序中,将会自动选择K=1,在自动生成的样本中,参数如下: - -| Distribution | Mean | Cov | Number | -| ------------ | ------ | ----------------- | ------ | -| Class 0 | (1,2) | [[10, 0], [0, 2]] | 100 | -| Class 1 | (4,5) | [[7, 3], [15, 1]] | 100 | -| Class 2 | (-2,6) | [[0, 1], [1, 2]] | 100 | - -将会选择K=11. 两者的寻找过程如下两图:![Figure_1](img/Figure_1.png) - -![Figure_2](img/Figure_2.png) - -### Euclidean & Manhattan - 两种距离尺度 - -本设计中实现了两种距离尺度中的KNN分类器。 - -曼哈顿距离对每一个channel都是独立的,欧氏距离对每一维度求偏导时都与当前距离相关,因此可以预见的是,在距离较远时,两者表现将接近,在距离较近的时候,需要通过实验探索两者的差异。 - -取两个分布方差矩阵均为单位阵,令其沿y=0.5x方向上其逐渐接近,其中一个分布的均值固定在`(0,0)`,另一分布均值分别为:`(0.1,0.2),(0.6,1.2),(1.1,2.2),(1.6,3.2),(2.1,4.2),(2.6,5.2)`。 - -可以看到如下的变化:![Figure_3](img/Figure_3.png) - -其中蓝线是欧几里得距离,绿线是曼哈顿距离,可见欧氏距离对两分布接近时处理的更好。 - -p.s. 以上的实验采用常规KNN算法。 - - - -### 置信度加权 - -在两个分布有较大的重叠时,训练集中的重叠点本身也不能通过KNN以很高的置信度判断自身的类别,对这类点,其置信度应该降低。 - -置信度的算法为:`K近邻点中与自身类别相同的点数/K`。 - -实验证明这样的改变是有益的。在如下分布中做消融实验: - -| Distribution | Mean | Cov | Number | -| ------------ | ------ | ----------------- | ------ | -| Class 0 | (1,2) | [[10, 0], [0, 2]] | 100 | -| Class 1 | (4,5) | [[7, 3], [15, 1]] | 100 | -| Class 2 | (-2,6) | [[0, 1], [1, 2]] | 100 | - -常规KNN算法:acc = 0.85 - -带置信度加权的KNN算法: 0.8666666666666667 - - - -### 距离加权 - -在一个K近邻范围内,有些点离待监测点较近,有些较远。有理由相信距离较近的点提供的置信度更少,因此需要实现距离加权。 - -距离加权的算法为:`每个近邻点投票的权值为:D-(该近邻点与待监测点的距离/近邻范围内最远点与待监测点的距离)`。其中D为超参。 - -实验得到,在如上分布中,D=1.8时表现最好,消融实验如下: - -常规KNN算法:acc = 0.85 - -带距离加权的KNN算法: 0.8666666666666667 - - - -### 联合加权 - -综合以上两种加权方式,此时需要引入第二个超参,该超参作用于置信度加权,并且对于仅采用置信度加权时,其不起作用,由于需要将两个计算得到的权重相乘,该超参用于调节两者起到的作用之比例。 - -联合加权算法为:`(K近邻点中与自身类别相同的点数/K-bias)*(D-(该近邻点与待监测点的距离/近邻范围内最远点与待监测点的距离))`。其中`bias`为引入的新超参,`D`为距离加权对应的超参。 - -实验证明,取`bias=0.2`,`D=2.1`时,可以达到较好的效果。 - -常规KNN算法:acc = 0.85 - -带联合加权的KNN算法:acc = 0.8833333333333333 - - - -### 对数据的鲁棒性 - -可以预见,当数据比较接近时,这样的加权会有更好的表现,因为其更好的考虑了模糊点置信程度较低的性质,并且利用了靠近待分类点与距离较远的点之间的差异。 - -下图为实验结果,其中蓝线为联合加权结果,绿线为常规KNN算法。 - -![Figure_4](img/Figure_4.png) - -### 总结 - -本设计探索了更高效的KNN算法,采用联合加权的方式对邻接范围内的点做不同的处理,取得了较好的效果。 \ No newline at end of file diff --git a/assignment-1/submission/19307110020/img/Figure_1.png b/assignment-1/submission/19307110020/img/Figure_1.png deleted file mode 100644 index b006d52cf0b89e9497b213ec044b0224a5a620a7..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19307110020/img/Figure_1.png and /dev/null differ diff --git a/assignment-1/submission/19307110020/img/Figure_2.png b/assignment-1/submission/19307110020/img/Figure_2.png deleted file mode 100644 index d520d40d31644714b20789c236f9976818185cab..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19307110020/img/Figure_2.png and /dev/null differ diff --git a/assignment-1/submission/19307110020/img/Figure_3.png b/assignment-1/submission/19307110020/img/Figure_3.png deleted file mode 100644 index c643350e0310bac079cdd179a01cff9371b27475..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19307110020/img/Figure_3.png and /dev/null differ diff --git a/assignment-1/submission/19307110020/img/Figure_4.png b/assignment-1/submission/19307110020/img/Figure_4.png deleted file mode 100644 index 6af94b33a92ba072e05b09dc9eff0ff3c6bc0d08..0000000000000000000000000000000000000000 Binary files a/assignment-1/submission/19307110020/img/Figure_4.png and /dev/null differ diff --git a/assignment-1/submission/19307110020/source.py b/assignment-1/submission/19307110020/source.py deleted file mode 100644 index a231c833e05fe6cb7a43b4c7fdabab987978305a..0000000000000000000000000000000000000000 --- a/assignment-1/submission/19307110020/source.py +++ /dev/null @@ -1,125 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -class KNN: - def __init__(self): - pass - - def fit(self, train_data, train_label): - self.len=train_data.shape[0] - #standardize - train_data=train_data.astype(np.float64) - train_data=train_data.T - self.channel=train_data.shape[0] - self.mins=[] - self.maxs=[] - for data in train_data: - self.mins.append(np.min(data)) - self.maxs.append(np.max(data)) - for i in range(data.shape[0]): - data[i] = (data[i] - np.min(data)) / (np.max(data) - np.min(data)) - self.train_data=train_data.T - self.train_label=train_label - - #grid search for K - maxk, maxacc=0, 0 - for k in range(1,min(17,self.len)): - acc=0 - for d in range(self.len): - dists = [] - indexs = np.arange(self.len) - for i in range(self.len): - dists.append(self.euclidean(self.train_data[d], self.train_data[i])) - dic = dict(zip(indexs, dists)) - dic = dict(sorted(dic.items(), key=lambda item: item[1])) - min_indexs = list(dict(list(dic.items())[1:k+1]).keys()) - min_dists = [self.train_label[i] for i in min_indexs] - if max(min_dists, key=min_dists.count) == self.train_label[d]: - acc+=1 - if acc>maxacc: - maxk, maxacc=k, acc - self.K=maxk - - #credibility - self.cred=[] - for d in range(self.len): - dists = [] - indexs = np.arange(self.len) - for i in range(self.len): - dists.append(self.euclidean(self.train_data[d], self.train_data[i])) - dic = dict(zip(indexs, dists)) - dic = dict(sorted(dic.items(), key=lambda item: item[1])) - min_indexs = list(dict(list(dic.items())[1:self.K+1]).keys()) - min_dists = [self.train_label[i] for i in min_indexs] - self.cred.append(float(min_dists.count(max(min_dists, key=min_dists.count)))/self.K) - - def predict(self, test_data): - test_data=test_data.astype(np.float64) - test_data=test_data.T - for i in range(self.channel): - for j in range(test_data.shape[1]): - test_data[i][j] = (test_data[i][j] - self.mins[i]) / (self.maxs[i] - self.mins[i]) - test_data = test_data.T - ans=[] - for d in range(test_data.shape[0]): - dists = [] - indexs = np.arange(self.len) - for i in range(self.len): - dists.append(self.euclidean(test_data[d], self.train_data[i])) - dic = dict(zip(indexs, dists)) - dic = dict(sorted(dic.items(), key=lambda item: item[1])) - min_indexs = list(dict(list(dic.items())[:self.K]).keys()) - min_dict={} - for i in min_indexs: - min_dict[self.train_label[i]]=min_dict.get(self.train_label[i],0)+(self.cred[i]-0.2)*(2.1-dic[i]/list(dic.items())[self.K-1][1]) - ans.append(max(min_dict, key=lambda k: min_dict[k])) - return ans - - - def euclidean(self, a, b): - return np.sqrt(np.sum(np.square(a-b))) - - def manhattan(self, a, b): - return np.sum(abs(a-b)) - - -def dataset(mean1,mean2,cov1,cov2,mean3=None,cov3=None): - mean=mean1 - cov=cov1 - x=np.random.multivariate_normal(mean, cov, (100,)) - mean=mean2 - cov=cov2 - y=np.random.multivariate_normal(mean, cov, (100,)) - num=300 - if mean3 is not None and cov3 is not None: - mean=mean3 - cov=cov3 - z=np.random.multivariate_normal(mean, cov, (100,)) - idx=np.arange(num) - np.random.shuffle(idx) - if mean3 is not None and cov3 is not None: - data=np.concatenate([x, y, z]) - label=np.concatenate([np.zeros((100,), dtype=np.int8), np.ones((100,), dtype=np.int8), np.ones((100,), dtype=np.int8) * 2]) - else: - data=np.concatenate([x, y]) - label=np.concatenate([np.zeros((100,), dtype=np.int8), np.ones((100,), dtype=np.int8)]) - data=data[idx] - label=label[idx] - split=int(num*0.8) - train_data, test_data=data[:split,:], data[split:,:] - train_label, test_label=label[:split], label[split:] - np.save("train_data.npy",train_data) - np.save("test_data.npy",test_data) - np.save("train_label.npy",train_label) - np.save("test_label.npy",test_label) - -def dataload(): - train_data, train_label, test_data, test_label = np.load("train_data.npy",allow_pickle=True),np.load("train_label.npy",allow_pickle=True),np.load("test_data.npy",allow_pickle=True),np.load("test_label.npy",allow_pickle=True) - return train_data, train_label, test_data, test_label - -if __name__ == '__main__': - dataset((1,2),(4,5),np.array([[10, 0], [0, 2]],dtype=np.float64),np.array([[7, 3], [15, 1]],dtype=np.float64),(-2,6),np.array([[0, 1], [1, 2]],dtype=np.float64)) - train_data, train_label, test_data, test_label=dataload() - model=KNN() - model.fit(train_data,train_label) - res=model.predict(test_data) - print("acc =",np.mean(np.equal(res, test_label)))