diff --git a/assignment-1/submission/17307130133/README.md b/assignment-1/submission/17307130133/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b4ef5ec932b48a64641332fa51b023e214123724 --- /dev/null +++ b/assignment-1/submission/17307130133/README.md @@ -0,0 +1,129 @@ +# 课程报告 + +## KNN实现 + +### 距离的计算 + +$$ +\begin{array}\\\\ +assume\ test\ matrix\ P: M\times D,\ train\ matrix\ C:N\times D. D\ is\ the\ dimension\ of\ data.\\\\ +let\ P_i\ is\ ith\ row\ in\ P,\ C_j\ is\ jth\ row\ in\ C.\\\\ +distance\ between\ P_i\ and\ C_j:\\\\ +d(P_i,C_j) = \sqrt{\sum_{k=1}^{D}(P_{ik}-C_{jk})^2}\\\\ +=\sqrt{\sum_{k=1}^DP_{ik}^2+\sum_{k=1}^DC_{jk}^2-2\sum_{k=1}^D(P_{ik}C_{jk})}\\\\ +=\sqrt{||P_i||^2+||C_j||^2-2P_iC_j^T}\\\\ +then\ we\ can\ calculate\ the\ whole\ distance\ matrix\ only\ with\ matrix\ operations. +\end{array} +$$ + +### 数据预处理:归一化 + +将数据归一化到[0, 1],在服从正态分布的数据集上测试时表现不佳(详见实验部分)。最终代码中有实现归一化(normalize中),但是并未应用。 + +### k的取值 + +fit函数中,k在[2, 6]中取最优值。fit函数先把train_data划分为train_data和dev_data,比例为1:4;然后计算出距离矩阵;最后k遍历[2, 6],找到在dev_data上测试所得到最高正确率的k值,应用于最后的预测。 + +## 实验 + +### 实验一 正则化 + +此实验目的是探究KNN中数据正则化的影响。 + +实验了多组数据,只有在测试数据参数为 + +$$ +\begin{array}{l} +&\Sigma_1&=&\left[\begin{array}{cc} +3 & 0 \\\\ +0 & 70 +\end{array}\right] \qquad +&\Sigma_2&=&\left[\begin{array}{cc} +4 & 0 \\\\ +0 & 65 +\end{array}\right] \qquad +&\Sigma_3&=&\left[\begin{array}{cc} +2 & 0 \\\\ +0 & 75 +\end{array}\right]\\\\ +&\mu_1&=&\left[\begin{array}{ll} +4 & -20 +\end{array}\right] +&\mu_2&=&\left[\begin{array}{ll} +5 & 0 +\end{array}\right] +&\mu_3&=&\left[\begin{array}{ll} +6 & 20 +\end{array}\right] +\end{array} +$$ + +时,使用正则化取得更好的结果。 + +训练集: + +train1 + +测试集: + +test1 + +| k | 2 | 3 | 4 | 5 | 6 | +| ---------------- | ------ | ------ | ------ | ------ | ------ | +| acc_dev 无归一化 | 87.81% | 91.88% | 91.25% | 91.25% | 91.25% | +| acc_dev 有归一化 | 87.81% | 91.88% | 91.25% | 91.25% | 91.25% | + +最佳k值都为3,无归一化时,在test上为准确率:88.25%;有归一化时,在test上准确率为89.25%。 + +在其他使用正态分布生成的数据中,都是不使用归一化准确率更高。在上例中,使用归一化正确率提升仅1%,而在其他数据上,不使用归一化正确率提高会高得多。所以在最终的代码中并未对数据进行归一化处理。在本系列实验中,归一化与否并不影响k的最佳取值。 + +实验结论:首先,归一化并不适合在正态分布上的KNN分类。其次,归一化不影响最佳k值。 + +### 实验二 改变分布之间的距离 + +使用两个正态分布探究不同高斯分布之间距离的影响。先保持高斯分布的协方差矩阵不变,改变均值之间的距离。 + +训练集: + +![train_2](./img/train_2.jpg) + +测试集: + +![train_2](./img/test_2.jpg) + +| 序号 | 1 | 2 | 3 | 4 | 5 | 6 | +| ------- | ------ | ------ | ------ | ------ | ------ | ------ | +| 准确率 | 97.75% | 98.25% | 98.50% | 92.25% | 87.75% | 85.00% | +| 最佳k值 | 2 | 3 | 5 | 5 | 5 | 6 | + +可以看出,两个分布的数据点范围开始重叠时,准确率开始下降,重叠范围越大,准确率越低,k值也在相应增大。 + +接下来,保持两个分布均值距离不变,仅改变高斯分布的协方差矩阵。 + +训练集: + +train_2 + +测试集: + +train_2 + +| 序号 | 1 | 2 | 3 | 4 | +| ------- | ------ | ------ | ------ | ------ | +| 准确率 | 96.75% | 96.25% | 95.00% | 92.50% | +| 最佳k值 | 5 | 5 | 6 | 3 | + +类似地,准确率随着分布的重叠而降低。 + +## 代码使用方法 + +```bash +python source.py g # 生成数据集 +python source.py d # 展示数据集 +python source.py # 训练和测试 +``` + +# 参考 + +距离的计算:https://blog.csdn.net/IT_forlearn/article/details/100022244?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.control&dist_request_id=&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.control + diff --git a/assignment-1/submission/17307130133/img/test_1.png b/assignment-1/submission/17307130133/img/test_1.png new file mode 100644 index 0000000000000000000000000000000000000000..1b3ef8c56c035e5122cdfacd83b9c436051d4b02 Binary files /dev/null and b/assignment-1/submission/17307130133/img/test_1.png differ diff --git a/assignment-1/submission/17307130133/img/test_2.jpg b/assignment-1/submission/17307130133/img/test_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3ceff6d6091c5d283fcba0023e053e079e4720d0 Binary files /dev/null and b/assignment-1/submission/17307130133/img/test_2.jpg differ diff --git a/assignment-1/submission/17307130133/img/test_3.jpg b/assignment-1/submission/17307130133/img/test_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..784506a6a23f0fcfaac61f422d03f0de9a4aab95 Binary files /dev/null and b/assignment-1/submission/17307130133/img/test_3.jpg differ diff --git a/assignment-1/submission/17307130133/img/train_1.png b/assignment-1/submission/17307130133/img/train_1.png new file mode 100644 index 0000000000000000000000000000000000000000..fab155cdf1d85d91e888e28a678ee2dc11d63d68 Binary files /dev/null and b/assignment-1/submission/17307130133/img/train_1.png differ diff --git a/assignment-1/submission/17307130133/img/train_2.jpg b/assignment-1/submission/17307130133/img/train_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b723ef988fb70cc8f5efe0dd7e502798135578e0 Binary files /dev/null and b/assignment-1/submission/17307130133/img/train_2.jpg differ diff --git a/assignment-1/submission/17307130133/img/train_3.jpg b/assignment-1/submission/17307130133/img/train_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5ba83b8810ff506b4eae19ce531494986142458d Binary files /dev/null and b/assignment-1/submission/17307130133/img/train_3.jpg differ diff --git a/assignment-1/submission/17307130133/source.py b/assignment-1/submission/17307130133/source.py new file mode 100644 index 0000000000000000000000000000000000000000..dc9938a32ac6c410feff6fb95204e0861363809d --- /dev/null +++ b/assignment-1/submission/17307130133/source.py @@ -0,0 +1,132 @@ +import sys +import numpy as np +import matplotlib.pyplot as plt + +class KNN: + + def __init__(self): + self.train_data = None + self.train_label = None + self.k = 2 + + def fit(self, train_data, train_label): + # train_data = self.normalize(train_data) + self.train_data = train_data + self.train_label = train_label + + N = train_data.shape[0] + cut = N // 5 * 4 + train_data, dev_data = train_data[:cut,], train_data[cut:,] + train_label, dev_label = train_label[:cut,], train_label[cut:,] + + dists = self.compute_distances(train_data, dev_data) + + max_acc = 0 + max_acc_k = 2 + for k in range(2,7): + res = self.get_label(dists, train_label, k) + acc = np.mean(np.equal(res, dev_label)) + print("k = %d, acc = %f" % (k, acc)) + if acc > max_acc: + max_acc = acc + max_acc_k = k + print("best k = %d" % max_acc_k) + self.k = max_acc_k + + def predict(self, test_data): + # test_data = self.normalize(test_data) + dists = self.compute_distances(self.train_data, test_data) + return self.get_label(dists, self.train_label, self.k) + + def compute_distances(self, train_data, test_data): + num_train = train_data.shape[0] + num_test = test_data.shape[0] + dists = np.zeros((num_test, num_train)) + + train_square = np.sum(np.square(train_data), axis=1).reshape(1, num_train) + test_square = np.sum(np.square(test_data), axis=1).reshape(num_test, 1) + dists = np.sqrt(train_square + test_square - 2 * np.dot(test_data, train_data.T)) + + return dists + + def get_label(self, dists, train_label, k): + num_test = dists.shape[0] + y_predict = np.zeros(num_test, dtype=train_label.dtype) + for i in range(num_test): + closest_y = list(train_label[np.argsort(dists[i,:])[:k]]) + y_predict[i] = max(closest_y, key = closest_y.count) + return y_predict + + def normalize(self, data): + if len(data) == 0: + return data + return (data - np.min(data)) / (np.max(data) - np.min(data)) + + +def generate(): + # mean = (1, 2) + # cov = np.array([[73, 0], [0, 22]]) + mean = (-17, 2) + cov = np.array([[103, 0],[0, 22]]) + x = np.random.multivariate_normal(mean, cov, (1200,)) + + # mean = (16, -5) + # cov = np.array([[21.2, 0], [0, 32.1]]) + mean = (10, -5) + cov = np.array([[101.2, 0],[0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (800,)) + + # mean = (10, 22) + # cov = np.array([[10,5],[5,10]]) + # z = np.random.multivariate_normal(mean, cov, (1000,)) + + idx = np.arange(2000) + np.random.shuffle(idx) + # data = np.concatenate([x,y,z]) + data = np.concatenate([x,y]) + label = np.concatenate([ + # np.zeros((800,),dtype=int), + # np.ones((200,),dtype=int), + # np.ones((1000,),dtype=int)*2 + np.zeros((1200,),dtype=int), + np.ones((800,),dtype=int), + ]) + data = data[idx] + label = label[idx] + + train_data, test_data = data[:1600,], data[1600:,] + train_label, test_label = label[:1600,], label[1600:,] + np.save("data.npy",( + (train_data, train_label), (test_data, test_label) + )) + +def read(): + (train_data, train_label), (test_data, test_label) = np.load("data.npy", allow_pickle=True) + return (train_data, train_label), (test_data, test_label) + +def display(data, label, name): + # datas = [[], [], []] + datas = [[], []] + for i in range(len(data)): + datas[label[i]].append(data[i]) + + for each in datas: + each = np.array(each) + plt.scatter(each[:, 0], each[:, 1]) + plt.savefig(f'img/{name}') + plt.show() + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "g": + generate() + elif len(sys.argv) > 1 and sys.argv[1] == "d": + (train_data, train_label), (test_data, test_label) = read() + display(train_data, train_label, 'train') + display(test_data, test_label, 'test') + else: + (train_data, train_label), (test_data, test_label) = read() + + model = KNN() + model.fit(train_data, train_label) + res = model.predict(test_data) + print("acc =",np.mean(np.equal(res, test_label)))