diff --git a/assignment-1/submission/17307130133/README.md b/assignment-1/submission/17307130133/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4ef5ec932b48a64641332fa51b023e214123724
--- /dev/null
+++ b/assignment-1/submission/17307130133/README.md
@@ -0,0 +1,129 @@
+# 课程报告
+
+## KNN实现
+
+### 距离的计算
+
+$$
+\begin{array}\\\\
+assume\ test\ matrix\ P: M\times D,\ train\ matrix\ C:N\times D. D\ is\ the\ dimension\ of\ data.\\\\
+let\ P_i\ is\ ith\ row\ in\ P,\ C_j\ is\ jth\ row\ in\ C.\\\\
+distance\ between\ P_i\ and\ C_j:\\\\
+d(P_i,C_j) = \sqrt{\sum_{k=1}^{D}(P_{ik}-C_{jk})^2}\\\\
+=\sqrt{\sum_{k=1}^DP_{ik}^2+\sum_{k=1}^DC_{jk}^2-2\sum_{k=1}^D(P_{ik}C_{jk})}\\\\
+=\sqrt{||P_i||^2+||C_j||^2-2P_iC_j^T}\\\\
+then\ we\ can\ calculate\ the\ whole\ distance\ matrix\ only\ with\ matrix\ operations.
+\end{array}
+$$
+
+### 数据预处理:归一化
+
+将数据归一化到[0, 1],在服从正态分布的数据集上测试时表现不佳(详见实验部分)。最终代码中有实现归一化(normalize中),但是并未应用。
+
+### k的取值
+
+fit函数中,k在[2, 6]中取最优值。fit函数先把train_data划分为train_data和dev_data,比例为1:4;然后计算出距离矩阵;最后k遍历[2, 6],找到在dev_data上测试所得到最高正确率的k值,应用于最后的预测。
+
+## 实验
+
+### 实验一 正则化
+
+此实验目的是探究KNN中数据正则化的影响。
+
+实验了多组数据,只有在测试数据参数为
+
+$$
+\begin{array}{l}
+&\Sigma_1&=&\left[\begin{array}{cc}
+3 & 0 \\\\
+0 & 70
+\end{array}\right] \qquad
+&\Sigma_2&=&\left[\begin{array}{cc}
+4 & 0 \\\\
+0 & 65
+\end{array}\right] \qquad
+&\Sigma_3&=&\left[\begin{array}{cc}
+2 & 0 \\\\
+0 & 75
+\end{array}\right]\\\\
+&\mu_1&=&\left[\begin{array}{ll}
+4 & -20
+\end{array}\right]
+&\mu_2&=&\left[\begin{array}{ll}
+5 & 0
+\end{array}\right]
+&\mu_3&=&\left[\begin{array}{ll}
+6 & 20
+\end{array}\right]
+\end{array}
+$$
+
+时,使用正则化取得更好的结果。
+
+训练集:
+
+
+
+测试集:
+
+
+
+| k | 2 | 3 | 4 | 5 | 6 |
+| ---------------- | ------ | ------ | ------ | ------ | ------ |
+| acc_dev 无归一化 | 87.81% | 91.88% | 91.25% | 91.25% | 91.25% |
+| acc_dev 有归一化 | 87.81% | 91.88% | 91.25% | 91.25% | 91.25% |
+
+最佳k值都为3,无归一化时,在test上为准确率:88.25%;有归一化时,在test上准确率为89.25%。
+
+在其他使用正态分布生成的数据中,都是不使用归一化准确率更高。在上例中,使用归一化正确率提升仅1%,而在其他数据上,不使用归一化正确率提高会高得多。所以在最终的代码中并未对数据进行归一化处理。在本系列实验中,归一化与否并不影响k的最佳取值。
+
+实验结论:首先,归一化并不适合在正态分布上的KNN分类。其次,归一化不影响最佳k值。
+
+### 实验二 改变分布之间的距离
+
+使用两个正态分布探究不同高斯分布之间距离的影响。先保持高斯分布的协方差矩阵不变,改变均值之间的距离。
+
+训练集:
+
+
+
+测试集:
+
+
+
+| 序号 | 1 | 2 | 3 | 4 | 5 | 6 |
+| ------- | ------ | ------ | ------ | ------ | ------ | ------ |
+| 准确率 | 97.75% | 98.25% | 98.50% | 92.25% | 87.75% | 85.00% |
+| 最佳k值 | 2 | 3 | 5 | 5 | 5 | 6 |
+
+可以看出,两个分布的数据点范围开始重叠时,准确率开始下降,重叠范围越大,准确率越低,k值也在相应增大。
+
+接下来,保持两个分布均值距离不变,仅改变高斯分布的协方差矩阵。
+
+训练集:
+
+
+
+测试集:
+
+
+
+| 序号 | 1 | 2 | 3 | 4 |
+| ------- | ------ | ------ | ------ | ------ |
+| 准确率 | 96.75% | 96.25% | 95.00% | 92.50% |
+| 最佳k值 | 5 | 5 | 6 | 3 |
+
+类似地,准确率随着分布的重叠而降低。
+
+## 代码使用方法
+
+```bash
+python source.py g # 生成数据集
+python source.py d # 展示数据集
+python source.py # 训练和测试
+```
+
+# 参考
+
+距离的计算:https://blog.csdn.net/IT_forlearn/article/details/100022244?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.control&dist_request_id=&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.control
+
diff --git a/assignment-1/submission/17307130133/img/test_1.png b/assignment-1/submission/17307130133/img/test_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b3ef8c56c035e5122cdfacd83b9c436051d4b02
Binary files /dev/null and b/assignment-1/submission/17307130133/img/test_1.png differ
diff --git a/assignment-1/submission/17307130133/img/test_2.jpg b/assignment-1/submission/17307130133/img/test_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3ceff6d6091c5d283fcba0023e053e079e4720d0
Binary files /dev/null and b/assignment-1/submission/17307130133/img/test_2.jpg differ
diff --git a/assignment-1/submission/17307130133/img/test_3.jpg b/assignment-1/submission/17307130133/img/test_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..784506a6a23f0fcfaac61f422d03f0de9a4aab95
Binary files /dev/null and b/assignment-1/submission/17307130133/img/test_3.jpg differ
diff --git a/assignment-1/submission/17307130133/img/train_1.png b/assignment-1/submission/17307130133/img/train_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..fab155cdf1d85d91e888e28a678ee2dc11d63d68
Binary files /dev/null and b/assignment-1/submission/17307130133/img/train_1.png differ
diff --git a/assignment-1/submission/17307130133/img/train_2.jpg b/assignment-1/submission/17307130133/img/train_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b723ef988fb70cc8f5efe0dd7e502798135578e0
Binary files /dev/null and b/assignment-1/submission/17307130133/img/train_2.jpg differ
diff --git a/assignment-1/submission/17307130133/img/train_3.jpg b/assignment-1/submission/17307130133/img/train_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5ba83b8810ff506b4eae19ce531494986142458d
Binary files /dev/null and b/assignment-1/submission/17307130133/img/train_3.jpg differ
diff --git a/assignment-1/submission/17307130133/source.py b/assignment-1/submission/17307130133/source.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc9938a32ac6c410feff6fb95204e0861363809d
--- /dev/null
+++ b/assignment-1/submission/17307130133/source.py
@@ -0,0 +1,132 @@
+import sys
+import numpy as np
+import matplotlib.pyplot as plt
+
+class KNN:
+
+ def __init__(self):
+ self.train_data = None
+ self.train_label = None
+ self.k = 2
+
+ def fit(self, train_data, train_label):
+ # train_data = self.normalize(train_data)
+ self.train_data = train_data
+ self.train_label = train_label
+
+ N = train_data.shape[0]
+ cut = N // 5 * 4
+ train_data, dev_data = train_data[:cut,], train_data[cut:,]
+ train_label, dev_label = train_label[:cut,], train_label[cut:,]
+
+ dists = self.compute_distances(train_data, dev_data)
+
+ max_acc = 0
+ max_acc_k = 2
+ for k in range(2,7):
+ res = self.get_label(dists, train_label, k)
+ acc = np.mean(np.equal(res, dev_label))
+ print("k = %d, acc = %f" % (k, acc))
+ if acc > max_acc:
+ max_acc = acc
+ max_acc_k = k
+ print("best k = %d" % max_acc_k)
+ self.k = max_acc_k
+
+ def predict(self, test_data):
+ # test_data = self.normalize(test_data)
+ dists = self.compute_distances(self.train_data, test_data)
+ return self.get_label(dists, self.train_label, self.k)
+
+ def compute_distances(self, train_data, test_data):
+ num_train = train_data.shape[0]
+ num_test = test_data.shape[0]
+ dists = np.zeros((num_test, num_train))
+
+ train_square = np.sum(np.square(train_data), axis=1).reshape(1, num_train)
+ test_square = np.sum(np.square(test_data), axis=1).reshape(num_test, 1)
+ dists = np.sqrt(train_square + test_square - 2 * np.dot(test_data, train_data.T))
+
+ return dists
+
+ def get_label(self, dists, train_label, k):
+ num_test = dists.shape[0]
+ y_predict = np.zeros(num_test, dtype=train_label.dtype)
+ for i in range(num_test):
+ closest_y = list(train_label[np.argsort(dists[i,:])[:k]])
+ y_predict[i] = max(closest_y, key = closest_y.count)
+ return y_predict
+
+ def normalize(self, data):
+ if len(data) == 0:
+ return data
+ return (data - np.min(data)) / (np.max(data) - np.min(data))
+
+
+def generate():
+ # mean = (1, 2)
+ # cov = np.array([[73, 0], [0, 22]])
+ mean = (-17, 2)
+ cov = np.array([[103, 0],[0, 22]])
+ x = np.random.multivariate_normal(mean, cov, (1200,))
+
+ # mean = (16, -5)
+ # cov = np.array([[21.2, 0], [0, 32.1]])
+ mean = (10, -5)
+ cov = np.array([[101.2, 0],[0, 32.1]])
+ y = np.random.multivariate_normal(mean, cov, (800,))
+
+ # mean = (10, 22)
+ # cov = np.array([[10,5],[5,10]])
+ # z = np.random.multivariate_normal(mean, cov, (1000,))
+
+ idx = np.arange(2000)
+ np.random.shuffle(idx)
+ # data = np.concatenate([x,y,z])
+ data = np.concatenate([x,y])
+ label = np.concatenate([
+ # np.zeros((800,),dtype=int),
+ # np.ones((200,),dtype=int),
+ # np.ones((1000,),dtype=int)*2
+ np.zeros((1200,),dtype=int),
+ np.ones((800,),dtype=int),
+ ])
+ data = data[idx]
+ label = label[idx]
+
+ train_data, test_data = data[:1600,], data[1600:,]
+ train_label, test_label = label[:1600,], label[1600:,]
+ np.save("data.npy",(
+ (train_data, train_label), (test_data, test_label)
+ ))
+
+def read():
+ (train_data, train_label), (test_data, test_label) = np.load("data.npy", allow_pickle=True)
+ return (train_data, train_label), (test_data, test_label)
+
+def display(data, label, name):
+ # datas = [[], [], []]
+ datas = [[], []]
+ for i in range(len(data)):
+ datas[label[i]].append(data[i])
+
+ for each in datas:
+ each = np.array(each)
+ plt.scatter(each[:, 0], each[:, 1])
+ plt.savefig(f'img/{name}')
+ plt.show()
+
+if __name__ == "__main__":
+ if len(sys.argv) > 1 and sys.argv[1] == "g":
+ generate()
+ elif len(sys.argv) > 1 and sys.argv[1] == "d":
+ (train_data, train_label), (test_data, test_label) = read()
+ display(train_data, train_label, 'train')
+ display(test_data, test_label, 'test')
+ else:
+ (train_data, train_label), (test_data, test_label) = read()
+
+ model = KNN()
+ model.fit(train_data, train_label)
+ res = model.predict(test_data)
+ print("acc =",np.mean(np.equal(res, test_label)))