diff --git a/assignment-1/submission/18307130074/img/1test.png b/assignment-1/submission/18307130074/img/1test.png
new file mode 100644
index 0000000000000000000000000000000000000000..3d3172d1f183248a2d7c4443ea469d574a48e337
Binary files /dev/null and b/assignment-1/submission/18307130074/img/1test.png differ
diff --git a/assignment-1/submission/18307130074/img/1train.png b/assignment-1/submission/18307130074/img/1train.png
new file mode 100644
index 0000000000000000000000000000000000000000..cde346b5c7c06d576910a6df2a66029b0670b6e3
Binary files /dev/null and b/assignment-1/submission/18307130074/img/1train.png differ
diff --git a/assignment-1/submission/18307130074/img/2test.png b/assignment-1/submission/18307130074/img/2test.png
new file mode 100644
index 0000000000000000000000000000000000000000..6deaec8880ed4997651cf23c37e9e7abece9516c
Binary files /dev/null and b/assignment-1/submission/18307130074/img/2test.png differ
diff --git a/assignment-1/submission/18307130074/img/2train.png b/assignment-1/submission/18307130074/img/2train.png
new file mode 100644
index 0000000000000000000000000000000000000000..643e46c2343e7743d6cd6bb2a94ac58eda4fd3fe
Binary files /dev/null and b/assignment-1/submission/18307130074/img/2train.png differ
diff --git a/assignment-1/submission/18307130074/img/3test.png b/assignment-1/submission/18307130074/img/3test.png
new file mode 100644
index 0000000000000000000000000000000000000000..5e75e15ce5ea8f09568b3138dac23866c2423701
Binary files /dev/null and b/assignment-1/submission/18307130074/img/3test.png differ
diff --git a/assignment-1/submission/18307130074/img/3train.png b/assignment-1/submission/18307130074/img/3train.png
new file mode 100644
index 0000000000000000000000000000000000000000..34be5d5a681d0fa8a5ff9186e9a8a08034d21f21
Binary files /dev/null and b/assignment-1/submission/18307130074/img/3train.png differ
diff --git a/assignment-1/submission/18307130074/img/4test.png b/assignment-1/submission/18307130074/img/4test.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3a4f0b2caf50db06112042356688b28465efab6
Binary files /dev/null and b/assignment-1/submission/18307130074/img/4test.png differ
diff --git a/assignment-1/submission/18307130074/img/4train.png b/assignment-1/submission/18307130074/img/4train.png
new file mode 100644
index 0000000000000000000000000000000000000000..b58b79676726ad8b34e82a2b4e2f918d5fce2975
Binary files /dev/null and b/assignment-1/submission/18307130074/img/4train.png differ
diff --git a/assignment-1/submission/18307130074/img/5test.png b/assignment-1/submission/18307130074/img/5test.png
new file mode 100644
index 0000000000000000000000000000000000000000..365c0cef40fa9d80536197fbde693acb25cd552b
Binary files /dev/null and b/assignment-1/submission/18307130074/img/5test.png differ
diff --git a/assignment-1/submission/18307130074/img/5train.png b/assignment-1/submission/18307130074/img/5train.png
new file mode 100644
index 0000000000000000000000000000000000000000..266f3fa6ea5f349d8705e4a8f22f92e6f5c2600c
Binary files /dev/null and b/assignment-1/submission/18307130074/img/5train.png differ
diff --git a/assignment-1/submission/18307130074/readme.md b/assignment-1/submission/18307130074/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..d830733fb3c4e45ed7398fedde30c850d6366d10
--- /dev/null
+++ b/assignment-1/submission/18307130074/readme.md
@@ -0,0 +1,259 @@
+# KNN Classifier
+
+[TOC]
+
+
+
+## 1. Code Introduction
+
+所有代码都在**source.py**中展示
+
+引用库包括**numpy**&**matplotlib**
+
+```
+import numpy as np
+import matplotlib.pyplot as mp
+```
+
+**KNN**类包括如下几个函数**partition**, **distance**, **predict_label**, **fit**, **predict**
+
+首先需要初始化KNN类中的成员变量
+
+```
+def __init__(self):
+
+ self.train_data_num = 0
+ self.valid_data_num = 0
+ self.train_data_dimension = 0
+
+ self.train_data = None
+ self.valid_data = None
+ self.train_label = None
+ self.valid_label = None
+
+ self.K = 20
+```
+
+其中train_data_num表示训练集数量,valid_data_num表示验证集数量,train_data_dimension表示数据维度(在本次实验中我们默认你使用二维数据),(train_data,valid_data,train_label,valid_label)分别表示(训练集数据,验证集数据,训练集标签,验证集标签),K表示通过训练所得出来的最优K值
+
+**partition**函数输入的**参数**包括:数据集和标签集,**作用**是:将输入的数据和标签随机取80%作为训练集的数据和标签,其余20%作为验证集的数据和标签。
+
+**distance**函数输入的**参数**包括:点1,点2,计算距离的方式(默认为Euclid即欧几里得距离),**作用**是:以给定的方式计算两个点之间的距离
+
+**predict_label**函数输入的**参数**包括:k值,数据集1,标签集1,数据集2,计算距离的方式, **作用**是:给定k值,以数据集1和标签集1作为已知的点集,通过k近邻算法计算并返回数据集2所对应的标签集
+
+**fit**函数输入的**参数**包括:数据集和标签集,**作用**是:首先利用**partition**函数将数据集和标签集按照8:2的比例分为训练集和验证集,然后枚举k的值,并通过**predict_label**函数预测训练集的标签集,并对比该结果和给定的训练集标签,得到准确率。选取准确率最高的k值作为模型的k值。
+
+**predict**函数输入的**参数**包括:数据集,**作用**是:将该数据集作为测试集,利用已经训练好的KNN模型返回测试集的标签,一般配合**fit**函数使用。
+
+**一些其他的函数**:
+
+1. **data_maker**函数输入的**参数**包括:id,平均值,协方差矩阵,数量,**作用**是:通过np.random.multivariate_normal函数以及给定的参数生成数据,并将数据分为训练集和测试集,通过np.save储存
+
+2. **data_reader**函数输入的**参数**包括:id,**作用**是:通过np.read读取**data_maker**生成的数据
+
+3. **data_painter**函数输入的**参数**包括:id,数据集,标签集,name,**作用**是:通过matplotlib.pyplot.scatter来画出数据的散点图,便于直观观察数据的分布
+
+## 2. Experiment
+
+### 1. Initial Data
+
+数据集参数
+$$
+\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}30&5 \\\\ 5&10\end{bmatrix},|x|=400\\\\
+\mu_y=[5,15],\Sigma_y=\begin{bmatrix}20&4 \\\\ 4&10\end{bmatrix},|y|=400\\\\
+\mu_z=[15,5],\Sigma_z=\begin{bmatrix}20&3 \\\\ 3&4\end{bmatrix},|z|=400
+$$
+训练集和测试集散点图
+
+
+
+
+
+
+实验结果(重复实验5次)
+
+| k | train | test |
+| ------ | ------ | ------ |
+| 18 | 0.8490 | 0.8667 |
+| 16 | 0.8333 | 0.8542 |
+| 5 | 0.8021 | 0.8458 |
+| 8 | 0.8646 | 0.8333 |
+| 10 | 0.8490 | 0.8542 |
+| 平均值 | 0.8396 | 0.8508 |
+
+### 2. Research
+
+#### 1.研究重叠度对准确率的影响
+
+##### 重叠度增大
+
+数据集参数
+$$
+\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}40&5 \\\\ 5&40\end{bmatrix},|x|=400\\\\
+\mu_y=[5,15],\Sigma_y=\begin{bmatrix}40&4 \\\\ 4&40\end{bmatrix},|y|=400\\\\
+\mu_z=[15,5],\Sigma_z=\begin{bmatrix}40&3 \\\\ 3&40\end{bmatrix},|z|=400
+$$
+训练集和测试集散点图
+
+
+
+
+
+
+实验结果(重复实验5次)
+
+| k | train | test |
+| ------ | ------ | ------ |
+| 19 | 0.6146 | 0.5917 |
+| 14 | 0.6510 | 0.5708 |
+| 18 | 0.5990 | 0.5875 |
+| 11 | 0.6563 | 0.5875 |
+| 6 | 0.6719 | 0.575 |
+| 平均值 | 0.6386 | 0.5825 |
+
+##### 重叠度减小
+
+数据集参数
+$$
+\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=400\\\\
+\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=400\\\\
+\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=400
+$$
+训练集和测试集散点图
+
+
+
+
+
+
+实验结果(重复实验5次)
+
+| k | train | test |
+| ------ | ------ | ------ |
+| 12 | 0.9271 | 0.8875 |
+| 7 | 0.9167 | 0.875 |
+| 6 | 0.9375 | 0.8625 |
+| 6 | 0.9271 | 0.8792 |
+| 10 | 0.9635 | 0.8792 |
+| 平均值 | 0.9344 | 0.8767 |
+
+##### 结论
+
+随着不同标签的点集重叠度增大,KNN分类器的准确率降低
+
+#### 2.研究点集数量对准确率的影响
+
+##### 数据减少
+
+数据集参数
+$$
+\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=200\\\\
+\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=200\\\\
+\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=200
+$$
+训练集和测试集散点图
+
+
+
+
+
+
+实验结果(重复实验5次)
+
+| k | train | test |
+| ------ | ------ | ------ |
+| 6 | 0.8958 | 0.9 |
+| 8 | 0.9479 | 0.8833 |
+| 18 | 0.9063 | 0.8917 |
+| 7 | 0.9688 | 0.8833 |
+| 14 | 0.875 | 0.8833 |
+| 平均值 | 0.9188 | 0.8883 |
+
+##### 数据增多
+
+数据集参数
+$$
+\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=800\\\\
+\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=800\\\\
+\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=800
+$$
+训练集和测试集散点图
+
+
+
+
+
+
+实验结果(重复实验5次)
+
+| k | train | test |
+| ------ | ------ | ------ |
+| 9 | 0.8958 | 0.9188 |
+| 9 | 0.9115 | 0.925 |
+| 18 | 0.9323 | 0.9083 |
+| 10 | 0.9193 | 0.9208 |
+| 18 | 0.9089 | 0.9188 |
+| 平均值 | 0.9136 | 0.9183 |
+
+##### 结论
+
+数据的多少很有可能影响着KNN分类器的准确率,但是影响效果不明显,准确率随着数据的增多而提高
+
+#### 3.研究计算距离的方式对准确率的影响
+
+##### 曼哈顿距离
+
+数据集参数
+$$
+\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=800\\\\
+\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=800\\\\
+\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=800
+$$
+训练集和测试集散点图
+
+
+
+
+
+
+实验结果(重复实验5次)
+
+| k | train | test |
+| ------ | ------ | ------ |
+| 17 | 0.8906 | 0.9167 |
+| 7 | 0.9010 | 0.9083 |
+| 13 | 0.9167 | 0.9271 |
+| 9 | 0.9115 | 0.9208 |
+| 17 | 0.9219 | 0.9292 |
+| 平均值 | 0.9083 | 0.9204 |
+
+##### 上确界距离
+
+数据集参数
+$$
+\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=800\\\\
+\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=800\\\\
+\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=800
+$$
+训练集和测试集散点图
+
+
+
+
+
+
+实验结果(重复实验5次)
+
+| k | train | test |
+| ------ | ------ | ------ |
+| 13 | 0.9036 | 0.9083 |
+| 17 | 0.9115 | 0.9167 |
+| 16 | 0.9089 | 0.9229 |
+| 9 | 0.9089 | 0.9125 |
+| 14 | 0.9167 | 0.9229 |
+| 平均值 | 0.9099 | 0.9167 |
+
+##### 结论
+
+三种计算距离的方式所得到的结果相差不多,说明在数据量足够大且分类明确的情况下,欧几里得距离、上确界距离、曼哈顿距离的效果相近
\ No newline at end of file
diff --git a/assignment-1/submission/18307130074/source.py b/assignment-1/submission/18307130074/source.py
new file mode 100644
index 0000000000000000000000000000000000000000..78932f83ad277b432007e9c4bfa61bad89f6d81b
--- /dev/null
+++ b/assignment-1/submission/18307130074/source.py
@@ -0,0 +1,188 @@
+import numpy as np
+import matplotlib.pyplot as mp
+
+class KNN:
+
+ def __init__(self):
+
+ self.train_data_num = 0
+ self.valid_data_num = 0
+ self.train_data_dimension = 0
+
+ self.train_data = None
+ self.valid_data = None
+ self.train_label = None
+ self.valid_label = None
+
+ self.K = 20
+
+
+ def partition(self, data, label):
+
+ length = len(data)
+ list1 = []
+ list2 = []
+ list3 = []
+
+ for i in range(0, length):
+ list1.append([data[i], label[i]])
+
+ np.random.shuffle(list1)
+
+ for each in list1:
+ list2.append(each[0])
+ list3.append(each[1])
+
+ list2 = np.array(list2)
+ list3 = np.array(list3)
+
+ return list2, list3
+
+
+ def distance(self, s, t, p='Euclid'):
+ """
+ 表示两个点之间的距离,s和t的类型应该为
+ p表示距离的种类,目前已实现的有曼哈顿距离(Manhattan),欧几里得(Euclid), 上确界距离(upper)
+ """
+ ans = 0
+
+ if p == 'Euclid':
+ for each in range(self.train_data_dimension):
+ ans += (s[each] - t[each])**2
+
+ elif p == 'Manhattan':
+ for each in range(self.train_data_dimension):
+ ans += abs(s[each] - t[each])
+
+ elif p == 'upper':
+ for each in range(self.train_data_dimension):
+ ans = max(abs(s[each] - t[each]), ans)
+
+ return ans
+
+
+ def predict_label(self, k, data1, label1, data2, p='Euclid'):
+
+ result = []
+ for point in data2:
+
+ dist = []
+
+ for i in range(len(data1)):
+ dist.append(self.distance(point, data1[i], p))
+
+ dist = np.array(dist)
+ indices = np.argpartition(dist, k)[:k]
+
+ counter = {}
+ temp = []
+ maxc = 0
+
+ for each in indices:
+ if label1[int(each)] in counter:
+ counter[label1[int(each)]] += 1
+ else:
+ counter[label1[int(each)]] = 1
+
+ if maxc < counter[label1[int(each)]]:
+ maxc = counter[label1[int(each)]]
+
+ for each in counter:
+ if counter[each] == maxc:
+ temp.append(each)
+
+ result.append(np.random.choice(temp))
+
+ result = np.array(result)
+ return result
+
+
+ def fit(self, train_data, train_label):
+
+ # 确定训练集和验证集的数量和数据维度
+ self.valid_data_num = max(1, train_data.shape[0] // 5)
+ self.train_data_num = train_data.shape[0] - self.valid_data_num
+ self.train_data_dimension = train_data.shape[1]
+
+ # 将数据集随机划分
+ temp_data, temp_label = self.partition(train_data, train_label)
+ self.train_data, self.train_label = temp_data[:self.train_data_num], temp_label[:self.train_data_num]
+ self.valid_data, self.valid_label = temp_data[self.train_data_num:], temp_label[self.train_data_num:]
+
+ max_re, best_k, max_k = 0, 0, min(20, self.train_data_num)
+
+ for i in range(1, max_k):
+ result = self.predict_label(i, self.train_data, self.train_label, self.valid_data)
+ re = np.mean(np.equal(result, self.valid_label))
+ if re > max_re:
+ max_re = re
+ best_k = i
+
+ self.K = best_k
+ print("acc =",max_re)
+ print("k=", best_k)
+
+
+ def predict(self, test_data):
+ return self.predict_label(self.K, self.train_data, self.train_label, test_data)
+
+
+def data_maker(id, mean, cov, num):
+
+ data = None
+ label = None
+
+ data = np.concatenate([np.random.multivariate_normal(mean[i], cov[i], (num[i],)) for i in range(3)])
+ label = np.concatenate([np.ones((num[i],), dtype=int) * i for i in range(3)])
+
+ length = len(data)
+ test_data_num = length // 5
+
+ data, label = KNN().partition(data, label)
+
+ test_data = data[:test_data_num]
+ test_label = label[:test_data_num]
+ train_data = data[test_data_num:]
+ train_label = label[test_data_num:]
+
+ np.save(str(id) + 'data.npy', (train_data, train_label, test_data, test_label))
+
+
+def data_reader(id):
+ return np.load(str(id) + 'data.npy', allow_pickle=True)
+
+
+def data_painter(id, data, label, name):
+
+ length = len(data)
+ painter = {}
+
+ for i in range(length):
+ if label[i] in painter:
+ painter[label[i]].append(data[i])
+ else:
+ painter[label[i]] = [data[i]]
+
+ for each in painter:
+ temp = np.array(painter[each])
+ mp.scatter(temp[:, 0], temp[:, 1])
+
+ mp.savefig(f'./{str(id) + name}')
+ mp.show()
+ mp.close()
+
+
+if __name__ == "__main__":
+ mean = [(10, 10), (5, 15), (15, 5)]
+ cov = [np.array([[10, 5], [5, 10]]), np.array([[10, 4], [4, 10]]), np.array([[10, 3], [3, 10]])]
+ num = [800, 800, 800]
+ id = 5
+ # data_maker(id, mean, cov, num)
+ train_data, train_label, test_data, test_label = data_reader(id)
+ # data_painter(id, train_data, train_label, 'train')
+ # data_painter(id, test_data, test_label, 'test')
+ model = KNN()
+ for i in range(5):
+ model.fit(train_data, train_label)
+ res = model.predict(test_data)
+ print("acc =",np.mean(np.equal(res, test_label)))
\ No newline at end of file