diff --git a/assignment-1/submission/19210680053/README.md b/assignment-1/submission/19210680053/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6ae1a49f48c030f79bcc25f37bd717d6fe307c48 --- /dev/null +++ b/assignment-1/submission/19210680053/README.md @@ -0,0 +1,246 @@ +# 课程报告 + +## 说明 + +我使用的包为numpy,在class KNN中: + + +a.使用函数euclidean进行向量间欧式距离的计算 + + +b.使用closest函数进行逐个向量输入,分别计算它与全部train data的欧氏距离,并输出距它最近k个点出现次数最多train label。当最近k个点不存在出现次数最多train label(如出现次数均等),将进行label随机输出 + + +c.使用predict函数将全部test data逐个输入,得到预测结果 + + +d.使用choose函数,将预测结果与test label进行比对,结果相同取值为1,不同为0,进行准确率计算。k值选择范围根据训练与测试集数量决定(最小值为2,最大值为数据量的10%),从中选取使预测结果准确率最高k值,并输出对准确率预测 + + +## 数据生成 实验探究 + +我使用以下参数生成了如下三个二维高斯分布,label分别为0,1,2 + + + label=0 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +10 & 0 \\\\ +0 & 10 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +20 & 25 +\end{array}\right] +\end{array} +$$ + + + label=1 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +23 & 0 \\\\ +0 & 22 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +16 & -5 +\end{array}\right] +\end{array} +$$ + + + label=2 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +10 & 5 \\\\ +5 & 10 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +20 & 25 +\end{array}\right] +\end{array} +$$ + +这是我生成的训练集: + + +训练集 + + +这是我生成的测试集: + + +测试集 + + +可以通过如下表格来报告我的实验结果 + +Algo |kvalue|Acc | +-----| ---- |---- | +KNN | 5 |0.6225 | + + + + +由于label=0和label=2的对应高斯分布较靠近,导致训练准确性为62.25%。 + + +为进一步探究高斯分布距离对预测准确性影响,我使用如下参数进行分布生成: + + label=0 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +10 & 2.1 \\\\ +2.1 & 12 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +20 & 25 +\end{array}\right] +\end{array} +$$ + + + label=1 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +23 & 0 \\\\ +0 & 22 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +20 & 25 +\end{array}\right] +\end{array} +$$ + + + label=2 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +10 & 5 \\\\ +5 & 10 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +20 & 25 +\end{array}\right] +\end{array} +$$ + +这是我生成的训练集: + + +训练集 + +这是我生成的测试集: + + +测试集 + + +可以通过如下表格来报告我的实验结果 + +Algo |kvalue|Acc | +-----| ---- |---- | +KNN | 12 |0.485 | + +此时3个高斯分布距离彼此都很近,进行不同k值选取,实验的准确性最高达到48.5%。 + +|k |Acc | +----- | ---- | +| 2 | 0.4525 | +| 3 | 0.4375 | +| 4 | 0.4475 | +| 5 | 0.4300 | +| 6 | 0.4675 | +| 7 | 0.4525 | +| 8 | 0.4775 | +| 9 | 0.4450 | +| 10 | 0.4650 | +| 11 | 0.4700 | +| 12 | 0.4850 | +| 13 | 0.4750 | +| 14 | 0.4650 | +| 15 | 0.4625 | +| 16 | 0.4775 | +| 17 | 0.4650 | +| 18 | 0.4800 | +| 19 | 0.4700 | +| 20 | 0.4725 | + + +改变高斯分布距离,我使用以下参数生成高斯分布。 + + + label=0 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +10 & 2.1 \\\\ +2.1 & 12 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +20 & 25 +\end{array}\right] +\end{array} +$$ + + + label=1 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +23 & 0 \\\\ +0 & 22 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +16 & -5 +\end{array}\right] +\end{array} +$$ + + + label=2 +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +10 & 5 \\\\ +5 & 10 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +3 & 5 +\end{array}\right] +\end{array} +$$ + +这是我生成的训练集: + + +训练集 + + +这是我生成的测试集: + + +测试集 + + +可以通过如下表格来报告我的实验结果 + +Algo |kvalue|Acc | +-----| ---- |---- | +KNN | 2 |0.9975 | + + +此时3个高斯分布距离较远,通过较少的k值即可得到较为准确的判断。增加高斯分布间的距离可以提升实验的准确性。 + +## 代码使用方法 + +```bash +改变mode数值: +mode=0 #数据生成 +mode=1 #数据可视化 +mode取非0-1值 #训练和测试 diff --git a/assignment-1/submission/19210680053/img/test 1.png b/assignment-1/submission/19210680053/img/test 1.png new file mode 100644 index 0000000000000000000000000000000000000000..bf515460fd3bf6e81d027117399749a3b10c29fe Binary files /dev/null and b/assignment-1/submission/19210680053/img/test 1.png differ diff --git a/assignment-1/submission/19210680053/img/test 2.png b/assignment-1/submission/19210680053/img/test 2.png new file mode 100644 index 0000000000000000000000000000000000000000..1d962680d1019a7b4946d61b7a66ede507ad0d4c Binary files /dev/null and b/assignment-1/submission/19210680053/img/test 2.png differ diff --git a/assignment-1/submission/19210680053/img/test 3.png b/assignment-1/submission/19210680053/img/test 3.png new file mode 100644 index 0000000000000000000000000000000000000000..3ab9d8b6157ed19597c283688c34daeef54beeeb Binary files /dev/null and b/assignment-1/submission/19210680053/img/test 3.png differ diff --git a/assignment-1/submission/19210680053/img/train 1.png b/assignment-1/submission/19210680053/img/train 1.png new file mode 100644 index 0000000000000000000000000000000000000000..dbe1db24a876a4b564d98b3009aefae717ba433c Binary files /dev/null and b/assignment-1/submission/19210680053/img/train 1.png differ diff --git a/assignment-1/submission/19210680053/img/train 2.png b/assignment-1/submission/19210680053/img/train 2.png new file mode 100644 index 0000000000000000000000000000000000000000..406126994e9ac71f4a43d6d182e72d88e4eaceed Binary files /dev/null and b/assignment-1/submission/19210680053/img/train 2.png differ diff --git a/assignment-1/submission/19210680053/img/train 3.png b/assignment-1/submission/19210680053/img/train 3.png new file mode 100644 index 0000000000000000000000000000000000000000..761f9ee658095183c7c2a3925b6cbb9c51fde989 Binary files /dev/null and b/assignment-1/submission/19210680053/img/train 3.png differ diff --git a/assignment-1/submission/19210680053/source.py b/assignment-1/submission/19210680053/source.py new file mode 100644 index 0000000000000000000000000000000000000000..0f5e2424b154e74548445bef39f513dee6b40c94 --- /dev/null +++ b/assignment-1/submission/19210680053/source.py @@ -0,0 +1,103 @@ +import matplotlib.pyplot as plt +import numpy as np + +class KNN(): + def euclidean(self,v1,v2): + return np.sqrt(np.sum(np.square(v1 - v2))) + def fit(self, X_train, Y_train): + self.train_data = train_data + self.train_label = train_label + def predict(self, train_data,k): + predictions = [] + for item in train_data: + label = self.closest(item,k) + predictions.append(label) + return predictions + + def closest(self, item,k): + min_ind = 0 + distlst=[] + idxlst=list(range(len(self.train_data))) + #get distance between test_data with train_data + for i in range(0,len(self.train_data)): + distlst.append(self.euclidean(item, self.train_data[i])) + #make up a dictionary with distance and index + distdict=dict(zip(idxlst,distlst)) + distdict=dict(sorted(distdict.items(),key=lambda item:item[1])) + #get first K nearest position + min_ind=list(dict(list(distdict.items())[:k]).keys()) + min_dist=[self.train_label[i] for i in min_ind] + return max(min_dist,key=min_dist.count) + + def choose(self,test_data,test_label): + acclst=[] + for k in range(2,7): + res=self.predict(test_data,k) + acc=np.mean(np.equal(res, test_label)) + acclst.append(acc) + max_acc=max(acclst) + max_k=acclst.index(max_acc)+2 + return max_k,max_acc + + +def generate(): + mean = (20, 25) + cov = np.array([[10,2.1], [2.1, 12]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[23, 0], [0, 22]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (3, 5) + cov = np.array([[10,5],[5,10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + idx = np.arange(2000) + np.random.shuffle(idx) + data = np.concatenate([x,y,z]) + label = np.concatenate([ + np.zeros((800,),dtype=int), + np.ones((200,),dtype=int), + np.ones((1000,),dtype=int)*2 + ]) + data = data[idx] + label = label[idx] + + train_data, test_data = data[:1600,], data[1600:,] + train_label, test_label = label[:1600,], label[1600:,] + np.save("data.npy",((train_data, train_label), (test_data, test_label) + )) + +def display(data, label, name): + datas =[[],[],[]] + for i in range(len(data)): + datas[label[i]].append(data[i]) + + for each in datas: + each = np.array(each) + plt.scatter(each[:, 0], each[:, 1]) + label=[str(i) for i in list(range(len(datas)))] + plt.legend(['label '+i for i in label]) + plt.show() + +def read(): + (train_data, train_label), (test_data, test_label) = np.load("data.npy",allow_pickle=True) + return (train_data, train_label), (test_data, test_label) + + +if __name__ == "__main__": + mode=0 + if mode == 0: + generate() + if mode == 1: + (train_data, train_label), (test_data, test_label) = read() + display(train_data, train_label, 'train') + display(test_data, test_label, 'test') + else: + (train_data, train_label), (test_data, test_label) = read() + + model = KNN() + model.fit(train_data, train_label) + k ,acc = model.choose(test_data,test_label) + print("k=",k,"acc=",acc*100,"%") \ No newline at end of file