diff --git a/assignment-1/submission/16307100065/.keep b/assignment-1/submission/16307100065/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-1/submission/16307100065/README.md b/assignment-1/submission/16307100065/README.md new file mode 100644 index 0000000000000000000000000000000000000000..06a0098602e802319c84db26985360e4c9e99cfb --- /dev/null +++ b/assignment-1/submission/16307100065/README.md @@ -0,0 +1,214 @@ +# KNN实验探索 + +## 一.KNN算法代码实现 + +首先定义KNN类,KNN类中包括三个方法: + +```python +import numpy as np +from collections import Counter #用于计数 +def __init__(self,k): + #输入超参k,并赋给内部变量self._k;定义另外内部三个重要变量 + #self._train_data,self._train_label,self._test_data + self._k = k + self._train_data=None + self._train_label = None + self._test_data=None +``` + +```python +def fit(self,train_data,train_label): + #输入train_data,train_label并赋给内部变量 + self._train_data=train_data + self._train_label=train_label +``` + +```python +def predict(self,test_data): + #输入test_data并赋给内部变量 + self._test_data=test_data + predicts_ =[] + #遍历测试集 + for i in self._test_data: + #对测试集中的数据求与每一个训练集中数据的欧氏距离 + distances_ = [np.sum((i-x)**2)**0.5 for x in self._train_data] + distances = np.array(distances_) + #用Counter函数求距离前k个中0-1的个数 + sorted_distances = np.argsort(distances) + topK = [self._train_label[j] for j in sorted_distances[0:self._k]] + votes = Counter(topK) + #预测结果为距离前k个中0-1数量多的种类 + predict = votes.most_common(1)[0][0] + predicts_.append(predict) + predicts = np.array(predicts_) + return predicts +``` + + + +## 二.试验探究 + +### 1.二维随机正态分布的简单分类实验 + +(1)两维之间完全不相关 + +```python +import numpy as np +from source import KNN +import matplotlib.pyplot as plt +#每一维均值为0,方差为10,并且两维独立,创建1000个数据 +cov = [[10,0],[0,10]] +data = np.around(np.random.multivariate_normal((0,0),cov,1000),2) +#对应分类随机取0或1 +label = np.random.choice([0,1],size=1000,replace=True) +#按8:2的比例分为训练集和测试集 +n = len(data)//5 +train_data = data[0:4*n] +train_label = label[0:4*n] +test_data = data[4*n:] +test_label = label[4*n:] +#调用KNN类,k赋值5,将训练集输入模型 +model = KNN(5) +model.fit(train_data, train_label) + +#绘制分类图 +#第一维和第二维分别作为x,y轴 +x_show = train_data[:,0] +y_show = train_data[:,1] +x_min,x_max=x_show.min(),x_show.max() +y_min,y_max=y_show.min(),y_show.max() +#将坐标系分为200×200的网格 +xx,yy = np.meshgrid(np.linspace(x_min,x_max,200),np.linspace(y_min,y_max,200)) +#将网格放入模型预测,预测每一个网格的分类 +z1 = np.c_[xx.ravel(),yy.ravel()] +z = np.array(z1) +pred = model.predict(z) +pred = pred.reshape(xx.shape) +#绘制网格分类图和训练集的散点图 +plt.pcolormesh(xx,yy,pred,cmap=plt.cm.Pastel1) +plt.scatter(x_show,y_show,s=80,c=train_label,cmap=plt.cm.spring,edgecolors='k') +plt.xlim(xx.min(),xx.max()) +plt.ylim(yy.min(),yy.max()) +plt.show() +``` + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195821_9a6657ad_8843048.png "二维结果.png") + +结果如图所示。由于data是根据二元正态分布随机取值的,并且label也是在0-1之间随机选取的,所以data和label之间是完全无关的。所以分类图也是不规则的。 + +(2)二维之间完全正相关 + +```python +#使两个维度相关系数为1 +cov = [[10,10],[10,10]] +``` + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195844_3f64dcf7_8843048.png "二维相关.png") + +结果如图所示。由于两维相关系数为1,所以所有点都在y=x的直线上。由于label的随机选取,所以0-1分类区域是在这条直线上的随机分布。 + +(3)完全负相关 + +```python +#两个维度相关系数为-1 +cov = [[10,-10],[-10,10]] +``` + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195855_21448894_8843048.png "负二维.png") + +结果如图所示。 + +(4)一般情况下 + +```python +#使相关系数为0.2 +cov = [[10,2],[2,10]] +``` + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195908_c9a5e2fa_8843048.png "0.2.png") + +```python +#使相关系数为0.8 +cov = [[10,8],[8,10]] +``` + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195921_5cf96f45_8843048.png "0.8.png") + +分别将相关系数设为0.2与0.8,可以看出0.2时分布还相对散乱,而0.8时已经可以看出明显的线性关系。同样,由于label的随机选取,在直线上0-1的分布仍然是随机的。 + +### 2.二维随机正态分布的多分类实验 + +```python +#相关系数取0.8 +cov = [[10,8],[8,10]] +#五分类任务 +label = np.random.choice([0,1,2,3,4],size=1000,replace=True) +``` + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195937_f438d130_8843048.png "五分类0.8.png") + +```python +#相关系数取0.2 +cov = [[10,2],[2,10]] +#五分类任务 +label = np.random.choice([0,1,2,3,4],size=1000,replace=True) +``` + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0329/195959_868f36e5_8843048.png "五0.2.png") + +结果如图所示。与简单分类结果类似,这是由于两维的相关系数是一样的,而仅仅是将label从0-1随机取值变为0-1-2-3-4随机取值,并没有本质变化,只是人为定义的分类数变多了。 + +### 3. 实验发现 + +#### (1)简单分类与多分类 + +以相关系数为0.2的二维正态分布,k值取5为例。 + +运行10次0-1的简单二分类实验,结果分别为: + +acc = 0.51,0.545,0.535,0.515 ,0.435, 0.465,0.56,0.53, 0.515,0.53 + +平均acc=0.514 + +运行10次三分类实验,结果如下: + +acc = 0.295,0.32,0.35,0.34,0.31,0.405 ,0.325 ,0.3,0.32,0.32 + +平均acc = 0.329 + +运行10次四分类实验,结果如下: + +acc = 0.25,0.22,0.255,0.19,0.215,0.335,0.3,0.25,0.28,0.225 + +平均acc = 0.252 + +运行10此五分类实验,结果如下: + +acc = 0.265,0.185,0.22,0.235,0.195,0.19,0.22, 0.23,0.2,0.215 + +平均acc = 0.2155 + + + +可以发现,当分类数越多时,预测的准确率越低。这应该是由于data和label本身之间没有相关性,acc近似等于(1/分类数),和瞎猜的准确率是近似的。 + +#### (2)k值的选取 + +以简单二分类为例。 + +k值取5,运行10次的平均acc =0.512 + +k值取4,运行10次的平均acc =0.507 + +k值取6,运行10次的平均acc =0.485 + +k值取3,运行10次的平均acc =0.495 + +k值取7,运行10次的平均acc =0.508 + + + +可知,k取5是恰当的。但同样由于data和label之间没有相关性,所以不同k值之间准确率的差异不大,都在0.5左右,近似于瞎猜。因为分类区域本身就是随机的,距离预测点越近的点并不代表属于该点类别的概率越大。 + +所以,该次尝试的调参是没有意义的。如果选取本身有意义,并且属性与分类有关系的数据组进行测试,取不同的k值的准确率才会有显著差异,此时的调参才有意义。 + diff --git a/assignment-1/submission/16307100065/img/.keep b/assignment-1/submission/16307100065/img/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-1/submission/16307100065/img/0.2.png b/assignment-1/submission/16307100065/img/0.2.png new file mode 100644 index 0000000000000000000000000000000000000000..5cc12d92a5e696a6181dc47fef6caf9de5a4d7c9 Binary files /dev/null and b/assignment-1/submission/16307100065/img/0.2.png differ diff --git a/assignment-1/submission/16307100065/img/0.8.png b/assignment-1/submission/16307100065/img/0.8.png new file mode 100644 index 0000000000000000000000000000000000000000..abe0d2e4626cc82b2e44961f668c4d732aa057cf Binary files /dev/null and b/assignment-1/submission/16307100065/img/0.8.png differ diff --git "a/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\233\270\345\205\263.png" "b/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\233\270\345\205\263.png" new file mode 100644 index 0000000000000000000000000000000000000000..4eb7d1805b16224826b6522b18680a6951f08085 Binary files /dev/null and "b/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\233\270\345\205\263.png" differ diff --git "a/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\273\223\346\236\234.png" "b/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\273\223\346\236\234.png" new file mode 100644 index 0000000000000000000000000000000000000000..cfb88bc7cf41bc6bb02cd3467b2f475c43a405a2 Binary files /dev/null and "b/assignment-1/submission/16307100065/img/\344\272\214\347\273\264\347\273\223\346\236\234.png" differ diff --git "a/assignment-1/submission/16307100065/img/\344\272\2240.2.png" "b/assignment-1/submission/16307100065/img/\344\272\2240.2.png" new file mode 100644 index 0000000000000000000000000000000000000000..5d151517b7b0da7e0ccf89ae43a44b848831e19c Binary files /dev/null and "b/assignment-1/submission/16307100065/img/\344\272\2240.2.png" differ diff --git "a/assignment-1/submission/16307100065/img/\344\272\224\345\210\206\347\261\2730.8.png" "b/assignment-1/submission/16307100065/img/\344\272\224\345\210\206\347\261\2730.8.png" new file mode 100644 index 0000000000000000000000000000000000000000..d5d72a9445ffdab5b80e24d02ee76a3789244d50 Binary files /dev/null and "b/assignment-1/submission/16307100065/img/\344\272\224\345\210\206\347\261\2730.8.png" differ diff --git "a/assignment-1/submission/16307100065/img/\350\264\237\344\272\214\347\273\264.png" "b/assignment-1/submission/16307100065/img/\350\264\237\344\272\214\347\273\264.png" new file mode 100644 index 0000000000000000000000000000000000000000..5f28d4a81b3deaa217a3d1a78602d6df915b3226 Binary files /dev/null and "b/assignment-1/submission/16307100065/img/\350\264\237\344\272\214\347\273\264.png" differ diff --git a/assignment-1/submission/16307100065/source.py b/assignment-1/submission/16307100065/source.py new file mode 100644 index 0000000000000000000000000000000000000000..4ae97853ca5174d7f23998d209a5603a4a7b7c58 --- /dev/null +++ b/assignment-1/submission/16307100065/source.py @@ -0,0 +1,35 @@ +#算法代码 +import numpy as np +from collections import Counter +class KNN: + + def __init__(self,k): + self._k = k + self._train_data=None + self._train_label = None + self._test_data=None + + def fit(self,train_data,train_label): + self._train_data=train_data + self._train_label=train_label + + + def predict(self,test_data): + self._test_data=test_data + predicts_ =[] + #遍历测试集 + for i in self._test_data: + #对测试集中的数据求距离每一个训练集中数据的欧氏距离 + distances_ = [np.sum((i-x)**2)**0.5 for x in self._train_data] + distances = np.array(distances_) + #用Counter函数求距离前k个 + sorted_distances = np.argsort(distances) + topK = [self._train_label[j] for j in sorted_distances[0:self._k]] + votes = Counter(topK) + #预测结果 + predict = votes.most_common(1)[0][0] + predicts_.append(predict) + predicts = np.array(predicts_) + return predicts + + diff --git a/assignment-1/submission/16307100065/train&test.py b/assignment-1/submission/16307100065/train&test.py new file mode 100644 index 0000000000000000000000000000000000000000..31f28e4c3580e54f001d4ac69fb1c3f9ddf533a9 --- /dev/null +++ b/assignment-1/submission/16307100065/train&test.py @@ -0,0 +1,39 @@ +#实验代码 +import numpy as np +from lknn import KNN +import matplotlib.pyplot as plt +#每一维均值为0,方差为10,并且两维独立,创建1000个数据 +cov = [[10,0],[0,10]] +data = np.around(np.random.multivariate_normal((0,0),cov,1000),2) +#对应分类随机取0或1 +label = np.random.choice([0,1],size=1000,replace=True) +#按8:2的比例分为训练集和测试集 +n = len(data)//5 +train_data = data[0:4*n] +train_label = label[0:4*n] +test_data = data[4*n:] +test_label = label[4*n:] +#调用KNN类,k赋值5,将训练集输入模型 +model = KNN(5) +model.fit(train_data, train_label) +#绘制分类图 +#第一维和第二维分别作为x,y轴 +x_show = train_data[:,0] +y_show = train_data[:,1] +x_min,x_max=x_show.min(),x_show.max() +y_min,y_max=y_show.min(),y_show.max() +xx,yy = np.meshgrid(np.linspace(x_min,x_max,200),np.linspace(y_min,y_max,200)) +#将网格放入模型预测,预测每一个网格的分类 +z1 = np.c_[xx.ravel(),yy.ravel()] +z = np.array(z1) +pred = model.predict(z) +pred = pred.reshape(xx.shape) +#绘制网格分类图和训练集的散点图 +plt.pcolormesh(xx,yy,pred,cmap=plt.cm.Pastel1) +plt.scatter(x_show,y_show,s=80,c=train_label,cmap=plt.cm.spring,edgecolors='k') +plt.xlim(xx.min(),xx.max()) +plt.ylim(yy.min(),yy.max()) +plt.show() +#计算acc +res = model.predict(test_data) +print("acc =",np.mean(np.equal(res, test_label))) \ No newline at end of file