diff --git a/assignment-1/submission/18307130074/img/1test.png b/assignment-1/submission/18307130074/img/1test.png new file mode 100644 index 0000000000000000000000000000000000000000..3d3172d1f183248a2d7c4443ea469d574a48e337 Binary files /dev/null and b/assignment-1/submission/18307130074/img/1test.png differ diff --git a/assignment-1/submission/18307130074/img/1train.png b/assignment-1/submission/18307130074/img/1train.png new file mode 100644 index 0000000000000000000000000000000000000000..cde346b5c7c06d576910a6df2a66029b0670b6e3 Binary files /dev/null and b/assignment-1/submission/18307130074/img/1train.png differ diff --git a/assignment-1/submission/18307130074/img/2test.png b/assignment-1/submission/18307130074/img/2test.png new file mode 100644 index 0000000000000000000000000000000000000000..6deaec8880ed4997651cf23c37e9e7abece9516c Binary files /dev/null and b/assignment-1/submission/18307130074/img/2test.png differ diff --git a/assignment-1/submission/18307130074/img/2train.png b/assignment-1/submission/18307130074/img/2train.png new file mode 100644 index 0000000000000000000000000000000000000000..643e46c2343e7743d6cd6bb2a94ac58eda4fd3fe Binary files /dev/null and b/assignment-1/submission/18307130074/img/2train.png differ diff --git a/assignment-1/submission/18307130074/img/3test.png b/assignment-1/submission/18307130074/img/3test.png new file mode 100644 index 0000000000000000000000000000000000000000..5e75e15ce5ea8f09568b3138dac23866c2423701 Binary files /dev/null and b/assignment-1/submission/18307130074/img/3test.png differ diff --git a/assignment-1/submission/18307130074/img/3train.png b/assignment-1/submission/18307130074/img/3train.png new file mode 100644 index 0000000000000000000000000000000000000000..34be5d5a681d0fa8a5ff9186e9a8a08034d21f21 Binary files /dev/null and b/assignment-1/submission/18307130074/img/3train.png differ diff --git a/assignment-1/submission/18307130074/img/4test.png b/assignment-1/submission/18307130074/img/4test.png new file mode 100644 index 0000000000000000000000000000000000000000..e3a4f0b2caf50db06112042356688b28465efab6 Binary files /dev/null and b/assignment-1/submission/18307130074/img/4test.png differ diff --git a/assignment-1/submission/18307130074/img/4train.png b/assignment-1/submission/18307130074/img/4train.png new file mode 100644 index 0000000000000000000000000000000000000000..b58b79676726ad8b34e82a2b4e2f918d5fce2975 Binary files /dev/null and b/assignment-1/submission/18307130074/img/4train.png differ diff --git a/assignment-1/submission/18307130074/img/5test.png b/assignment-1/submission/18307130074/img/5test.png new file mode 100644 index 0000000000000000000000000000000000000000..365c0cef40fa9d80536197fbde693acb25cd552b Binary files /dev/null and b/assignment-1/submission/18307130074/img/5test.png differ diff --git a/assignment-1/submission/18307130074/img/5train.png b/assignment-1/submission/18307130074/img/5train.png new file mode 100644 index 0000000000000000000000000000000000000000..266f3fa6ea5f349d8705e4a8f22f92e6f5c2600c Binary files /dev/null and b/assignment-1/submission/18307130074/img/5train.png differ diff --git a/assignment-1/submission/18307130074/readme.md b/assignment-1/submission/18307130074/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..d830733fb3c4e45ed7398fedde30c850d6366d10 --- /dev/null +++ b/assignment-1/submission/18307130074/readme.md @@ -0,0 +1,259 @@ +# KNN Classifier + +[TOC] + + + +## 1. Code Introduction + +所有代码都在**source.py**中展示 + +引用库包括**numpy**&**matplotlib** + +``` +import numpy as np +import matplotlib.pyplot as mp +``` + +**KNN**类包括如下几个函数**partition**, **distance**, **predict_label**, **fit**, **predict** + +首先需要初始化KNN类中的成员变量 + +``` +def __init__(self): + + self.train_data_num = 0 + self.valid_data_num = 0 + self.train_data_dimension = 0 + + self.train_data = None + self.valid_data = None + self.train_label = None + self.valid_label = None + + self.K = 20 +``` + +其中train_data_num表示训练集数量,valid_data_num表示验证集数量,train_data_dimension表示数据维度(在本次实验中我们默认你使用二维数据),(train_data,valid_data,train_label,valid_label)分别表示(训练集数据,验证集数据,训练集标签,验证集标签),K表示通过训练所得出来的最优K值 + +**partition**函数输入的**参数**包括:数据集和标签集,**作用**是:将输入的数据和标签随机取80%作为训练集的数据和标签,其余20%作为验证集的数据和标签。 + +**distance**函数输入的**参数**包括:点1,点2,计算距离的方式(默认为Euclid即欧几里得距离),**作用**是:以给定的方式计算两个点之间的距离 + +**predict_label**函数输入的**参数**包括:k值,数据集1,标签集1,数据集2,计算距离的方式, **作用**是:给定k值,以数据集1和标签集1作为已知的点集,通过k近邻算法计算并返回数据集2所对应的标签集 + +**fit**函数输入的**参数**包括:数据集和标签集,**作用**是:首先利用**partition**函数将数据集和标签集按照8:2的比例分为训练集和验证集,然后枚举k的值,并通过**predict_label**函数预测训练集的标签集,并对比该结果和给定的训练集标签,得到准确率。选取准确率最高的k值作为模型的k值。 + +**predict**函数输入的**参数**包括:数据集,**作用**是:将该数据集作为测试集,利用已经训练好的KNN模型返回测试集的标签,一般配合**fit**函数使用。 + +**一些其他的函数**: + +1. **data_maker**函数输入的**参数**包括:id,平均值,协方差矩阵,数量,**作用**是:通过np.random.multivariate_normal函数以及给定的参数生成数据,并将数据分为训练集和测试集,通过np.save储存 + +2. **data_reader**函数输入的**参数**包括:id,**作用**是:通过np.read读取**data_maker**生成的数据 + +3. **data_painter**函数输入的**参数**包括:id,数据集,标签集,name,**作用**是:通过matplotlib.pyplot.scatter来画出数据的散点图,便于直观观察数据的分布 + +## 2. Experiment + +### 1. Initial Data + +数据集参数 +$$ +\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}30&5 \\\\ 5&10\end{bmatrix},|x|=400\\\\ +\mu_y=[5,15],\Sigma_y=\begin{bmatrix}20&4 \\\\ 4&10\end{bmatrix},|y|=400\\\\ +\mu_z=[15,5],\Sigma_z=\begin{bmatrix}20&3 \\\\ 3&4\end{bmatrix},|z|=400 +$$ +训练集和测试集散点图 + +
+ train + test +
+ +实验结果(重复实验5次) + +| k | train | test | +| ------ | ------ | ------ | +| 18 | 0.8490 | 0.8667 | +| 16 | 0.8333 | 0.8542 | +| 5 | 0.8021 | 0.8458 | +| 8 | 0.8646 | 0.8333 | +| 10 | 0.8490 | 0.8542 | +| 平均值 | 0.8396 | 0.8508 | + +### 2. Research + +#### 1.研究重叠度对准确率的影响 + +##### 重叠度增大 + +数据集参数 +$$ +\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}40&5 \\\\ 5&40\end{bmatrix},|x|=400\\\\ +\mu_y=[5,15],\Sigma_y=\begin{bmatrix}40&4 \\\\ 4&40\end{bmatrix},|y|=400\\\\ +\mu_z=[15,5],\Sigma_z=\begin{bmatrix}40&3 \\\\ 3&40\end{bmatrix},|z|=400 +$$ +训练集和测试集散点图 + +
+ train + test +
+ +实验结果(重复实验5次) + +| k | train | test | +| ------ | ------ | ------ | +| 19 | 0.6146 | 0.5917 | +| 14 | 0.6510 | 0.5708 | +| 18 | 0.5990 | 0.5875 | +| 11 | 0.6563 | 0.5875 | +| 6 | 0.6719 | 0.575 | +| 平均值 | 0.6386 | 0.5825 | + +##### 重叠度减小 + +数据集参数 +$$ +\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=400\\\\ +\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=400\\\\ +\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=400 +$$ +训练集和测试集散点图 + +
+ train + test +
+ +实验结果(重复实验5次) + +| k | train | test | +| ------ | ------ | ------ | +| 12 | 0.9271 | 0.8875 | +| 7 | 0.9167 | 0.875 | +| 6 | 0.9375 | 0.8625 | +| 6 | 0.9271 | 0.8792 | +| 10 | 0.9635 | 0.8792 | +| 平均值 | 0.9344 | 0.8767 | + +##### 结论 + +随着不同标签的点集重叠度增大,KNN分类器的准确率降低 + +#### 2.研究点集数量对准确率的影响 + +##### 数据减少 + +数据集参数 +$$ +\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=200\\\\ +\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=200\\\\ +\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=200 +$$ +训练集和测试集散点图 + +
+ train + test +
+ +实验结果(重复实验5次) + +| k | train | test | +| ------ | ------ | ------ | +| 6 | 0.8958 | 0.9 | +| 8 | 0.9479 | 0.8833 | +| 18 | 0.9063 | 0.8917 | +| 7 | 0.9688 | 0.8833 | +| 14 | 0.875 | 0.8833 | +| 平均值 | 0.9188 | 0.8883 | + +##### 数据增多 + +数据集参数 +$$ +\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=800\\\\ +\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=800\\\\ +\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=800 +$$ +训练集和测试集散点图 + +
+ train + test +
+ +实验结果(重复实验5次) + +| k | train | test | +| ------ | ------ | ------ | +| 9 | 0.8958 | 0.9188 | +| 9 | 0.9115 | 0.925 | +| 18 | 0.9323 | 0.9083 | +| 10 | 0.9193 | 0.9208 | +| 18 | 0.9089 | 0.9188 | +| 平均值 | 0.9136 | 0.9183 | + +##### 结论 + +数据的多少很有可能影响着KNN分类器的准确率,但是影响效果不明显,准确率随着数据的增多而提高 + +#### 3.研究计算距离的方式对准确率的影响 + +##### 曼哈顿距离 + +数据集参数 +$$ +\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=800\\\\ +\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=800\\\\ +\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=800 +$$ +训练集和测试集散点图 + +
+ train + test +
+ +实验结果(重复实验5次) + +| k | train | test | +| ------ | ------ | ------ | +| 17 | 0.8906 | 0.9167 | +| 7 | 0.9010 | 0.9083 | +| 13 | 0.9167 | 0.9271 | +| 9 | 0.9115 | 0.9208 | +| 17 | 0.9219 | 0.9292 | +| 平均值 | 0.9083 | 0.9204 | + +##### 上确界距离 + +数据集参数 +$$ +\mu_x=[10,10] ,\Sigma_x=\begin{bmatrix}10&5 \\\\ 5&10\end{bmatrix},|x|=800\\\\ +\mu_y=[5,15],\Sigma_y=\begin{bmatrix}10&4 \\\\ 4&10\end{bmatrix},|y|=800\\\\ +\mu_z=[15,5],\Sigma_z=\begin{bmatrix}10&3 \\\\ 3&10\end{bmatrix},|z|=800 +$$ +训练集和测试集散点图 + +
+ train + test +
+ +实验结果(重复实验5次) + +| k | train | test | +| ------ | ------ | ------ | +| 13 | 0.9036 | 0.9083 | +| 17 | 0.9115 | 0.9167 | +| 16 | 0.9089 | 0.9229 | +| 9 | 0.9089 | 0.9125 | +| 14 | 0.9167 | 0.9229 | +| 平均值 | 0.9099 | 0.9167 | + +##### 结论 + +三种计算距离的方式所得到的结果相差不多,说明在数据量足够大且分类明确的情况下,欧几里得距离、上确界距离、曼哈顿距离的效果相近 \ No newline at end of file diff --git a/assignment-1/submission/18307130074/source.py b/assignment-1/submission/18307130074/source.py new file mode 100644 index 0000000000000000000000000000000000000000..78932f83ad277b432007e9c4bfa61bad89f6d81b --- /dev/null +++ b/assignment-1/submission/18307130074/source.py @@ -0,0 +1,188 @@ +import numpy as np +import matplotlib.pyplot as mp + +class KNN: + + def __init__(self): + + self.train_data_num = 0 + self.valid_data_num = 0 + self.train_data_dimension = 0 + + self.train_data = None + self.valid_data = None + self.train_label = None + self.valid_label = None + + self.K = 20 + + + def partition(self, data, label): + + length = len(data) + list1 = [] + list2 = [] + list3 = [] + + for i in range(0, length): + list1.append([data[i], label[i]]) + + np.random.shuffle(list1) + + for each in list1: + list2.append(each[0]) + list3.append(each[1]) + + list2 = np.array(list2) + list3 = np.array(list3) + + return list2, list3 + + + def distance(self, s, t, p='Euclid'): + """ + 表示两个点之间的距离,s和t的类型应该为 + p表示距离的种类,目前已实现的有曼哈顿距离(Manhattan),欧几里得(Euclid), 上确界距离(upper) + """ + ans = 0 + + if p == 'Euclid': + for each in range(self.train_data_dimension): + ans += (s[each] - t[each])**2 + + elif p == 'Manhattan': + for each in range(self.train_data_dimension): + ans += abs(s[each] - t[each]) + + elif p == 'upper': + for each in range(self.train_data_dimension): + ans = max(abs(s[each] - t[each]), ans) + + return ans + + + def predict_label(self, k, data1, label1, data2, p='Euclid'): + + result = [] + for point in data2: + + dist = [] + + for i in range(len(data1)): + dist.append(self.distance(point, data1[i], p)) + + dist = np.array(dist) + indices = np.argpartition(dist, k)[:k] + + counter = {} + temp = [] + maxc = 0 + + for each in indices: + if label1[int(each)] in counter: + counter[label1[int(each)]] += 1 + else: + counter[label1[int(each)]] = 1 + + if maxc < counter[label1[int(each)]]: + maxc = counter[label1[int(each)]] + + for each in counter: + if counter[each] == maxc: + temp.append(each) + + result.append(np.random.choice(temp)) + + result = np.array(result) + return result + + + def fit(self, train_data, train_label): + + # 确定训练集和验证集的数量和数据维度 + self.valid_data_num = max(1, train_data.shape[0] // 5) + self.train_data_num = train_data.shape[0] - self.valid_data_num + self.train_data_dimension = train_data.shape[1] + + # 将数据集随机划分 + temp_data, temp_label = self.partition(train_data, train_label) + self.train_data, self.train_label = temp_data[:self.train_data_num], temp_label[:self.train_data_num] + self.valid_data, self.valid_label = temp_data[self.train_data_num:], temp_label[self.train_data_num:] + + max_re, best_k, max_k = 0, 0, min(20, self.train_data_num) + + for i in range(1, max_k): + result = self.predict_label(i, self.train_data, self.train_label, self.valid_data) + re = np.mean(np.equal(result, self.valid_label)) + if re > max_re: + max_re = re + best_k = i + + self.K = best_k + print("acc =",max_re) + print("k=", best_k) + + + def predict(self, test_data): + return self.predict_label(self.K, self.train_data, self.train_label, test_data) + + +def data_maker(id, mean, cov, num): + + data = None + label = None + + data = np.concatenate([np.random.multivariate_normal(mean[i], cov[i], (num[i],)) for i in range(3)]) + label = np.concatenate([np.ones((num[i],), dtype=int) * i for i in range(3)]) + + length = len(data) + test_data_num = length // 5 + + data, label = KNN().partition(data, label) + + test_data = data[:test_data_num] + test_label = label[:test_data_num] + train_data = data[test_data_num:] + train_label = label[test_data_num:] + + np.save(str(id) + 'data.npy', (train_data, train_label, test_data, test_label)) + + +def data_reader(id): + return np.load(str(id) + 'data.npy', allow_pickle=True) + + +def data_painter(id, data, label, name): + + length = len(data) + painter = {} + + for i in range(length): + if label[i] in painter: + painter[label[i]].append(data[i]) + else: + painter[label[i]] = [data[i]] + + for each in painter: + temp = np.array(painter[each]) + mp.scatter(temp[:, 0], temp[:, 1]) + + mp.savefig(f'./{str(id) + name}') + mp.show() + mp.close() + + +if __name__ == "__main__": + mean = [(10, 10), (5, 15), (15, 5)] + cov = [np.array([[10, 5], [5, 10]]), np.array([[10, 4], [4, 10]]), np.array([[10, 3], [3, 10]])] + num = [800, 800, 800] + id = 5 + # data_maker(id, mean, cov, num) + train_data, train_label, test_data, test_label = data_reader(id) + # data_painter(id, train_data, train_label, 'train') + # data_painter(id, test_data, test_label, 'test') + model = KNN() + for i in range(5): + model.fit(train_data, train_label) + res = model.predict(test_data) + print("acc =",np.mean(np.equal(res, test_label))) \ No newline at end of file