diff --git a/assignment-1/submission/18307130213/README.md b/assignment-1/submission/18307130213/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1312179c838a324adb960c53606d9a76ed582a4f --- /dev/null +++ b/assignment-1/submission/18307130213/README.md @@ -0,0 +1,66 @@ +# 课程报告 + +## KNN类实现 + +KNN类的实现位于 [source.py](./source.py) 。 + +### 初始化 + +初始化时,我们给 `KNN` 类的 `private` 变量赋值 `None`,表示尚未进行训练,防止使用未训练模型进行测试。 + +### 训练 + +训练函数包括两部分:检查数据是否合法(数据维度是否匹配),以及将所有训练数据中的点保存下来。 + +我们会根据训练数据集的大小 `n` 和标签数 `l` 来决定超参 `K = min(n, log2(n), l + 1)` 。 + +从实验情况来看,这样的超参选择是合理的。 + +### 测试 + +测试函数同样包含两个部分:检测数据是否与训练数据维度相同,并给出对于所有点的标签预测。 + + + +## 数据生成与可视化 + +在给定参数N时,数据生成部分能够生成一套含 `N` 个位置不同,协方差矩阵随机的二维高斯分布的数据。 + +其中 `80%` 会用于训练,剩下 `20%` 用于测试。 + +这是 `N=5` 时生成的训练集: + +![训练集](./img/exptrain.png) + +这是 `N=5` 时生成的测试集: + +![测试集](./img/exptest.png) + + + +## 效果评估 + +以下为随机情况下中获得的一些准确度,当N过大时由于生成数据过密,效果下降。 + +| Algo | Acc | +| ----------- | ------------------ | +| ----- | ----- | +| KNN (N=2) | 0.9983193277310924 | +| KNN (N=3) | 0.9986807387862797 | +| KNN (N=5) | 0.9744360902255639 | +| KNN (N=10) | 0.868824531516184 | +| KNN (N=100) | 0.7205387205387206 | + + + +## 代码使用方法 + +以 `N=3` 为例: + +```bash +python source.py g 3 # 生成数据集 + +python source.py d 3 # 生成数据集的可视化结果(保存在img文件夹下) + +python source.py # 训练和测试 +``` \ No newline at end of file diff --git a/assignment-1/submission/18307130213/img/exptest.png b/assignment-1/submission/18307130213/img/exptest.png new file mode 100644 index 0000000000000000000000000000000000000000..1e95008faf1f147efa733242da45b730ca69e04b Binary files /dev/null and b/assignment-1/submission/18307130213/img/exptest.png differ diff --git a/assignment-1/submission/18307130213/img/exptrain.png b/assignment-1/submission/18307130213/img/exptrain.png new file mode 100644 index 0000000000000000000000000000000000000000..a2a24be956aea669a30fe4562563154da9020047 Binary files /dev/null and b/assignment-1/submission/18307130213/img/exptrain.png differ diff --git a/assignment-1/submission/18307130213/source.py b/assignment-1/submission/18307130213/source.py new file mode 100644 index 0000000000000000000000000000000000000000..7a53de852289de55e74fe9a70c7ac56fdb5372ec --- /dev/null +++ b/assignment-1/submission/18307130213/source.py @@ -0,0 +1,148 @@ +import math +import heapq +import numpy as np +import random +import matplotlib.pyplot as plt +import sys + +class KNN: + + def __init__(self): + self.__data = None + self.__lable = None + self.__num = None + self.__dim = None + self.__k = None + + def fit(self, train_data, train_label): + if type(train_data) != np.ndarray: + print('error: wrong type of train_data') + return + if len(train_data.shape) != 2: + print('error: wrong shape of train_data') + return + if type(train_label) != np.ndarray: + print('error: wrong type of train_label') + return + if len(train_label.shape) != 1: + print('error: wrong shape of train_label') + return + num_data, dim_data = train_data.shape + num_label, = train_label.shape + if num_data != num_label: + print('error: shape of train_data and train_label can not match') + return + if num_data < 1: + print('error: less than 1 data') + return + + label_k = len(np.unique(train_label)) + + self.__data = train_data + self.__label = train_label + self.__num = num_data + self.__dim = dim_data + self.__k = min(num_data, math.floor(math.log(num_data, 2)), label_k + 1) + + print('finish: fit') + return + + def predict(self, test_data): + if self.__k == None: + print('error: not fit yet') + return + if type(test_data) != np.ndarray: + print('error: wrong type of test_data') + return + if len(test_data.shape) != 2: + print('error: wrong shape of test_data') + return + + test_data_num, test_data_dim = test_data.shape + if test_data_dim != self.__dim: + print('error: wrong dimention of test_data') + return + + tmp_ans = [] + for i in range(test_data_num): + tmp_inum = [j for j in range(self.__num)] + closest = heapq.nsmallest(self.__k, tmp_inum, key = lambda s: np.linalg.norm(test_data[i]-self.__data[s])) + tmp_dict = {} + lab, cnt = -1, 0 + for j in range(self.__k): + tmp_cnt = tmp_dict[self.__label[closest[j]]] = tmp_dict.get(self.__label[closest[j]], 0) + 1 + if tmp_cnt > cnt: + lab, cnt = self.__label[closest[j]], tmp_cnt + tmp_ans.append(lab) + + return np.array(tmp_ans) + +def generate(n): + np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) + if n <= 0: + print('error: n <= 0') + return + r = n/max(1, math.log(n, 2)) + sizs = [] + xs = [] + for i in range(n): + theta = i*(2*math.pi/n) + mean = (r*math.cos(theta) , r*math.sin(theta)) + rand_mat = np.random.rand(2, 2) + cov = rand_mat.transpose()*rand_mat + siz = random.randint(200, 1000) + sizs.append(siz) + x = np.random.multivariate_normal(mean, cov, (siz, )) + xs.append(x) + siz = sum(sizs) + idx = np.arange(siz) + np.random.shuffle(idx) + data = np.concatenate(xs) + label = np.concatenate([np.ones((sizs[j], ), dtype=int)*j for j in range(n)]) + data = data[idx] + label = label[idx] + + train_data, test_data = data[:(siz//n)*(n-1),], data[(siz//n)*(n-1):,] + train_label, test_label = label[:(siz//n)*(n-1),], label[(siz//n)*(n-1):,] + + np.save("data.npy",( + (train_data, train_label), (test_data, test_label) + )) + +def read(): + (train_data, train_label), (test_data, test_label) = np.load("data.npy",allow_pickle=True) + return (train_data, train_label), (test_data, test_label) + +def genimg(n, data, label, name): + datas =[[] for i in range(n)] + for i in range(len(data)): + datas[label[i]].append(data[i]) + + for each in datas: + each = np.array(each) + plt.scatter(each[:, 0], each[:, 1]) + plt.savefig(f'img/{name}') + plt.close() + # plt.show() + +if __name__ == '__main__': + if len(sys.argv) > 1 and sys.argv[1] == 'g': + try: + n = int(sys.argv[2]) + generate(n) + except: + print('error: wrong n') + elif len(sys.argv) > 1 and sys.argv[1] == 'd': + (train_data, train_label), (test_data, test_label) = read() + try: + n = int(sys.argv[2]) + genimg(n, train_data, train_label, 'train') + genimg(n, test_data, test_label, 'test') + except: + print('somthing goes wrong!') + else: + (train_data, train_label), (test_data, test_label) = read() + model = KNN() + model.fit(train_data, train_label) + res = model.predict(test_data) + print("acc =",np.mean(np.equal(res, test_label))) \ No newline at end of file