diff --git a/assignment-1/submission/18307130090/README.md b/assignment-1/submission/18307130090/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7f124928fdebc2f38c8563755f3c6fb073f6102f --- /dev/null +++ b/assignment-1/submission/18307130090/README.md @@ -0,0 +1,132 @@ +# PRML-2021 Assignment1 + +姓名:夏海淞 + +学号:18307130090 + +## 简述 + +在本次实验中,我实现了K近邻算法,在自己生成的数据集上进行测试,将其在不同数据集上的性能与作为对照的随机算法进行了比较。同时,尝试对性能瓶颈进行了一定优化,效果较好。 + +## 算法介绍 + +### 数据集生成 + +`source.py`中的`generate_data`函数用于生成数据集。该函数接收`mean` `cov` `nums`三个数组参数,每个数组的第`i`个元素分别表示第`i`类点的均值、协方差和数量。`generate_data`函数通过调用`numpy.random.multivariate_normal`方法生成数据。 + +`source.py`中的`show_plot`函数用于绘制数据集的散点图。该函数接收两个`ndarray`参数和一个字符串参数,分别表示样本点、样本标签和图片名称。`show_plot`函数通过调用`matplotlib`库完成绘制,将图存放至项目的`\img`路径中。 + +按照实验要求,将数据打乱后,取80%作为训练集,20%作为测试集。 + +训练集和测试集的散点图如下所示: + +
+
+
+
+由图可知,数据集容量的大小对于模型性能没有明显的影响。
+
+推测:出现这种现象的原因可能是K近邻模型过于简单,样本容量较小时也能得到较好的性能。
+
+#### 数据集距离
+
+固定上述数据集的协方差和容量,改变均值为原先的$k$倍$(k\in[0.1,2])$,作出$k$关于`acc`的图如下:
+
+
+
+由图可知,数据集的均值越大(即不同数据集的距离越大),模型效果越好。
+
+推测:出现这种现象的原因可能是不同数据集之间距离越远,测试点周围属于自己的类的训练点比例越高,因此性能越好。
+
+#### 数据集方差
+
+固定上述数据集的均值和容量,改变协方差为原先的$k$倍$(k\in[0.1,2])$,作出$k$关于`acc`的图如下:
+
+
+
+由图可知,数据集的协方差越大,模型效果越好。
+
+推测:出现这种现象的原因和数据集距离的原因类似。数据集协方差增大时,不同数据集越倾向于“混合”在一起,使得测试点周围属于自己的类的点比例不断减小,性能随之降低。
+
+#### 总结
+
+综上,数据集的容量对K近邻模型的影响较小,而不同数据集的距离和协方差分别对K近邻模型性能产生了正面和负面的影响。
+
+## 性能优化
+
+### 问题归纳
+
+注意到K近邻算法的核心操作是:给定$n$个点$x_1,x_2,\cdots,x_n$和一个查询点$y$,要求返回距离$y$最近的$k$个点。常见的算法为计算出$x_i(i=1,2,\cdots,n)$与$y$的距离后按照距离进行排序,取出前$k$个点。这样做的时间复杂度为$O(n\log n)$。
+
+### 优化1
+
+考虑到$k$一般很小(具体实现中$k\le25$),因此尝试使用堆进行优化。算法步骤为:
+
+1. 计算出$x_1,\cdots,x_k$与$y$的距离,插入大根堆中;
+2. 计算出$x_i(i=k+1,k+2,\cdots,n)$与$y$的距离,与堆顶元素比较。如果$dist(x_i,y)$比堆顶元素小,则弹出堆顶元素,将$dist(x_i,y)$插入堆中。
+
+算法结束时堆中的元素即为$x_i$与$y$的距离中最近的$k$个。时间复杂度$O(n\log k)$。由于$k\le25$,因此算法近似为$O(n)$。
+
+然而在具体实现中发现优化后速度反而慢于优化前。可能原因是暴力排序调用的是`numpy.argsort`,其底层调用了C的库,常数较小;而堆优化调用的是`heapq`库,完全由Python实现,常数较大,因此性能反而不如优化前。
+
+### 优化2
+
+后来发现`numpy`库提供了一个名为`numpy.argpartition`的方法。该方法与快速排序的`partition`操作类似,接收一个`ndarray`参数和一个`int`参数并返回一个数组,其中前$k$个为前$k$小的元素参数。由主定理可知,该方法的时间复杂度为$O(n)$。因此改用`numpy.argpartition`计算距离$y$最近的$k$个点,提高了运算速度。
+
+`fit`方法分别在使用`numpy.argsort`和`numpy.argpartition`的情况下的运行时间比较图:
+
+
\ No newline at end of file
diff --git a/assignment-1/submission/18307130090/img/readme/acc_cov.png b/assignment-1/submission/18307130090/img/readme/acc_cov.png
new file mode 100644
index 0000000000000000000000000000000000000000..e187ecebdb9c294dfcbad886487b4b0f7fceb4f2
Binary files /dev/null and b/assignment-1/submission/18307130090/img/readme/acc_cov.png differ
diff --git a/assignment-1/submission/18307130090/img/readme/acc_mean.png b/assignment-1/submission/18307130090/img/readme/acc_mean.png
new file mode 100644
index 0000000000000000000000000000000000000000..8419fdf833238cfada421178870aa5e86d62923b
Binary files /dev/null and b/assignment-1/submission/18307130090/img/readme/acc_mean.png differ
diff --git a/assignment-1/submission/18307130090/img/readme/acc_nums.png b/assignment-1/submission/18307130090/img/readme/acc_nums.png
new file mode 100644
index 0000000000000000000000000000000000000000..d84bc984a7c161e0f9b764b4a267ee990d44785a
Binary files /dev/null and b/assignment-1/submission/18307130090/img/readme/acc_nums.png differ
diff --git a/assignment-1/submission/18307130090/img/readme/test.png b/assignment-1/submission/18307130090/img/readme/test.png
new file mode 100644
index 0000000000000000000000000000000000000000..893580fcb95deac02a084e15583e5cf2e95ad5d8
Binary files /dev/null and b/assignment-1/submission/18307130090/img/readme/test.png differ
diff --git a/assignment-1/submission/18307130090/img/readme/time.png b/assignment-1/submission/18307130090/img/readme/time.png
new file mode 100644
index 0000000000000000000000000000000000000000..f76d49f9fabe732cb07eb2a271db4a22b3ed83d0
Binary files /dev/null and b/assignment-1/submission/18307130090/img/readme/time.png differ
diff --git a/assignment-1/submission/18307130090/img/readme/train.png b/assignment-1/submission/18307130090/img/readme/train.png
new file mode 100644
index 0000000000000000000000000000000000000000..b0b19b6f518c4d8a79848be7aa76dbff4050f88e
Binary files /dev/null and b/assignment-1/submission/18307130090/img/readme/train.png differ
diff --git a/assignment-1/submission/18307130090/img/test.png b/assignment-1/submission/18307130090/img/test.png
new file mode 100644
index 0000000000000000000000000000000000000000..bb50d7cd57a3ccd2d5e87dfd3abb59cbea12d934
Binary files /dev/null and b/assignment-1/submission/18307130090/img/test.png differ
diff --git a/assignment-1/submission/18307130090/img/train.png b/assignment-1/submission/18307130090/img/train.png
new file mode 100644
index 0000000000000000000000000000000000000000..2f2105f7befcf80573111eb2b64f77dda866c295
Binary files /dev/null and b/assignment-1/submission/18307130090/img/train.png differ
diff --git a/assignment-1/submission/18307130090/source.py b/assignment-1/submission/18307130090/source.py
new file mode 100644
index 0000000000000000000000000000000000000000..766de86932d9baf64d44cfc385daed350b70935e
--- /dev/null
+++ b/assignment-1/submission/18307130090/source.py
@@ -0,0 +1,160 @@
+import random
+import sys
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+class KNN:
+
+ def __init__(self):
+ self.train_data = None
+ self.train_labels = None
+ self.k = None
+
+ # func=bf表示使用O(nlogn)的算法 func=opt表示使用O(n)的算法
+ def get_predict_labels(self, k, train_data, train_labels, valid_data, func='bf'):
+
+ # 确保func字段只能取bf或者opt
+ assert func in {'bf', 'opt'}
+
+ predict_labels = np.array([])
+ for valid_dot in valid_data:
+
+ # 计算每个train_dot与valid_dot之间的距离
+ dist = np.linalg.norm(train_data - valid_dot, axis=1)
+
+ # 计算距离最小的k个train_dot的下标
+ dist_index = np.argsort(dist)[:k] if func == 'bf' else np.argpartition(dist, k)[:k]
+
+ # 计算数量最多的标签
+ count_dict = {}
+ max_count = 0
+ for index in dist_index:
+ index = int(index)
+ train_label = train_labels[index]
+ count_dict[train_label] = count_dict.get(train_label, 0) + 1
+ max_count = max(max_count, count_dict[train_label])
+ predict_label = np.array([])
+ for train_label, count in count_dict.items():
+ if max_count != count: continue
+ predict_label = np.append(predict_label, train_label)
+ predict_labels = np.append(predict_labels, np.random.choice(predict_label))
+
+ return predict_labels
+
+ def fit(self, input_data, input_labels):
+ self.train_data = input_data
+ self.train_labels = input_labels
+
+ # 将数据打乱
+ shuffled_data, shuffled_labels = shuffle(input_data, input_labels)
+
+ # 划分为训练集和验证集
+ ratio, data_size = 0.2, shuffled_data.shape[0]
+ valid_size = int(data_size * ratio)
+ train_size = data_size - valid_size
+ valid_data, valid_labels = shuffled_data[:valid_size], shuffled_labels[:valid_size]
+ train_data, train_labels = shuffled_data[valid_size:], shuffled_labels[valid_size:]
+
+ # 枚举k,求出最佳参数
+ k_size = min(25, train_size)
+ max_acc, best_k = -1, 0
+ for k in range(1, k_size):
+ predict_labels = self.get_predict_labels(k, train_data, train_labels, valid_data, func='opt')
+ acc = np.mean(np.equal(predict_labels, valid_labels))
+ # print(f'k={k} acc={acc}')
+ if acc > max_acc:
+ max_acc = acc
+ best_k = k
+ print(f'k={best_k} train_acc={max_acc}')
+ self.k = best_k
+
+ def predict(self, test_data):
+ return self.get_predict_labels(self.k, self.train_data, self.train_labels, test_data, func='opt')
+
+
+def generate_data(mean, cov, nums):
+ n = len(mean)
+ assert n == len(cov) and n == len(nums)
+ data = np.concatenate([np.random.multivariate_normal(mean[i], cov[i], int(nums[i])) for i in range(n)])
+ labels = np.concatenate([np.ones(int(nums[i]), dtype=int) * i for i in range(n)])
+
+ data, labels = shuffle(data, labels)
+
+ ratio, data_size = 0.2, len(data)
+ test_size = int(ratio * data_size)
+ test_data, test_label = data[:test_size], labels[:test_size]
+ train_data, train_label = data[test_size:], labels[test_size:]
+ np.save('data.npy', (train_data, train_label, test_data, test_label))
+
+
+def shuffle(data, labels):
+ data_size = len(data)
+ assert data_size == len(labels)
+
+ indices = np.random.permutation(data_size)
+ return data[indices], labels[indices]
+
+
+def save_plot(data, labels, name):
+ data_size = len(data)
+ assert data_size == len(labels)
+ total = {}
+ for i in range(data_size):
+ label = labels[i]
+ if label not in total:
+ total[label] = []
+ else:
+ total[label].append(data[i])
+ for category in total.values():
+ if category == []: continue
+ category = np.array(category)
+ plt.scatter(category[:, 0], category[:, 1])
+ plt.title(name)
+ plt.savefig(f'./img/{name}')
+ plt.close()
+
+
+def read():
+ return np.load('data.npy', allow_pickle=True)
+
+
+def generate_control(nums, length):
+ n = len(nums)
+ labels = [i for i in range(n)]
+ return random.choices(labels, nums, k=length)
+
+
+def train(mean, cov, nums, generate, ratio=(1, 1, 1)):
+ if generate:
+ generate_data(mean * ratio[0], cov * ratio[1], nums * ratio[2])
+ train_data, train_label, test_data, test_label = read()
+ save_plot(train_data, train_label, 'train')
+ save_plot(test_data, test_label, 'test')
+ knn = KNN()
+ start_time = time.time()
+ knn.fit(train_data, train_label)
+ end_time = time.time()
+ training_time = end_time - start_time
+ # print(f'training stime={training_time} s')
+ ans = knn.predict(test_data)
+ control_group = generate_control(nums, len(test_label))
+ test_acc = np.mean(np.equal(ans, test_label))
+ control = np.mean(np.equal(control_group, test_label))
+ return test_acc, control
+
+
+if __name__ == '__main__':
+ nums = np.array([1600, 400, 2000], dtype=int)
+ mean = np.array([[5, 5], [10, 15], [20, 5]])
+ cov = np.array([
+ [[34, 5], [5, 10]],
+ [[20, 5], [5, 24]],
+ [[30, 5], [5, 10]]
+ ])
+ generate = True if len(sys.argv) > 1 and sys.argv[1] == 'g' else False
+ acc, control = train(mean, cov, nums, generate)
+ print(f'acc={acc} control={control}')
+ pass