diff --git a/assignment-2/submission/19307110020/README.md b/assignment-2/submission/19307110020/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d887e343b664bf6c91ab616fa27361d7d030b605 --- /dev/null +++ b/assignment-2/submission/19307110020/README.md @@ -0,0 +1,282 @@ +# HW2-用numpy实现FNN + +## 19307110020 贺正夫 + +### 简介 + +在本实验中,完成了如下内容: + +- 完成梯度反向传播的数学推导,并且使用numpy库实现了如下结构的神经网络; + +```python + x = x.reshape(-1, 28 * 28) + x = torch.relu(torch.matmul(x, self.W1)) + x = torch.relu(torch.matmul(x, self.W2)) + x = torch.matmul(x, self.W3) + x = torch.softmax(x, 1) + x = torch.log(x) +``` + +- 实现了mini-batch函数,这相当于torch中的Dataloader; +- 实现了Kaiming-Uniform参数初始化,并且调研了torch中参数初始化的策略以及其适用的场景; +- 实现了momentum优化与Adam优化器,取得了比原始优化方法更好的效果。 + + + +### 梯度反向传播的数学推导 + +*在本章节中将分别推导需要用到的四种算子的前向传播与反向传播的数学公式,其中由于其他算子的前向传播难度较低,因此仅对softmax中的一个数值技巧做简单阐述*。 + +#### Matmul + +对于矩阵乘法而言,假设算式为Y=XW。shape为(N \* C)=(N \* D)(D \* C)。 + +因而有 +$$ +Y_{i,j} = \sum_{k=1}^DX_{i,k}W_{k,j} +$$ +这是矩阵运算按行展开的性质。 + +显然有当元素的行数不同的时候偏导必然为0,即 +$$ +\frac{\partial Y_{i,j}}{\partial X_{i,k}}=W_{k,j} +$$ +于是有 +$$ +Grad(W_{i,j}) = \frac {\partial L}{\partial X_{i,j}} = \sum_{p\le N, q\le C}\frac{\partial L}{\partial{O_{p,q}}}\frac{\partial{O_{p,q}}}{\partial W_{i,j}}=\sum_{p\le N, q = j}G_{p,q}\frac{\partial{O_{p,q}}}{\partial W_{i,j}}=\sum_{p\le N}X_{i,p}^T*G_{p,j}=X^T*G +$$ +同理可得 +$$ +Grad(X_{i,j})=G*W^T +$$ + +以下是反向传播的实现代码: + +```python +grad_W = np.matmul(self.memory['x'].T, grad_y) +grad_x = np.matmul(grad_y, self.memory['W'].T) +return grad_x, grad_W +``` + + + +#### ReLU + +由于ReLU激活函数是按元素操作的,因此梯度也是按元素回传的。 + +显然,如果激活函数前的值为正,应当保留梯度继续往前回传,反之则将梯度归零。 + +因此直接根据memory中的x值,对指定位置的梯度做掩蔽即可。代码如下: + +```python +x = self.memory['x'] +return grad_y * np.where(x > 0, 1, 0) +``` + + + +#### Softmax + +##### 前向传播 + +在前向传播中,加入了一个“Plus C Trick”,代码如下: + +```python +1 self.memory['x'] = x +2 x = np.array(x) # suit for torch +3 N = x.shape[0] +4 x = x - np.array([max(i) for i in x]).reshape(N, 1) # plus C trick +5 x = np.exp(x) +6 out = x / np.sum(x, axis=1).reshape(N, 1) +7 self.memory['softmax'] = out +8 return out +``` + +在第三行中,对每一行减去了本行中的最大值,这是为了更好防止数值溢出,接下来将证明在精确意义上这不影响整个softmax层的结果。对C维向量z,有 +$$ +softmax(z_i)=\frac{e^{z_i-D}}{\sum_{c=1}^Ce^{z_c-D}} +$$ +这仅仅相当于在分子与分母各乘$e^{-D}$,并且这也不会影响梯度的反向传播,是有利无害的改进。 + +##### 反向传播 + +对于一维向量x, y而言有 +$$ +\left\{ +\begin{array}{lr} +\frac{\mathrm{d} y_j}{\mathrm{d}x_i}=y_j-(y_j)^2, &i = j \\ +\frac{\mathrm{d} y_j}{\mathrm{d}x_i}=-y_i·y_j, & i \ne j\\ +\end{array} +\right. +$$ +为了统一这种分类讨论,可以用对角阵在元素相等时的独特性质,构建 +$$ +D(y)=Diag(y)-y^T·y, \\ +$$ +有 +$$ +\frac{Grad(y_j)}{Grad(x_i)}=D(y)_{i,j} +$$ +于是显而易见地 +$$ +Grad(x)=Grad(y)·D(y) +$$ + +反向传播代码如下: + +```python +c = grad_y.shape[1] +softmax = self.memory['softmax'] +tem = [] +for i in softmax: + tem.append(np.diag(i) - np.outer(i, i)) +return np.matmul(grad_y.reshape(-1, 1, c), np.array(tem)).reshape(-1, c) +``` + + + +#### Log + +这个算子的前向传播与反向传播都无需赘述,以下是反向传播的结论: +$$ +Grad(x)=Grad(y)·Reverse(x) +$$ +其中Reverse(x)是指对矩阵x按位取倒数,这是由于log的求导结果得到的。代码如下: + +```python +x = self.memory['x'] +return (1 / (x + self.epsilon)) * grad_y +``` + + + +### Mini-batch: numpy中的Dataloader + +代码如下: + +```python +def mini_batch(dataset, batch_size=128, numpy=False, shuffle=True): + if not numpy: + return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + import random + datas = np.array([np.array(i[0].numpy(), i[1]) for i in dataset]) + if shuffle: + random.shuffle(datas) + data = [np.array(datas[0, i: i + batch_size]) for i in range(0, len(dataset), batch_size)] + label = [np.array(datas[1, i: i + batch_size]) for i in range(0, len(dataset), batch_size)] + return data, label +``` + +在这段代码中,额外引入了shuffle参数,支持调整数据集是否打乱。实现细节上无非是花式索引与循环,不得不感叹python的易用。 + + + +### torch中的初始化方案 + +在本节中,in代表本层输入的神经元个数,out是输出神经元个数。 + +- Xavier均匀分布服从U(−a,a) ,分布的参数$a = gain \sqrt{\frac{6}{in+out}}$ +- Xavier正态分布服从 $N(0, gain \sqrt{\frac{2}{in+out}})$ + +其中增益gain的大小是依据激活函数类型来设定的。 + +Xavier初始化在Relu层表现不好,主要原因是ReLU层会将负数映射到0,影响了整体方差,Kaiming大佬对此做了改进。 + +- Kaiming均匀分布U(-bound, bound), 其中$bound = \sqrt{\frac{6}{(1+a^2)*in}}$, a作为超参,与leaky ReLU的斜率有关。 +- Kaiming正态分布服从$N(0, \sqrt{\frac{2}{(1+a^2)*in}})$ + +- 其余的初始化策略不具太多特色与讨论价值,一一列出有无端内卷之嫌。例如常数初始化,自定义范围内的均匀分布初始化与自定义方差的正态初始化等,都是比较朴素的想法。 + +在本报告实现的模型中,采用Kaiming均匀分布,因为它更适合含有ReLU此类非饱和激活函数的初始化,相对的,如果采用双曲正切函数或sigmoid函数作为激活函数,则选择Kaiming正态分布更好。这是由均匀分布与正态分布的特性决定的。代码使用lambda函数实现,如下: + +```python +ran = lambda x: (6 / x) ** 0.5 +Kaiming = lambda x, y: np.random.uniform(low=-ran(x), high=ran(x), size=(x,y)) +W1 = Kaiming(28 * 28, 256) +W2 = Kaiming(256, 64) +W3 = Kaiming(64, 10) +return W1, W2, W3 +``` + +由于本模型中没有使用leaky ReLU做激活函数,因此在Kaiming均匀分布初始化中可以减少一个超参,当作负斜率为0的leaky ReLU处理。 + +### Adam与Momentum优化方法 + +本模型实现了两种优化方法。两种优化方法思路上都较为简单,简而言之就是利用历史的更新信息调整每一次更新的lr,因此不再赘述,下附代码: + +以下是两种优化方法按照默认参数的初始化: + +```python +self.adam = {'lr': 0.001, 't': 0, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-8, + 'm1': 0, 'm2': 0, 'm3': 0, 'v1': 0, 'v2': 0, 'v3': 0} +self.pre_grad1, self.pre_grad2, self.pre_grad3 = 0.0, 0.0, 0.0 # for Momentum +``` + +以下是实现细节: + +```python +def optimize(self, learning_rate, optimizer='Adam'): +if optimizer == 'Adam': + self.adam['lr'] = learning_rate + self.adam['t'] += 1 + lr = self.adam['lr'] * (1 - self.adam['beta2'] ** self.adam['t']) ** 0.5 / ( + 1 - self.adam['beta1'] ** self.adam['t']) + self.adam['m1'] = self.adam['beta1'] * self.adam['m1'] + (1 - self.adam['beta1']) * self.W1_grad + self.adam['v1'] = self.adam['beta2'] * self.adam['v1'] + (1 - self.adam['beta2']) * (self.W1_grad * self.W1_grad) + self.W1 -= lr * self.adam['m1'] / (self.adam['v1'] ** 0.5 + self.adam['epsilon']) + self.adam['m2'] = self.adam['beta1'] * self.adam['m2'] + (1 - self.adam['beta1']) * self.W2_grad + self.adam['v2'] = self.adam['beta2'] * self.adam['v2'] + (1 - self.adam['beta2']) * (self.W2_grad * self.W2_grad) + self.W2 -= lr * self.adam['m2'] / (self.adam['v2'] ** 0.5 + self.adam['epsilon']) + self.adam['m3'] = self.adam['beta1'] * self.adam['m3'] + (1 - self.adam['beta1']) * self.W3_grad + self.adam['v3'] = self.adam['beta2'] * self.adam['v3'] + (1 - self.adam['beta2']) * (self.W3_grad * self.W3_grad) + self.W3 -= lr * self.adam['m3'] / (self.adam['v3'] ** 0.5 + self.adam['epsilon']) +else: # momentum optimization + step, discount = 0.2, 0.7 + self.pre_grad1 = self.pre_grad1 * discount + self.W1_grad * step + self.W1 -= self.pre_grad1 * step + self.pre_grad2 = self.pre_grad2 * discount + self.W2_grad * step + self.W2 -= self.pre_grad2 * step + self.pre_grad3 = self.pre_grad3 * discount + self.W3_grad * step + self.W3 -= self.pre_grad3 * step +``` + +按照源码做了细节上的改动,使其能够在本模型中使用在优化能力上比文件中原有的优化方法均有很好的提升。 + +有趣的是,当不小心忘记把Adam中的学习率从0.1改回0.001时,会得到slightly better than random的测试成绩与不变的loss,这是由于指数溢出造成的。Adam优化的参数不好调整,0.001过大了,loss震荡十分明显,最后在0.0003取得不错的结果。 + +### 实验 + +在tester_demo中可以获得满分成绩,在mnist_numpy中可以达到1个epoch95.5,10个epoch98的准确率。 + +以下是消融实验: + +| 初始化后第一个epoch的平均loss | Numpy | Torch | +| ----------------------------- | ------ | ------ | +| Test 0 | 0.2591 | 0.2469 | +| Test 1 | 0.1836 | 0.2759 | +| Test 2 | 0.2350 | 0.2836 | + +显然本模型实现的初始化有更好的效果。 + +| 优化方法 epoch = 10 | Adam(lr = 0.0003) | Momentum(lr = 0.001) | 默认优化(lr = 0.1) | +| --------------------- | ----------------- | -------------------- | ------------------ | +| Final Accuracy Test 0 | 0.9811 | 0.9816 | 0.9779 | +| Final Accuracy Test 1 | 0.9804 | 0.9803 | 0.9793 | + +以下是Adam优化的loss曲线。 + +![Figure_1](img/Figure_1.png) + +以下是Momentum优化的loss曲线。 + +![Figure_2](img/Figure_2.png) + +以下是默认优化的loss曲线。![Figure_3](img/Figure_3.png) + +从loss曲线上似乎看不出Adam能力强太多,Momentum算法在loss曲线上有很大的优势,从10个epoch的最终的准确率来看,Adam与Momentum都有微小改进。 + +为了解释在此处Adam优化与原始的基础优化表现类似的现象,进行了如下调研: + +Adam是傻瓜式的优化算法,它可以很容易地跳过鞍点,而且不需要人为的干预学习率的调节,但是它很容易在局部最小值处震荡,存在在特殊的数据集下出现学习率突然上升,造成不收敛的情况,它有其他优化算法的所有优点和所有不足。这或许是因为我在初始化Adam的参数的时候beta值设置太小,因此学习率衰减较慢,在MNIST这样的小数据集上及时收敛了也会一直震荡。相比之下Momentum中之前的参数是以0.7作底指数衰减的,因而可以快速的抑制震荡! + +后来我发现调Adam的参似乎并不能改进这一问题,反正我也不是研究优化的,遂放弃。 \ No newline at end of file diff --git a/assignment-2/submission/19307110020/img/Figure_1.png b/assignment-2/submission/19307110020/img/Figure_1.png new file mode 100644 index 0000000000000000000000000000000000000000..d0877fb04eddded1955de63996db7c589d8ac5fa Binary files /dev/null and b/assignment-2/submission/19307110020/img/Figure_1.png differ diff --git a/assignment-2/submission/19307110020/img/Figure_2.png b/assignment-2/submission/19307110020/img/Figure_2.png new file mode 100644 index 0000000000000000000000000000000000000000..abcee7f5dc8032a38a45023fa9935ba5b3299b03 Binary files /dev/null and b/assignment-2/submission/19307110020/img/Figure_2.png differ diff --git a/assignment-2/submission/19307110020/img/Figure_3.png b/assignment-2/submission/19307110020/img/Figure_3.png new file mode 100644 index 0000000000000000000000000000000000000000..c4b43e1dfd59ea375650bcf50c36fd859c131c6d Binary files /dev/null and b/assignment-2/submission/19307110020/img/Figure_3.png differ diff --git a/assignment-2/submission/19307110020/numpy_fnn.py b/assignment-2/submission/19307110020/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..fe98f61c89fc0f3496fbdcefeeea3cd47ff813ab --- /dev/null +++ b/assignment-2/submission/19307110020/numpy_fnn.py @@ -0,0 +1,207 @@ +import numpy as np + + +class NumpyOp: + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + + def forward(self, x, W): + """ + x: shape(N, d) + w: shape(d, d') + """ + self.memory['x'] = x + self.memory['W'] = W + h = np.matmul(x, W) + return h + + def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + + #################### + # code 1 # + #################### + grad_W = np.matmul(self.memory['x'].T, grad_y) + grad_x = np.matmul(grad_y, self.memory['W'].T) + return grad_x, grad_W + + +class Relu(NumpyOp): + + def forward(self, x): + self.memory['x'] = x + return np.where(x > 0, x, np.zeros_like(x)) + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 2 # + #################### + x = self.memory['x'] + return grad_y * np.where(x > 0, 1, 0) + + +class Log(NumpyOp): + + def forward(self, x): + """ + x: shape(N, c) + """ + out = np.log(x + self.epsilon) + self.memory['x'] = x + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 3 # + #################### + x = self.memory['x'] + return (1 / (x + self.epsilon)) * grad_y + + +class Softmax(NumpyOp): + """ + softmax over last dimension + """ + + def forward(self, x): + """ + x: shape(N, c) + """ + + #################### + # code 4 # + #################### + self.memory['x'] = x + x = np.array(x) + N = x.shape[0] + x = x - np.array([max(i) for i in x]).reshape(N, 1) # A plus C trick + x = np.exp(x) + out = x / np.sum(x, axis=1).reshape(N, 1) + self.memory['softmax'] = out + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 5 # + #################### + c = grad_y.shape[1] + softmax = self.memory['softmax'] + tem = [] + for i in softmax: + tem.append(np.diag(i) - np.outer(i, i)) + return np.matmul(grad_y.reshape(-1, 1, c), np.array(tem)).reshape(-1, c) + + +class NumpyLoss: + def __init__(self): + self.target = None + + def get_loss(self, pred, target): + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self): + return - self.target / self.target.shape[0] + + +class NumpyModel: + def __init__(self): + self.W1 = np.random.normal(size=(28 * 28, 256)) + self.W2 = np.random.normal(size=(256, 64)) + self.W3 = np.random.normal(size=(64, 10)) + + # 以下算子会在 forward 和 backward 中使用 + self.matmul_1 = Matmul() + self.relu_1 = Relu() + self.matmul_2 = Matmul() + self.relu_2 = Relu() + self.matmul_3 = Matmul() + self.softmax = Softmax() + self.log = Log() + + # 以下变量需要在 backward 中更新 + self.x1_grad, self.W1_grad = None, None + self.relu_1_grad = None + self.x2_grad, self.W2_grad = None, None + self.relu_2_grad = None + self.x3_grad, self.W3_grad = None, None + self.softmax_grad = None + self.log_grad = None + + self.adam = {'lr': 0.001, 't': 0, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-8, + 'm1': 0, 'm2': 0, 'm3': 0, 'v1': 0, 'v2': 0, 'v3': 0} + self.pre_grad1, self.pre_grad2, self.pre_grad3 = 0.0, 0.0, 0.0 # for Momentum + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + #################### + # code 6 # + #################### + x = self.matmul_1.forward(x, self.W1) # 256 + x = self.relu_1.forward(x) + x = self.matmul_2.forward(x, self.W2) # 64 + x = self.relu_2.forward(x) + x = self.matmul_3.forward(x, self.W3) # 10 + x = self.softmax.forward(x) + x = self.log.forward(x) + return x + + def backward(self, y): + #################### + # code 7 # + #################### + + self.log_grad = self.log.backward(y) + self.softmax_grad = self.softmax.backward(self.log_grad) + self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) + self.relu_2_grad = self.relu_2.backward(self.x3_grad) + self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) + self.relu_1_grad = self.relu_1.backward(self.x2_grad) + self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) + + pass + + def optimize(self, learning_rate, optimizer='Adam'): + if optimizer == 'Adam': + self.adam['lr'] = learning_rate + self.adam['t'] += 1 + lr = self.adam['lr'] * (1 - self.adam['beta2'] ** self.adam['t']) ** 0.5 / ( + 1 - self.adam['beta1'] ** self.adam['t']) + self.adam['m1'] = self.adam['beta1'] * self.adam['m1'] + (1 - self.adam['beta1']) * self.W1_grad + self.adam['v1'] = self.adam['beta2'] * self.adam['v1'] + (1 - self.adam['beta2']) * ( + self.W1_grad * self.W1_grad) + self.W1 -= lr * self.adam['m1'] / (self.adam['v1'] ** 0.5 + self.adam['epsilon']) + self.adam['m2'] = self.adam['beta1'] * self.adam['m2'] + (1 - self.adam['beta1']) * self.W2_grad + self.adam['v2'] = self.adam['beta2'] * self.adam['v2'] + (1 - self.adam['beta2']) * ( + self.W2_grad * self.W2_grad) + self.W2 -= lr * self.adam['m2'] / (self.adam['v2'] ** 0.5 + self.adam['epsilon']) + self.adam['m3'] = self.adam['beta1'] * self.adam['m3'] + (1 - self.adam['beta1']) * self.W3_grad + self.adam['v3'] = self.adam['beta2'] * self.adam['v3'] + (1 - self.adam['beta2']) * ( + self.W3_grad * self.W3_grad) + self.W3 -= lr * self.adam['m3'] / (self.adam['v3'] ** 0.5 + self.adam['epsilon']) + else: # momentum optimization + step, discount = 0.2, 0.7 + self.pre_grad1 = self.pre_grad1 * discount + self.W1_grad * step + self.W1 -= self.pre_grad1 * step + self.pre_grad2 = self.pre_grad2 * discount + self.W2_grad * step + self.W2 -= self.pre_grad2 * step + self.pre_grad3 = self.pre_grad3 * discount + self.W3_grad * step + self.W3 -= self.pre_grad3 * step \ No newline at end of file diff --git a/assignment-2/submission/19307110020/numpy_mnist.py b/assignment-2/submission/19307110020/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..5108516579af49fd53859ecaaa7faa2a2bbd28ad --- /dev/null +++ b/assignment-2/submission/19307110020/numpy_mnist.py @@ -0,0 +1,61 @@ +import numpy as np +import torch +from numpy_fnn import NumpyModel, NumpyLoss +from utils import download_mnist, batch, plot_curve, one_hot + +def mini_batch(dataset, batch_size=128, numpy=False, shuffle=True): + if not numpy: + return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + import random + datas = np.array([np.array(i[0].numpy(), i[1]) for i in dataset]) + if shuffle: + random.shuffle(datas) + data = [np.array(datas[0, i: i + batch_size]) for i in range(0, len(dataset), batch_size)] + label = [np.array(datas[1, i: i + batch_size]) for i in range(0, len(dataset), batch_size)] + return data, label + + +def get_torch_initialization(numpy=True): + if numpy: + ran = lambda x: (6 / x) ** 0.5 + Kaiming = lambda x, y: np.random.uniform(low=-ran(x), high=ran(x), size=(x,y)) + W1 = Kaiming(28 * 28, 256) + W2 = Kaiming(256, 64) + W3 = Kaiming(64, 10) + else: + fc1 = torch.nn.Linear(28 * 28, 256) + fc2 = torch.nn.Linear(256, 64) + fc3 = torch.nn.Linear(64, 10) + W1 = fc1.weight.T.detach().clone().data + W2 = fc2.weight.T.detach().clone().data + W3 = fc3.weight.T.detach().clone().data + return W1, W2, W3 + +def numpy_run(): + train_dataset, test_dataset = download_mnist() + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + + epoch_number = 10 + learning_rate = 0.0003 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset): + y = one_hot(y) + y_pred = model.forward(x.numpy()) + loss = numpy_loss.get_loss(y_pred, y) + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + train_loss.append(loss.item()) + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + numpy_run() \ No newline at end of file