diff --git a/assignment-2/submission/18307130213/README.md b/assignment-2/submission/18307130213/README.md new file mode 100644 index 0000000000000000000000000000000000000000..44314b9df69abb8780c981955900b82f26b376e5 --- /dev/null +++ b/assignment-2/submission/18307130213/README.md @@ -0,0 +1,100 @@ +# 课程报告 + +## NumpyModel 类实现 + + `NumpyModel` 类的实现位于 [numpy_fnn.py](./numpy_fnn.py) 。 + +具体内容包括: + +1. 实现 `Matmul, Relu, Log, Softmax​` 等支持前向传播和反向传播的基础算子类。 +2. 完善 `NumpyModel` 的前向传播函数 `forward` 和反向传播函数 `backward` 。 + + + +## 模型训练与测试 + +此模型应用了新的初始化方法即非 `PyTorch​` 版初始化,在第一个 `epoch` 能达到更好的效果。 + +单次实验的三个 `epoch` ​中,模型的准确率分别在 95.7%, 96.6%, 97.2% 附近波动,以下为某次实验的结果: + +``` +[0] Accuracy: 0.9550 +[1] Accuracy: 0.9651 +[2] Accuracy: 0.9723 +``` + +对应的图像为: + +![](./img/numpy_minist_result.jpg) + +可以看到,随着模型训练过程 `Loss​` 逐渐收敛于某个较小值。 + + + +## 数据处理和参数初始化 + +在 `NumPy​` 库基础上实现了 `mini_batch` 函数和 `get_torch_initialization` 函数,位于[numpy_mnist.py](./numpy_mnist.py) 。 + +其中 `get_torch_initialization`​ 函数使用了**何恺明**提出的 `Kaiming` 初始化方法。这也是 `PyTorch` 线性层默认的初始化方法。 + +究其原因可能有以下两方面的考量: + +1. 若权重初始绝对值过小,导致信号逐层衰减,激活函数趋于线性。 +2. 若权重初始绝对值过大,导致信号逐层放大,激活函数饱和,可能造成梯度消失等后果。 + +使用 `Kaiming` 初始化可以得到一个适中的随机分布值,有效地加强训练效果。 + +### Kaiming初始化公式 + + `Kaiming​` 初始化方法相较于其他方法可以在使用 `relu` 或 `leaky_relu` 时取得更好的效果。 + +令 `a​` 为 `leaky_relu` 的负区域所对应的的斜率且尽量保证 $a<1$,显然对于 `relu​` 有 $a = 0$。 + + `Kaiming​` 初始化即使用某个均匀分布 `U(-bound, bound)` 对参数矩阵进行初始化。 + +其中 `bound​` 的计算公式为 +$$ +bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}} +$$ + `fan_in` 为扇入部分的参数个数。 + +此方法的具体实现见 `get_torch_initialization` 函数。 + + + +## 反向传播算子公式推导 + +在本实验中,大部分算子要求进行矩阵对矩阵求导,正确的求导方式应先将矩阵向量化,进行向量对向量的求导。 + +![](./img/formula_1.jpg) + + +### Matmul算子 + +![](./img/formula_2.jpg) + +### Relu算子 + +![](./img/formula_3.jpg) + +### log算子 + +![](./img/formula_4.jpg) + +### softmax算子 + +![](./img/formula_5.jpg) + +## 总结 + +已完成:自动测试 `60%` + +已完成:模仿 `torch_mnist.py` 的代码,在 `numpy_mnist.py` 中进行模型的训练和测试,并在报告中介绍你的实验过程与结果 `20%` + + 已完成:在 `numpy_mnist.py` 中只用 `NumPy​` 实现 `mini_batch` 函数,替换 `utils.py` 中使用 `PyTorch` 实现的 `mini_batch` 函数 `10%` + +已完成:在报告中推导 `numpy_fnn.py` 中实现算子的反向传播计算公式 `10%` + +已完成:调研 `PyTorch​` 中权重初始化的方法,并实现代码替换 `get_torch_initialization` 函数 `10%` + +已完成:相关 `bug​` 查杀工作 \ No newline at end of file diff --git a/assignment-2/submission/18307130213/img/.keep b/assignment-2/submission/18307130213/img/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-2/submission/18307130213/img/formula_1.jpg b/assignment-2/submission/18307130213/img/formula_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d3c5d14dd27f77a3f0a7012d1dc25e82c1f84f5f Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_1.jpg differ diff --git a/assignment-2/submission/18307130213/img/formula_2.jpg b/assignment-2/submission/18307130213/img/formula_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..eca0921203b32a83a12f95005d2a5f1cb6fc7247 Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_2.jpg differ diff --git a/assignment-2/submission/18307130213/img/formula_3.jpg b/assignment-2/submission/18307130213/img/formula_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2bd1069a8c4128fdb5633402ebda4492ac92fb9b Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_3.jpg differ diff --git a/assignment-2/submission/18307130213/img/formula_4.jpg b/assignment-2/submission/18307130213/img/formula_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d7bf4291c88813f3c5ed2c9790f1f1074922c1d8 Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_4.jpg differ diff --git a/assignment-2/submission/18307130213/img/formula_5.jpg b/assignment-2/submission/18307130213/img/formula_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7989af3b0414f5f593d8bb705563e54a2f9bd3e0 Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_5.jpg differ diff --git a/assignment-2/submission/18307130213/img/numpy_minist_result.jpg b/assignment-2/submission/18307130213/img/numpy_minist_result.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fb54da399984213c5c65d7b3e74f603161f99208 Binary files /dev/null and b/assignment-2/submission/18307130213/img/numpy_minist_result.jpg differ diff --git a/assignment-2/submission/18307130213/numpy_fnn.py b/assignment-2/submission/18307130213/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..13e65fbb9150c6c4643cfeec3fa0c31e3eaf4005 --- /dev/null +++ b/assignment-2/submission/18307130213/numpy_fnn.py @@ -0,0 +1,161 @@ +import numpy as np + + +class NumpyOp: + + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + + def forward(self, x, W): + """ + x: shape(N, d) + w: shape(d, d') + """ + self.memory['x'] = x + self.memory['W'] = W + h = np.matmul(x, W) + return h + + def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + grad_x = np.matmul(grad_y, self.memory['W'].T) + grad_W = np.matmul(self.memory['x'].T, grad_y) + return grad_x, grad_W + + +class Relu(NumpyOp): + + def forward(self, x): + self.memory['x'] = x + return np.where(x > 0, x, np.zeros_like(x)) + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + grad_x = np.where(self.memory['x']>0, grad_y, np.zeros_like(grad_y)) + return grad_x + + +class Log(NumpyOp): + + def forward(self, x): + """ + x: shape(N, c) + """ + out = np.log(x + self.epsilon) + self.memory['x'] = x + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + grad_x = np.divide(grad_y, self.memory['x'] + self.epsilon) + return grad_x + + +class Softmax(NumpyOp): + """ + softmax over last dimension + """ + + def forward(self, x): + """ + x: shape(N, c) + """ + N, c = x.shape + e_x = np.exp(x) + sum_e_x = np.repeat(np.expand_dims(np.sum(e_x, axis=-1), axis=1), c, axis=1) + out = np.divide(e_x, sum_e_x) + self.memory['x'] = x + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + N, c = self.memory['x'].shape + e_x = np.power(np.e, self.memory['x']) + sum_e_x = np.repeat(np.expand_dims(np.sum(e_x, axis=-1), axis=1), c, axis=1) + fout = np.divide(e_x, sum_e_x) + e_g = e_x * grad_y + sum_e_g = np.repeat(np.expand_dims(np.sum(e_g, axis=-1), axis=1), c, axis=1) + grad_x = fout * (grad_y - np.divide(sum_e_g, sum_e_x)) + return grad_x + +class NumpyLoss: + + def __init__(self): + self.target = None + + def get_loss(self, pred, target): + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self): + return -self.target / self.target.shape[0] + + +class NumpyModel: + def __init__(self): + self.W1 = np.random.normal(size=(28 * 28, 256)) + self.W2 = np.random.normal(size=(256, 64)) + self.W3 = np.random.normal(size=(64, 10)) + + + # 以下算子会在 forward 和 backward 中使用 + self.matmul_1 = Matmul() + self.relu_1 = Relu() + self.matmul_2 = Matmul() + self.relu_2 = Relu() + self.matmul_3 = Matmul() + self.softmax = Softmax() + self.log = Log() + + # 以下变量需要在 backward 中更新 + self.x1_grad, self.W1_grad = None, None + self.relu_1_grad = None + self.x2_grad, self.W2_grad = None, None + self.relu_2_grad = None + self.x3_grad, self.W3_grad = None, None + self.softmax_grad = None + self.log_grad = None + + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + x = self.matmul_1.forward(x, self.W1) + x = self.relu_1.forward(x) + x = self.matmul_2.forward(x, self.W2) + x = self.relu_2.forward(x) + x = self.matmul_3.forward(x, self.W3) + x = self.softmax.forward(x) + x = self.log.forward(x) + return x + + + def backward(self, y): + + self.log_grad = self.log.backward(y) + self.softmax_grad = self.softmax.backward(self.log_grad) + + self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) + self.relu_2_grad = self.relu_2.backward(self.x3_grad) + + self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) + self.relu_1_grad = self.relu_1.backward(self.x2_grad) + + self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) + + + def optimize(self, learning_rate): + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad diff --git a/assignment-2/submission/18307130213/numpy_mnist.py b/assignment-2/submission/18307130213/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..315fbf32a5a00ec2eaaca9978fbd311f1392a0ae --- /dev/null +++ b/assignment-2/submission/18307130213/numpy_mnist.py @@ -0,0 +1,68 @@ +import numpy as np +from numpy_fnn import NumpyModel, NumpyLoss +from utils import download_mnist, batch, plot_curve, one_hot + + +def get_torch_initialization(numpy=True): + + def kaiming_uniform(fan_in, fan_out, a = 0.0): + # a: the negative slope of the rectifier used after this layer, specially 0 for relu + bound = (6.0 / ((1.0 + a**2) * fan_in))**0.5 + return np.random.uniform(low = -bound, high = bound, size = (fan_in, fan_out)) + + return kaiming_uniform(28 * 28, 256), kaiming_uniform(256, 64), kaiming_uniform(64, 10) + +def mini_batch(dataset, batch_size=128, numpy=False): + data = [] + label = [] + for x in dataset: + data.append(np.array(x[0])) + label.append(x[1]) + data = np.array(data) + label = np.array(label) + + size = data.shape[0] + index = np.arange(size) + np.random.shuffle(index) + + batches = [] + i = 0 + while i + batch_size <= size: + batches.append((data[index[i:i + batch_size]], label[index[i:i + batch_size]])) + i += batch_size + + return batches + +def numpy_run(): + train_dataset, test_dataset = download_mnist() + + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset): + y = one_hot(y) + + y_pred = model.forward(x) + loss = numpy_loss.get_loss(y_pred, y) + + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + numpy_run()