diff --git a/assignment-2/submission/16307130040/README.md b/assignment-2/submission/16307130040/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..75a75c96dba24c1fe572704318d3646659659437
--- /dev/null
+++ b/assignment-2/submission/16307130040/README.md
@@ -0,0 +1,148 @@
+# 实验报告2
+
+### 1,实验结果
+
+在完成了对mini_batch函数的替换后,模型得以顺利进行:
+
+
+
+```shell
+[0] Accuracy: 0.9453
+[1] Accuracy: 0.9656
+[2] Accuracy: 0.9689
+```
+
+### 2,mini_batch的替换
+
+```python
+def mini_batch(dataset, batch_size=128, numpy=True):
+    data = []
+    label = []
+    for each in dataset:
+        data.append(np.array(each[0]))
+        label.append(each[1])
+    data = np.array(data)
+    label = np.array(label)
+
+    m = data.shape[0]
+    permutation = list(np.random.permutation(m))
+    data =data[permutation]
+    label=label[permutation]
+
+    n=m//batch_size
+    mini_batches=[]
+    for i in range(n):
+        mini_batches.append([data[i*batch_size:(i+1)*batch_size],label[i*batch_size:(i+1)*batch_size]])
+
+    return mini_batches
+```
+
+整体上参考了utils.py的batch函数。前半段和batch一样,将dataset中的数据分别放到data和label之中。之后,让data和label的元素顺序随机变化,再将每一个小batch的数据和标记放入对应的列表中,依次放入一个大的列表,并最终输出。
+
+### 3,反向传播公式的推导
+
+matmul:
+
+#### 
+
+设输出y为l维的向量。
+
+**dx:**对于每个x中的元素xi,它对y1,y2,y3,.....,yl的偏导为wi1,wi2......,wil.
+
+dL/dxi=(dL/dy1)·wi1+(dL/dy2)·wi2+……+(dL/dyl)·wil
+
+所以,dx=dy·WT.
+
+**dW:** 对于W的元素wij,它对yj的偏导为xi。
+
+dL/dwij=(dL/dyj)·xi
+
+所以,dW=xT*dy。对于多个样本,需要求平均值。
+
+```python
+def backward(self, grad_y):
+    """
+    grad_y: shape(N, d')
+    """
+    N=grad_y.shape[0]
+
+    grad_x=np.matmul(grad_y,self.memory['W'].T)
+    grad_W=np.matmul(self.memory['x'].T,grad_y)
+
+    
+    return grad_x, grad_W
+```
+
+
+
+2,Relu函数
+
+```python
+def backward(self, grad_y):
+    """
+    grad_y: same shape as x
+    """
+    x=self.memory['x']
+    grad_x = grad_y.copy()
+    grad_x[x<=0]=0
+    return grad_x
+    
+```
+
+在xi<=0时,dyi/dxi=0,dL/dxi=0
+
+在xi>0时,dyi/dxi=1,dL/dxi=dL/dyi
+
+所以如上所示,如果xi大于0,则dx相应的位置照搬dy;如果xi小于等于0,则dx响应的位置设置为0.
+
+3,log函数
+
+```python
+def backward(self, grad_y):
+    """
+    grad_y: same shape as x
+    """
+    
+    x=self.memory['x']
+    x[x<=self.epsilon]=self.epsilon
+    grad_x=grad_y*(1/x)
+    
+    return grad_x
+```
+
+dyi/dxi=1/xi,dL/dxi=(dL/dyi)·1/xi
+
+所以,dx=dy·(1/x).
+
+4,softmax函数
+
+
+
+设x和y均为l维的向量。
+
+则对于dyj/dxi,如果i=j,则dyj/dxi=yj-(yj)^2.如果i!=j,则有dyj/dxi=-yi·yj.
+
+设D=diag(y)-yT·y,有dyj/dxi=Dij
+
+这样的话,有dL/dxi=(dL/dy1)·Di1+(dL/dy2)·Di2+……+(dL/dyl)·Dil
+
+所以,dx=dy·D
+
+```python
+def backward(self, grad_y):
+    """
+    grad_y: same shape as x
+    """
+    y=self.memory['y']
+    l=y.shape[0]
+    grad_x=[]
+    for grad_y1,y1 in zip(grad_y,y):
+        D= np.diag(y1) - np.outer(y1,y1)
+        grad_x1=np.dot(grad_y1, D)
+        grad_x.append(grad_x1)
+    grad_x=np.array(grad_x)
+    return grad_x
+```
+
+不过,在实际的实现中,要考虑到一批中不只有一个数据,要一个一个数据地逐个生成dx。
+
diff --git a/assignment-2/submission/16307130040/img/Figure_1.png b/assignment-2/submission/16307130040/img/Figure_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..dab6049f889917dcbf2e93d6203b3a6579908777
Binary files /dev/null and b/assignment-2/submission/16307130040/img/Figure_1.png differ
diff --git a/assignment-2/submission/16307130040/img/matmul.jpg b/assignment-2/submission/16307130040/img/matmul.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dd071796bbe85141e48275be4c38358eefa4112f
Binary files /dev/null and b/assignment-2/submission/16307130040/img/matmul.jpg differ
diff --git a/assignment-2/submission/16307130040/img/softmax.jpg b/assignment-2/submission/16307130040/img/softmax.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2daa913899b5f8401693ffb777ff3e27ed24cf09
Binary files /dev/null and b/assignment-2/submission/16307130040/img/softmax.jpg differ
diff --git a/assignment-2/submission/16307130040/numpy_fnn.py b/assignment-2/submission/16307130040/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..277f81a3f11fb44523777ef4bddcb998454bcdc3
--- /dev/null
+++ b/assignment-2/submission/16307130040/numpy_fnn.py
@@ -0,0 +1,184 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+
+        grad_x = np.matmul(grad_y, self.memory['W'].T)
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+
+
+        
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        x=self.memory['x']
+        grad_x = grad_y.copy()
+        grad_x[x<=0]=0
+        
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        x=self.memory['x']
+        x[x<=self.epsilon]=self.epsilon
+        grad_x=grad_y*(1/x)
+        
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        shift_x = x - np.max(x, axis=1).reshape(-1, 1)
+        y = np.exp(shift_x) / np.sum(np.exp(shift_x), axis=1).reshape(-1, 1)
+        #y = np.exp(x+1) / np.sum(np.exp(x+1), axis=1).reshape(-1, 1)
+        self.memory['y'] = y
+        
+        return y
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        y=self.memory['y']
+        l=y.shape[0]
+        grad_x=[]
+        for grad_y1,y1 in zip(grad_y,y):
+            D= np.diag(y1) - np.outer(y1,y1)
+            grad_x1=np.dot(grad_y1, D)
+            grad_x.append(grad_x1)
+        grad_x=np.array(grad_x)
+        
+        return grad_x
+
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+        
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+        
+        # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        x=self.matmul_1.forward(x,self.W1)
+        x=self.relu_1.forward(x)
+        x=self.matmul_2.forward(x,self.W2)
+        x=self.relu_2.forward(x)
+        x=self.matmul_3.forward(x, self.W3)
+
+
+        x = self.softmax.forward(x)
+        x = self.log.forward(x)
+        
+        return x
+    
+    def backward(self, y):
+        
+        y = self.log.backward(y)
+        self.log_grad = y
+        y = self.softmax.backward(y)
+        self.softmax_grad = y
+        y, self.W3_grad = self.matmul_3.backward(y)
+        self.x3_grad = y
+        y = self.relu_2.backward(y)
+        y, self.W2_grad = self.matmul_2.backward(y)
+        self.x2_grad = y
+        y = self.relu_1.backward(y)
+        y, self.W1_grad = self.matmul_1.backward(y)
+        self.x1_grad = y
+        
+        pass
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/16307130040/numpy_mnist.py b/assignment-2/submission/16307130040/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..a688f7c64114bf150ffff2b903dfc74688bda4ad
--- /dev/null
+++ b/assignment-2/submission/16307130040/numpy_mnist.py
@@ -0,0 +1,59 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot
+
+def mini_batch(dataset, batch_size=128, numpy=True):
+    data = []
+    label = []
+    for each in dataset:
+        data.append(np.array(each[0]))
+        label.append(each[1])
+    data = np.array(data)
+    label = np.array(label)
+
+    m = data.shape[0]
+    permutation = list(np.random.permutation(m))
+    data =data[permutation]
+    label=label[permutation]
+
+    n=m//batch_size
+    mini_batches=[]
+    for i in range(n):
+        mini_batches.append([data[i*batch_size:(i+1)*batch_size],label[i*batch_size:(i+1)*batch_size]])
+
+    return mini_batches
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/16307130040/torch_mnist.py b/assignment-2/submission/16307130040/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a5649bbfa750b3520b4b895de7260c3aa8ea7cd
--- /dev/null
+++ b/assignment-2/submission/16307130040/torch_mnist.py
@@ -0,0 +1,64 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+    
+    def __init__(self):
+        self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+        self.W2 = torch.randn((256, 64), requires_grad=True)
+        self.W3 = torch.randn((64, 10), requires_grad=True)
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = torch.relu(torch.matmul(x, self.W1))
+        x = torch.relu(torch.matmul(x, self.W2))
+        x = torch.matmul(x, self.W3)
+        self.softmax = torch.softmax(x, 1)
+        self.log = torch.log(self.softmax)
+        self.softmax.retain_grad()  # for test only
+        self.log.retain_grad()      # for test only
+        return self.log
+    
+    def optimize(self, learning_rate):
+        with torch.no_grad():
+            self.W1 -= learning_rate * self.W1.grad
+            self.W2 -= learning_rate * self.W2.grad
+            self.W3 -= learning_rate * self.W3.grad
+            
+            self.W1.grad = None
+            self.W2.grad = None
+            self.W3.grad = None
+
+
+def torch_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = TorchModel()
+    model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, numpy=False):
+            y = one_hot(y, numpy=False)
+            
+            y_pred = model.forward(x)
+            loss = (-y_pred * y).sum(dim=1).mean()
+            loss.backward()
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset, numpy=False)[0]
+        accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    torch_run()
diff --git a/assignment-2/submission/17307130331/README.md b/assignment-2/submission/17307130331/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..abd8de5834bacc838e1b813905da469a8d9168c3
--- /dev/null
+++ b/assignment-2/submission/17307130331/README.md
@@ -0,0 +1,343 @@
+# 实验报告
+
+陈疏桐   17307130331
+
+本次实验,我用numpy实现了Matmul、log、softmax和relu四个算子的前向计算与后向计算,用四个算子构建分类模型,通过了自动测试,并实现了mini_batch函数,在mnist数据集上用不同的学习率与Batch大小进行训练和测试,讨论学习率与Batch大小对模型训练效果的影响。最后,我还实现Momentum、RMSProp与Adam三种优化方法,与传统梯度下降进行比较。
+
+## 算子的反向传播与实现
+### Matmul
+
+Matmul是矩阵的乘法,在模型中的作用相当于pytorch的一个线性层,前向传播的公式是:
+
+$$ \mathrm{Y} = \mathrm{X}\mathrm{W} $$
+
+其中,$\mathrm{X}$是形状为 $N \times d$的输入矩阵,$\mathrm{W}$是形状为$d \times d'$的矩阵, $\mathrm{Y}$是形状为$N\times d'$的输出矩阵。Matmul算子相当于输入维度为$d$、输出$d'$维的线性全连接层。
+
+Matmul分别对输入求偏导,有
+
+$$ \frac{\partial \mathrm{Y}}{\partial \mathrm{X}} = \frac{\partial \mathrm{X}\mathrm{W}}{\partial \mathrm{X}} = \mathrm{W}^T$$
+
+$$ \frac{\partial \mathrm{Y}}{\partial \mathrm{W}} = \frac{\partial \mathrm{X}\mathrm{W}}{\partial \mathrm{W}} = \mathrm{X}^T $$
+
+则根据链式法则,反向传播的计算公式为:
+
+$$ \triangledown{\mathrm{X}} = \triangledown{\mathrm{Y}} \times \mathrm{W}^T $$
+$$ \triangledown{\mathrm{W}} = \mathrm{X}^T \times \triangledown{\mathrm{Y}} $$
+
+### Relu 
+
+Relu函数对输入每一个元素的公式是:
+
+$$ \mathrm{Y}_{ij}=
+\begin{cases}
+\mathrm{X}_{ij} & \mathrm{X}_{ij} \ge 0 \\\\
+0 & \text{otherwise}
+\end{cases} 
+$$
+
+
+每一个输出 $\mathrm{Y}_{ij}$都只与输入$\mathrm{X}_{ij}$有关。则$\mathrm{X}$每一个元素的导数也只和对应的输出有关,为:
+
+$$ \frac{\partial \mathrm{Y}_{ij}}{\partial \mathrm{X}_{ij}} = 
+\begin{cases}
+1 & \mathrm{X}_{ij} \ge 0 \\\\
+0 & \text{otherwise}
+\end{cases}$$ 
+
+因此,根据链式法则,输入的梯度为:
+
+$$ \triangledown{\mathrm{X}_{ij}} = \triangledown{\mathrm{Y}_{ij}} \times \frac{\partial \mathrm{Y}_{ij}}{\partial \mathrm{X}_{ij}}$$
+
+### Log
+
+Log 函数公式:
+
+$$ \mathrm{Y}_{ij} = \log(\mathrm{X}_{ij} + \epsilon)$$
+
+$$ \frac{\partial \mathrm{Y}_{ij}}{\partial \mathrm{X}_{ij}} = \frac{1}{(\mathrm{X}_{ij} + \epsilon)} $$
+
+类似地,反向传播的计算公式为:
+
+$$ \triangledown{\mathrm{X}_{ij}} = \triangledown{\mathrm{Y}_{ij}} \times \frac{\partial \mathrm{Y}_{ij}}{\partial \mathrm{X}_{ij}}$$
+
+### Softmax
+
+Softmax对输入$\mathrm{X}$的最后一个维度进行计算。前向传播的计算公式为:
+
+$$ \mathrm{Y}_{ij} = \frac{\exp^{\mathrm{X}_{ij}}}{\sum_{k} \exp ^ {\mathrm{X}_{ik}}}$$
+
+从公式可知,Softmax的每一行输出都是独立计算的,与其它行的输入无关。而对于同一行,每一个输出都与每一个输入元素有关。以行$k$为例,可推得输出元素对输入元素求导的计算公式是:
+
+$$\frac{\partial Y_{ki}}{\partial X_{kj}} = \begin{cases}
+\frac{\exp ^ {X_{kj}} \times (\sum_{t \ne j}{\exp ^ {X_{kt}}}) }{(\sum_{t}{\exp ^ {X_{kt}}})^2} = Y_{kj}(1-Y_{kj}) & i = j \\\\
+-\frac{\exp^{X_{ki} }\exp^{X_{kj} }}{(\sum_t\exp^{X_{kt}})^2}=-Y_{ki} \times Y_{kj} & i\ne j
+\end{cases}$$
+
+可得每行输出$\mathrm{Y}_{k}$与每行输入$\mathrm{X}_{k}$的Jacob矩阵$\mathrm{J}_{k}$, $\mathrm{J_{k}}_{ij} = \frac{\partial \mathrm{Y}_{ki}}{\partial \mathrm{X}_{kj}}$.
+
+输出的一行对于输入$\mathrm{X}_{kj}$的导数,是输出每一行所有元素对其导数相加,即$\sum_{i} {\frac{\partial \mathrm{Y}_{ki}}{\partial \mathrm{X}_{kj}}}$ 的结果。
+
+因此,根据链式法则,可得到反向传播的计算公式为:
+$$ \triangledown \mathrm{X}_{kj} = \sum_{i} {\frac{\partial \mathrm{Y}_{ki} \times \triangledown \mathrm{Y}_{ki}}{\partial \mathrm{X}_{kj}}}$$
+
+相当于:
+
+$$ \triangledown \mathrm{X}_{k} = \mathrm{J}_{k} \times \triangledown \mathrm{Y}_{k} $$
+
+在实现时,可以用`numpy`的`matmul`操作实现对最后两个维度的矩阵相乘,得到的矩阵堆叠起来,得到最后的结果。
+
+
+## 模型构建与训练
+### 模型构建
+
+参照`torch_mnist.py`中的`torch_model`,`numpy`模型的构建只需要将其中的算子换成我们实现的算子:
+```
+def forward(self, x):
+    x = x.reshape(-1, 28 * 28)
+
+    x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+    x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+
+    x = self.matmul_3.forward(x, self.W3)
+
+    x = self.softmax.forward(x)
+    x = self.log.forward(x)
+
+    return x
+```
+
+模型的computation graph是:
+
+
+根据计算图,可以应用链式法则,推导出各个叶子变量($\mathrm{W}_{1}, \mathrm{W}_{2}, \mathrm{W}_{3}, \mathrm{X}$)以及中间变量的计算方法。
+
+反向传播的计算图为:
+
+
+可根据计算图完成梯度的计算:
+```
+def backward(self, y):
+    self.log_grad = self.log.backward(y)
+    self.softmax_grad = self.softmax.backward(self.log_grad)
+    self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+    self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+    self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+    self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+    self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+```
+
+### MiniBatch
+
+在`utils`中的`mini_batch`方法,直接调用了`pytorch`的`DataLoader`。 `DataLoader`是一个负责从数据集中读取样本、组合成批次输出的方法。简单地使用`DataLoader`, 可以方便地多线程并行化预取数据,加快训练速度,且节省代码。`DataLoader`还可以自定义`Sampler`,以不同的方式从数据集中进行采样,以及`BatchSampler`以自定的方式将采集的样本组合成批,这样就可以实现在同一Batch内将数据补0、自定义Batch正负样本混合比例等操作。
+
+在这里,我们模仿`DataLoader`的默认行为实现`mini_batch`方法。
+```
+def mini_batch(dataset, batch_size=128):
+    data = np.array([each[0].numpy() for each in dataset]) # 需要先处理数据
+    label = np.array([each[1] for each in dataset])
+    
+    data_size = data.shape[0]
+    idx = np.array([i for i in range(data_size)])
+    np.random.shuffle(idx)   # 打乱顺序
+    
+    return [(data[idx[i: i+batch_size]], label[idx[i:i+batch_size]])  for i in range(0, data_size, batch_size)]  # 这里相当于DataLoader 的BatchSampler,但一次性调用
+```
+
+### 模型训练
+
+构建模型,设置`epoch=10`, `learning_rate=0.1`, `batch_size=128`后,开始训练。训练时每次fit一个batch的数据,前向传播计算输出,然后根据输出计算loss,再调用`loss.backward`计算loss对输出的求导,即模型输出的梯度,之后就可以调用模型的`backward`进行后向计算。 最后调用模型的`optimize`更新参数。
+
+训练过程:
+
+ 
+各个epoch的测试准确率为:
+```
+[0] Test Accuracy: 0.9437
+[1] Test Accuracy: 0.9651
+[2] Test Accuracy: 0.9684
+[3] Test Accuracy: 0.9730
+[4] Test Accuracy: 0.9755
+[5] Test Accuracy: 0.9775
+[6] Test Accuracy: 0.9778
+[7] Test Accuracy: 0.9766
+[8] Test Accuracy: 0.9768
+[9] Test Accuracy: 0.9781
+```
+
+将`learning_rate` 调整到0.2,重新训练:
+
+
+各个epoch的测试准确率为:
+```
+[0] Test Accuracy: 0.9621
+[1] Test Accuracy: 0.9703
+[2] Test Accuracy: 0.9753
+[3] Test Accuracy: 0.9740
+[4] Test Accuracy: 0.9787
+[5] Test Accuracy: 0.9756
+[6] Test Accuracy: 0.9807
+[7] Test Accuracy: 0.9795
+[8] Test Accuracy: 0.9814
+[9] Test Accuracy: 0.9825
+```
+
+可见,稍微提高学习率之后,训练前期参数更新的幅度更大,损失下降得更快,能够更早收敛。训练相同迭代数,现在的模型测试准确率更高。
+
+将`learning_rate` 提高到0.3,重新训练:
+
+
+```
+[0] Test Accuracy: 0.9554
+[1] Test Accuracy: 0.9715
+[2] Test Accuracy: 0.9744
+[3] Test Accuracy: 0.9756
+[4] Test Accuracy: 0.9782
+[5] Test Accuracy: 0.9795
+[6] Test Accuracy: 0.9801
+[7] Test Accuracy: 0.9816
+[8] Test Accuracy: 0.9828
+[9] Test Accuracy: 0.9778
+```
+
+增大学习率到0.3之后,训练前期损失下降速度与上一次训练差不多,但是到了训练后期,过大的学习率导致权重在局部最小值的附近以过大的幅度移动,难以进入最低点,模型loss表现为振荡,难以收敛。本次训练的测试准确率先提高到0.9828,后反而下降。
+
+因此,可认为对于大小为128的batch,0.2是较为合适的学习率。
+
+之后,维持学习率为0.2, 修改batch_size 为256, 重新训练:
+
+```
+[0] Test Accuracy: 0.9453
+[1] Test Accuracy: 0.9621
+[2] Test Accuracy: 0.9657
+[3] Test Accuracy: 0.9629
+[4] Test Accuracy: 0.9733
+[5] Test Accuracy: 0.9766
+[6] Test Accuracy: 0.9721
+[7] Test Accuracy: 0.9768
+[8] Test Accuracy: 0.9724
+[9] Test Accuracy: 0.9775
+```
+
+batch_size增大后,每个batch更新一次参数,参数更新的频率更低,从而收敛速度有所降低;但是对比本次实验与前几次实验loss的曲线图,可发现振荡幅度更小。
+
+将batch_size减小到64, 重新实验:
+
+```
+[0] Test Accuracy: 0.9526
+[1] Test Accuracy: 0.9674
+[2] Test Accuracy: 0.9719
+[3] Test Accuracy: 0.9759
+[4] Test Accuracy: 0.9750
+[5] Test Accuracy: 0.9748
+[6] Test Accuracy: 0.9772
+[7] Test Accuracy: 0.9791
+[8] Test Accuracy: 0.9820
+[9] Test Accuracy: 0.9823
+```
+
+loss的下降速度增加,但是振荡幅度变大了。
+
+总结:在一定范围之内,随着学习率的增大,模型收敛速度增加;随着batch_size的减小,模型收敛速度也会有一定增加,但是振荡幅度增大。 学习率过大会导致后期loss振荡、难以收敛;学习率过小则会导致loss下降速度过慢,甚至可能陷入局部最小值而错过更好的最低点。
+
+## 其他优化方式实现
+
+### momentum
+
+普通梯度下降每次更新参数仅仅取决于当前batch的梯度,这可能会让梯度方向受到某些特殊的输入影响。Momentum引入了动量,让当前更新不仅取决于当前的梯度,还考虑到先前的梯度,能够在一定程度上保持一段时间的趋势。momentum的计算方式为:
+
+$$
+\begin{align}
+& v = \alpha v - \gamma \frac{\partial L}{\partial W} \\\\
+& W = W + v
+\end{align}
+$$
+
+我们在`numpy_fnn.py`的模型中实现了Momentum的优化方法。 设置学习率为0.02,batch_size为128, 继续实验:
+
+```
+[0] Test Accuracy: 0.9586
+[1] Test Accuracy: 0.9717
+[2] Test Accuracy: 0.9743
+[3] Test Accuracy: 0.9769
+[4] Test Accuracy: 0.9778
+[5] Test Accuracy: 0.9786
+[6] Test Accuracy: 0.9782
+[7] Test Accuracy: 0.9809
+[8] Test Accuracy: 0.9790
+[9] Test Accuracy: 0.9818
+```
+
+momentum 相比传统梯度下降,不一定最后会得到更好的效果。当加入动量,当前梯度方向与动量方向相同时,参数就会得到更大幅度的调整,因此loss下降速度更快,并且前期动量基本上会积累起来,如果使用过大的学习率,很容易会溢出。所以momentum适合的学习率比普通梯度下降要小一个数量级。 而当梯度方向错误的时候,加入动量会使得参数来不及更新,从而错过最小值。
+
+### RMSProp
+
+
+RMSProp引入了自适应的学习率调节。 在训练前期,学习率应该较高,使得loss能快速下降;但随着训练迭代增加,学习率应该不断减小,使得模型能够更好地收敛。 自适应调整学习率的基本思路是根据梯度来调节,梯度越大,学习率就衰减得越快;后期梯度减小,学习率衰减就更加缓慢。
+
+而为了避免前期学习率衰减得过快,RMSProp还用了指数平均的方法,来缓慢丢弃原来的梯度历史。计算方法为:
+
+$$
+\begin{align}
+& h = \rho h + (1-\rho) \frac{\partial L}{\partial W} \odot \frac{\partial L}{\partial W} \\\\
+& W = W - \gamma \frac{1}{\sqrt{\delta + h}} \frac{\partial L}{\partial W}
+\end{align}$$
+
+设置梯度为0.001, weight_decay 为0.01, 进行训练和测试:
+
+
+```
+[0] Test Accuracy: 0.9663
+[1] Test Accuracy: 0.9701
+[2] Test Accuracy: 0.9758
+[3] Test Accuracy: 0.9701
+[4] Test Accuracy: 0.9748
+[5] Test Accuracy: 0.9813
+[6] Test Accuracy: 0.9813
+[7] Test Accuracy: 0.9819
+[8] Test Accuracy: 0.9822
+[9] Test Accuracy: 0.9808
+```
+
+可见,在训练的中间部分,loss振荡幅度比普通梯度下降更小。训练前期,模型的收敛速度更快,但到后期比起普通梯度下降并无明显优势。
+
+### Adam
+
+Adam 同时结合了动量与自适应的学习率调节。Adam首先要计算梯度的一阶和二阶矩估计,分别代表了动量与自适应的部分:
+
+$$
+\begin{align}
+& \mathrm{m} = \beta_1 \mathrm{m} + (1-\beta_1) \frac{\partial L}{\partial W} \\\\
+& \mathrm{v} = \beta_2 \mathrm{v} + (1-\beta_2) \frac{\partial L}{\partial W} \odot \frac{\partial L}{\partial W}
+\end{align}
+$$
+
+然后进行修正:
+
+$$
+\begin{align}
+& \mathrm{\hat{m}} = \frac{\mathrm{m}}{1-\beta_1 ^ t }\\\\
+& \mathrm{\hat{v}} = \frac{\mathrm{v}}{1-\beta_2 ^ t}
+\end{align}
+$$
+
+最后,参数的更新为:
+$$ W = W - \gamma \frac{\mathrm{\hat m}}{\sqrt{\mathrm{\hat v}+ \delta}}$$
+
+
+设置学习率为0.001, batch_size为128, 开始训练:
+
+```
+[0] Test Accuracy: 0.9611
+[1] Test Accuracy: 0.9701
+[2] Test Accuracy: 0.9735
+[3] Test Accuracy: 0.9752
+[4] Test Accuracy: 0.9787
+[5] Test Accuracy: 0.9788
+[6] Test Accuracy: 0.9763
+[7] Test Accuracy: 0.9790
+[8] Test Accuracy: 0.9752
+[9] Test Accuracy: 0.9806
+
+```
+
+相比传统梯度下降,loss振荡略微有所减小,前期loss下降速度略微更快,但是最后收敛的速度相当。
\ No newline at end of file
diff --git a/assignment-2/submission/17307130331/img/backgraph.png b/assignment-2/submission/17307130331/img/backgraph.png
new file mode 100644
index 0000000000000000000000000000000000000000..c4a70b28e869708641bd01dba83730ed62ab9c4d
Binary files /dev/null and b/assignment-2/submission/17307130331/img/backgraph.png differ
diff --git a/assignment-2/submission/17307130331/img/compu_graph.png b/assignment-2/submission/17307130331/img/compu_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..74f02ff1b4c4795c99600fb2e358d23a170f11c1
Binary files /dev/null and b/assignment-2/submission/17307130331/img/compu_graph.png differ
diff --git a/assignment-2/submission/17307130331/img/momentum.png b/assignment-2/submission/17307130331/img/momentum.png
new file mode 100644
index 0000000000000000000000000000000000000000..152bfe4eda8bf98cb271e9e3af3801f223273ec2
Binary files /dev/null and b/assignment-2/submission/17307130331/img/momentum.png differ
diff --git a/assignment-2/submission/17307130331/img/rmsprop.png b/assignment-2/submission/17307130331/img/rmsprop.png
new file mode 100644
index 0000000000000000000000000000000000000000..d4c9f6d651ea0dcac312c3a7dcb38266a477679c
Binary files /dev/null and b/assignment-2/submission/17307130331/img/rmsprop.png differ
diff --git a/assignment-2/submission/17307130331/img/train.png b/assignment-2/submission/17307130331/img/train.png
new file mode 100644
index 0000000000000000000000000000000000000000..618816332b78c4f0498444a42dd2a5028df91ef1
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train.png differ
diff --git a/assignment-2/submission/17307130331/img/train02.png b/assignment-2/submission/17307130331/img/train02.png
new file mode 100644
index 0000000000000000000000000000000000000000..a2cbc7b9ccbf2f28955902b86881d7a640f50fa7
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train02.png differ
diff --git a/assignment-2/submission/17307130331/img/train03.png b/assignment-2/submission/17307130331/img/train03.png
new file mode 100644
index 0000000000000000000000000000000000000000..41dd8fd9060e6774b983375f3b025ee6335b9f66
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train03.png differ
diff --git a/assignment-2/submission/17307130331/img/train10.png b/assignment-2/submission/17307130331/img/train10.png
new file mode 100644
index 0000000000000000000000000000000000000000..a2056ba0d21f8f40fc0279e532fd6b9f1ff79cef
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train10.png differ
diff --git a/assignment-2/submission/17307130331/img/train256.png b/assignment-2/submission/17307130331/img/train256.png
new file mode 100644
index 0000000000000000000000000000000000000000..81aa1b2bcc7f708607f8c402f9f41d579793f9e1
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train256.png differ
diff --git a/assignment-2/submission/17307130331/img/train64.png b/assignment-2/submission/17307130331/img/train64.png
new file mode 100644
index 0000000000000000000000000000000000000000..8f34749c6fda428437ff3fe11292b0213eca0d7a
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train64.png differ
diff --git a/assignment-2/submission/17307130331/img/train_adam.png b/assignment-2/submission/17307130331/img/train_adam.png
new file mode 100644
index 0000000000000000000000000000000000000000..eefa8b27deb6485f895033add750f018fd14e293
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train_adam.png differ
diff --git a/assignment-2/submission/17307130331/img/trainloss.png b/assignment-2/submission/17307130331/img/trainloss.png
new file mode 100644
index 0000000000000000000000000000000000000000..b845297f03d5d6e6ae2b026b25554519a77f471b
Binary files /dev/null and b/assignment-2/submission/17307130331/img/trainloss.png differ
diff --git a/assignment-2/submission/17307130331/numpy_fnn.py b/assignment-2/submission/17307130331/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b32d95b7825b4787f5d226ac058c0039aee4bba
--- /dev/null
+++ b/assignment-2/submission/17307130331/numpy_fnn.py
@@ -0,0 +1,208 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        
+        ####################
+        #      code 1      #
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+        grad_x = np.matmul(grad_y, self.memory['W'].T)
+        ####################
+        
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 2      #
+        ####################
+        grad_x = np.where(self.memory['x'] > 0, np.ones_like(self.memory['x']), np.zeros_like(self.memory['x'])) * grad_y # 元素乘积
+        
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 3      #
+        ####################
+        grad_x = (1/(self.memory['x'] + self.epsilon)) * grad_y
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        ####################
+        #      code 4      #
+        ####################
+        exp_x = np.exp(x)
+        out = exp_x/np.sum(exp_x, axis=1, keepdims=True)
+        self.memory['x'] = x
+        self.memory['out'] = out
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        o = self.memory['out']
+        Jacob = np.array([np.diag(r) - np.outer(r, r) for r in o]) 
+        # i!=j  - oi* oj
+        # i==j  oi*(1-oi)
+        grad_y = grad_y[:, np.newaxis, :]
+        grad_x = np.matmul(grad_y, Jacob).squeeze(1)
+        #print(grad_x.shape)
+        #print(grad_x)
+        return grad_x
+
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+        
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+        
+        # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+        
+        # 以下变量是在 momentum\rmsprop中使用的
+        self.v1 = np.zeros_like(self.W1)
+        self.v2 = np.zeros_like(self.W2)
+        self.v3 = np.zeros_like(self.W3)
+        
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+        x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+        
+        x = self.matmul_3.forward(x, self.W3)
+        
+        x = self.softmax.forward(x)
+        x = self.log.forward(x)
+        
+        return x
+    
+    def backward(self, y):
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+        self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+        self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+        self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+        
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
+        
+    def momentum(self, learning_rate, alpha=0.9):
+        self.v1 = self.v1 * alpha - learning_rate * self.W1_grad
+        self.v2 = self.v2 * alpha - learning_rate * self.W2_grad
+        self.v3 = self.v3 * alpha - learning_rate * self.W3_grad
+        
+        self.W1 += self.v1
+        self.W2 += self.v2
+        self.W3 += self.v3
+    
+    def RMSProp(self, learning_rate, weight_decay = 0.99):
+        self.v1 = self.v1 * weight_decay + (1-weight_decay) * self.W1_grad * self.W1_grad
+        self.v2 = self.v2 * weight_decay + (1-weight_decay) * self.W2_grad * self.W2_grad
+        self.v3 = self.v3 * weight_decay + (1-weight_decay) * self.W3_grad * self.W3_grad
+        
+        self.W1 = self.W1 - learning_rate * self.W1_grad / np.sqrt( self.v1 + 1e-7)
+        self.W2 = self.W2 - learning_rate * self.W2_grad / np.sqrt( self.v2 + 1e-7)
+        self.W3 = self.W3 - learning_rate * self.W3_grad / np.sqrt( self.v3 + 1e-7)
+    
+    
+
+        
+        
+        
+        
\ No newline at end of file
diff --git a/assignment-2/submission/17307130331/numpy_mnist.py b/assignment-2/submission/17307130331/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..4187f01eeebbbcd6ab48bfacf8dedc37085e46e2
--- /dev/null
+++ b/assignment-2/submission/17307130331/numpy_mnist.py
@@ -0,0 +1,70 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, get_torch_initialization, plot_curve, one_hot
+
+def mini_batch(dataset, batch_size=128):
+    data = np.array([each[0].numpy() for each in dataset])
+    label = np.array([each[1] for each in dataset])
+
+    data_size = data.shape[0]
+    idx = np.array([i for i in range(data_size)])
+    np.random.shuffle(idx)
+    
+    return [(data[idx[i: i+batch_size]], label[idx[i:i+batch_size]])  for i in range(0, data_size, batch_size)]
+
+class Adam():
+    def __init__(self, param, learning_rate=0.001, beta_1=0.9, beta_2=0.999):
+        self.param = param
+        self.iter = 0
+        self.m = 0
+        self.v = 0
+        self.beta1 = beta_1
+        self.beta2 = beta_2
+        self.lr = learning_rate
+    def optimize(self, grad):
+        self.iter+=1
+        self.m = self.beta1 * self.m + (1 - self.beta1) * grad
+        self.v = self.beta2 * self.v + (1 - self.beta2) * grad * grad
+        m_hat = self.m / (1 - self.beta1 ** self.iter)
+        v_hat = self.v / (1 - self.beta2 ** self.iter)
+        self.param -= self.lr * m_hat / (v_hat ** 0.5 + 1e-8)
+        return self.param
+        
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    W1_opt, W2_opt, W3_opt = Adam(model.W1), Adam(model.W2), Adam(model.W3)
+    
+    train_loss = []
+    
+    epoch_number = 10
+    learning_rate = 0.0015
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, batch_size=128):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            #model.Adam(learning_rate)
+            W1_opt.optimize(model.W1_grad)
+            W2_opt.optimize(model.W2_grad)
+            W3_opt.optimize(model.W3_grad)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Test Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+            
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/17307130331/tester_demo.py b/assignment-2/submission/17307130331/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..515b86c1240eebad83287461548530c944f23bc8
--- /dev/null
+++ b/assignment-2/submission/17307130331/tester_demo.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+    if isinstance(numpy_result, list) and torch_result is None:
+        flag = True
+        for (n, t) in numpy_result:
+            flag = flag and check_result(n, t)
+        return flag
+    # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+    T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+    direction = T / torch_result.numel() < err_p
+    return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+    x = np.random.normal(size=[5, 6])
+    W = np.random.normal(size=[6, 4])
+    
+    numpy_matmul = Matmul()
+    numpy_out = numpy_matmul.forward(x, W)
+    numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    torch_W = torch.from_numpy(W).clone().requires_grad_()
+    
+    torch_out = torch_matmul(torch_x, torch_W)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+        (numpy_W_grad, torch_W.grad)
+    ])
+
+
+def case_2():
+    x = np.random.normal(size=[5, 6])
+    
+    numpy_relu = Relu()
+    numpy_out = numpy_relu.forward(x)
+    numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_relu(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_3():
+    x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+    
+    numpy_log = Log()
+    numpy_out = numpy_log.forward(x)
+    numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_log(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_4():
+    x = np.random.normal(size=[4, 5])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    
+    return check_result(numpy_out, torch_out)
+
+
+def case_5():
+    x = np.random.normal(size=[20, 25])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+    torch_out = torch_softmax(torch_x, 1)
+    torch_out.sum().backward()
+
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def test_model():
+    try:
+        numpy_loss = NumpyLoss()
+        numpy_model = NumpyModel()
+        torch_model = TorchModel()
+        torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+        numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+        numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+        numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+        
+        x = torch.randn((10000, 28, 28))
+        y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+        
+        y = one_hot(y, numpy=False)
+        x2 = x.numpy()
+        y_pred = torch_model.forward(x)
+        loss = (-y_pred * y).sum(dim=1).mean()
+        loss.backward()
+        
+        y_pred_numpy = numpy_model.forward(x2)
+        numpy_loss.get_loss(y_pred_numpy, y.numpy())
+        
+        check_flag_1 = check_result(y_pred_numpy, y_pred)
+        print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+    except:
+        print("[Runtime Error in forward]")
+        print("+ {:12} {}/{}".format("forward", 0, 10))
+        return 0
+    
+    try:
+        
+        numpy_model.backward(numpy_loss.backward())
+        
+        check_flag_2 = [
+            check_result(numpy_model.log_grad, torch_model.log_input.grad),
+            check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+            check_result(numpy_model.W3_grad, torch_model.W3.grad),
+            check_result(numpy_model.W2_grad, torch_model.W2.grad),
+            check_result(numpy_model.W1_grad, torch_model.W1.grad)
+        ]
+        check_flag_2 = sum(check_flag_2) >= 4
+        print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+    except:
+        print("[Runtime Error in backward]")
+        print("+ {:12} {}/{}".format("backward", 0, 20))
+        check_flag_2 = False
+    
+    return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+    testcases = [
+        ["matmul", case_1, 5],
+        ["relu", case_2, 5],
+        ["log", case_3, 5],
+        ["softmax_1", case_4, 5],
+        ["softmax_2", case_5, 10],
+    ]
+    score = 0
+    for case in testcases:
+        try:
+            res = case[2] if case[1]() else 0
+        except:
+            print("[Runtime Error in {}]".format(case[0]))
+            res = 0
+        score += res
+        print("+ {:12} {}/{}".format(case[0], res, case[2]))
+    score += test_model()
+    print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/17307130331/torch_mnist.py b/assignment-2/submission/17307130331/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/17307130331/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+    
+    def __init__(self):
+        self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+        self.W2 = torch.randn((256, 64), requires_grad=True)
+        self.W3 = torch.randn((64, 10), requires_grad=True)
+        self.softmax_input = None
+        self.log_input = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = torch.relu(torch.matmul(x, self.W1))
+        x = torch.relu(torch.matmul(x, self.W2))
+        x = torch.matmul(x, self.W3)
+        
+        self.softmax_input = x
+        self.softmax_input.retain_grad()
+        
+        x = torch.softmax(x, 1)
+        
+        self.log_input = x
+        self.log_input.retain_grad()
+        
+        x = torch.log(x)
+        
+        return x
+    
+    def optimize(self, learning_rate):
+        with torch.no_grad():
+            self.W1 -= learning_rate * self.W1.grad
+            self.W2 -= learning_rate * self.W2.grad
+            self.W3 -= learning_rate * self.W3.grad
+            
+            self.W1.grad = None
+            self.W2.grad = None
+            self.W3.grad = None
+
+
+def torch_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = TorchModel()
+    model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, numpy=False):
+            y = one_hot(y, numpy=False)
+            
+            y_pred = model.forward(x)
+            loss = (-y_pred * y).sum(dim=1).mean()
+            loss.backward()
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset, numpy=False)[0]
+        accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    torch_run()
diff --git a/assignment-2/submission/17307130331/utils.py b/assignment-2/submission/17307130331/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..709220cfa7a924d914ec1c098c505f864bcd4cfc
--- /dev/null
+++ b/assignment-2/submission/17307130331/utils.py
@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+
+def plot_curve(data):
+    plt.plot(range(len(data)), data, color='blue')
+    plt.legend(['loss_value'], loc='upper right')
+    plt.xlabel('step')
+    plt.ylabel('value')
+    plt.show()
+
+
+def download_mnist():
+    from torchvision import datasets, transforms
+    
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+    ])
+    
+    train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+    test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+    
+    return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+    if numpy:
+        y_ = np.zeros((y.shape[0], 10))
+        y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+        return y_
+    else:
+        y_ = torch.zeros((y.shape[0], 10))
+        y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+    return y_
+
+
+def batch(dataset, numpy=True):
+    data = []
+    label = []
+    for each in dataset:
+        data.append(each[0])
+        label.append(each[1])
+    data = torch.stack(data)
+    label = torch.LongTensor(label)
+    if numpy:
+        return [(data.numpy(), label.numpy())]
+    else:
+        return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+    fc1 = torch.nn.Linear(28 * 28, 256)
+    fc2 = torch.nn.Linear(256, 64)
+    fc3 = torch.nn.Linear(64, 10)
+    
+    if numpy:
+        W1 = fc1.weight.T.detach().clone().numpy()
+        W2 = fc2.weight.T.detach().clone().numpy()
+        W3 = fc3.weight.T.detach().clone().numpy()
+    else:
+        W1 = fc1.weight.T.detach().clone().data
+        W2 = fc2.weight.T.detach().clone().data
+        W3 = fc3.weight.T.detach().clone().data
+    
+    return W1, W2, W3
diff --git a/assignment-2/submission/18307130090/README.md b/assignment-2/submission/18307130090/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..647eb99d08956f5fea84c6aa563ab3e1576cfcc6
--- /dev/null
+++ b/assignment-2/submission/18307130090/README.md
@@ -0,0 +1,276 @@
+# PRML-2021 Assignment2
+
+姓名:夏海淞
+
+学号:18307130090
+
+## 简述
+
+在本次实验中,我通过`NumPy`实现了一个简单的前馈神经网络,其中包括`numpy_fnn.py`中算子的反向传播以及前馈神经网络模型的构建。为了验证模型效果,我在MNIST数据集上进行了训练和测试。此外,我还实现了`Momentum`和`Adam`优化算法,并比较了它们的性能。
+
+## 算子的反向传播
+
+### `Matmul`
+
+`Matmul`的计算公式为:
+$$
+Y=X\times W
+$$
+其中$Y,X,W$分别为$n\times d',n\times d,d\times d'$的矩阵。
+
+由[神经网络与深度学习-邱锡鹏](https://nndl.github.io/nndl-book.pdf)中公式(B.20)和(B.21),有
+$$
+\frac{\partial Y}{\partial W}=\frac{\partial(X\times W)}{\partial W}=X^T\\\\
+\frac{\partial Y}{\partial X}=\frac{\partial(X\times W)}{\partial X}=W^T
+$$
+结合链式法则和矩阵运算法则,可得
+$$
+\nabla_X=\nabla_Y\times W^T\\\\
+\nabla_W=X^T\times \nabla_Y
+$$
+
+### `Relu`
+
+`Relu`的计算公式为:
+$$
+Y_{ij}=\begin{cases}
+X_{ij}&X_{ij}\ge0\\\\
+0&\text{otherwise}
+\end{cases}
+$$
+因此有
+$$
+\frac{\partial Y_{ij}}{\partial X_{ij}}=\begin{cases}
+1&X_{ij}>0\\\\
+0&\text{otherwise}
+\end{cases}
+$$
+结合链式法则,得到反向传播的计算公式:$\nabla_{Xij}=\nabla_{Yij}\cdot\frac{\partial Y_{ij}}{\partial X_{ij}}$
+
+### `Log`
+
+`Log`的计算公式为
+$$
+Y_{ij}=\ln(X_{ij}+\epsilon),\epsilon=10^{-12}
+$$
+因此有
+$$
+\frac{\partial Y_{ij}}{\partial X_{ij}}=\frac1{X_{ij}+\epsilon}
+$$
+结合链式法则,得到反向传播的计算公式:$\nabla_{Xij}=\nabla_{Yij}\cdot\frac{\partial Y_{ij}}{\partial {X_{ij}}}$
+
+### `Softmax`
+
+`Softmax`的计算公式为
+$$
+Y_{ij}=\frac{\exp\{X_{ij} \}}{\sum_{k=1}^c\exp\{X_{ik} \}}
+$$
+其中$Y,X$均为$N\times c$的矩阵。容易发现`Softmax`以$X$的每行作为单位进行运算。因此对于$X,Y$的行分量$X_k,Y_k$,有
+$$
+\frac{\partial Y_{ki}}{\partial X_{kj}}=\begin{cases}
+\frac{\exp\{X_{kj} \}(\sum_t\exp\{X_{kt}\})-\exp\{2X_{ki}\}}{(\sum_t\exp\{X_{kt}\})^2}=Y_{ki}(1-Y_{ki})&i=j\\\\
+-\frac{\exp\{X_{ki} \}\exp\{X_{kj} \}}{(\sum_t\exp\{X_{kt}\})^2}=-Y_{ki}Y_{kj}&i\not=j
+\end{cases}
+$$
+因此可计算得到$X_k,Y_k$的Jacob矩阵,满足$J_{ij}=\frac{\partial Y_{ki}}{\partial X_{kj}}$。结合链式法则,可得
+$$
+\nabla_X=\nabla_Y\times J
+$$
+将行分量组合起来,就得到了反向传播的最终结果。
+
+## 模型构建与训练
+
+### 模型构建
+
+#### `forward`
+
+参考`torch_mnist.py`中`TorchModel`方法的模型,使用如下代码构建:
+
+```python
+def forward(self, x):
+    x = x.reshape(-1, 28 * 28)
+
+    x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+    x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+    x = self.matmul_3.forward(x, self.W3)
+
+    x = self.log.forward(self.softmax.forward(x))
+    
+    return x
+```
+
+模型的计算图如下:
+
+
+
+#### `backward`
+
+根据模型的计算图,按照反向的计算顺序依次调用对应算子的反向传播算法即可。
+
+```python
+def backward(self, y):
+    self.log_grad = self.log.backward(y)
+    self.softmax_grad = self.softmax.backward(self.log_grad)
+    self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+    self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+    self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+    self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+    self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+    return self.x1_grad
+```
+
+#### `mini_batch`
+
+`mini_batch`的作用是提高模型的训练速度,同时得到较好的优化效果。传统的批处理方法对整个数据集计算平均的损失函数值,随后计算相应梯度进行反向传播。当训练数据集容量较大时,对训练速度造成严重影响;而随机方法则对数据集的每个样本计算损失函数值,随后计算相应梯度进行反向传播。此时数据集容量不对训练速度产生影响,然而由于样本的随机性,可能导致参数无法收敛到最优值,在最优值附近震荡。因此一个折中的方法是将数据集划分为若干批次,在提高训练速度的同时保证了较好的收敛效果。
+
+在本次实验中,我参照`utils.py`中的`mini_batch`,在`numpy_mnist.py`中重新实现了`mini_batch`方法:
+
+```python
+def mini_batch(dataset, batch_size=128):
+    data = np.array([np.array(each[0]) for each in dataset])
+    label = np.array([each[1] for each in dataset])
+
+    size = data.shape[0]
+    index = np.arange(size)
+    np.random.shuffle(index)
+
+    return [(data[index[i:i + batch_size]], label[index[i:i + batch_size]]) for i in range(0, size, batch_size)]
+```
+
+### 模型训练
+
+设定`learning_rate=0.1`,`batch_size=128`,`epoch_number=10`。训练结果如下:
+
+```
+[0] Accuracy: 0.9486
+[1] Accuracy: 0.9643
+[2] Accuracy: 0.9724
+[3] Accuracy: 0.9738
+[4] Accuracy: 0.9781
+[5] Accuracy: 0.9768
+[6] Accuracy: 0.9796
+[7] Accuracy: 0.9802
+[8] Accuracy: 0.9800
+[9] Accuracy: 0.9796
+```
+
+ +
+尝试缩减`batch_size`的大小,设定`batch_size=64`。训练结果如下:
+
+```
+[0] Accuracy: 0.9597
+[1] Accuracy: 0.9715
+[2] Accuracy: 0.9739
+[3] Accuracy: 0.9771
+[4] Accuracy: 0.9775
+[5] Accuracy: 0.9803
+[6] Accuracy: 0.9808
+[7] Accuracy: 0.9805
+[8] Accuracy: 0.9805
+[9] Accuracy: 0.9716
+```
+
+
+
+尝试缩减`batch_size`的大小,设定`batch_size=64`。训练结果如下:
+
+```
+[0] Accuracy: 0.9597
+[1] Accuracy: 0.9715
+[2] Accuracy: 0.9739
+[3] Accuracy: 0.9771
+[4] Accuracy: 0.9775
+[5] Accuracy: 0.9803
+[6] Accuracy: 0.9808
+[7] Accuracy: 0.9805
+[8] Accuracy: 0.9805
+[9] Accuracy: 0.9716
+```
+
+ +
+尝试降低`learning_rate`,设定`learning_rate=0.01`。训练结果如下:
+
+```
+[0] Accuracy: 0.8758
+[1] Accuracy: 0.9028
+[2] Accuracy: 0.9143
+[3] Accuracy: 0.9234
+[4] Accuracy: 0.9298
+[5] Accuracy: 0.9350
+[6] Accuracy: 0.9397
+[7] Accuracy: 0.9434
+[8] Accuracy: 0.9459
+[9] Accuracy: 0.9501
+```
+
+
+
+尝试降低`learning_rate`,设定`learning_rate=0.01`。训练结果如下:
+
+```
+[0] Accuracy: 0.8758
+[1] Accuracy: 0.9028
+[2] Accuracy: 0.9143
+[3] Accuracy: 0.9234
+[4] Accuracy: 0.9298
+[5] Accuracy: 0.9350
+[6] Accuracy: 0.9397
+[7] Accuracy: 0.9434
+[8] Accuracy: 0.9459
+[9] Accuracy: 0.9501
+```
+
+ +
+根据实验结果,可以得出以下结论:
+
+当学习率和批处理容量合适时,参数的收敛速度随着学习率的减小而减小,而参数的震荡幅度随着批处理容量的减小而增大。
+
+## 梯度下降算法的改进
+
+传统的梯度下降算法可以表述为:
+$$
+w_{t+1}=w_t-\eta\cdot\nabla f(w_t)
+$$
+尽管梯度下降作为优化算法被广泛使用,它依然存在一些缺点,主要表现为:
+
+- 参数修正方向完全由当前梯度决定,导致当学习率过高时参数可能在最优点附近震荡;
+- 学习率无法随着训练进度改变,导致训练前期收敛速度较慢,后期可能无法收敛。
+
+针对上述缺陷,产生了许多梯度下降算法的改进算法。其中较为典型的是`Momentum`算法和`Adam`算法。
+
+### `Momentum`
+
+针对“参数修正方向完全由当前梯度决定”的问题,`Momentum`引入了“动量”的概念。
+
+类比现实世界,当小球从高处向低处滚动时,其运动方向不仅与当前位置的“陡峭程度”相关,也和当前的速度,即先前位置的“陡峭程度”相关。因此在`Momentum`算法中,参数的修正值不是取决于当前梯度,而是取决于梯度的各时刻的指数移动平均值:
+$$
+m_t=\beta\cdot m_{t-1}+(1-\beta)\cdot\nabla f(w_t)\\\\
+w_{t+1}=w_t-\eta\cdot m_t
+$$
+指数移动平均值反映了参数调整时的“惯性”。当参数调整方向正确时,`Momentum`有助于加快训练速度,减少震荡的幅度;然而当参数调整方向错误时,`Momentum`会因为无法及时调整方向造成性能上的部分损失。
+
+使用`Momentum`算法的训练结果如下:
+
+```
+[0] Accuracy: 0.9444
+[1] Accuracy: 0.9627
+[2] Accuracy: 0.9681
+[3] Accuracy: 0.9731
+[4] Accuracy: 0.9765
+[5] Accuracy: 0.9755
+[6] Accuracy: 0.9768
+[7] Accuracy: 0.9790
+[8] Accuracy: 0.9794
+[9] Accuracy: 0.9819
+```
+
+
+
+根据实验结果,可以得出以下结论:
+
+当学习率和批处理容量合适时,参数的收敛速度随着学习率的减小而减小,而参数的震荡幅度随着批处理容量的减小而增大。
+
+## 梯度下降算法的改进
+
+传统的梯度下降算法可以表述为:
+$$
+w_{t+1}=w_t-\eta\cdot\nabla f(w_t)
+$$
+尽管梯度下降作为优化算法被广泛使用,它依然存在一些缺点,主要表现为:
+
+- 参数修正方向完全由当前梯度决定,导致当学习率过高时参数可能在最优点附近震荡;
+- 学习率无法随着训练进度改变,导致训练前期收敛速度较慢,后期可能无法收敛。
+
+针对上述缺陷,产生了许多梯度下降算法的改进算法。其中较为典型的是`Momentum`算法和`Adam`算法。
+
+### `Momentum`
+
+针对“参数修正方向完全由当前梯度决定”的问题,`Momentum`引入了“动量”的概念。
+
+类比现实世界,当小球从高处向低处滚动时,其运动方向不仅与当前位置的“陡峭程度”相关,也和当前的速度,即先前位置的“陡峭程度”相关。因此在`Momentum`算法中,参数的修正值不是取决于当前梯度,而是取决于梯度的各时刻的指数移动平均值:
+$$
+m_t=\beta\cdot m_{t-1}+(1-\beta)\cdot\nabla f(w_t)\\\\
+w_{t+1}=w_t-\eta\cdot m_t
+$$
+指数移动平均值反映了参数调整时的“惯性”。当参数调整方向正确时,`Momentum`有助于加快训练速度,减少震荡的幅度;然而当参数调整方向错误时,`Momentum`会因为无法及时调整方向造成性能上的部分损失。
+
+使用`Momentum`算法的训练结果如下:
+
+```
+[0] Accuracy: 0.9444
+[1] Accuracy: 0.9627
+[2] Accuracy: 0.9681
+[3] Accuracy: 0.9731
+[4] Accuracy: 0.9765
+[5] Accuracy: 0.9755
+[6] Accuracy: 0.9768
+[7] Accuracy: 0.9790
+[8] Accuracy: 0.9794
+[9] Accuracy: 0.9819
+```
+
+ +
+可以看出相较传统的梯度下降算法并无明显优势。
+
+### `Adam`
+
+针对“学习率无法随着训练进度改变”的问题,`Adam`在`Momentum`的基础上引入了“二阶动量”的概念。
+
+`Adam`的改进思路为:由于神经网络中存在大量参数,不同参数的调整频率存在差别。对于频繁更新的参数,我们希望适当降低其学习率,提高收敛概率;而对于其他参数,我们希望适当增大其学习率,加快收敛速度。同时,参数的调整频率可能发生动态改变,我们也希望学习率能够随之动态调整。
+
+因为参数的调整值与当前梯度直接相关,因此取历史梯度的平方和作为衡量参数调整频率的标准。如果历史梯度平方和较大,表明参数被频繁更新,需要降低其学习率。因此梯度下降算法改写为:
+$$
+m_t=\beta\cdot m_{t-1}+(1-\beta)\cdot\nabla f(w_t)\\\\
+V_t=V_{t-1}+\nabla^2f(w_t)\\\\
+w_{t+1}=w_t-\frac\eta{\sqrt{V_t}}\cdot m_t
+$$
+然而,由于$V_t$关于$t$单调递增,可能导致训练后期学习率过低,参数无法收敛至最优。因此将$V_t$也改为指数移动平均值,避免了上述缺陷:
+$$
+m_t=\beta_1\cdot m_{t-1}+(1-\beta_1)\cdot\nabla f(w_t)\\\\
+V_t=\beta_2\cdot V_{t-1}+(1-\beta_2)\cdot\nabla^2f(w_t)\\\\
+w_{t+1}=w_t-\frac\eta{\sqrt{V_t}}\cdot m_t
+$$
+使用`Adam`算法的训练结果如下:
+
+```
+[0] Accuracy: 0.9657
+[1] Accuracy: 0.9724
+[2] Accuracy: 0.9759
+[3] Accuracy: 0.9769
+[4] Accuracy: 0.9788
+[5] Accuracy: 0.9778
+[6] Accuracy: 0.9775
+[7] Accuracy: 0.9759
+[8] Accuracy: 0.9786
+[9] Accuracy: 0.9779
+```
+
+
+
+可以看出相较传统的梯度下降算法并无明显优势。
+
+### `Adam`
+
+针对“学习率无法随着训练进度改变”的问题,`Adam`在`Momentum`的基础上引入了“二阶动量”的概念。
+
+`Adam`的改进思路为:由于神经网络中存在大量参数,不同参数的调整频率存在差别。对于频繁更新的参数,我们希望适当降低其学习率,提高收敛概率;而对于其他参数,我们希望适当增大其学习率,加快收敛速度。同时,参数的调整频率可能发生动态改变,我们也希望学习率能够随之动态调整。
+
+因为参数的调整值与当前梯度直接相关,因此取历史梯度的平方和作为衡量参数调整频率的标准。如果历史梯度平方和较大,表明参数被频繁更新,需要降低其学习率。因此梯度下降算法改写为:
+$$
+m_t=\beta\cdot m_{t-1}+(1-\beta)\cdot\nabla f(w_t)\\\\
+V_t=V_{t-1}+\nabla^2f(w_t)\\\\
+w_{t+1}=w_t-\frac\eta{\sqrt{V_t}}\cdot m_t
+$$
+然而,由于$V_t$关于$t$单调递增,可能导致训练后期学习率过低,参数无法收敛至最优。因此将$V_t$也改为指数移动平均值,避免了上述缺陷:
+$$
+m_t=\beta_1\cdot m_{t-1}+(1-\beta_1)\cdot\nabla f(w_t)\\\\
+V_t=\beta_2\cdot V_{t-1}+(1-\beta_2)\cdot\nabla^2f(w_t)\\\\
+w_{t+1}=w_t-\frac\eta{\sqrt{V_t}}\cdot m_t
+$$
+使用`Adam`算法的训练结果如下:
+
+```
+[0] Accuracy: 0.9657
+[1] Accuracy: 0.9724
+[2] Accuracy: 0.9759
+[3] Accuracy: 0.9769
+[4] Accuracy: 0.9788
+[5] Accuracy: 0.9778
+[6] Accuracy: 0.9775
+[7] Accuracy: 0.9759
+[8] Accuracy: 0.9786
+[9] Accuracy: 0.9779
+```
+
+ +
+可以看出相较传统的梯度下降算法,损失函数值的震荡幅度有所减小,而收敛速度与传统方法相当。
\ No newline at end of file
diff --git a/assignment-2/submission/18307130090/img/Adam.png b/assignment-2/submission/18307130090/img/Adam.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe0326ebad52ad9356bdd7410834d9d61e9e5152
Binary files /dev/null and b/assignment-2/submission/18307130090/img/Adam.png differ
diff --git a/assignment-2/submission/18307130090/img/SGDM.png b/assignment-2/submission/18307130090/img/SGDM.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba7ad91c5569f2605e7944afe3803863b8072b46
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGDM.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_batch_size.png b/assignment-2/submission/18307130090/img/SGD_batch_size.png
new file mode 100644
index 0000000000000000000000000000000000000000..328c4cc7bf90ef75a09f8c97ee8e9134d44a33dd
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_batch_size.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_learning_rate.png b/assignment-2/submission/18307130090/img/SGD_learning_rate.png
new file mode 100644
index 0000000000000000000000000000000000000000..7bca928d1aa569b08dad43d761da1b6e27e02942
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_learning_rate.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_normal.png b/assignment-2/submission/18307130090/img/SGD_normal.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6f3933e1bf979fa7b3b643d8f7fe823610109e9
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_normal.png differ
diff --git a/assignment-2/submission/18307130090/img/fnn_model.png b/assignment-2/submission/18307130090/img/fnn_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..29ed50732a88ed1ca38a1cb3c6e82099a3d3e087
Binary files /dev/null and b/assignment-2/submission/18307130090/img/fnn_model.png differ
diff --git a/assignment-2/submission/18307130090/numpy_fnn.py b/assignment-2/submission/18307130090/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7010cad4609f7ae31b8bdc0b19cedc005c5b950c
--- /dev/null
+++ b/assignment-2/submission/18307130090/numpy_fnn.py
@@ -0,0 +1,239 @@
+import numpy as np
+
+
+class NumpyOp:
+
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        x, W = self.memory['x'], self.memory['W']
+        grad_x = np.matmul(grad_y, W.T)
+        grad_W = np.matmul(x.T, grad_y)
+
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        x = self.memory['x']
+        grad_x = grad_y * np.where(x > 0, np.ones_like(x), np.zeros_like(x))
+
+        return grad_x
+
+
+class Log(NumpyOp):
+
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+
+        return out
+
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        x = self.memory['x']
+        grad_x = grad_y * np.reciprocal(x + self.epsilon)
+
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        exp_x = np.exp(x - x.max())
+        exp_sum = np.sum(exp_x, axis=1, keepdims=True)
+        out = exp_x / exp_sum
+        self.memory['x'] = x
+        self.memory['out'] = out
+
+        return out
+
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        sm = self.memory['out']
+        Jacobs = np.array([np.diag(r) - np.outer(r, r) for r in sm])
+
+        grad_y = grad_y[:, np.newaxis, :]
+        grad_x = np.matmul(grad_y, Jacobs).squeeze(axis=1)
+
+        return grad_x
+
+
+class NumpyLoss:
+
+    def __init__(self):
+        self.target = None
+
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+
+        # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+
+        self.beta_1 = 0.9
+        self.beta_2 = 0.999
+        self.epsilon = 1e-8
+        self.is_first = True
+
+        self.W1_grad_mean = None
+        self.W2_grad_mean = None
+        self.W3_grad_mean = None
+
+        self.W1_grad_square_mean = None
+        self.W2_grad_square_mean = None
+        self.W3_grad_square_mean = None
+
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+
+        x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+        x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+        x = self.matmul_3.forward(x, self.W3)
+
+        x = self.log.forward(self.softmax.forward(x))
+
+        return x
+
+    def backward(self, y):
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+        self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+        self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+        self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+        return self.x1_grad
+
+    def optimize(self, learning_rate):
+        def SGD():
+            self.W1 -= learning_rate * self.W1_grad
+            self.W2 -= learning_rate * self.W2_grad
+            self.W3 -= learning_rate * self.W3_grad
+
+        def SGDM():
+            if self.is_first:
+                self.is_first = False
+
+                self.W1_grad_mean = self.W1_grad
+                self.W2_grad_mean = self.W2_grad
+                self.W3_grad_mean = self.W3_grad
+            else:
+                self.W1_grad_mean = self.beta_1 * self.W1_grad_mean + (1 - self.beta_1) * self.W1_grad
+                self.W2_grad_mean = self.beta_1 * self.W2_grad_mean + (1 - self.beta_1) * self.W2_grad
+                self.W3_grad_mean = self.beta_1 * self.W3_grad_mean + (1 - self.beta_1) * self.W3_grad
+
+            delta_1 = learning_rate * self.W1_grad_mean
+            delta_2 = learning_rate * self.W2_grad_mean
+            delta_3 = learning_rate * self.W3_grad_mean
+
+            self.W1 -= delta_1
+            self.W2 -= delta_2
+            self.W3 -= delta_3
+
+        def Adam(learning_rate=0.001):
+            if self.is_first:
+                self.is_first = False
+                self.W1_grad_mean = self.W1_grad
+                self.W2_grad_mean = self.W2_grad
+                self.W3_grad_mean = self.W3_grad
+
+                self.W1_grad_square_mean = np.square(self.W1_grad)
+                self.W2_grad_square_mean = np.square(self.W2_grad)
+                self.W3_grad_square_mean = np.square(self.W3_grad)
+
+                self.W1 -= learning_rate * self.W1_grad_mean
+                self.W2 -= learning_rate * self.W2_grad_mean
+                self.W3 -= learning_rate * self.W3_grad_mean
+            else:
+                self.W1_grad_mean = self.beta_1 * self.W1_grad_mean + (1 - self.beta_1) * self.W1_grad
+                self.W2_grad_mean = self.beta_1 * self.W2_grad_mean + (1 - self.beta_1) * self.W2_grad
+                self.W3_grad_mean = self.beta_1 * self.W3_grad_mean + (1 - self.beta_1) * self.W3_grad
+
+                self.W1_grad_square_mean = self.beta_2 * self.W1_grad_square_mean + (1 - self.beta_2) * np.square(
+                    self.W1_grad)
+                self.W2_grad_square_mean = self.beta_2 * self.W2_grad_square_mean + (1 - self.beta_2) * np.square(
+                    self.W2_grad)
+                self.W3_grad_square_mean = self.beta_2 * self.W3_grad_square_mean + (1 - self.beta_2) * np.square(
+                    self.W3_grad)
+
+                delta_1 = learning_rate * self.W1_grad_mean * np.reciprocal(
+                    np.sqrt(self.W1_grad_square_mean) + np.full_like(self.W1_grad_square_mean, self.epsilon))
+                delta_2 = learning_rate * self.W2_grad_mean * np.reciprocal(
+                    np.sqrt(self.W2_grad_square_mean) + np.full_like(self.W2_grad_square_mean, self.epsilon))
+                delta_3 = learning_rate * self.W3_grad_mean * np.reciprocal(
+                    np.sqrt(self.W3_grad_square_mean) + np.full_like(self.W3_grad_square_mean, self.epsilon))
+
+                self.W1 -= delta_1
+                self.W2 -= delta_2
+                self.W3 -= delta_3
+
+        # SGD()
+        # SGDM()
+        Adam()
diff --git a/assignment-2/submission/18307130090/numpy_mnist.py b/assignment-2/submission/18307130090/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d67f25824dabdc5791ae5cc96655affe8315e72
--- /dev/null
+++ b/assignment-2/submission/18307130090/numpy_mnist.py
@@ -0,0 +1,50 @@
+import numpy as np
+
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, get_torch_initialization, plot_curve, one_hot
+
+
+def mini_batch(dataset, batch_size=128):
+    data = np.array([np.array(each[0]) for each in dataset])
+    label = np.array([each[1] for each in dataset])
+
+    size = data.shape[0]
+    index = np.arange(size)
+    np.random.shuffle(index)
+
+    return [(data[index[i:i + batch_size]], label[index[i:i + batch_size]]) for i in range(0, size, batch_size)]
+
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+
+    train_loss = []
+
+    epoch_number = 10
+    learning_rate = 0.1
+
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+
+            train_loss.append(loss.item())
+
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130090/tester_demo.py b/assignment-2/submission/18307130090/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..504b3eef50a6df4d0aa433113136add50835e420
--- /dev/null
+++ b/assignment-2/submission/18307130090/tester_demo.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+    if isinstance(numpy_result, list) and torch_result is None:
+        flag = True
+        for (n, t) in numpy_result:
+            flag = flag and check_result(n, t)
+        return flag
+    # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+    T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+    direction = T / torch_result.numel() < err_p
+    return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+    x = np.random.normal(size=[5, 6])
+    W = np.random.normal(size=[6, 4])
+    
+    numpy_matmul = Matmul()
+    numpy_out = numpy_matmul.forward(x, W)
+    numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    torch_W = torch.from_numpy(W).clone().requires_grad_()
+    
+    torch_out = torch_matmul(torch_x, torch_W)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+        (numpy_W_grad, torch_W.grad)
+    ])
+
+
+def case_2():
+    x = np.random.normal(size=[5, 6])
+    
+    numpy_relu = Relu()
+    numpy_out = numpy_relu.forward(x)
+    numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_relu(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_3():
+    x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+    
+    numpy_log = Log()
+    numpy_out = numpy_log.forward(x)
+    numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_log(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_4():
+    x = np.random.normal(size=[4, 5])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    
+    return check_result(numpy_out, torch_out)
+
+
+def case_5():
+    x = np.random.normal(size=[20, 25])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def test_model():
+    try:
+        numpy_loss = NumpyLoss()
+        numpy_model = NumpyModel()
+        torch_model = TorchModel()
+        torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+        numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+        numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+        numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+        
+        x = torch.randn((10000, 28, 28))
+        y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+        
+        y = one_hot(y, numpy=False)
+        x2 = x.numpy()
+        y_pred = torch_model.forward(x)
+        loss = (-y_pred * y).sum(dim=1).mean()
+        loss.backward()
+        
+        y_pred_numpy = numpy_model.forward(x2)
+        numpy_loss.get_loss(y_pred_numpy, y.numpy())
+        
+        check_flag_1 = check_result(y_pred_numpy, y_pred)
+        print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+    except:
+        print("[Runtime Error in forward]")
+        print("+ {:12} {}/{}".format("forward", 0, 10))
+        return 0
+    
+    try:
+        
+        numpy_model.backward(numpy_loss.backward())
+        
+        check_flag_2 = [
+            check_result(numpy_model.log_grad, torch_model.log_input.grad),
+            check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+            check_result(numpy_model.W3_grad, torch_model.W3.grad),
+            check_result(numpy_model.W2_grad, torch_model.W2.grad),
+            check_result(numpy_model.W1_grad, torch_model.W1.grad)
+        ]
+        check_flag_2 = sum(check_flag_2) >= 4
+        print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+    except:
+        print("[Runtime Error in backward]")
+        print("+ {:12} {}/{}".format("backward", 0, 20))
+        check_flag_2 = False
+    
+    return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+    testcases = [
+        ["matmul", case_1, 5],
+        ["relu", case_2, 5],
+        ["log", case_3, 5],
+        ["softmax_1", case_4, 5],
+        ["softmax_2", case_5, 10],
+    ]
+    score = 0
+    for case in testcases:
+        try:
+            res = case[2] if case[1]() else 0
+        except:
+            print("[Runtime Error in {}]".format(case[0]))
+            res = 0
+        score += res
+        print("+ {:12} {}/{}".format(case[0], res, case[2]))
+    score += test_model()
+    print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/18307130090/torch_mnist.py b/assignment-2/submission/18307130090/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/18307130090/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+    
+    def __init__(self):
+        self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+        self.W2 = torch.randn((256, 64), requires_grad=True)
+        self.W3 = torch.randn((64, 10), requires_grad=True)
+        self.softmax_input = None
+        self.log_input = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = torch.relu(torch.matmul(x, self.W1))
+        x = torch.relu(torch.matmul(x, self.W2))
+        x = torch.matmul(x, self.W3)
+        
+        self.softmax_input = x
+        self.softmax_input.retain_grad()
+        
+        x = torch.softmax(x, 1)
+        
+        self.log_input = x
+        self.log_input.retain_grad()
+        
+        x = torch.log(x)
+        
+        return x
+    
+    def optimize(self, learning_rate):
+        with torch.no_grad():
+            self.W1 -= learning_rate * self.W1.grad
+            self.W2 -= learning_rate * self.W2.grad
+            self.W3 -= learning_rate * self.W3.grad
+            
+            self.W1.grad = None
+            self.W2.grad = None
+            self.W3.grad = None
+
+
+def torch_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = TorchModel()
+    model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, numpy=False):
+            y = one_hot(y, numpy=False)
+            
+            y_pred = model.forward(x)
+            loss = (-y_pred * y).sum(dim=1).mean()
+            loss.backward()
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset, numpy=False)[0]
+        accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    torch_run()
diff --git a/assignment-2/submission/18307130090/utils.py b/assignment-2/submission/18307130090/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..709220cfa7a924d914ec1c098c505f864bcd4cfc
--- /dev/null
+++ b/assignment-2/submission/18307130090/utils.py
@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+
+def plot_curve(data):
+    plt.plot(range(len(data)), data, color='blue')
+    plt.legend(['loss_value'], loc='upper right')
+    plt.xlabel('step')
+    plt.ylabel('value')
+    plt.show()
+
+
+def download_mnist():
+    from torchvision import datasets, transforms
+    
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+    ])
+    
+    train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+    test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+    
+    return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+    if numpy:
+        y_ = np.zeros((y.shape[0], 10))
+        y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+        return y_
+    else:
+        y_ = torch.zeros((y.shape[0], 10))
+        y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+    return y_
+
+
+def batch(dataset, numpy=True):
+    data = []
+    label = []
+    for each in dataset:
+        data.append(each[0])
+        label.append(each[1])
+    data = torch.stack(data)
+    label = torch.LongTensor(label)
+    if numpy:
+        return [(data.numpy(), label.numpy())]
+    else:
+        return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+    fc1 = torch.nn.Linear(28 * 28, 256)
+    fc2 = torch.nn.Linear(256, 64)
+    fc3 = torch.nn.Linear(64, 10)
+    
+    if numpy:
+        W1 = fc1.weight.T.detach().clone().numpy()
+        W2 = fc2.weight.T.detach().clone().numpy()
+        W3 = fc3.weight.T.detach().clone().numpy()
+    else:
+        W1 = fc1.weight.T.detach().clone().data
+        W2 = fc2.weight.T.detach().clone().data
+        W3 = fc3.weight.T.detach().clone().data
+    
+    return W1, W2, W3
diff --git a/assignment-2/submission/18307130104/README.md b/assignment-2/submission/18307130104/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1d38cfc70c1a72658e9d0fa1cf8569687ab9e45
--- /dev/null
+++ b/assignment-2/submission/18307130104/README.md
@@ -0,0 +1,179 @@
+18307130104
+
+# 课程报告
+
+这是 prml 的 assignment-2 课程报告,我的代码可以查看 numpy_fnn.py 中 code 1 ~ code 7 部分,以及 util.py 中 mini_batch 函数 numpy == True 的部分。
+
+在 assignment-2 中,完成了 numpy_fnn.py 中各种算子的反向传播,以及一个简单的前馈神经网络构建(包括正向传播和反向传播)。修改了 mini_batch,在 numpy == True 的情况下,不使用 torch 中的 dataloader 函数完成测试集的打乱和分批。
+
+## 模型实现
+
+为了区别矩阵乘法(np.matmul)和矩阵元素逐一做乘法(\*),下面用$\times$表示矩阵乘法,\*表示元素逐一相乘。
+
+### Matmul 算子的反向传播
+
+Matmul 算子输入一个 X 和权重 W,输出 $$[Y] = [X] \times [W]$$
+
+对于 Y 中的元素 $$Y_{ij}$$ 有$$Y_{ij}=\sum_{k}X_{ik} * W_{kj}$$
+
+在计算 grad_x 的时候,已知 grad_y,根据链式法则,可以得到 $gradx_{ij}=\sum_{k}\frac{\partial Y_{ik}}{\partial X_{ij}} * grady_{ik}$
+
+由 $Y_{ij}$的计算公式可以得到,$\frac{\partial Y_{ik}}{\partial X_{ij}}=W_{jk}$
+
+故 $gradx_{ij}=\sum_k W_{jk} *grady_{ik}$
+
+所以 $[gradx] = [grady] \times [W^T]$
+
+同理,可以得到$[gradW]=[x^T]\times [grady]$
+
+经过验证,矩阵的大小符合矩阵乘法规则。
+
+### Relu 算子的反向传播
+
+relu 函数的计算规则如下:
+
+$relu(x) = \begin{cases}0 & x < 0 \\\\ x & otherwise \end{cases}$
+
+求导可以得到
+
+$relu^{'}(x) = \begin{cases}0 & x < 0 \\\\ 1 & otherwise \end{cases}$
+
+故
+
+$[relugrad]=[grady]* [relu^{'}]$
+
+### Log 算子的反向传播
+
+$log(x) = \ln x$
+
+可以得到
+
+$log^{'}(x)=\frac 1 x$
+
+故
+
+$[loggrad]=[grady]* [log^{'}]$
+
+### softmax 算子的反向传播
+
+$softmax(x_i) = \frac {e^{x_i}}{\sum_j e^{x_j}}$
+
+在实现过程中,因为每一行代表一个测试数据点,所以以每一行为整体对每个元素进行 softmax 操作,从而达成对每个测试数据点进行分类的目的。
+
+采用 softmax 算子和交叉熵损失函数可以让损失函数的形式比较简单,但是遗憾的是实现的神经网络要求将两个算子的反向传播操作分开,因此没有办法投机取巧,只能分步进行计算。
+
+为了表达方便,不妨令 $a_i = softmax(x_i)$
+
+下面考虑$a_i$对$x_j$的反向传播。
+
+$a_i = \frac{e^{x_i}}{\sum_k e^{x_k}}$
+
+$\frac {\partial a_i}{\partial x_j}=\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})$
+
+接下来根据 i 和 j 是否相等分情况进行讨论。
+
+若 i == j,则 $\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})=\frac{e^{x_i}(\sum_j e^{x_j})-e^{z_i}e^{z_i}}{(\sum_k e^{x_k})^2}=a_i(1-a_i)$
+
+若 i != j,则$\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})=-\frac{e^{x_i}e^{x_j}}{(\sum_k e^{x_k})^2}=-a_ia_j$
+
+结合 grady,可以得到
+
+$gradx_{ij}=\sum_k \frac{\partial}{\partial x_j}(\frac{e^{x_k}}{\sum_w e^{x_w}}) grady_{ik}$
+
+由于这个梯度的计算需要进行分类讨论,我没有想到可以直接用 numpy 中函数进行计算的方法,所以首先计算出一个 list 再转换成 ndarray 进行返回。
+
+### 模型正向传播
+
+模型每一层的输出作为下一层的输入,最后得到的是经过 Log 计算的 softmax 结果,这样就能很方便的进行交叉熵损失函数的计算。同时经过“模型反向传播”中的分析可以知道,这样设计使反向传播时的输入也非常简便。
+
+### 模型反向传播
+
+模型进行反向传播的时候会输入一个每行为一个独热向量的矩阵,表示每个数据集的类别,初始代码中会将矩阵中所有元素都除以矩阵的大小,但是经过的尝试,需要将所有元素除以训练数据的组数才能保证结果正确。~~同时,虽然通过了测试,但是 softmax 层的输出也和 torch 中的结果有不同,而后面层的输出是正确的。我认定我理解的 softmax 层和 torch 实现的 softmax 层有一定区别。~~
+
+在更改了测试代码之后,输出和 torch 层比较接近,可以认定是正确的。
+
+接下来推导反向传播时 Log 层的输入。
+
+交叉熵损失函数的形式为
+
+$Loss = -\sum_k t_k*\ln a_k$
+
+其中 $t_k$表示是否属于第 k 个类别,$a_k$为 softmax 层的输出,Log 层的输出为$\ln a_k$,则$\frac{\partial Loss}{\partial \ln a_k}=-t_k$
+
+因此,将输入到反向传播的矩阵 T 取反作为 Log 层的反向传播输入,然后将结果作为前一层的输入逐一反向传播。
+
+## 模型训练
+
+随着训练轮数增长,训练的正确率如下
+
+learning_rate = 0.1    mini_batch = 128
+
+> [0] Accuracy: 0.9403
+
+可以看出相较传统的梯度下降算法,损失函数值的震荡幅度有所减小,而收敛速度与传统方法相当。
\ No newline at end of file
diff --git a/assignment-2/submission/18307130090/img/Adam.png b/assignment-2/submission/18307130090/img/Adam.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe0326ebad52ad9356bdd7410834d9d61e9e5152
Binary files /dev/null and b/assignment-2/submission/18307130090/img/Adam.png differ
diff --git a/assignment-2/submission/18307130090/img/SGDM.png b/assignment-2/submission/18307130090/img/SGDM.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba7ad91c5569f2605e7944afe3803863b8072b46
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGDM.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_batch_size.png b/assignment-2/submission/18307130090/img/SGD_batch_size.png
new file mode 100644
index 0000000000000000000000000000000000000000..328c4cc7bf90ef75a09f8c97ee8e9134d44a33dd
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_batch_size.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_learning_rate.png b/assignment-2/submission/18307130090/img/SGD_learning_rate.png
new file mode 100644
index 0000000000000000000000000000000000000000..7bca928d1aa569b08dad43d761da1b6e27e02942
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_learning_rate.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_normal.png b/assignment-2/submission/18307130090/img/SGD_normal.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6f3933e1bf979fa7b3b643d8f7fe823610109e9
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_normal.png differ
diff --git a/assignment-2/submission/18307130090/img/fnn_model.png b/assignment-2/submission/18307130090/img/fnn_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..29ed50732a88ed1ca38a1cb3c6e82099a3d3e087
Binary files /dev/null and b/assignment-2/submission/18307130090/img/fnn_model.png differ
diff --git a/assignment-2/submission/18307130090/numpy_fnn.py b/assignment-2/submission/18307130090/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7010cad4609f7ae31b8bdc0b19cedc005c5b950c
--- /dev/null
+++ b/assignment-2/submission/18307130090/numpy_fnn.py
@@ -0,0 +1,239 @@
+import numpy as np
+
+
+class NumpyOp:
+
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        x, W = self.memory['x'], self.memory['W']
+        grad_x = np.matmul(grad_y, W.T)
+        grad_W = np.matmul(x.T, grad_y)
+
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        x = self.memory['x']
+        grad_x = grad_y * np.where(x > 0, np.ones_like(x), np.zeros_like(x))
+
+        return grad_x
+
+
+class Log(NumpyOp):
+
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+
+        return out
+
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        x = self.memory['x']
+        grad_x = grad_y * np.reciprocal(x + self.epsilon)
+
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        exp_x = np.exp(x - x.max())
+        exp_sum = np.sum(exp_x, axis=1, keepdims=True)
+        out = exp_x / exp_sum
+        self.memory['x'] = x
+        self.memory['out'] = out
+
+        return out
+
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        sm = self.memory['out']
+        Jacobs = np.array([np.diag(r) - np.outer(r, r) for r in sm])
+
+        grad_y = grad_y[:, np.newaxis, :]
+        grad_x = np.matmul(grad_y, Jacobs).squeeze(axis=1)
+
+        return grad_x
+
+
+class NumpyLoss:
+
+    def __init__(self):
+        self.target = None
+
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+
+        # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+
+        self.beta_1 = 0.9
+        self.beta_2 = 0.999
+        self.epsilon = 1e-8
+        self.is_first = True
+
+        self.W1_grad_mean = None
+        self.W2_grad_mean = None
+        self.W3_grad_mean = None
+
+        self.W1_grad_square_mean = None
+        self.W2_grad_square_mean = None
+        self.W3_grad_square_mean = None
+
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+
+        x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+        x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+        x = self.matmul_3.forward(x, self.W3)
+
+        x = self.log.forward(self.softmax.forward(x))
+
+        return x
+
+    def backward(self, y):
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+        self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+        self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+        self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+        return self.x1_grad
+
+    def optimize(self, learning_rate):
+        def SGD():
+            self.W1 -= learning_rate * self.W1_grad
+            self.W2 -= learning_rate * self.W2_grad
+            self.W3 -= learning_rate * self.W3_grad
+
+        def SGDM():
+            if self.is_first:
+                self.is_first = False
+
+                self.W1_grad_mean = self.W1_grad
+                self.W2_grad_mean = self.W2_grad
+                self.W3_grad_mean = self.W3_grad
+            else:
+                self.W1_grad_mean = self.beta_1 * self.W1_grad_mean + (1 - self.beta_1) * self.W1_grad
+                self.W2_grad_mean = self.beta_1 * self.W2_grad_mean + (1 - self.beta_1) * self.W2_grad
+                self.W3_grad_mean = self.beta_1 * self.W3_grad_mean + (1 - self.beta_1) * self.W3_grad
+
+            delta_1 = learning_rate * self.W1_grad_mean
+            delta_2 = learning_rate * self.W2_grad_mean
+            delta_3 = learning_rate * self.W3_grad_mean
+
+            self.W1 -= delta_1
+            self.W2 -= delta_2
+            self.W3 -= delta_3
+
+        def Adam(learning_rate=0.001):
+            if self.is_first:
+                self.is_first = False
+                self.W1_grad_mean = self.W1_grad
+                self.W2_grad_mean = self.W2_grad
+                self.W3_grad_mean = self.W3_grad
+
+                self.W1_grad_square_mean = np.square(self.W1_grad)
+                self.W2_grad_square_mean = np.square(self.W2_grad)
+                self.W3_grad_square_mean = np.square(self.W3_grad)
+
+                self.W1 -= learning_rate * self.W1_grad_mean
+                self.W2 -= learning_rate * self.W2_grad_mean
+                self.W3 -= learning_rate * self.W3_grad_mean
+            else:
+                self.W1_grad_mean = self.beta_1 * self.W1_grad_mean + (1 - self.beta_1) * self.W1_grad
+                self.W2_grad_mean = self.beta_1 * self.W2_grad_mean + (1 - self.beta_1) * self.W2_grad
+                self.W3_grad_mean = self.beta_1 * self.W3_grad_mean + (1 - self.beta_1) * self.W3_grad
+
+                self.W1_grad_square_mean = self.beta_2 * self.W1_grad_square_mean + (1 - self.beta_2) * np.square(
+                    self.W1_grad)
+                self.W2_grad_square_mean = self.beta_2 * self.W2_grad_square_mean + (1 - self.beta_2) * np.square(
+                    self.W2_grad)
+                self.W3_grad_square_mean = self.beta_2 * self.W3_grad_square_mean + (1 - self.beta_2) * np.square(
+                    self.W3_grad)
+
+                delta_1 = learning_rate * self.W1_grad_mean * np.reciprocal(
+                    np.sqrt(self.W1_grad_square_mean) + np.full_like(self.W1_grad_square_mean, self.epsilon))
+                delta_2 = learning_rate * self.W2_grad_mean * np.reciprocal(
+                    np.sqrt(self.W2_grad_square_mean) + np.full_like(self.W2_grad_square_mean, self.epsilon))
+                delta_3 = learning_rate * self.W3_grad_mean * np.reciprocal(
+                    np.sqrt(self.W3_grad_square_mean) + np.full_like(self.W3_grad_square_mean, self.epsilon))
+
+                self.W1 -= delta_1
+                self.W2 -= delta_2
+                self.W3 -= delta_3
+
+        # SGD()
+        # SGDM()
+        Adam()
diff --git a/assignment-2/submission/18307130090/numpy_mnist.py b/assignment-2/submission/18307130090/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d67f25824dabdc5791ae5cc96655affe8315e72
--- /dev/null
+++ b/assignment-2/submission/18307130090/numpy_mnist.py
@@ -0,0 +1,50 @@
+import numpy as np
+
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, get_torch_initialization, plot_curve, one_hot
+
+
+def mini_batch(dataset, batch_size=128):
+    data = np.array([np.array(each[0]) for each in dataset])
+    label = np.array([each[1] for each in dataset])
+
+    size = data.shape[0]
+    index = np.arange(size)
+    np.random.shuffle(index)
+
+    return [(data[index[i:i + batch_size]], label[index[i:i + batch_size]]) for i in range(0, size, batch_size)]
+
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+
+    train_loss = []
+
+    epoch_number = 10
+    learning_rate = 0.1
+
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+
+            train_loss.append(loss.item())
+
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130090/tester_demo.py b/assignment-2/submission/18307130090/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..504b3eef50a6df4d0aa433113136add50835e420
--- /dev/null
+++ b/assignment-2/submission/18307130090/tester_demo.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+    if isinstance(numpy_result, list) and torch_result is None:
+        flag = True
+        for (n, t) in numpy_result:
+            flag = flag and check_result(n, t)
+        return flag
+    # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+    T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+    direction = T / torch_result.numel() < err_p
+    return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+    x = np.random.normal(size=[5, 6])
+    W = np.random.normal(size=[6, 4])
+    
+    numpy_matmul = Matmul()
+    numpy_out = numpy_matmul.forward(x, W)
+    numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    torch_W = torch.from_numpy(W).clone().requires_grad_()
+    
+    torch_out = torch_matmul(torch_x, torch_W)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+        (numpy_W_grad, torch_W.grad)
+    ])
+
+
+def case_2():
+    x = np.random.normal(size=[5, 6])
+    
+    numpy_relu = Relu()
+    numpy_out = numpy_relu.forward(x)
+    numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_relu(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_3():
+    x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+    
+    numpy_log = Log()
+    numpy_out = numpy_log.forward(x)
+    numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_log(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_4():
+    x = np.random.normal(size=[4, 5])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    
+    return check_result(numpy_out, torch_out)
+
+
+def case_5():
+    x = np.random.normal(size=[20, 25])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def test_model():
+    try:
+        numpy_loss = NumpyLoss()
+        numpy_model = NumpyModel()
+        torch_model = TorchModel()
+        torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+        numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+        numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+        numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+        
+        x = torch.randn((10000, 28, 28))
+        y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+        
+        y = one_hot(y, numpy=False)
+        x2 = x.numpy()
+        y_pred = torch_model.forward(x)
+        loss = (-y_pred * y).sum(dim=1).mean()
+        loss.backward()
+        
+        y_pred_numpy = numpy_model.forward(x2)
+        numpy_loss.get_loss(y_pred_numpy, y.numpy())
+        
+        check_flag_1 = check_result(y_pred_numpy, y_pred)
+        print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+    except:
+        print("[Runtime Error in forward]")
+        print("+ {:12} {}/{}".format("forward", 0, 10))
+        return 0
+    
+    try:
+        
+        numpy_model.backward(numpy_loss.backward())
+        
+        check_flag_2 = [
+            check_result(numpy_model.log_grad, torch_model.log_input.grad),
+            check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+            check_result(numpy_model.W3_grad, torch_model.W3.grad),
+            check_result(numpy_model.W2_grad, torch_model.W2.grad),
+            check_result(numpy_model.W1_grad, torch_model.W1.grad)
+        ]
+        check_flag_2 = sum(check_flag_2) >= 4
+        print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+    except:
+        print("[Runtime Error in backward]")
+        print("+ {:12} {}/{}".format("backward", 0, 20))
+        check_flag_2 = False
+    
+    return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+    testcases = [
+        ["matmul", case_1, 5],
+        ["relu", case_2, 5],
+        ["log", case_3, 5],
+        ["softmax_1", case_4, 5],
+        ["softmax_2", case_5, 10],
+    ]
+    score = 0
+    for case in testcases:
+        try:
+            res = case[2] if case[1]() else 0
+        except:
+            print("[Runtime Error in {}]".format(case[0]))
+            res = 0
+        score += res
+        print("+ {:12} {}/{}".format(case[0], res, case[2]))
+    score += test_model()
+    print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/18307130090/torch_mnist.py b/assignment-2/submission/18307130090/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/18307130090/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+    
+    def __init__(self):
+        self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+        self.W2 = torch.randn((256, 64), requires_grad=True)
+        self.W3 = torch.randn((64, 10), requires_grad=True)
+        self.softmax_input = None
+        self.log_input = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = torch.relu(torch.matmul(x, self.W1))
+        x = torch.relu(torch.matmul(x, self.W2))
+        x = torch.matmul(x, self.W3)
+        
+        self.softmax_input = x
+        self.softmax_input.retain_grad()
+        
+        x = torch.softmax(x, 1)
+        
+        self.log_input = x
+        self.log_input.retain_grad()
+        
+        x = torch.log(x)
+        
+        return x
+    
+    def optimize(self, learning_rate):
+        with torch.no_grad():
+            self.W1 -= learning_rate * self.W1.grad
+            self.W2 -= learning_rate * self.W2.grad
+            self.W3 -= learning_rate * self.W3.grad
+            
+            self.W1.grad = None
+            self.W2.grad = None
+            self.W3.grad = None
+
+
+def torch_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = TorchModel()
+    model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, numpy=False):
+            y = one_hot(y, numpy=False)
+            
+            y_pred = model.forward(x)
+            loss = (-y_pred * y).sum(dim=1).mean()
+            loss.backward()
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset, numpy=False)[0]
+        accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    torch_run()
diff --git a/assignment-2/submission/18307130090/utils.py b/assignment-2/submission/18307130090/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..709220cfa7a924d914ec1c098c505f864bcd4cfc
--- /dev/null
+++ b/assignment-2/submission/18307130090/utils.py
@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+
+def plot_curve(data):
+    plt.plot(range(len(data)), data, color='blue')
+    plt.legend(['loss_value'], loc='upper right')
+    plt.xlabel('step')
+    plt.ylabel('value')
+    plt.show()
+
+
+def download_mnist():
+    from torchvision import datasets, transforms
+    
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+    ])
+    
+    train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+    test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+    
+    return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+    if numpy:
+        y_ = np.zeros((y.shape[0], 10))
+        y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+        return y_
+    else:
+        y_ = torch.zeros((y.shape[0], 10))
+        y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+    return y_
+
+
+def batch(dataset, numpy=True):
+    data = []
+    label = []
+    for each in dataset:
+        data.append(each[0])
+        label.append(each[1])
+    data = torch.stack(data)
+    label = torch.LongTensor(label)
+    if numpy:
+        return [(data.numpy(), label.numpy())]
+    else:
+        return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+    fc1 = torch.nn.Linear(28 * 28, 256)
+    fc2 = torch.nn.Linear(256, 64)
+    fc3 = torch.nn.Linear(64, 10)
+    
+    if numpy:
+        W1 = fc1.weight.T.detach().clone().numpy()
+        W2 = fc2.weight.T.detach().clone().numpy()
+        W3 = fc3.weight.T.detach().clone().numpy()
+    else:
+        W1 = fc1.weight.T.detach().clone().data
+        W2 = fc2.weight.T.detach().clone().data
+        W3 = fc3.weight.T.detach().clone().data
+    
+    return W1, W2, W3
diff --git a/assignment-2/submission/18307130104/README.md b/assignment-2/submission/18307130104/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1d38cfc70c1a72658e9d0fa1cf8569687ab9e45
--- /dev/null
+++ b/assignment-2/submission/18307130104/README.md
@@ -0,0 +1,179 @@
+18307130104
+
+# 课程报告
+
+这是 prml 的 assignment-2 课程报告,我的代码可以查看 numpy_fnn.py 中 code 1 ~ code 7 部分,以及 util.py 中 mini_batch 函数 numpy == True 的部分。
+
+在 assignment-2 中,完成了 numpy_fnn.py 中各种算子的反向传播,以及一个简单的前馈神经网络构建(包括正向传播和反向传播)。修改了 mini_batch,在 numpy == True 的情况下,不使用 torch 中的 dataloader 函数完成测试集的打乱和分批。
+
+## 模型实现
+
+为了区别矩阵乘法(np.matmul)和矩阵元素逐一做乘法(\*),下面用$\times$表示矩阵乘法,\*表示元素逐一相乘。
+
+### Matmul 算子的反向传播
+
+Matmul 算子输入一个 X 和权重 W,输出 $$[Y] = [X] \times [W]$$
+
+对于 Y 中的元素 $$Y_{ij}$$ 有$$Y_{ij}=\sum_{k}X_{ik} * W_{kj}$$
+
+在计算 grad_x 的时候,已知 grad_y,根据链式法则,可以得到 $gradx_{ij}=\sum_{k}\frac{\partial Y_{ik}}{\partial X_{ij}} * grady_{ik}$
+
+由 $Y_{ij}$的计算公式可以得到,$\frac{\partial Y_{ik}}{\partial X_{ij}}=W_{jk}$
+
+故 $gradx_{ij}=\sum_k W_{jk} *grady_{ik}$
+
+所以 $[gradx] = [grady] \times [W^T]$
+
+同理,可以得到$[gradW]=[x^T]\times [grady]$
+
+经过验证,矩阵的大小符合矩阵乘法规则。
+
+### Relu 算子的反向传播
+
+relu 函数的计算规则如下:
+
+$relu(x) = \begin{cases}0 & x < 0 \\\\ x & otherwise \end{cases}$
+
+求导可以得到
+
+$relu^{'}(x) = \begin{cases}0 & x < 0 \\\\ 1 & otherwise \end{cases}$
+
+故
+
+$[relugrad]=[grady]* [relu^{'}]$
+
+### Log 算子的反向传播
+
+$log(x) = \ln x$
+
+可以得到
+
+$log^{'}(x)=\frac 1 x$
+
+故
+
+$[loggrad]=[grady]* [log^{'}]$
+
+### softmax 算子的反向传播
+
+$softmax(x_i) = \frac {e^{x_i}}{\sum_j e^{x_j}}$
+
+在实现过程中,因为每一行代表一个测试数据点,所以以每一行为整体对每个元素进行 softmax 操作,从而达成对每个测试数据点进行分类的目的。
+
+采用 softmax 算子和交叉熵损失函数可以让损失函数的形式比较简单,但是遗憾的是实现的神经网络要求将两个算子的反向传播操作分开,因此没有办法投机取巧,只能分步进行计算。
+
+为了表达方便,不妨令 $a_i = softmax(x_i)$
+
+下面考虑$a_i$对$x_j$的反向传播。
+
+$a_i = \frac{e^{x_i}}{\sum_k e^{x_k}}$
+
+$\frac {\partial a_i}{\partial x_j}=\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})$
+
+接下来根据 i 和 j 是否相等分情况进行讨论。
+
+若 i == j,则 $\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})=\frac{e^{x_i}(\sum_j e^{x_j})-e^{z_i}e^{z_i}}{(\sum_k e^{x_k})^2}=a_i(1-a_i)$
+
+若 i != j,则$\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})=-\frac{e^{x_i}e^{x_j}}{(\sum_k e^{x_k})^2}=-a_ia_j$
+
+结合 grady,可以得到
+
+$gradx_{ij}=\sum_k \frac{\partial}{\partial x_j}(\frac{e^{x_k}}{\sum_w e^{x_w}}) grady_{ik}$
+
+由于这个梯度的计算需要进行分类讨论,我没有想到可以直接用 numpy 中函数进行计算的方法,所以首先计算出一个 list 再转换成 ndarray 进行返回。
+
+### 模型正向传播
+
+模型每一层的输出作为下一层的输入,最后得到的是经过 Log 计算的 softmax 结果,这样就能很方便的进行交叉熵损失函数的计算。同时经过“模型反向传播”中的分析可以知道,这样设计使反向传播时的输入也非常简便。
+
+### 模型反向传播
+
+模型进行反向传播的时候会输入一个每行为一个独热向量的矩阵,表示每个数据集的类别,初始代码中会将矩阵中所有元素都除以矩阵的大小,但是经过的尝试,需要将所有元素除以训练数据的组数才能保证结果正确。~~同时,虽然通过了测试,但是 softmax 层的输出也和 torch 中的结果有不同,而后面层的输出是正确的。我认定我理解的 softmax 层和 torch 实现的 softmax 层有一定区别。~~
+
+在更改了测试代码之后,输出和 torch 层比较接近,可以认定是正确的。
+
+接下来推导反向传播时 Log 层的输入。
+
+交叉熵损失函数的形式为
+
+$Loss = -\sum_k t_k*\ln a_k$
+
+其中 $t_k$表示是否属于第 k 个类别,$a_k$为 softmax 层的输出,Log 层的输出为$\ln a_k$,则$\frac{\partial Loss}{\partial \ln a_k}=-t_k$
+
+因此,将输入到反向传播的矩阵 T 取反作为 Log 层的反向传播输入,然后将结果作为前一层的输入逐一反向传播。
+
+## 模型训练
+
+随着训练轮数增长,训练的正确率如下
+
+learning_rate = 0.1    mini_batch = 128
+
+> [0] Accuracy: 0.9403
+> [1] Accuracy: 0.9641
+> [2] Accuracy: 0.9716
+> [3] Accuracy: 0.9751
+> [4] Accuracy: 0.9772
+> [5] Accuracy: 0.9782
+> [6] Accuracy: 0.9745
+> [7] Accuracy: 0.9807
+> [8] Accuracy: 0.9790
+> [9] Accuracy: 0.9811
+
+损失随训练轮数变化如下图所示
+
+ +
+可以看到,正确率随着训练稳步上升,在 6 轮之后,数字基本稳定,仅仅有略微的上下波动。
+
+learning_rate = 0.1    mini_batch = 32
+
+> [0] Accuracy: 0.9646
+
+可以看到,正确率随着训练稳步上升,在 6 轮之后,数字基本稳定,仅仅有略微的上下波动。
+
+learning_rate = 0.1    mini_batch = 32
+
+> [0] Accuracy: 0.9646
+> [1] Accuracy: 0.9726
+> [2] Accuracy: 0.9768
+> [3] Accuracy: 0.9788
+> [4] Accuracy: 0.9792
+> [5] Accuracy: 0.9770
+> [6] Accuracy: 0.9820
+> [7] Accuracy: 0.9808
+> [8] Accuracy: 0.9822
+> [9] Accuracy: 0.9835
+
+ +
+可以看到,由于 mini_batch 从 128 变成 32,损失随着轮数的变化会有比较大的起伏。
+
+learning_rate = 0.2    mini_batch = 128
+
+> [0] Accuracy: 0.9295
+
+可以看到,由于 mini_batch 从 128 变成 32,损失随着轮数的变化会有比较大的起伏。
+
+learning_rate = 0.2    mini_batch = 128
+
+> [0] Accuracy: 0.9295
+> [1] Accuracy: 0.9688
+> [2] Accuracy: 0.9753
+> [3] Accuracy: 0.9734
+> [4] Accuracy: 0.9793
+> [5] Accuracy: 0.9777
+> [6] Accuracy: 0.9792
+> [7] Accuracy: 0.9807
+> [8] Accuracy: 0.9821
+> [9] Accuracy: 0.9815
+
+ +
+虽然调高了学习率,但是损失并没有因此产生比较大的起伏,仍然表现出非常好的效果。
+
+learning_rate = 0.05 mini_batch = 128
+
+> [0] Accuracy: 0.9310
+
+虽然调高了学习率,但是损失并没有因此产生比较大的起伏,仍然表现出非常好的效果。
+
+learning_rate = 0.05 mini_batch = 128
+
+> [0] Accuracy: 0.9310
+> [1] Accuracy: 0.9504
+> [2] Accuracy: 0.9601
+> [3] Accuracy: 0.9661
+> [4] Accuracy: 0.9691
+> [5] Accuracy: 0.9728
+> [6] Accuracy: 0.9749
+> [7] Accuracy: 0.9761
+> [8] Accuracy: 0.9768
+> [9] Accuracy: 0.9752
+
+ +
+降低了学习率之后,可以看到正确率的增长比较缓慢,但是经过几轮训练之后的结果和高学习率的时候差不多。
+
+综合来看,影响最终正确率的主要还是模型本身的学习能力,一定范围内修改学习率和 mini_batch 对结果的影响不大。采用 mini_batch 的方式训练有助于降低训练过程中损失的波动。
\ No newline at end of file
diff --git a/assignment-2/submission/18307130104/img/result-1.png b/assignment-2/submission/18307130104/img/result-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..11c6fba6be9d6f58a463830a5d8c006ad64af963
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-1.png differ
diff --git a/assignment-2/submission/18307130104/img/result-2.png b/assignment-2/submission/18307130104/img/result-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..3f9aa1a2ed643f738f7d9ff59ea1923891048166
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-2.png differ
diff --git a/assignment-2/submission/18307130104/img/result-3.png b/assignment-2/submission/18307130104/img/result-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..1e7d29f9f43741b83d6ac43ecf4b6c448c8c1141
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-3.png differ
diff --git a/assignment-2/submission/18307130104/img/result-4.png b/assignment-2/submission/18307130104/img/result-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..2a1f550db001bdcc1d3a3b9501dba56a13028e8e
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-4.png differ
diff --git a/assignment-2/submission/18307130104/img/result-5.png b/assignment-2/submission/18307130104/img/result-5.png
new file mode 100644
index 0000000000000000000000000000000000000000..7ee7df630e01d83559e9f316a937df107e98248d
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-5.png differ
diff --git a/assignment-2/submission/18307130104/img/result.png b/assignment-2/submission/18307130104/img/result.png
new file mode 100644
index 0000000000000000000000000000000000000000..0039ef8029c07eeb75caa2efd42c13aeba61ce5a
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result.png differ
diff --git a/assignment-2/submission/18307130104/numpy_fnn.py b/assignment-2/submission/18307130104/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba780e9edb71ec687ddf7d295973be810848ce79
--- /dev/null
+++ b/assignment-2/submission/18307130104/numpy_fnn.py
@@ -0,0 +1,214 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        
+        ####################
+        #      code 1      #
+        ####################
+        grad_x = np.matmul(grad_y, self.memory['W'].T)
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+        
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 2      #
+        ####################
+        grad_x = grad_y * np.where(self.memory['x'] > 0, np.ones_like(self.memory['x']), np.zeros_like(self.memory['x']))
+        
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 3      #
+        ####################
+        grad_x = grad_y * np.reciprocal(self.memory['x'] + self.epsilon)
+        
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        ####################
+        #      code 4      #
+        ####################
+        self.memory['x'] = x
+        expx = np.exp(x)
+        sumx = np.sum(expx, axis = 1, keepdims = True)
+        return (expx / sumx)
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 5      #
+        ####################
+        
+        x = self.memory['x']
+        softx = self.forward(x)
+        # print(sumx.shape)
+        [n, m] = x.shape
+        out = []
+        # print(grad_y)
+        for i in range(n):
+            out.append([])
+            for j in range(m):
+                out[i].append(0)
+                for k in range(m):
+                    if j == k:
+                        # print(softx[i][k], grad_y[i][k])
+                        out[i][j] += (1 - softx[i][k]) * softx[i][k] * grad_y[i][k]
+                    else:
+                        out[i][j] += -softx[i][j] * softx[i][k] * grad_y[i][k]
+        grad_x = np.array(out)
+
+        return grad_x
+
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+        
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+        
+        # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        ####################
+        #      code 6      #
+        ####################
+        x = self.matmul_1.forward(x, self.W1)
+        x = self.relu_1.forward(x)
+        x = self.matmul_2.forward(x, self.W2)
+        x = self.relu_2.forward(x)
+        x = self.matmul_3.forward(x, self.W3)
+        x = self.softmax.forward(x)
+        # print(x)
+        x = self.log.forward(x)
+        
+        return x
+    
+    def backward(self, y):
+        
+        ####################
+        #      code 7      #
+        ####################
+
+        y = self.log.backward(y)
+        self.log_grad = y
+
+        y = self.softmax.backward(y)
+        self.softmax_grad = y
+
+        y, self.W3_grad = self.matmul_3.backward(y)
+        self.x3_grad = y
+
+        y = self.relu_2.backward(y)
+        self.relu_2_grad = y
+
+        y, self.W2_grad = self.matmul_2.backward(y)
+        self.x2_grad = y
+
+        y = self.relu_1.backward(y)
+        self.relu_1_grad = y
+
+        y, self.W1_grad = self.matmul_1.backward(y)
+        self.x1_grad = y
+        return y
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130104/numpy_mnist.py b/assignment-2/submission/18307130104/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f7aaadd84d701b578d384df3d4976f5c76a5dfa
--- /dev/null
+++ b/assignment-2/submission/18307130104/numpy_mnist.py
@@ -0,0 +1,38 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot
+
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, 128, True):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130104/tester_demo.py b/assignment-2/submission/18307130104/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..df4bb27bc0d8b9f28f5abd09faff7635d8347792
--- /dev/null
+++ b/assignment-2/submission/18307130104/tester_demo.py
@@ -0,0 +1,183 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+    if isinstance(numpy_result, list) and torch_result is None:
+        flag = True
+        for (n, t) in numpy_result:
+            flag = flag and check_result(n, t)
+        return flag
+    # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+    T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+    direction = T / torch_result.numel() < err_p
+
+    return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+    x = np.random.normal(size=[5, 6])
+    W = np.random.normal(size=[6, 4])
+    
+    numpy_matmul = Matmul()
+    numpy_out = numpy_matmul.forward(x, W)
+    numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    torch_W = torch.from_numpy(W).clone().requires_grad_()
+    
+    torch_out = torch_matmul(torch_x, torch_W)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+        (numpy_W_grad, torch_W.grad)
+    ])
+
+
+def case_2():
+    x = np.random.normal(size=[5, 6])
+    
+    numpy_relu = Relu()
+    numpy_out = numpy_relu.forward(x)
+    numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_relu(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_3():
+    x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+    
+    numpy_log = Log()
+    numpy_out = numpy_log.forward(x)
+    numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_log(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_4():
+    x = np.random.normal(size=[4, 5])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    
+    return check_result(numpy_out, torch_out)
+
+
+def case_5():
+    x = np.random.normal(size=[20, 25])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def test_model():
+    try:
+        numpy_loss = NumpyLoss()
+        numpy_model = NumpyModel()
+        torch_model = TorchModel()
+        torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+        numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+        numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+        numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+        
+        x = torch.randn((10000, 28, 28))
+        y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+        
+        y = one_hot(y, numpy=False)
+        x2 = x.numpy()
+        y_pred = torch_model.forward(x)
+        loss = (-y_pred * y).sum(dim=1).mean()
+        loss.backward()
+        
+        y_pred_numpy = numpy_model.forward(x2)
+        numpy_loss.get_loss(y_pred_numpy, y.numpy())
+        
+        check_flag_1 = check_result(y_pred_numpy, y_pred)
+        print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+    except:
+        print("[Runtime Error in forward]")
+        print("+ {:12} {}/{}".format("forward", 0, 10))
+        return 0
+    
+    try:
+        
+        numpy_model.backward(numpy_loss.backward())
+        
+        check_flag_2 = [
+            check_result(numpy_model.log_grad, torch_model.log_input.grad),
+            check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+            check_result(numpy_model.W3_grad, torch_model.W3.grad),
+            check_result(numpy_model.W2_grad, torch_model.W2.grad),
+            check_result(numpy_model.W1_grad, torch_model.W1.grad)
+        ]
+        check_flag_2 = sum(check_flag_2) >= 4
+        print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+    except:
+        print("[Runtime Error in backward]")
+        print("+ {:12} {}/{}".format("backward", 0, 20))
+        check_flag_2 = False
+    
+    return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+    testcases = [
+        ["matmul", case_1, 5],
+        ["relu", case_2, 5],
+        ["log", case_3, 5],
+        ["softmax_1", case_4, 5],
+        ["softmax_2", case_5, 10],
+    ]
+    score = 0
+    for case in testcases:
+        try:
+            res = case[2] if case[1]() else 0
+        except:
+            print("[Runtime Error in {}]".format(case[0]))
+            res = 0
+        score += res
+        print("+ {:12} {}/{}".format(case[0], res, case[2]))
+    score += test_model()
+    print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/18307130104/torch_mnist.py b/assignment-2/submission/18307130104/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/18307130104/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+    
+    def __init__(self):
+        self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+        self.W2 = torch.randn((256, 64), requires_grad=True)
+        self.W3 = torch.randn((64, 10), requires_grad=True)
+        self.softmax_input = None
+        self.log_input = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = torch.relu(torch.matmul(x, self.W1))
+        x = torch.relu(torch.matmul(x, self.W2))
+        x = torch.matmul(x, self.W3)
+        
+        self.softmax_input = x
+        self.softmax_input.retain_grad()
+        
+        x = torch.softmax(x, 1)
+        
+        self.log_input = x
+        self.log_input.retain_grad()
+        
+        x = torch.log(x)
+        
+        return x
+    
+    def optimize(self, learning_rate):
+        with torch.no_grad():
+            self.W1 -= learning_rate * self.W1.grad
+            self.W2 -= learning_rate * self.W2.grad
+            self.W3 -= learning_rate * self.W3.grad
+            
+            self.W1.grad = None
+            self.W2.grad = None
+            self.W3.grad = None
+
+
+def torch_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = TorchModel()
+    model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, numpy=False):
+            y = one_hot(y, numpy=False)
+            
+            y_pred = model.forward(x)
+            loss = (-y_pred * y).sum(dim=1).mean()
+            loss.backward()
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset, numpy=False)[0]
+        accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    torch_run()
diff --git a/assignment-2/submission/18307130104/utils.py b/assignment-2/submission/18307130104/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..274566a51dc9718158d63b6aa59546381d939223
--- /dev/null
+++ b/assignment-2/submission/18307130104/utils.py
@@ -0,0 +1,83 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+def plot_curve(data):
+    plt.plot(range(len(data)), data, color='blue')
+    plt.legend(['loss_value'], loc='upper right')
+    plt.xlabel('step')
+    plt.ylabel('value')
+    plt.xlim(-100,5000)
+    plt.savefig('./img/result.png')
+    plt.close()
+    plt.show()
+
+
+def download_mnist():
+    from torchvision import datasets, transforms
+    
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+    ])
+    
+    train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+    test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+    
+    return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+    if numpy:
+        y_ = np.zeros((y.shape[0], 10))
+        y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+        return y_
+    else:
+        y_ = torch.zeros((y.shape[0], 10))
+        y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+    return y_
+
+
+def batch(dataset, numpy=True):
+    data = []
+    label = []
+    for each in dataset:
+        data.append(each[0])
+        label.append(each[1])
+    data = torch.stack(data)
+    label = torch.LongTensor(label)
+    if numpy:
+        return [(data.numpy(), label.numpy())]
+    else:
+        return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    if numpy:
+        import random
+        datas = [(each[0].numpy(), each[1]) for each in dataset]
+        random.shuffle(datas)
+        datat = [each[0] for each in datas]
+        labelt = [each[1] for each in datas]
+        data = [np.array(datat[i: i + batch_size]) for i in range(0, len(datat), batch_size)]
+        label = [np.array(labelt[i: i + batch_size]) for i in range(0, len(datat), batch_size)]
+        return zip(data, label)
+    else:
+        return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+    fc1 = torch.nn.Linear(28 * 28, 256)
+    fc2 = torch.nn.Linear(256, 64)
+    fc3 = torch.nn.Linear(64, 10)
+    
+    if numpy:
+        W1 = fc1.weight.T.detach().clone().numpy()
+        W2 = fc2.weight.T.detach().clone().numpy()
+        W3 = fc3.weight.T.detach().clone().numpy()
+    else:
+        W1 = fc1.weight.T.detach().clone().data
+        W2 = fc2.weight.T.detach().clone().data
+        W3 = fc3.weight.T.detach().clone().data
+    
+    return W1, W2, W3
diff --git a/assignment-2/submission/18307130116/README.md b/assignment-2/submission/18307130116/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..60d6a7aaf412e4f028a1124ff7cc63b243e2c2d7
--- /dev/null
+++ b/assignment-2/submission/18307130116/README.md
@@ -0,0 +1,160 @@
+# FNN实现
+
+[toc]
+
+## 模型实现
+
+各算子实现参考[算子导数推导部分](##算子导数推导),网络结构如下图所示
+
+
+
+根据上图对应的模型,建立顺序将算子拼接在一起,并在反向传播时从loss开始逐层回传,基本没什么难点,最终模型构建了函数
+
+$log(softmax(W_3\sigma(W_2\sigma(W_1X)))$
+
+## 模型训练
+
+在运行实现给出的`numpy_minst.py`,共运行了三个epoch,对应的准确率和loss变化情况如下
+
+| epoch | Accuracy |
+| ----- | -------- |
+| 0     | 94.49%   |
+| 1     | 96.47%   |
+| 2     | 96.58%   |
+
+
+
+### 学习率和epoch的影响
+
+观察发现,loss下降到一定范围后开始上下抖动,推测其原因为接近极值点时学习率过大,为达到更优的性能,我调小的学习率并增大了epoch数量,得到结果如下,并做了不更改学习率仅调整epoch数量的对比实验其中i为[(i-1)*5, i\*5)中位数,20为最终结果
+
+| epoch | Accuracy(learning_rate = 0.1) | Accuracy(learning_rate = 0.05) | Accuracy(learning_rate = 0.1+0.05) |
+| ----- | ----------------------------- | ------------------------------ | ---------------------------------- |
+| 0     | 97.27%                        | 95.85%                         | 96.59%                             |
+| 5     | 97.93%                        | 97.85%                         | 97.91%                             |
+| 10    | 98.03%                        | 98.03%                         | 98.18%                             |
+| 15    | 98.12%                        | 98.09%                         | 98.18%                             |
+| 20    | 98.12%                        | 98.19%                         | 98.18%                             |
+
+
+
+降低了学习率之后,可以看到正确率的增长比较缓慢,但是经过几轮训练之后的结果和高学习率的时候差不多。
+
+综合来看,影响最终正确率的主要还是模型本身的学习能力,一定范围内修改学习率和 mini_batch 对结果的影响不大。采用 mini_batch 的方式训练有助于降低训练过程中损失的波动。
\ No newline at end of file
diff --git a/assignment-2/submission/18307130104/img/result-1.png b/assignment-2/submission/18307130104/img/result-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..11c6fba6be9d6f58a463830a5d8c006ad64af963
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-1.png differ
diff --git a/assignment-2/submission/18307130104/img/result-2.png b/assignment-2/submission/18307130104/img/result-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..3f9aa1a2ed643f738f7d9ff59ea1923891048166
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-2.png differ
diff --git a/assignment-2/submission/18307130104/img/result-3.png b/assignment-2/submission/18307130104/img/result-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..1e7d29f9f43741b83d6ac43ecf4b6c448c8c1141
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-3.png differ
diff --git a/assignment-2/submission/18307130104/img/result-4.png b/assignment-2/submission/18307130104/img/result-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..2a1f550db001bdcc1d3a3b9501dba56a13028e8e
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-4.png differ
diff --git a/assignment-2/submission/18307130104/img/result-5.png b/assignment-2/submission/18307130104/img/result-5.png
new file mode 100644
index 0000000000000000000000000000000000000000..7ee7df630e01d83559e9f316a937df107e98248d
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-5.png differ
diff --git a/assignment-2/submission/18307130104/img/result.png b/assignment-2/submission/18307130104/img/result.png
new file mode 100644
index 0000000000000000000000000000000000000000..0039ef8029c07eeb75caa2efd42c13aeba61ce5a
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result.png differ
diff --git a/assignment-2/submission/18307130104/numpy_fnn.py b/assignment-2/submission/18307130104/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba780e9edb71ec687ddf7d295973be810848ce79
--- /dev/null
+++ b/assignment-2/submission/18307130104/numpy_fnn.py
@@ -0,0 +1,214 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        
+        ####################
+        #      code 1      #
+        ####################
+        grad_x = np.matmul(grad_y, self.memory['W'].T)
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+        
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 2      #
+        ####################
+        grad_x = grad_y * np.where(self.memory['x'] > 0, np.ones_like(self.memory['x']), np.zeros_like(self.memory['x']))
+        
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 3      #
+        ####################
+        grad_x = grad_y * np.reciprocal(self.memory['x'] + self.epsilon)
+        
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        ####################
+        #      code 4      #
+        ####################
+        self.memory['x'] = x
+        expx = np.exp(x)
+        sumx = np.sum(expx, axis = 1, keepdims = True)
+        return (expx / sumx)
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 5      #
+        ####################
+        
+        x = self.memory['x']
+        softx = self.forward(x)
+        # print(sumx.shape)
+        [n, m] = x.shape
+        out = []
+        # print(grad_y)
+        for i in range(n):
+            out.append([])
+            for j in range(m):
+                out[i].append(0)
+                for k in range(m):
+                    if j == k:
+                        # print(softx[i][k], grad_y[i][k])
+                        out[i][j] += (1 - softx[i][k]) * softx[i][k] * grad_y[i][k]
+                    else:
+                        out[i][j] += -softx[i][j] * softx[i][k] * grad_y[i][k]
+        grad_x = np.array(out)
+
+        return grad_x
+
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+        
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+        
+        # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        ####################
+        #      code 6      #
+        ####################
+        x = self.matmul_1.forward(x, self.W1)
+        x = self.relu_1.forward(x)
+        x = self.matmul_2.forward(x, self.W2)
+        x = self.relu_2.forward(x)
+        x = self.matmul_3.forward(x, self.W3)
+        x = self.softmax.forward(x)
+        # print(x)
+        x = self.log.forward(x)
+        
+        return x
+    
+    def backward(self, y):
+        
+        ####################
+        #      code 7      #
+        ####################
+
+        y = self.log.backward(y)
+        self.log_grad = y
+
+        y = self.softmax.backward(y)
+        self.softmax_grad = y
+
+        y, self.W3_grad = self.matmul_3.backward(y)
+        self.x3_grad = y
+
+        y = self.relu_2.backward(y)
+        self.relu_2_grad = y
+
+        y, self.W2_grad = self.matmul_2.backward(y)
+        self.x2_grad = y
+
+        y = self.relu_1.backward(y)
+        self.relu_1_grad = y
+
+        y, self.W1_grad = self.matmul_1.backward(y)
+        self.x1_grad = y
+        return y
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130104/numpy_mnist.py b/assignment-2/submission/18307130104/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f7aaadd84d701b578d384df3d4976f5c76a5dfa
--- /dev/null
+++ b/assignment-2/submission/18307130104/numpy_mnist.py
@@ -0,0 +1,38 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot
+
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, 128, True):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130104/tester_demo.py b/assignment-2/submission/18307130104/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..df4bb27bc0d8b9f28f5abd09faff7635d8347792
--- /dev/null
+++ b/assignment-2/submission/18307130104/tester_demo.py
@@ -0,0 +1,183 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+    if isinstance(numpy_result, list) and torch_result is None:
+        flag = True
+        for (n, t) in numpy_result:
+            flag = flag and check_result(n, t)
+        return flag
+    # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+    T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+    direction = T / torch_result.numel() < err_p
+
+    return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+    x = np.random.normal(size=[5, 6])
+    W = np.random.normal(size=[6, 4])
+    
+    numpy_matmul = Matmul()
+    numpy_out = numpy_matmul.forward(x, W)
+    numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    torch_W = torch.from_numpy(W).clone().requires_grad_()
+    
+    torch_out = torch_matmul(torch_x, torch_W)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+        (numpy_W_grad, torch_W.grad)
+    ])
+
+
+def case_2():
+    x = np.random.normal(size=[5, 6])
+    
+    numpy_relu = Relu()
+    numpy_out = numpy_relu.forward(x)
+    numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_relu(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_3():
+    x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+    
+    numpy_log = Log()
+    numpy_out = numpy_log.forward(x)
+    numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_log(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_4():
+    x = np.random.normal(size=[4, 5])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    
+    return check_result(numpy_out, torch_out)
+
+
+def case_5():
+    x = np.random.normal(size=[20, 25])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def test_model():
+    try:
+        numpy_loss = NumpyLoss()
+        numpy_model = NumpyModel()
+        torch_model = TorchModel()
+        torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+        numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+        numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+        numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+        
+        x = torch.randn((10000, 28, 28))
+        y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+        
+        y = one_hot(y, numpy=False)
+        x2 = x.numpy()
+        y_pred = torch_model.forward(x)
+        loss = (-y_pred * y).sum(dim=1).mean()
+        loss.backward()
+        
+        y_pred_numpy = numpy_model.forward(x2)
+        numpy_loss.get_loss(y_pred_numpy, y.numpy())
+        
+        check_flag_1 = check_result(y_pred_numpy, y_pred)
+        print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+    except:
+        print("[Runtime Error in forward]")
+        print("+ {:12} {}/{}".format("forward", 0, 10))
+        return 0
+    
+    try:
+        
+        numpy_model.backward(numpy_loss.backward())
+        
+        check_flag_2 = [
+            check_result(numpy_model.log_grad, torch_model.log_input.grad),
+            check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+            check_result(numpy_model.W3_grad, torch_model.W3.grad),
+            check_result(numpy_model.W2_grad, torch_model.W2.grad),
+            check_result(numpy_model.W1_grad, torch_model.W1.grad)
+        ]
+        check_flag_2 = sum(check_flag_2) >= 4
+        print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+    except:
+        print("[Runtime Error in backward]")
+        print("+ {:12} {}/{}".format("backward", 0, 20))
+        check_flag_2 = False
+    
+    return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+    testcases = [
+        ["matmul", case_1, 5],
+        ["relu", case_2, 5],
+        ["log", case_3, 5],
+        ["softmax_1", case_4, 5],
+        ["softmax_2", case_5, 10],
+    ]
+    score = 0
+    for case in testcases:
+        try:
+            res = case[2] if case[1]() else 0
+        except:
+            print("[Runtime Error in {}]".format(case[0]))
+            res = 0
+        score += res
+        print("+ {:12} {}/{}".format(case[0], res, case[2]))
+    score += test_model()
+    print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/18307130104/torch_mnist.py b/assignment-2/submission/18307130104/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/18307130104/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+    
+    def __init__(self):
+        self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+        self.W2 = torch.randn((256, 64), requires_grad=True)
+        self.W3 = torch.randn((64, 10), requires_grad=True)
+        self.softmax_input = None
+        self.log_input = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = torch.relu(torch.matmul(x, self.W1))
+        x = torch.relu(torch.matmul(x, self.W2))
+        x = torch.matmul(x, self.W3)
+        
+        self.softmax_input = x
+        self.softmax_input.retain_grad()
+        
+        x = torch.softmax(x, 1)
+        
+        self.log_input = x
+        self.log_input.retain_grad()
+        
+        x = torch.log(x)
+        
+        return x
+    
+    def optimize(self, learning_rate):
+        with torch.no_grad():
+            self.W1 -= learning_rate * self.W1.grad
+            self.W2 -= learning_rate * self.W2.grad
+            self.W3 -= learning_rate * self.W3.grad
+            
+            self.W1.grad = None
+            self.W2.grad = None
+            self.W3.grad = None
+
+
+def torch_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = TorchModel()
+    model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, numpy=False):
+            y = one_hot(y, numpy=False)
+            
+            y_pred = model.forward(x)
+            loss = (-y_pred * y).sum(dim=1).mean()
+            loss.backward()
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset, numpy=False)[0]
+        accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    torch_run()
diff --git a/assignment-2/submission/18307130104/utils.py b/assignment-2/submission/18307130104/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..274566a51dc9718158d63b6aa59546381d939223
--- /dev/null
+++ b/assignment-2/submission/18307130104/utils.py
@@ -0,0 +1,83 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+def plot_curve(data):
+    plt.plot(range(len(data)), data, color='blue')
+    plt.legend(['loss_value'], loc='upper right')
+    plt.xlabel('step')
+    plt.ylabel('value')
+    plt.xlim(-100,5000)
+    plt.savefig('./img/result.png')
+    plt.close()
+    plt.show()
+
+
+def download_mnist():
+    from torchvision import datasets, transforms
+    
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+    ])
+    
+    train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+    test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+    
+    return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+    if numpy:
+        y_ = np.zeros((y.shape[0], 10))
+        y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+        return y_
+    else:
+        y_ = torch.zeros((y.shape[0], 10))
+        y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+    return y_
+
+
+def batch(dataset, numpy=True):
+    data = []
+    label = []
+    for each in dataset:
+        data.append(each[0])
+        label.append(each[1])
+    data = torch.stack(data)
+    label = torch.LongTensor(label)
+    if numpy:
+        return [(data.numpy(), label.numpy())]
+    else:
+        return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    if numpy:
+        import random
+        datas = [(each[0].numpy(), each[1]) for each in dataset]
+        random.shuffle(datas)
+        datat = [each[0] for each in datas]
+        labelt = [each[1] for each in datas]
+        data = [np.array(datat[i: i + batch_size]) for i in range(0, len(datat), batch_size)]
+        label = [np.array(labelt[i: i + batch_size]) for i in range(0, len(datat), batch_size)]
+        return zip(data, label)
+    else:
+        return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+    fc1 = torch.nn.Linear(28 * 28, 256)
+    fc2 = torch.nn.Linear(256, 64)
+    fc3 = torch.nn.Linear(64, 10)
+    
+    if numpy:
+        W1 = fc1.weight.T.detach().clone().numpy()
+        W2 = fc2.weight.T.detach().clone().numpy()
+        W3 = fc3.weight.T.detach().clone().numpy()
+    else:
+        W1 = fc1.weight.T.detach().clone().data
+        W2 = fc2.weight.T.detach().clone().data
+        W3 = fc3.weight.T.detach().clone().data
+    
+    return W1, W2, W3
diff --git a/assignment-2/submission/18307130116/README.md b/assignment-2/submission/18307130116/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..60d6a7aaf412e4f028a1124ff7cc63b243e2c2d7
--- /dev/null
+++ b/assignment-2/submission/18307130116/README.md
@@ -0,0 +1,160 @@
+# FNN实现
+
+[toc]
+
+## 模型实现
+
+各算子实现参考[算子导数推导部分](##算子导数推导),网络结构如下图所示
+
+
+
+根据上图对应的模型,建立顺序将算子拼接在一起,并在反向传播时从loss开始逐层回传,基本没什么难点,最终模型构建了函数
+
+$log(softmax(W_3\sigma(W_2\sigma(W_1X)))$
+
+## 模型训练
+
+在运行实现给出的`numpy_minst.py`,共运行了三个epoch,对应的准确率和loss变化情况如下
+
+| epoch | Accuracy |
+| ----- | -------- |
+| 0     | 94.49%   |
+| 1     | 96.47%   |
+| 2     | 96.58%   |
+
+
+
+### 学习率和epoch的影响
+
+观察发现,loss下降到一定范围后开始上下抖动,推测其原因为接近极值点时学习率过大,为达到更优的性能,我调小的学习率并增大了epoch数量,得到结果如下,并做了不更改学习率仅调整epoch数量的对比实验其中i为[(i-1)*5, i\*5)中位数,20为最终结果
+
+| epoch | Accuracy(learning_rate = 0.1) | Accuracy(learning_rate = 0.05) | Accuracy(learning_rate = 0.1+0.05) |
+| ----- | ----------------------------- | ------------------------------ | ---------------------------------- |
+| 0     | 97.27%                        | 95.85%                         | 96.59%                             |
+| 5     | 97.93%                        | 97.85%                         | 97.91%                             |
+| 10    | 98.03%                        | 98.03%                         | 98.18%                             |
+| 15    | 98.12%                        | 98.09%                         | 98.18%                             |
+| 20    | 98.12%                        | 98.19%                         | 98.18%                             |
+
+
 +
+     +
+     +
+     +
+
+
+## 优化器
+
+### Adam原理
+
+类似于实验部分做的对学习率的调整,Adam优化器作为一种很多情况下常常使用到的优化器,在自动调整学习率这个点较为出彩,基本已经成为了很多模型优化问题的默认优化器,另一方面初始的学习率选择也影响到了优化过程。
+
+Adam优化器的基本公式为$\theta_t = \theta_{t-1}-\alpha*\hat m_t/(\sqrt{\hat v_t}+\epsilon)$,其中$\hat m_t$以指数移动平均的方式估计样本的一阶矩,并通过超参$\beta_1$的t次方削减初始化为0导致偏差的影响,其基本公式如下,$g_t$为梯度值
+
+$\hat m_t = m_t/(1-\beta_1^t)$,$m_t = \beta_1m_{t-1}+(1-\beta_1)g_t$
+
+类似的计算$\hat v = v_t/(1-\beta_2^t),v_t = \beta_2v_{t-1}+(1-\beta_2)g_t^2$
+
+$\epsilon$目的是为了防止除数变成0
+
+### Momentum原理
+
+Momentum优化器的思路和Adam类似,但是并不考虑标准差对学习率的影响,同样利用滑动窗口机制,指数加权动量,赋给当前梯度一个较小的权重,从而平滑梯度在极值点附近的摆动,更能够接近极值点
+
+其公式如下
+
+$v_t = \beta v_{t-1}+(1-\beta)dW$
+
+$W = W - \alpha v_t$
+
+### 实现
+
+有了如上公式,我在`numpy_mnist.py`中设计了Adam类和Momentum类,由于并不能对`numpy_fnn.py`进行修改,对这两个优化器的实现大体思路变成了,针对每一个变量生成一个优化器,并通过内部变量记录上一轮迭代时参数信息,并计算后返回新的参数,例如Moment的使用呈如下格式:
+
+`model.W1 = W1_opt.optimize(model.W1, model.W1_grad)`
+
+即计算新的权值后,赋给模型
+
+### 实验比较
+
+我们将两个优化器我们同之前获得的最优结果,`lr` = 0.1+0.05方式作比较,loss和Accuracy变化如下
+
+| epoch | Accuracy(learning_rate = 0.1+0.05) | Accuracy(Adam, $\alpha = 0.001$) | Accuracy(Momentum,$\alpha = 0.1$) |
+| ----- | ---------------------------------- | ---------------------------------- | --------------------------------- |
+| 0     | 96.59%                             | 97.46%                             | 97.01%                            |
+| 5     | 97.91%                             | 97.69%                             | 97.95%                            |
+| 10    | 98.18%                             | 97.80%                             | 98.07%                            |
+| 15    | 98.18%                             | 97.98%                             | 98.22%                            |
+| 20    | 98.18%                             | 98.04%                             | 98.36%                            |
+
+
+
+
+
+## 优化器
+
+### Adam原理
+
+类似于实验部分做的对学习率的调整,Adam优化器作为一种很多情况下常常使用到的优化器,在自动调整学习率这个点较为出彩,基本已经成为了很多模型优化问题的默认优化器,另一方面初始的学习率选择也影响到了优化过程。
+
+Adam优化器的基本公式为$\theta_t = \theta_{t-1}-\alpha*\hat m_t/(\sqrt{\hat v_t}+\epsilon)$,其中$\hat m_t$以指数移动平均的方式估计样本的一阶矩,并通过超参$\beta_1$的t次方削减初始化为0导致偏差的影响,其基本公式如下,$g_t$为梯度值
+
+$\hat m_t = m_t/(1-\beta_1^t)$,$m_t = \beta_1m_{t-1}+(1-\beta_1)g_t$
+
+类似的计算$\hat v = v_t/(1-\beta_2^t),v_t = \beta_2v_{t-1}+(1-\beta_2)g_t^2$
+
+$\epsilon$目的是为了防止除数变成0
+
+### Momentum原理
+
+Momentum优化器的思路和Adam类似,但是并不考虑标准差对学习率的影响,同样利用滑动窗口机制,指数加权动量,赋给当前梯度一个较小的权重,从而平滑梯度在极值点附近的摆动,更能够接近极值点
+
+其公式如下
+
+$v_t = \beta v_{t-1}+(1-\beta)dW$
+
+$W = W - \alpha v_t$
+
+### 实现
+
+有了如上公式,我在`numpy_mnist.py`中设计了Adam类和Momentum类,由于并不能对`numpy_fnn.py`进行修改,对这两个优化器的实现大体思路变成了,针对每一个变量生成一个优化器,并通过内部变量记录上一轮迭代时参数信息,并计算后返回新的参数,例如Moment的使用呈如下格式:
+
+`model.W1 = W1_opt.optimize(model.W1, model.W1_grad)`
+
+即计算新的权值后,赋给模型
+
+### 实验比较
+
+我们将两个优化器我们同之前获得的最优结果,`lr` = 0.1+0.05方式作比较,loss和Accuracy变化如下
+
+| epoch | Accuracy(learning_rate = 0.1+0.05) | Accuracy(Adam, $\alpha = 0.001$) | Accuracy(Momentum,$\alpha = 0.1$) |
+| ----- | ---------------------------------- | ---------------------------------- | --------------------------------- |
+| 0     | 96.59%                             | 97.46%                             | 97.01%                            |
+| 5     | 97.91%                             | 97.69%                             | 97.95%                            |
+| 10    | 98.18%                             | 97.80%                             | 98.07%                            |
+| 15    | 98.18%                             | 97.98%                             | 98.22%                            |
+| 20    | 98.18%                             | 98.04%                             | 98.36%                            |
+
+
 +
+### 分析
+
+从表格和loss变化情况来看,Momentum的效果明显优于手动学习率调整,而Adam的效果甚至不如恒定学习率,查看论文中的算法后,我排除了实现错误的可能性,查找了相关资料,发现了这样的一段话:
+
+[简单认识Adam]: https://www.jianshu.com/p/aebcaf8af76e	"Adam的缺陷与改进"
+
+虽然Adam算法目前成为主流的优化算法,不过在很多领域里(如计算机视觉的对象识别、NLP中的机器翻译)的最佳成果仍然是使用带动量(Momentum)的SGD来获取到的。Wilson 等人的论文结果显示,在对象识别、字符级别建模、语法成分分析等方面,自适应学习率方法(包括AdaGrad、AdaDelta、RMSProp、Adam等)通常比Momentum算法效果更差。
+
+根据该资料的说法,本次实验手写数字识别应划归为对象识别,自适应学习率方法确为效果更差,Adam的好处在于,对于不稳定目标函数,效果很好,因此,从这里可以看到,优化器选择应该针对实际问题类型综合考量
\ No newline at end of file
diff --git a/assignment-2/submission/18307130116/img/Adam.png b/assignment-2/submission/18307130116/img/Adam.png
new file mode 100644
index 0000000000000000000000000000000000000000..76c571e3ea0c18e00faf75a5f078350cb86a1159
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Adam.png differ
diff --git a/assignment-2/submission/18307130116/img/Figure_1.png b/assignment-2/submission/18307130116/img/Figure_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..683414e2e126545f2a851da9a05be74eb5261b13
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Figure_1.png differ
diff --git a/assignment-2/submission/18307130116/img/Figure_2.png b/assignment-2/submission/18307130116/img/Figure_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..bef71ab36ae8d83504f84243e3d64082b8fcab5d
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Figure_2.png differ
diff --git a/assignment-2/submission/18307130116/img/Figure_3.png b/assignment-2/submission/18307130116/img/Figure_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..639051608449345a12b51083243e78dcfa6a4f70
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Figure_3.png differ
diff --git a/assignment-2/submission/18307130116/img/Figure_4.png b/assignment-2/submission/18307130116/img/Figure_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe141456a1e96e256569cdcb37a87e2d4b6f0e6b
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Figure_4.png differ
diff --git a/assignment-2/submission/18307130116/img/matmul.png b/assignment-2/submission/18307130116/img/matmul.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3e6d769ef44203d80817a2928a5b1ea2a533e06
Binary files /dev/null and b/assignment-2/submission/18307130116/img/matmul.png differ
diff --git a/assignment-2/submission/18307130116/img/model.png b/assignment-2/submission/18307130116/img/model.png
new file mode 100644
index 0000000000000000000000000000000000000000..72c73828f7d70be8ea8d3f010b27bc7ada0a4139
Binary files /dev/null and b/assignment-2/submission/18307130116/img/model.png differ
diff --git a/assignment-2/submission/18307130116/img/momentum.png b/assignment-2/submission/18307130116/img/momentum.png
new file mode 100644
index 0000000000000000000000000000000000000000..b9b0b145e362898c6a6cf5f379fe0459abb9fa28
Binary files /dev/null and b/assignment-2/submission/18307130116/img/momentum.png differ
diff --git a/assignment-2/submission/18307130116/img/softmax1.png b/assignment-2/submission/18307130116/img/softmax1.png
new file mode 100644
index 0000000000000000000000000000000000000000..56c1a6c77141e66a1970dc8d7d66d00c891a74d2
Binary files /dev/null and b/assignment-2/submission/18307130116/img/softmax1.png differ
diff --git a/assignment-2/submission/18307130116/img/softmax2.png b/assignment-2/submission/18307130116/img/softmax2.png
new file mode 100644
index 0000000000000000000000000000000000000000..277f06da303ed92389cc7620e89ee25bf5b1c7e1
Binary files /dev/null and b/assignment-2/submission/18307130116/img/softmax2.png differ
diff --git a/assignment-2/submission/18307130116/numpy_fnn.py b/assignment-2/submission/18307130116/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..13397e1977d0b8bf530900861e08a2176816f780
--- /dev/null
+++ b/assignment-2/submission/18307130116/numpy_fnn.py
@@ -0,0 +1,185 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        
+        ####################
+        #      code 1      #
+        ####################
+        grad_x = np.matmul(grad_y,self.memory['W'].T)
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 2      #
+        ####################
+        grad_x = np.where(self.memory['x'] > 0, grad_y, np.zeros_like(self.memory['x']))
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 3      #
+        ####################
+        grad_x =(1/(self.memory['x'] + self.epsilon)) *grad_y
+        
+        return grad_x
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        self.memory['x'] = x
+        ####################
+        #      code 4      #
+        ####################
+        exp = np.exp(self.memory['x'])
+        one = np.ones((self.memory['x'].shape[1], self.memory['x'].shape[1]))
+        h = 1./np.matmul(exp,one)
+        out = h * exp
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 5      #
+        ####################
+        exp = np.exp(self.memory['x'])
+        one = np.ones((self.memory['x'].shape[1], self.memory['x'].shape[1]))
+        h = 1./np.matmul(exp,one)
+        h_grad = -h * h
+        grad_x = grad_y* exp * h + np.matmul(grad_y * exp * h_grad, one) * exp
+        return grad_x
+
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+
+
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+
+        # 以下变量需要在 backward 中更新
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+        
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        ####################
+        #      code 6      #
+        ####################
+        x = self.matmul_1.forward(x, self.W1)
+        x = self.relu_1.forward(x)
+        x = self.matmul_2.forward(x, self.W2)
+        x = self.relu_2.forward(x)
+        x = self.matmul_3.forward(x, self.W3)
+        x = self.softmax.forward(x)
+        x = self.log.forward(x)
+        return x
+    
+    def backward(self, y):
+        ####################
+        #      code 7      #
+        ####################
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+        self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+        self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+        self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+        
+        
+
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130116/numpy_mnist.py b/assignment-2/submission/18307130116/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc5fdaa3b169f4a5ec77458993318b1b875ac400
--- /dev/null
+++ b/assignment-2/submission/18307130116/numpy_mnist.py
@@ -0,0 +1,97 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, get_torch_initialization, plot_curve, one_hot
+
+def mini_batch(dataset, batch_size=128, numpy=False, drop_last=False):
+    data = []
+    label = []
+    dataset_num = dataset.__len__()
+    idx = np.arange(dataset_num)
+    np.random.shuffle(idx)
+    for each in dataset:
+        data.append(each[0].numpy())
+        label.append(each[1])
+    label_numpy = np.array(label)[idx]
+    data_numpy = np.array(data)[idx]
+
+    result = []
+    for iter in range(dataset_num // batch_size):
+        result.append((data_numpy[iter*batch_size:(iter+1)*batch_size], label_numpy[iter*batch_size:(iter+1)*batch_size]))
+    if drop_last == False:
+        result.append((data_numpy[(iter+1)*batch_size:dataset_num], label_numpy[(iter+1)*batch_size:dataset_num]))
+    return result
+
+class Adam:
+    def __init__(self, weight, lr=0.0015, beta1=0.9, beta2=0.999, epsilon=1e-8):
+        self.theta = weight
+        self.lr = lr
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epislon = epsilon
+        self.m = 0
+        self.v = 0
+        self.t = 0
+
+    def optimize(self, grad):
+        self.t += 1
+        self.m = self.beta1 * self.m + (1 - self.beta1) * grad
+        self.v = self.beta2 * self.v + (1 - self.beta2) * grad * grad
+        self.m_hat = self.m / (1 - self.beta1 ** self.t)
+        self.v_hat = self.v / (1 - self.beta2 ** self.t)
+        self.theta -= self.lr * self.m_hat / (self.v_hat ** 0.5 + self.epislon)
+        return self.theta
+
+class Momentum:
+    def __init__(self, lr=0.1, beta=0.9):
+        self.lr = lr
+        self.beta = beta
+        self.v = 0
+    
+    def optimize(self, weight, grad):
+        self.v = self.beta*self.v + (1-self.beta)*grad
+        weight -= self.lr * self.v
+        return weight
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    W1_opt = Momentum()
+    W2_opt = Momentum()
+    W3_opt = Momentum()
+
+
+    train_loss = []
+    
+    epoch_number = 20
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            # if epoch >= 10:
+            #     learning_rate = 0.05
+            # else:
+            #     learning_rate = 0.1
+            # model.optimize(learning_rate)
+            model.W1 = W1_opt.optimize(model.W1, model.W1_grad)
+            model.W2 = W2_opt.optimize(model.W2, model.W2_grad)
+            model.W3 = W3_opt.optimize(model.W3, model.W3_grad)
+
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130154/README.md b/assignment-2/submission/18307130154/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..51cea514496f5e1d5ba2fae3e49da09fe4afb6ca
--- /dev/null
+++ b/assignment-2/submission/18307130154/README.md
@@ -0,0 +1,488 @@
+# Assignment 2——选题1  报告
+
+## 概述
+
+本次实验实现了简单的几个Pytorch算子,包括正向计算和反向传播,同时记录了反向传播的公式推导。然后搭建了简单的模型,在Mnist手写体数据集上进行了测试。
+
+**作为扩展,我调研了Pytorch的权重初始化方法——Xavier初始化和Kaiming初始化,用Numpy实现了numpyutils工具包替代utils(现在放在numpy_mnist中了)。**
+
+## 算子及推导
+
+### Matmul
+
+此算子进行两个矩阵的求积运算
+
+**推导**
+
+设反向传播的开始节点(叶节点)为 **L**,这是一个标量,下同。
+
+设正向计算中两个输入矩阵为 **P(m * k) , Q(k * n)**, 输出矩阵为 **O(m * n)**; 反向传播中输入的梯度为 **G(m * n)**。
+
+则有
+$$
+G_{ij} = \frac{\partial L}{\partial O_{ij}}
+$$
+此公式对后面的算子同样适用。
+
+**计算Q的梯度 GQ (k * n)**
+
+首先有
+$$
+\begin{aligned}
+	GQ {ts}&= \frac{\partial L}{\partial Q_{ts}} \\\\
+		&=\sum_{i \leqslant m\\ j\leqslant n}   \frac{\partial L}{\partial O_{ij}}  \times   \frac{\partial O_{ij}}{\partial Q_{ts}}\\\\
+		&=\sum_{i \leqslant m\\ j = s}   G_{ij}  \times   \frac{\partial O_{ij}}{\partial Q_{ts}}
+		&(其余的 \frac{\partial O_{ij}}{\partial Q_{ts}} = 0)\\\\
+		&=\sum_{i \leqslant m}   G_{is}  \times   P_{it}\\\\
+		&=\sum_{i \leqslant m}   P_{ti}^T  \times   G_{is}  
+\end{aligned}
+$$
+所以写成矩阵乘法为
+$$
+GQ {ts} = P^T \times G
+$$
+同理,**P的梯度为**
+$$
+GP {ts} = G \times Q^T
+$$
+
+### Relu
+
+设输入为**X**,输出为**Y**,Relu层的输出矩阵中每个元素**只与输入矩阵中对应位置的元素有关**,设对应位置元素为**x** , **y**。
+
+则有
+$$
+y = relu(x)=
+    \begin{cases}
+    0& x \leq 0\\\\
+    x& x \geq 0
+    \end{cases}
+$$
+和
+$$
+\frac{\partial Y_{ts}}{\partial X_{ij}} = 
+	\begin{cases}
+    1& t = i& and & s = j & and & X_{ij} > 0   \\\\
+    0& else
+    \end{cases}
+$$
+
+于是
+$$
+\frac{\partial L}{\partial X_{ij}} = \frac{\partial L}{\partial Y_{ij}} \times \frac{\partial Y_{ij}}{\partial X_{ij}}
+$$
+设M为X的掩码矩阵,其中M中元素m定义为:当X对应位置为正数时,m为1;否则m为0。那么上面的式子写成矩阵的形式:
+$$
+GX = GY * M
+$$
+其中 $*$ 表示矩阵的点乘,即对应位相乘。
+
+### Log
+
+设输入为**X**,输出为**Y**,Log层的输出矩阵中每个元素**只与输入矩阵中对应位置的元素有关**,设对应位置元素为**x** , **y**。
+
+则有
+$$
+\frac{dy}{dx} = \frac{1}{x}
+$$
+与Relu同理,设矩阵M定义为:M和X形状相同,且
+$$
+M_{ij} = \frac{1}{X_{ij}}
+$$
+则有
+$$
+GX = GY * M
+$$
+
+### Softmax
+
+设输入为**X**,输出为**Y**,其中**X**的第一维可以看成batch维,所以Softmax层的输出矩阵中每个元素只与输入矩阵中对应位置元素**所在行的元素**有关。方便起见,我们先考虑batch size为1的输入,即X (1 * n),并且用 
+$$
+X_i
+$$
+来简写 X 中第一行第 i 列的元素。
+
+输出Y 也是 1 * n 的矩阵,我们使用和 X 相同的表示规则。那么,正向计算公式为
+$$
+Y_i = \frac{e^{X_i}}{\sum_{k=1}^n e^{X_k}}
+$$
+梯度公式为:
+$$
+\frac{\partial Y_i}{\partial X_j} =
+	\begin{cases}
+    Y_i \times (1 - Y_i) & i = j\\\\
+    -Y_i \times Y_j & i \neq j
+    \end{cases}
+$$
+
+根据上面的公式可以计算出向量Y对向量X求导的雅各比矩阵**J (n * n)**, 定义如下
+$$
+J_{ij} = \frac{\partial Y_i}{\partial X_j}
+$$
+那么
+$$
+\begin{aligned}
+GX_{i} &= \frac{\partial L}{\partial X_i} \\\\
+	&=\sum_{k=1}^{n}\frac{\partial L}{\partial Y_k} \times \frac{\partial Y_k}{\partial X_i}\\\\
+	&=\sum_{k=1}^{n} GY_k \times J_{ki}\\\\
+	&=\sum_{k=1}^{n} GY_{1k} \times J_{ki}\\\\
+	&=GY \times J\\\\
+	&(其中GY为Y的梯度,是这一层反向传播的输入)
+\end{aligned}
+$$
+我们已经推出了在输入X的第一维为1的情况下的反向传播公式,事实上,当X的第一维(batch size)大于1时,只需要添加一个最高维,扩展 X, Y, GY, J, 并利用numpy的函数:
+
+```python
+numpy.matmul()
+```
+
+将自动执行张量计算,得到 GX。
+
+## 模型训练与测试
+
+### 模型搭建
+
+首先按照 torch_mnist 搭建模型。
+
+**正向传播**
+
+```python
+x = self.matmul_1.forward(x, self.W1)
+x = self.relu_1.forward(x)
+x = self.matmul_2.forward(x, self.W2)
+x = self.relu_2.forward(x)
+x = self.matmul_3.forward(x, self.W3)
+x = self.softmax.forward(x)
+x = self.log.forward(x)
+```
+
+~~**反向传播**~~(这里由于后面测试例做了改动,这里的模型也随之变化,最终的模型在下面)
+
+这里有一点要注意,torch的反向传播以**标量(叶子结点)**为开始,但是我们定义的模型没有最后的激活为标量的层,所以最高层的梯度要手动计算。看到测试例中torch使用的标量(Loss)为:
+
+```python
+loss = (-y_pred * y).sum(dim=1).mean() 
+```
+
+因为有一个对列求均值的操作,所以激活层的权重矩阵(也即最高层的梯度矩阵),为**- y / y.shape[0]**,但是在模型反向传播的函数中已经有这样一段代码:
+
+```python
+for size in y.shape:
+        y /= size
+```
+
+y的符号相反,并且多除了一个y.shape[1], 所以我在反向传播一开始,把这个弥补进顶层梯度里面了,最终的code 7:
+
+```python
+####################
+#      code 7      #
+####################
+
+#mulgrade = mulgrade3
+#x3_grade = mulgrade2
+#x2_grade = mulgrade1
+#x1_grade = input_grad
+
+y *= (-y.shape[1])  
+self.log_grad = y
+self.softmax_grad = self.log.backward(self.log_grad)
+
+mulgrade = self.softmax.backward(self.softmax_grad)
+self.relu_2_grad,self.W3_grad = self.matmul_3.backward(mulgrade)
+
+self.x3_grad = self.relu_2.backward(self.relu_2_grad)
+self.relu_1_grad,self.W2_grad = self.matmul_2.backward(self.x3_grad)
+
+self.x2_grad = self.relu_1.backward(self.relu_1_grad)
+self.x1_grad,self.W1_grad = self.matmul_1.backward(self.x2_grad)
+```
+
+**反向传播版本2**
+
+现在_grad 表示对应层的 input 的梯度,直接贴代码
+
+```python
+self.log_grad = self.log.backward(y)
+self.softmax_grad = self.softmax.backward(self.log_grad)
+
+mulgrade3,self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+self.relu_2_grad = self.relu_2.backward(mulgrade3)
+
+mulgrade2,self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+self.relu_1_grad = self.relu_1.backward(mulgrade2)
+
+self.x1_grad,self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+```
+
+### 用numpy实现mini_batch
+
+将数据集打乱,并根据batch_size分割
+
+```python
+def mini_batch(dataset, batch_size=128, numpy=False):
+    data = []
+    label = []
+    for x in dataset:
+        data.append(np.array(x[0]))
+        label.append(x[1])
+    data = np.array(data)
+    label = np.array(label)
+
+    #索引随机打乱
+    siz = data.shape[0]
+    ind = np.arange(siz)
+    np.random.shuffle(ind)
+
+    #划分batch
+    res = []
+    con = 0
+    while con + batch_size <= siz:
+        data_batch = data[ind[con:con + batch_size]]
+        label_batch = label[ind[con:con + batch_size]]
+        res.append((data_batch,label_batch))
+        con += batch_size
+
+    return res
+```
+
+### 训练与测试
+
+这部分代码助教已经给出,使用的是mnist手写体数据集。下载数据集后,对每个epoch,按照batch_size将数据读入,并使用模型进行一次正向计算、反向传播、优化。主要部分:
+
+```python
+for epoch in range(epoch_number):
+    for x, y in mini_batch(train_dataset):
+
+        y = one_hot(y)
+
+        # y_pred = model.forward(x.numpy())
+        y_pred = model.forward(x)
+        loss = (-y_pred * y).sum(axis=1).mean()
+        model.backward(y)
+        model.optimize(learning_rate)
+
+        train_loss.append(loss.item())
+
+    x, y = batch(test_dataset)[0]
+    accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+    print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+```
+
+### 测试结果
+
+**损失函数**
+
+
+
+**每一轮epoch后正确率(共3轮)**
+
+```
+[0] Accuracy: 0.9459
+[1] Accuracy: 0.9635
+[2] Accuracy: 0.9713
+```
+
+
+
+## 扩展——Pytorch权重初始化方法
+
+### 结论
+
+结论写在前。Pytorch线性层采取的默认初始化方式是**Kaiming**初始化,这是由我国计算机视觉领域专家**何恺明**提出的。我的探究主要包括:
+
+* 为什么采取Kaiming初始化?
+* 考察Kaiming初始化的基础——Xavier初始化的公式
+* 考察Kaiming初始化的公式
+* 用Numpy实现一个简易的Kaiming初始化
+
+### 为什么采取Kaiming初始化?
+
+**采取固定的分布?**
+
+当考虑怎么初始化权重矩阵这个问题时,可以想到应该使得初始权重具有随机性。提到随机,自然的想法是使用**均匀分布或正态分布**,那么我们如果采用**与模型无关的固定分布**(例如标准正态分布(均值为0,方差为1))怎么样?下面我们分析如果对模型本身不加考虑,采取固定的分布,会有什么问题:
+
+* 如果权重的绝对值太小,在多层的神经网络的每一层,输入信号的方差会不断减小;当到达最终的输出层时,可以理解为输入信号的影响已经降低到微乎其微。一方面训练效果差,另一方面可能会有梯度消失等问题。(此处从略,参考https://zhuanlan.zhihu.com/p/25631496)
+* 如果权重的绝对值太大,同样道理,随着深度的加深,可能会使输入信号的方差过大,这会造成梯度爆炸或消失的问题。
+
+这里举一个例子,假如一个网络使用了多个sigmoid作为中间层(这个函数具有两边导数趋于0的特点):
+
+* 如果权重初始绝对值太小,随着深度的加深,输入信号的方差过小。当输入很小时,sigmoid函数接近线性,深层模型也失去了非线性性的优点。(**模型效果**)
+* 如果权重初始绝对值太大,随着深度的加深,输入信号的方差过大。绝对值过大的sigmoid输入意味着激活变得饱和,梯度将开始接近零。(**梯度消失**)
+
+### Xavier初始化
+
+前面的问题提示我们要根据模型的特点(维度,规模)决定使用的随机化方法(分布的均值、方差),**xavier初始化**应运而生,它可以使得输入值经过网络层后**方差不变**。pytorch中这一点是通过增益值gain来实现的,下面的函数用来获得特定层的gain:
+
+```python
+torch.nn.init.calculate_gain(nonlinearity, param=None)
+```
+
+增益值表(图片摘自https://blog.csdn.net/winycg/article/details/86649832)
+
+
+
+Xavier初始化可以采用均匀分布 **U(-a, a)**,其中a的计算公式为:
+$$
+a = gain \times \sqrt[]{\frac{6}{fan\_in+fan\_out}}
+$$
+Xavier初始化可以采用正态分布 **N(0, std)**,其中std的计算公式为:
+$$
+std = gain \times \sqrt[]{\frac{2}{fan\_in+fan\_out}}
+$$
+其中fan_in和fan_out分别是输入神经元和输出神经元的数量,在全连接层中,就等于输入输出的feature数。
+
+### Kaiming初始化
+
+Xavier初始化在Relu层表现不好,主要原因是relu层会将负数映射到0,影响整体方差。所以**何恺明**在对此做了改进提出Kaiming初始化,一开始主要应用于计算机视觉、卷积网络。
+
+Kaiming均匀分布的初始化采用**U(-bound, bound)**,其中bound的计算公式为:(a 的概念下面再说)
+$$
+bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}}
+$$
+这里补充一点,pytorch中这个公式也通过gain作为中间变量实现,也就是:
+$$
+bound = gain \times \sqrt[]{\frac{3}{ fan\_in}}
+$$
+其中:
+$$
+gain = \sqrt{\frac{2}{1 + a^2}}
+$$
+Kaiming正态分布的初始化采用**N(0,std)**,其中std的计算公式为:
+$$
+std = \sqrt[]{\frac{2}{(1 + a ^2) \times fan\_in}}
+$$
+这里稍微解释一下a的含义,源码中的解释为
+
+```
+the negative slope of the rectifier used after this layer
+```
+
+简单说,是用来衡量这一层中负数比例的,负数越多,Relu层会将越多的输入“抹平”为0,a用来平衡这种“抹平”对于方差的影响。
+
+### 我们使用的初始化
+
+看一下我们现在使用的get_torch_initialization函数,可以看到是通过调用pytorch的线性层进行的默认初始化:
+
+```python
+fc1 = torch.nn.Linear(28 * 28, 256)
+```
+
+在Linear类中通过
+
+```python
+self.reset_parameters()
+```
+
+这个函数来完成随机初始化的过程,后者使用的是
+
+```python
+init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+```
+
+可见是我们前面提到的Kaiming均匀分布的初始化方式,这个函数的内容和前面的公式相符(使用gain作为中间变量):
+
+```python
+fan = _calculate_correct_fan(tensor, mode)
+gain = calculate_gain(nonlinearity, a)
+std = gain / math.sqrt(fan)
+bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+with torch.no_grad():
+    return tensor.uniform_(-bound, bound)
+```
+
+~~同时将参数a 的值设置为5。~~
+
+同时将参数a 的值设置为根号5。
+
+### ~~使用numpy完成get_torch_initialization~~    修正
+
+简单起见,我没有按照pytorch的封装方法分层实现初始化过程,后者主要为了提供多种不同的初始化方式。我直接按照线性层默认的初始方式——Kaiming均匀分布的公式用numpy实现了get_torch_initialization,其中a值取5, 代码如下:
+
+```python
+def get_torch_initialization(numpy = True):
+
+    a = 5
+
+    def Kaiming_uniform(fan_in,fan_out,a):
+        bound = 6.0 / (1 + a * a) / fan_in
+        bound = bound ** 0.5
+        W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out))
+        return W
+
+    W1 = Kaiming_uniform(28 * 28, 256, a)
+    W2 = Kaiming_uniform(256, 64, a)
+    W3 = Kaiming_uniform(64, 10, a)
+    return W1,W2,W3
+```
+
+顺便,我将utils其它函数(包括之前的mini_batch)转化为numpy版本,~~写在了numpyutils中~~现在全放在了numpy_mnist中。这样,使用这个工具包可以不使用torch包进行numpy_mnist。特别指出的是,download_mnist依然需要使用 torchvision这个包下载数据集。
+
+### ~~测试~~  修正
+
+在numpy_mnist替换了工具包之后重新运行,正确率和之前基本一致。
+
+```python
+[0] Accuracy: 0.9340
+[1] Accuracy: 0.9584
+[2] Accuracy: 0.9684
+```
+
+## 4月27日  对初始化方式的修正
+
+之前提交的版本中采取和Linear层默认初始化方式相同的方式进行初始化,今天发现存在以下两方面的问题(特别感谢**彭润宇**同学的提醒):
+
+* Pytorch线性层采取默认初始化中,假定非线性层为**Leaky Relu**,并设置a值默认为**根号5**,而非5。前面我公式中采用了5,会造成很不好的效果。
+* 如**何恺明**论文中所述,a值代表leaky relu层负斜率,我们采用relu层,理论上a值应该取0才符合Kaiming初始化设计初衷。
+
+本次修正针对上面两处问题进行修改,并补充探讨a值的选取。
+
+### 修改
+
+修改后的get_torch_initialization将a作为入参,并设置默认值为0,作为Relu层的Kaiming初始化方法。
+
+```python
+def get_torch_initialization(numpy = True,a = 0):
+    def Kaiming_uniform(fan_in,fan_out,a):
+        bound = 6.0 / (1 + a * a) / fan_in
+        bound = bound ** 0.5
+        W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out))
+        return W
+
+    W1 = Kaiming_uniform(28 * 28, 256, a)
+    W2 = Kaiming_uniform(256, 64, a)
+    W3 = Kaiming_uniform(64, 10, a)
+    return W1,W2,W3
+```
+
+### 对a值选取进行测试
+
+Pytorch的Linear层默认非线性激活层为Leaky Relu,并将a设置为根号5的做法发人深思。为了比较a值选择对效果的影响,我选取不同的a值在原数据集上进行了测试(a从0到6,间隔为0.3,同时统计第1、2、3次迭代后的正确率)。但结果不甚理想,事实上结果中权重初始化方式对3轮迭代后的正确率影响很不明显,即使仅在第一轮迭代后。可以想见的原因包括:
+
+* 我们的模型及数据不会产生**梯度消失**或**神经元死亡**的问题
+* batch的随机性,测试次数少
+
+我在img中保留了测试结果。但是对于我们的模型,还是按照何恺明在论文中指出的规则,对于Relu层使用a = 0。
+
+### 一点问题
+
+Pytorch对线性层的默认初始化中a值的选取令人困惑,按照何恺明指出,a值应该选择Leaky Relu层的**负斜率**,这个值应该是小于1 的正数(pytorch下层源码中是这样使用的,如下图)
+
+
+
+但在linear层中将其默认值设置为根号5:
+
+```python
+init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+```
+
+这两者存在矛盾,使得默认的线性层初始化中会将a=$\sqrt{5}$代入公式:
+$$
+bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}}
+$$
+得到一个较小的bound。
+
+曾有多名国内外网友提及这个问题,目前我没有看到这个问题合理的解释,其中一个讨论的地址:
+
+https://github.com/pytorch/pytorch/issues/15314
+
+我认为这有可能是Pytorch(version 3)的一处歧义甚至错误。
\ No newline at end of file
diff --git a/assignment-2/submission/18307130154/img/20190125144412278.png b/assignment-2/submission/18307130154/img/20190125144412278.png
new file mode 100644
index 0000000000000000000000000000000000000000..fcbc3a2982c4162900790d4e3d479717765b743f
Binary files /dev/null and b/assignment-2/submission/18307130154/img/20190125144412278.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210425011755375.png b/assignment-2/submission/18307130154/img/image-20210425011755375.png
new file mode 100644
index 0000000000000000000000000000000000000000..62a58dedaff524c0d49407a1103b4ac0d7e8d022
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425011755375.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210425230057530.png b/assignment-2/submission/18307130154/img/image-20210425230057530.png
new file mode 100644
index 0000000000000000000000000000000000000000..7779533c9222baca603aab11e54f32d58054bb90
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425230057530.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210425230119977.png b/assignment-2/submission/18307130154/img/image-20210425230119977.png
new file mode 100644
index 0000000000000000000000000000000000000000..70f10047ed945ea6ac69f36d9a80195e11de4967
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425230119977.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427200512951.png b/assignment-2/submission/18307130154/img/image-20210427200512951.png
new file mode 100644
index 0000000000000000000000000000000000000000..43189faca346fc18e2938d53d39691aea37c954e
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427200512951.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427203245993.png b/assignment-2/submission/18307130154/img/image-20210427203245993.png
new file mode 100644
index 0000000000000000000000000000000000000000..52cfc7d3907638f1502a6a89866f44a6af6b73bd
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203245993.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427203300617.png b/assignment-2/submission/18307130154/img/image-20210427203300617.png
new file mode 100644
index 0000000000000000000000000000000000000000..24b35eed4c9f022a11991135806034b706dec21c
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203300617.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427203337433.png b/assignment-2/submission/18307130154/img/image-20210427203337433.png
new file mode 100644
index 0000000000000000000000000000000000000000..912b1ca130c033a9ba33e0f0b30254843241c5bc
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203337433.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427205224362.png b/assignment-2/submission/18307130154/img/image-20210427205224362.png
new file mode 100644
index 0000000000000000000000000000000000000000..1bb5da48837686d89da73925b935accbe5454c17
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205224362.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427205245840.png b/assignment-2/submission/18307130154/img/image-20210427205245840.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ec5e96e75e7987a6d12d4977a49205c03ca923a
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205245840.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427205308848.png b/assignment-2/submission/18307130154/img/image-20210427205308848.png
new file mode 100644
index 0000000000000000000000000000000000000000..060021006b29d7907d064146375f28b30079459e
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205308848.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427212809776.png b/assignment-2/submission/18307130154/img/image-20210427212809776.png
new file mode 100644
index 0000000000000000000000000000000000000000..d0e834c5023e6ce211c264c0c386a97af8e21172
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427212809776.png differ
diff --git a/assignment-2/submission/18307130154/numpy_fnn.py b/assignment-2/submission/18307130154/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb8a954dc83f8c8125a655602af0cac7933c4de
--- /dev/null
+++ b/assignment-2/submission/18307130154/numpy_fnn.py
@@ -0,0 +1,215 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        
+        ####################
+        #      code 1      #
+        ####################
+        x = self.memory['x']
+        W = self.memory['W']
+
+        grad_W = np.matmul(x.T,grad_y)
+        grad_x = np.matmul(grad_y,W.T)
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 2      #
+        ####################
+        x = self.memory['x']
+        x1 = np.where(x > 0, 1, 0)
+        grad_x = x1 * grad_y
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 3      #
+        ####################
+        x = self.memory['x']
+        grad_x = 1/(x + self.epsilon)
+        grad_x = grad_x * grad_y
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+
+        ####################
+        #      code 4      #
+        ####################
+        self.memory['x'] = x
+        ex = np.exp(x)
+        rowsum = np.sum(ex,axis=1)
+        rowsum = rowsum[:,np.newaxis]
+        softmax = ex / rowsum
+        self.memory['softmax'] = softmax
+        return softmax
+
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 5      #
+        ####################
+        sm = self.memory['softmax']
+        Jacobs = []
+        for i in range(sm.shape[0]):
+            r = sm[i]
+            #对每一行求雅各比矩阵(因为导数只与本行有关)
+            J = np.diag(r) - np.outer(r, r)
+            Jacobs.append(J)
+        Jacobs = np.array(Jacobs)
+
+        grad_y = grad_y[:,np.newaxis,:]
+        grad_x = np.matmul(grad_y,Jacobs)
+        grad_x = np.squeeze(grad_x,axis=1)
+
+        return grad_x
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+
+
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+
+        # 以下变量需要在 backward 中更新
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+        
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        ####################
+        #      code 6      #
+        ####################
+        x = self.matmul_1.forward(x, self.W1)
+        x = self.relu_1.forward(x)
+        x = self.matmul_2.forward(x, self.W2)
+        x = self.relu_2.forward(x)
+        x = self.matmul_3.forward(x, self.W3)
+        x = self.softmax.forward(x)
+        x = self.log.forward(x)
+
+        return x
+    
+    def backward(self, y):
+        
+
+            
+        ####################
+        #      code 7      #
+        ####################
+        
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+
+        mulgrade3,self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(mulgrade3)
+
+        mulgrade2,self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(mulgrade2)
+
+        self.x1_grad,self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+
+
+        pass
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
+
+
+
+    
\ No newline at end of file
diff --git a/assignment-2/submission/18307130154/numpy_mnist.py b/assignment-2/submission/18307130154/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..1abc1e73eef32967faa94c5f1d93f20f8ae96d2d
--- /dev/null
+++ b/assignment-2/submission/18307130154/numpy_mnist.py
@@ -0,0 +1,112 @@
+from numpy_fnn import NumpyModel, NumpyLoss
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+def get_torch_initialization(numpy = True,a=0):
+
+
+    def Kaiming_uniform(fan_in,fan_out,a):
+        bound = 6.0 / (1 + a * a) / fan_in
+        bound = bound ** 0.5
+        W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out))
+        return W
+
+    W1 = Kaiming_uniform(28 * 28, 256, a)
+    W2 = Kaiming_uniform(256, 64, a)
+    W3 = Kaiming_uniform(64, 10, a)
+    return W1,W2,W3
+    
+def plot_curve(data):
+    plt.plot(range(len(data)), data, color='blue')
+    plt.legend(['loss_value'], loc='upper right')
+    plt.xlabel('step')
+    plt.ylabel('value')
+    plt.show()
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    data = []
+    label = []
+    for x in dataset:
+        data.append(np.array(x[0]))
+        label.append(x[1])
+    data = np.array(data)
+    label = np.array(label)
+
+    #索引随机打乱
+    siz = data.shape[0]
+    ind = np.arange(siz)
+    np.random.shuffle(ind)
+
+    #划分batch
+    res = []
+    con = 0
+    while con + batch_size <= siz:
+        data_batch = data[ind[con:con + batch_size]]
+        label_batch = label[ind[con:con + batch_size]]
+        res.append((data_batch,label_batch))
+        con += batch_size
+
+    return res
+
+def batch(dataset, numpy=True):
+    data = []
+    label = []
+    for x in dataset:
+        data.append(np.array(x[0]))
+        label.append(x[1])
+    data = np.array(data)
+    label = np.array(label)
+    return [(data, label)]
+
+def one_hot(y, numpy=True):
+    y_ = np.zeros((y.shape[0], 10))
+    y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+    return y_
+
+def download_mnist():
+    from torchvision import datasets, transforms
+    
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+    ])
+    
+    train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+    test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+    
+    return train_dataset, test_dataset
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130213/README.md b/assignment-2/submission/18307130213/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..44314b9df69abb8780c981955900b82f26b376e5
--- /dev/null
+++ b/assignment-2/submission/18307130213/README.md
@@ -0,0 +1,100 @@
+# 课程报告
+
+## NumpyModel 类实现
+
+  `NumpyModel` 类的实现位于 [numpy_fnn.py](./numpy_fnn.py) 。
+
+具体内容包括:
+
+1. 实现 `Matmul, Relu, Log, Softmax` 等支持前向传播和反向传播的基础算子类。
+2. 完善 `NumpyModel` 的前向传播函数 `forward` 和反向传播函数 `backward` 。
+
+
+
+## 模型训练与测试
+
+此模型应用了新的初始化方法即非 `PyTorch` 版初始化,在第一个 `epoch` 能达到更好的效果。
+
+单次实验的三个 `epoch` 中,模型的准确率分别在 95.7%, 96.6%, 97.2% 附近波动,以下为某次实验的结果:
+
+```
+[0] Accuracy: 0.9550
+[1] Accuracy: 0.9651
+[2] Accuracy: 0.9723
+```
+
+对应的图像为:
+
+
+
+可以看到,随着模型训练过程 `Loss` 逐渐收敛于某个较小值。
+
+
+
+## 数据处理和参数初始化
+
+在 `NumPy` 库基础上实现了 `mini_batch` 函数和 `get_torch_initialization` 函数,位于[numpy_mnist.py](./numpy_mnist.py) 。
+
+其中 `get_torch_initialization` 函数使用了**何恺明**提出的 `Kaiming` 初始化方法。这也是 `PyTorch` 线性层默认的初始化方法。
+
+究其原因可能有以下两方面的考量:
+
+1. 若权重初始绝对值过小,导致信号逐层衰减,激活函数趋于线性。
+2. 若权重初始绝对值过大,导致信号逐层放大,激活函数饱和,可能造成梯度消失等后果。
+
+使用 `Kaiming` 初始化可以得到一个适中的随机分布值,有效地加强训练效果。
+
+### Kaiming初始化公式
+
+ `Kaiming` 初始化方法相较于其他方法可以在使用 `relu` 或 `leaky_relu` 时取得更好的效果。
+
+令 `a` 为 `leaky_relu` 的负区域所对应的的斜率且尽量保证 $a<1$,显然对于 `relu` 有 $a = 0$。
+
+ `Kaiming` 初始化即使用某个均匀分布 `U(-bound, bound)` 对参数矩阵进行初始化。
+
+其中 `bound` 的计算公式为
+$$
+bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}}
+$$
+ `fan_in` 为扇入部分的参数个数。
+
+此方法的具体实现见 `get_torch_initialization` 函数。
+
+
+
+## 反向传播算子公式推导
+
+在本实验中,大部分算子要求进行矩阵对矩阵求导,正确的求导方式应先将矩阵向量化,进行向量对向量的求导。
+
+
+
+
+### Matmul算子
+
+
+
+### Relu算子
+
+
+
+### log算子
+
+
+
+### softmax算子
+
+
+
+## 总结
+
+已完成:自动测试 `60%`
+
+已完成:模仿 `torch_mnist.py` 的代码,在 `numpy_mnist.py` 中进行模型的训练和测试,并在报告中介绍你的实验过程与结果 `20%`
+
+ 已完成:在 `numpy_mnist.py` 中只用 `NumPy` 实现 `mini_batch` 函数,替换 `utils.py` 中使用 `PyTorch` 实现的 `mini_batch` 函数 `10%`
+
+已完成:在报告中推导 `numpy_fnn.py` 中实现算子的反向传播计算公式 `10%`
+
+已完成:调研 `PyTorch` 中权重初始化的方法,并实现代码替换 `get_torch_initialization` 函数 `10%`
+
+已完成:相关 `bug` 查杀工作
\ No newline at end of file
diff --git a/assignment-2/submission/18307130213/img/.keep b/assignment-2/submission/18307130213/img/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/assignment-2/submission/18307130213/img/formula_1.jpg b/assignment-2/submission/18307130213/img/formula_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d3c5d14dd27f77a3f0a7012d1dc25e82c1f84f5f
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_1.jpg differ
diff --git a/assignment-2/submission/18307130213/img/formula_2.jpg b/assignment-2/submission/18307130213/img/formula_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..eca0921203b32a83a12f95005d2a5f1cb6fc7247
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_2.jpg differ
diff --git a/assignment-2/submission/18307130213/img/formula_3.jpg b/assignment-2/submission/18307130213/img/formula_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2bd1069a8c4128fdb5633402ebda4492ac92fb9b
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_3.jpg differ
diff --git a/assignment-2/submission/18307130213/img/formula_4.jpg b/assignment-2/submission/18307130213/img/formula_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d7bf4291c88813f3c5ed2c9790f1f1074922c1d8
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_4.jpg differ
diff --git a/assignment-2/submission/18307130213/img/formula_5.jpg b/assignment-2/submission/18307130213/img/formula_5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7989af3b0414f5f593d8bb705563e54a2f9bd3e0
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_5.jpg differ
diff --git a/assignment-2/submission/18307130213/img/numpy_minist_result.jpg b/assignment-2/submission/18307130213/img/numpy_minist_result.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fb54da399984213c5c65d7b3e74f603161f99208
Binary files /dev/null and b/assignment-2/submission/18307130213/img/numpy_minist_result.jpg differ
diff --git a/assignment-2/submission/18307130213/numpy_fnn.py b/assignment-2/submission/18307130213/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..13e65fbb9150c6c4643cfeec3fa0c31e3eaf4005
--- /dev/null
+++ b/assignment-2/submission/18307130213/numpy_fnn.py
@@ -0,0 +1,161 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        grad_x = np.matmul(grad_y, self.memory['W'].T)
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        grad_x = np.where(self.memory['x']>0, grad_y, np.zeros_like(grad_y))
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        grad_x = np.divide(grad_y, self.memory['x'] + self.epsilon)
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        N, c = x.shape
+        e_x = np.exp(x)
+        sum_e_x = np.repeat(np.expand_dims(np.sum(e_x, axis=-1), axis=1), c, axis=1)
+        out = np.divide(e_x, sum_e_x)
+        self.memory['x'] = x
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        N, c = self.memory['x'].shape
+        e_x = np.power(np.e, self.memory['x'])
+        sum_e_x = np.repeat(np.expand_dims(np.sum(e_x, axis=-1), axis=1), c, axis=1)
+        fout = np.divide(e_x, sum_e_x)
+        e_g = e_x * grad_y
+        sum_e_g = np.repeat(np.expand_dims(np.sum(e_g, axis=-1), axis=1), c, axis=1)
+        grad_x = fout * (grad_y - np.divide(sum_e_g, sum_e_x))
+        return grad_x
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+
+
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+
+        # 以下变量需要在 backward 中更新
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+        
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = self.matmul_1.forward(x, self.W1)
+        x = self.relu_1.forward(x)
+        x = self.matmul_2.forward(x, self.W2)
+        x = self.relu_2.forward(x)
+        x = self.matmul_3.forward(x, self.W3)
+        x = self.softmax.forward(x)
+        x = self.log.forward(x)
+        return x
+
+
+    def backward(self, y):
+
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+
+        self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+
+        self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+
+        self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130213/numpy_mnist.py b/assignment-2/submission/18307130213/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..315fbf32a5a00ec2eaaca9978fbd311f1392a0ae
--- /dev/null
+++ b/assignment-2/submission/18307130213/numpy_mnist.py
@@ -0,0 +1,68 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, plot_curve, one_hot
+
+
+def get_torch_initialization(numpy=True):
+
+    def kaiming_uniform(fan_in, fan_out, a = 0.0):
+        # a: the negative slope of the rectifier used after this layer, specially 0 for relu
+        bound = (6.0 / ((1.0 + a**2) * fan_in))**0.5
+        return np.random.uniform(low = -bound, high = bound, size = (fan_in, fan_out))
+
+    return kaiming_uniform(28 * 28, 256), kaiming_uniform(256, 64), kaiming_uniform(64, 10)
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    data = []
+    label = []
+    for x in dataset:
+        data.append(np.array(x[0]))
+        label.append(x[1])
+    data = np.array(data)
+    label = np.array(label)
+
+    size = data.shape[0]
+    index = np.arange(size)
+    np.random.shuffle(index)
+
+    batches = []
+    i = 0
+    while i + batch_size <= size:
+        batches.append((data[index[i:i + batch_size]], label[index[i:i + batch_size]]))
+        i += batch_size
+
+    return batches
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
+
+### 分析
+
+从表格和loss变化情况来看,Momentum的效果明显优于手动学习率调整,而Adam的效果甚至不如恒定学习率,查看论文中的算法后,我排除了实现错误的可能性,查找了相关资料,发现了这样的一段话:
+
+[简单认识Adam]: https://www.jianshu.com/p/aebcaf8af76e	"Adam的缺陷与改进"
+
+虽然Adam算法目前成为主流的优化算法,不过在很多领域里(如计算机视觉的对象识别、NLP中的机器翻译)的最佳成果仍然是使用带动量(Momentum)的SGD来获取到的。Wilson 等人的论文结果显示,在对象识别、字符级别建模、语法成分分析等方面,自适应学习率方法(包括AdaGrad、AdaDelta、RMSProp、Adam等)通常比Momentum算法效果更差。
+
+根据该资料的说法,本次实验手写数字识别应划归为对象识别,自适应学习率方法确为效果更差,Adam的好处在于,对于不稳定目标函数,效果很好,因此,从这里可以看到,优化器选择应该针对实际问题类型综合考量
\ No newline at end of file
diff --git a/assignment-2/submission/18307130116/img/Adam.png b/assignment-2/submission/18307130116/img/Adam.png
new file mode 100644
index 0000000000000000000000000000000000000000..76c571e3ea0c18e00faf75a5f078350cb86a1159
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Adam.png differ
diff --git a/assignment-2/submission/18307130116/img/Figure_1.png b/assignment-2/submission/18307130116/img/Figure_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..683414e2e126545f2a851da9a05be74eb5261b13
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Figure_1.png differ
diff --git a/assignment-2/submission/18307130116/img/Figure_2.png b/assignment-2/submission/18307130116/img/Figure_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..bef71ab36ae8d83504f84243e3d64082b8fcab5d
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Figure_2.png differ
diff --git a/assignment-2/submission/18307130116/img/Figure_3.png b/assignment-2/submission/18307130116/img/Figure_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..639051608449345a12b51083243e78dcfa6a4f70
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Figure_3.png differ
diff --git a/assignment-2/submission/18307130116/img/Figure_4.png b/assignment-2/submission/18307130116/img/Figure_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe141456a1e96e256569cdcb37a87e2d4b6f0e6b
Binary files /dev/null and b/assignment-2/submission/18307130116/img/Figure_4.png differ
diff --git a/assignment-2/submission/18307130116/img/matmul.png b/assignment-2/submission/18307130116/img/matmul.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3e6d769ef44203d80817a2928a5b1ea2a533e06
Binary files /dev/null and b/assignment-2/submission/18307130116/img/matmul.png differ
diff --git a/assignment-2/submission/18307130116/img/model.png b/assignment-2/submission/18307130116/img/model.png
new file mode 100644
index 0000000000000000000000000000000000000000..72c73828f7d70be8ea8d3f010b27bc7ada0a4139
Binary files /dev/null and b/assignment-2/submission/18307130116/img/model.png differ
diff --git a/assignment-2/submission/18307130116/img/momentum.png b/assignment-2/submission/18307130116/img/momentum.png
new file mode 100644
index 0000000000000000000000000000000000000000..b9b0b145e362898c6a6cf5f379fe0459abb9fa28
Binary files /dev/null and b/assignment-2/submission/18307130116/img/momentum.png differ
diff --git a/assignment-2/submission/18307130116/img/softmax1.png b/assignment-2/submission/18307130116/img/softmax1.png
new file mode 100644
index 0000000000000000000000000000000000000000..56c1a6c77141e66a1970dc8d7d66d00c891a74d2
Binary files /dev/null and b/assignment-2/submission/18307130116/img/softmax1.png differ
diff --git a/assignment-2/submission/18307130116/img/softmax2.png b/assignment-2/submission/18307130116/img/softmax2.png
new file mode 100644
index 0000000000000000000000000000000000000000..277f06da303ed92389cc7620e89ee25bf5b1c7e1
Binary files /dev/null and b/assignment-2/submission/18307130116/img/softmax2.png differ
diff --git a/assignment-2/submission/18307130116/numpy_fnn.py b/assignment-2/submission/18307130116/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..13397e1977d0b8bf530900861e08a2176816f780
--- /dev/null
+++ b/assignment-2/submission/18307130116/numpy_fnn.py
@@ -0,0 +1,185 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        
+        ####################
+        #      code 1      #
+        ####################
+        grad_x = np.matmul(grad_y,self.memory['W'].T)
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 2      #
+        ####################
+        grad_x = np.where(self.memory['x'] > 0, grad_y, np.zeros_like(self.memory['x']))
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 3      #
+        ####################
+        grad_x =(1/(self.memory['x'] + self.epsilon)) *grad_y
+        
+        return grad_x
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        self.memory['x'] = x
+        ####################
+        #      code 4      #
+        ####################
+        exp = np.exp(self.memory['x'])
+        one = np.ones((self.memory['x'].shape[1], self.memory['x'].shape[1]))
+        h = 1./np.matmul(exp,one)
+        out = h * exp
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 5      #
+        ####################
+        exp = np.exp(self.memory['x'])
+        one = np.ones((self.memory['x'].shape[1], self.memory['x'].shape[1]))
+        h = 1./np.matmul(exp,one)
+        h_grad = -h * h
+        grad_x = grad_y* exp * h + np.matmul(grad_y * exp * h_grad, one) * exp
+        return grad_x
+
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+
+
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+
+        # 以下变量需要在 backward 中更新
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+        
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        ####################
+        #      code 6      #
+        ####################
+        x = self.matmul_1.forward(x, self.W1)
+        x = self.relu_1.forward(x)
+        x = self.matmul_2.forward(x, self.W2)
+        x = self.relu_2.forward(x)
+        x = self.matmul_3.forward(x, self.W3)
+        x = self.softmax.forward(x)
+        x = self.log.forward(x)
+        return x
+    
+    def backward(self, y):
+        ####################
+        #      code 7      #
+        ####################
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+        self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+        self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+        self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+        
+        
+
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130116/numpy_mnist.py b/assignment-2/submission/18307130116/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc5fdaa3b169f4a5ec77458993318b1b875ac400
--- /dev/null
+++ b/assignment-2/submission/18307130116/numpy_mnist.py
@@ -0,0 +1,97 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, get_torch_initialization, plot_curve, one_hot
+
+def mini_batch(dataset, batch_size=128, numpy=False, drop_last=False):
+    data = []
+    label = []
+    dataset_num = dataset.__len__()
+    idx = np.arange(dataset_num)
+    np.random.shuffle(idx)
+    for each in dataset:
+        data.append(each[0].numpy())
+        label.append(each[1])
+    label_numpy = np.array(label)[idx]
+    data_numpy = np.array(data)[idx]
+
+    result = []
+    for iter in range(dataset_num // batch_size):
+        result.append((data_numpy[iter*batch_size:(iter+1)*batch_size], label_numpy[iter*batch_size:(iter+1)*batch_size]))
+    if drop_last == False:
+        result.append((data_numpy[(iter+1)*batch_size:dataset_num], label_numpy[(iter+1)*batch_size:dataset_num]))
+    return result
+
+class Adam:
+    def __init__(self, weight, lr=0.0015, beta1=0.9, beta2=0.999, epsilon=1e-8):
+        self.theta = weight
+        self.lr = lr
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epislon = epsilon
+        self.m = 0
+        self.v = 0
+        self.t = 0
+
+    def optimize(self, grad):
+        self.t += 1
+        self.m = self.beta1 * self.m + (1 - self.beta1) * grad
+        self.v = self.beta2 * self.v + (1 - self.beta2) * grad * grad
+        self.m_hat = self.m / (1 - self.beta1 ** self.t)
+        self.v_hat = self.v / (1 - self.beta2 ** self.t)
+        self.theta -= self.lr * self.m_hat / (self.v_hat ** 0.5 + self.epislon)
+        return self.theta
+
+class Momentum:
+    def __init__(self, lr=0.1, beta=0.9):
+        self.lr = lr
+        self.beta = beta
+        self.v = 0
+    
+    def optimize(self, weight, grad):
+        self.v = self.beta*self.v + (1-self.beta)*grad
+        weight -= self.lr * self.v
+        return weight
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    W1_opt = Momentum()
+    W2_opt = Momentum()
+    W3_opt = Momentum()
+
+
+    train_loss = []
+    
+    epoch_number = 20
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            # if epoch >= 10:
+            #     learning_rate = 0.05
+            # else:
+            #     learning_rate = 0.1
+            # model.optimize(learning_rate)
+            model.W1 = W1_opt.optimize(model.W1, model.W1_grad)
+            model.W2 = W2_opt.optimize(model.W2, model.W2_grad)
+            model.W3 = W3_opt.optimize(model.W3, model.W3_grad)
+
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130154/README.md b/assignment-2/submission/18307130154/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..51cea514496f5e1d5ba2fae3e49da09fe4afb6ca
--- /dev/null
+++ b/assignment-2/submission/18307130154/README.md
@@ -0,0 +1,488 @@
+# Assignment 2——选题1  报告
+
+## 概述
+
+本次实验实现了简单的几个Pytorch算子,包括正向计算和反向传播,同时记录了反向传播的公式推导。然后搭建了简单的模型,在Mnist手写体数据集上进行了测试。
+
+**作为扩展,我调研了Pytorch的权重初始化方法——Xavier初始化和Kaiming初始化,用Numpy实现了numpyutils工具包替代utils(现在放在numpy_mnist中了)。**
+
+## 算子及推导
+
+### Matmul
+
+此算子进行两个矩阵的求积运算
+
+**推导**
+
+设反向传播的开始节点(叶节点)为 **L**,这是一个标量,下同。
+
+设正向计算中两个输入矩阵为 **P(m * k) , Q(k * n)**, 输出矩阵为 **O(m * n)**; 反向传播中输入的梯度为 **G(m * n)**。
+
+则有
+$$
+G_{ij} = \frac{\partial L}{\partial O_{ij}}
+$$
+此公式对后面的算子同样适用。
+
+**计算Q的梯度 GQ (k * n)**
+
+首先有
+$$
+\begin{aligned}
+	GQ {ts}&= \frac{\partial L}{\partial Q_{ts}} \\\\
+		&=\sum_{i \leqslant m\\ j\leqslant n}   \frac{\partial L}{\partial O_{ij}}  \times   \frac{\partial O_{ij}}{\partial Q_{ts}}\\\\
+		&=\sum_{i \leqslant m\\ j = s}   G_{ij}  \times   \frac{\partial O_{ij}}{\partial Q_{ts}}
+		&(其余的 \frac{\partial O_{ij}}{\partial Q_{ts}} = 0)\\\\
+		&=\sum_{i \leqslant m}   G_{is}  \times   P_{it}\\\\
+		&=\sum_{i \leqslant m}   P_{ti}^T  \times   G_{is}  
+\end{aligned}
+$$
+所以写成矩阵乘法为
+$$
+GQ {ts} = P^T \times G
+$$
+同理,**P的梯度为**
+$$
+GP {ts} = G \times Q^T
+$$
+
+### Relu
+
+设输入为**X**,输出为**Y**,Relu层的输出矩阵中每个元素**只与输入矩阵中对应位置的元素有关**,设对应位置元素为**x** , **y**。
+
+则有
+$$
+y = relu(x)=
+    \begin{cases}
+    0& x \leq 0\\\\
+    x& x \geq 0
+    \end{cases}
+$$
+和
+$$
+\frac{\partial Y_{ts}}{\partial X_{ij}} = 
+	\begin{cases}
+    1& t = i& and & s = j & and & X_{ij} > 0   \\\\
+    0& else
+    \end{cases}
+$$
+
+于是
+$$
+\frac{\partial L}{\partial X_{ij}} = \frac{\partial L}{\partial Y_{ij}} \times \frac{\partial Y_{ij}}{\partial X_{ij}}
+$$
+设M为X的掩码矩阵,其中M中元素m定义为:当X对应位置为正数时,m为1;否则m为0。那么上面的式子写成矩阵的形式:
+$$
+GX = GY * M
+$$
+其中 $*$ 表示矩阵的点乘,即对应位相乘。
+
+### Log
+
+设输入为**X**,输出为**Y**,Log层的输出矩阵中每个元素**只与输入矩阵中对应位置的元素有关**,设对应位置元素为**x** , **y**。
+
+则有
+$$
+\frac{dy}{dx} = \frac{1}{x}
+$$
+与Relu同理,设矩阵M定义为:M和X形状相同,且
+$$
+M_{ij} = \frac{1}{X_{ij}}
+$$
+则有
+$$
+GX = GY * M
+$$
+
+### Softmax
+
+设输入为**X**,输出为**Y**,其中**X**的第一维可以看成batch维,所以Softmax层的输出矩阵中每个元素只与输入矩阵中对应位置元素**所在行的元素**有关。方便起见,我们先考虑batch size为1的输入,即X (1 * n),并且用 
+$$
+X_i
+$$
+来简写 X 中第一行第 i 列的元素。
+
+输出Y 也是 1 * n 的矩阵,我们使用和 X 相同的表示规则。那么,正向计算公式为
+$$
+Y_i = \frac{e^{X_i}}{\sum_{k=1}^n e^{X_k}}
+$$
+梯度公式为:
+$$
+\frac{\partial Y_i}{\partial X_j} =
+	\begin{cases}
+    Y_i \times (1 - Y_i) & i = j\\\\
+    -Y_i \times Y_j & i \neq j
+    \end{cases}
+$$
+
+根据上面的公式可以计算出向量Y对向量X求导的雅各比矩阵**J (n * n)**, 定义如下
+$$
+J_{ij} = \frac{\partial Y_i}{\partial X_j}
+$$
+那么
+$$
+\begin{aligned}
+GX_{i} &= \frac{\partial L}{\partial X_i} \\\\
+	&=\sum_{k=1}^{n}\frac{\partial L}{\partial Y_k} \times \frac{\partial Y_k}{\partial X_i}\\\\
+	&=\sum_{k=1}^{n} GY_k \times J_{ki}\\\\
+	&=\sum_{k=1}^{n} GY_{1k} \times J_{ki}\\\\
+	&=GY \times J\\\\
+	&(其中GY为Y的梯度,是这一层反向传播的输入)
+\end{aligned}
+$$
+我们已经推出了在输入X的第一维为1的情况下的反向传播公式,事实上,当X的第一维(batch size)大于1时,只需要添加一个最高维,扩展 X, Y, GY, J, 并利用numpy的函数:
+
+```python
+numpy.matmul()
+```
+
+将自动执行张量计算,得到 GX。
+
+## 模型训练与测试
+
+### 模型搭建
+
+首先按照 torch_mnist 搭建模型。
+
+**正向传播**
+
+```python
+x = self.matmul_1.forward(x, self.W1)
+x = self.relu_1.forward(x)
+x = self.matmul_2.forward(x, self.W2)
+x = self.relu_2.forward(x)
+x = self.matmul_3.forward(x, self.W3)
+x = self.softmax.forward(x)
+x = self.log.forward(x)
+```
+
+~~**反向传播**~~(这里由于后面测试例做了改动,这里的模型也随之变化,最终的模型在下面)
+
+这里有一点要注意,torch的反向传播以**标量(叶子结点)**为开始,但是我们定义的模型没有最后的激活为标量的层,所以最高层的梯度要手动计算。看到测试例中torch使用的标量(Loss)为:
+
+```python
+loss = (-y_pred * y).sum(dim=1).mean() 
+```
+
+因为有一个对列求均值的操作,所以激活层的权重矩阵(也即最高层的梯度矩阵),为**- y / y.shape[0]**,但是在模型反向传播的函数中已经有这样一段代码:
+
+```python
+for size in y.shape:
+        y /= size
+```
+
+y的符号相反,并且多除了一个y.shape[1], 所以我在反向传播一开始,把这个弥补进顶层梯度里面了,最终的code 7:
+
+```python
+####################
+#      code 7      #
+####################
+
+#mulgrade = mulgrade3
+#x3_grade = mulgrade2
+#x2_grade = mulgrade1
+#x1_grade = input_grad
+
+y *= (-y.shape[1])  
+self.log_grad = y
+self.softmax_grad = self.log.backward(self.log_grad)
+
+mulgrade = self.softmax.backward(self.softmax_grad)
+self.relu_2_grad,self.W3_grad = self.matmul_3.backward(mulgrade)
+
+self.x3_grad = self.relu_2.backward(self.relu_2_grad)
+self.relu_1_grad,self.W2_grad = self.matmul_2.backward(self.x3_grad)
+
+self.x2_grad = self.relu_1.backward(self.relu_1_grad)
+self.x1_grad,self.W1_grad = self.matmul_1.backward(self.x2_grad)
+```
+
+**反向传播版本2**
+
+现在_grad 表示对应层的 input 的梯度,直接贴代码
+
+```python
+self.log_grad = self.log.backward(y)
+self.softmax_grad = self.softmax.backward(self.log_grad)
+
+mulgrade3,self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+self.relu_2_grad = self.relu_2.backward(mulgrade3)
+
+mulgrade2,self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+self.relu_1_grad = self.relu_1.backward(mulgrade2)
+
+self.x1_grad,self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+```
+
+### 用numpy实现mini_batch
+
+将数据集打乱,并根据batch_size分割
+
+```python
+def mini_batch(dataset, batch_size=128, numpy=False):
+    data = []
+    label = []
+    for x in dataset:
+        data.append(np.array(x[0]))
+        label.append(x[1])
+    data = np.array(data)
+    label = np.array(label)
+
+    #索引随机打乱
+    siz = data.shape[0]
+    ind = np.arange(siz)
+    np.random.shuffle(ind)
+
+    #划分batch
+    res = []
+    con = 0
+    while con + batch_size <= siz:
+        data_batch = data[ind[con:con + batch_size]]
+        label_batch = label[ind[con:con + batch_size]]
+        res.append((data_batch,label_batch))
+        con += batch_size
+
+    return res
+```
+
+### 训练与测试
+
+这部分代码助教已经给出,使用的是mnist手写体数据集。下载数据集后,对每个epoch,按照batch_size将数据读入,并使用模型进行一次正向计算、反向传播、优化。主要部分:
+
+```python
+for epoch in range(epoch_number):
+    for x, y in mini_batch(train_dataset):
+
+        y = one_hot(y)
+
+        # y_pred = model.forward(x.numpy())
+        y_pred = model.forward(x)
+        loss = (-y_pred * y).sum(axis=1).mean()
+        model.backward(y)
+        model.optimize(learning_rate)
+
+        train_loss.append(loss.item())
+
+    x, y = batch(test_dataset)[0]
+    accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+    print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+```
+
+### 测试结果
+
+**损失函数**
+
+
+
+**每一轮epoch后正确率(共3轮)**
+
+```
+[0] Accuracy: 0.9459
+[1] Accuracy: 0.9635
+[2] Accuracy: 0.9713
+```
+
+
+
+## 扩展——Pytorch权重初始化方法
+
+### 结论
+
+结论写在前。Pytorch线性层采取的默认初始化方式是**Kaiming**初始化,这是由我国计算机视觉领域专家**何恺明**提出的。我的探究主要包括:
+
+* 为什么采取Kaiming初始化?
+* 考察Kaiming初始化的基础——Xavier初始化的公式
+* 考察Kaiming初始化的公式
+* 用Numpy实现一个简易的Kaiming初始化
+
+### 为什么采取Kaiming初始化?
+
+**采取固定的分布?**
+
+当考虑怎么初始化权重矩阵这个问题时,可以想到应该使得初始权重具有随机性。提到随机,自然的想法是使用**均匀分布或正态分布**,那么我们如果采用**与模型无关的固定分布**(例如标准正态分布(均值为0,方差为1))怎么样?下面我们分析如果对模型本身不加考虑,采取固定的分布,会有什么问题:
+
+* 如果权重的绝对值太小,在多层的神经网络的每一层,输入信号的方差会不断减小;当到达最终的输出层时,可以理解为输入信号的影响已经降低到微乎其微。一方面训练效果差,另一方面可能会有梯度消失等问题。(此处从略,参考https://zhuanlan.zhihu.com/p/25631496)
+* 如果权重的绝对值太大,同样道理,随着深度的加深,可能会使输入信号的方差过大,这会造成梯度爆炸或消失的问题。
+
+这里举一个例子,假如一个网络使用了多个sigmoid作为中间层(这个函数具有两边导数趋于0的特点):
+
+* 如果权重初始绝对值太小,随着深度的加深,输入信号的方差过小。当输入很小时,sigmoid函数接近线性,深层模型也失去了非线性性的优点。(**模型效果**)
+* 如果权重初始绝对值太大,随着深度的加深,输入信号的方差过大。绝对值过大的sigmoid输入意味着激活变得饱和,梯度将开始接近零。(**梯度消失**)
+
+### Xavier初始化
+
+前面的问题提示我们要根据模型的特点(维度,规模)决定使用的随机化方法(分布的均值、方差),**xavier初始化**应运而生,它可以使得输入值经过网络层后**方差不变**。pytorch中这一点是通过增益值gain来实现的,下面的函数用来获得特定层的gain:
+
+```python
+torch.nn.init.calculate_gain(nonlinearity, param=None)
+```
+
+增益值表(图片摘自https://blog.csdn.net/winycg/article/details/86649832)
+
+
+
+Xavier初始化可以采用均匀分布 **U(-a, a)**,其中a的计算公式为:
+$$
+a = gain \times \sqrt[]{\frac{6}{fan\_in+fan\_out}}
+$$
+Xavier初始化可以采用正态分布 **N(0, std)**,其中std的计算公式为:
+$$
+std = gain \times \sqrt[]{\frac{2}{fan\_in+fan\_out}}
+$$
+其中fan_in和fan_out分别是输入神经元和输出神经元的数量,在全连接层中,就等于输入输出的feature数。
+
+### Kaiming初始化
+
+Xavier初始化在Relu层表现不好,主要原因是relu层会将负数映射到0,影响整体方差。所以**何恺明**在对此做了改进提出Kaiming初始化,一开始主要应用于计算机视觉、卷积网络。
+
+Kaiming均匀分布的初始化采用**U(-bound, bound)**,其中bound的计算公式为:(a 的概念下面再说)
+$$
+bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}}
+$$
+这里补充一点,pytorch中这个公式也通过gain作为中间变量实现,也就是:
+$$
+bound = gain \times \sqrt[]{\frac{3}{ fan\_in}}
+$$
+其中:
+$$
+gain = \sqrt{\frac{2}{1 + a^2}}
+$$
+Kaiming正态分布的初始化采用**N(0,std)**,其中std的计算公式为:
+$$
+std = \sqrt[]{\frac{2}{(1 + a ^2) \times fan\_in}}
+$$
+这里稍微解释一下a的含义,源码中的解释为
+
+```
+the negative slope of the rectifier used after this layer
+```
+
+简单说,是用来衡量这一层中负数比例的,负数越多,Relu层会将越多的输入“抹平”为0,a用来平衡这种“抹平”对于方差的影响。
+
+### 我们使用的初始化
+
+看一下我们现在使用的get_torch_initialization函数,可以看到是通过调用pytorch的线性层进行的默认初始化:
+
+```python
+fc1 = torch.nn.Linear(28 * 28, 256)
+```
+
+在Linear类中通过
+
+```python
+self.reset_parameters()
+```
+
+这个函数来完成随机初始化的过程,后者使用的是
+
+```python
+init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+```
+
+可见是我们前面提到的Kaiming均匀分布的初始化方式,这个函数的内容和前面的公式相符(使用gain作为中间变量):
+
+```python
+fan = _calculate_correct_fan(tensor, mode)
+gain = calculate_gain(nonlinearity, a)
+std = gain / math.sqrt(fan)
+bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+with torch.no_grad():
+    return tensor.uniform_(-bound, bound)
+```
+
+~~同时将参数a 的值设置为5。~~
+
+同时将参数a 的值设置为根号5。
+
+### ~~使用numpy完成get_torch_initialization~~    修正
+
+简单起见,我没有按照pytorch的封装方法分层实现初始化过程,后者主要为了提供多种不同的初始化方式。我直接按照线性层默认的初始方式——Kaiming均匀分布的公式用numpy实现了get_torch_initialization,其中a值取5, 代码如下:
+
+```python
+def get_torch_initialization(numpy = True):
+
+    a = 5
+
+    def Kaiming_uniform(fan_in,fan_out,a):
+        bound = 6.0 / (1 + a * a) / fan_in
+        bound = bound ** 0.5
+        W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out))
+        return W
+
+    W1 = Kaiming_uniform(28 * 28, 256, a)
+    W2 = Kaiming_uniform(256, 64, a)
+    W3 = Kaiming_uniform(64, 10, a)
+    return W1,W2,W3
+```
+
+顺便,我将utils其它函数(包括之前的mini_batch)转化为numpy版本,~~写在了numpyutils中~~现在全放在了numpy_mnist中。这样,使用这个工具包可以不使用torch包进行numpy_mnist。特别指出的是,download_mnist依然需要使用 torchvision这个包下载数据集。
+
+### ~~测试~~  修正
+
+在numpy_mnist替换了工具包之后重新运行,正确率和之前基本一致。
+
+```python
+[0] Accuracy: 0.9340
+[1] Accuracy: 0.9584
+[2] Accuracy: 0.9684
+```
+
+## 4月27日  对初始化方式的修正
+
+之前提交的版本中采取和Linear层默认初始化方式相同的方式进行初始化,今天发现存在以下两方面的问题(特别感谢**彭润宇**同学的提醒):
+
+* Pytorch线性层采取默认初始化中,假定非线性层为**Leaky Relu**,并设置a值默认为**根号5**,而非5。前面我公式中采用了5,会造成很不好的效果。
+* 如**何恺明**论文中所述,a值代表leaky relu层负斜率,我们采用relu层,理论上a值应该取0才符合Kaiming初始化设计初衷。
+
+本次修正针对上面两处问题进行修改,并补充探讨a值的选取。
+
+### 修改
+
+修改后的get_torch_initialization将a作为入参,并设置默认值为0,作为Relu层的Kaiming初始化方法。
+
+```python
+def get_torch_initialization(numpy = True,a = 0):
+    def Kaiming_uniform(fan_in,fan_out,a):
+        bound = 6.0 / (1 + a * a) / fan_in
+        bound = bound ** 0.5
+        W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out))
+        return W
+
+    W1 = Kaiming_uniform(28 * 28, 256, a)
+    W2 = Kaiming_uniform(256, 64, a)
+    W3 = Kaiming_uniform(64, 10, a)
+    return W1,W2,W3
+```
+
+### 对a值选取进行测试
+
+Pytorch的Linear层默认非线性激活层为Leaky Relu,并将a设置为根号5的做法发人深思。为了比较a值选择对效果的影响,我选取不同的a值在原数据集上进行了测试(a从0到6,间隔为0.3,同时统计第1、2、3次迭代后的正确率)。但结果不甚理想,事实上结果中权重初始化方式对3轮迭代后的正确率影响很不明显,即使仅在第一轮迭代后。可以想见的原因包括:
+
+* 我们的模型及数据不会产生**梯度消失**或**神经元死亡**的问题
+* batch的随机性,测试次数少
+
+我在img中保留了测试结果。但是对于我们的模型,还是按照何恺明在论文中指出的规则,对于Relu层使用a = 0。
+
+### 一点问题
+
+Pytorch对线性层的默认初始化中a值的选取令人困惑,按照何恺明指出,a值应该选择Leaky Relu层的**负斜率**,这个值应该是小于1 的正数(pytorch下层源码中是这样使用的,如下图)
+
+
+
+但在linear层中将其默认值设置为根号5:
+
+```python
+init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+```
+
+这两者存在矛盾,使得默认的线性层初始化中会将a=$\sqrt{5}$代入公式:
+$$
+bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}}
+$$
+得到一个较小的bound。
+
+曾有多名国内外网友提及这个问题,目前我没有看到这个问题合理的解释,其中一个讨论的地址:
+
+https://github.com/pytorch/pytorch/issues/15314
+
+我认为这有可能是Pytorch(version 3)的一处歧义甚至错误。
\ No newline at end of file
diff --git a/assignment-2/submission/18307130154/img/20190125144412278.png b/assignment-2/submission/18307130154/img/20190125144412278.png
new file mode 100644
index 0000000000000000000000000000000000000000..fcbc3a2982c4162900790d4e3d479717765b743f
Binary files /dev/null and b/assignment-2/submission/18307130154/img/20190125144412278.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210425011755375.png b/assignment-2/submission/18307130154/img/image-20210425011755375.png
new file mode 100644
index 0000000000000000000000000000000000000000..62a58dedaff524c0d49407a1103b4ac0d7e8d022
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425011755375.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210425230057530.png b/assignment-2/submission/18307130154/img/image-20210425230057530.png
new file mode 100644
index 0000000000000000000000000000000000000000..7779533c9222baca603aab11e54f32d58054bb90
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425230057530.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210425230119977.png b/assignment-2/submission/18307130154/img/image-20210425230119977.png
new file mode 100644
index 0000000000000000000000000000000000000000..70f10047ed945ea6ac69f36d9a80195e11de4967
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425230119977.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427200512951.png b/assignment-2/submission/18307130154/img/image-20210427200512951.png
new file mode 100644
index 0000000000000000000000000000000000000000..43189faca346fc18e2938d53d39691aea37c954e
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427200512951.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427203245993.png b/assignment-2/submission/18307130154/img/image-20210427203245993.png
new file mode 100644
index 0000000000000000000000000000000000000000..52cfc7d3907638f1502a6a89866f44a6af6b73bd
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203245993.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427203300617.png b/assignment-2/submission/18307130154/img/image-20210427203300617.png
new file mode 100644
index 0000000000000000000000000000000000000000..24b35eed4c9f022a11991135806034b706dec21c
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203300617.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427203337433.png b/assignment-2/submission/18307130154/img/image-20210427203337433.png
new file mode 100644
index 0000000000000000000000000000000000000000..912b1ca130c033a9ba33e0f0b30254843241c5bc
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203337433.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427205224362.png b/assignment-2/submission/18307130154/img/image-20210427205224362.png
new file mode 100644
index 0000000000000000000000000000000000000000..1bb5da48837686d89da73925b935accbe5454c17
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205224362.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427205245840.png b/assignment-2/submission/18307130154/img/image-20210427205245840.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ec5e96e75e7987a6d12d4977a49205c03ca923a
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205245840.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427205308848.png b/assignment-2/submission/18307130154/img/image-20210427205308848.png
new file mode 100644
index 0000000000000000000000000000000000000000..060021006b29d7907d064146375f28b30079459e
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205308848.png differ
diff --git a/assignment-2/submission/18307130154/img/image-20210427212809776.png b/assignment-2/submission/18307130154/img/image-20210427212809776.png
new file mode 100644
index 0000000000000000000000000000000000000000..d0e834c5023e6ce211c264c0c386a97af8e21172
Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427212809776.png differ
diff --git a/assignment-2/submission/18307130154/numpy_fnn.py b/assignment-2/submission/18307130154/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb8a954dc83f8c8125a655602af0cac7933c4de
--- /dev/null
+++ b/assignment-2/submission/18307130154/numpy_fnn.py
@@ -0,0 +1,215 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        
+        ####################
+        #      code 1      #
+        ####################
+        x = self.memory['x']
+        W = self.memory['W']
+
+        grad_W = np.matmul(x.T,grad_y)
+        grad_x = np.matmul(grad_y,W.T)
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 2      #
+        ####################
+        x = self.memory['x']
+        x1 = np.where(x > 0, 1, 0)
+        grad_x = x1 * grad_y
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 3      #
+        ####################
+        x = self.memory['x']
+        grad_x = 1/(x + self.epsilon)
+        grad_x = grad_x * grad_y
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+
+        ####################
+        #      code 4      #
+        ####################
+        self.memory['x'] = x
+        ex = np.exp(x)
+        rowsum = np.sum(ex,axis=1)
+        rowsum = rowsum[:,np.newaxis]
+        softmax = ex / rowsum
+        self.memory['softmax'] = softmax
+        return softmax
+
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        ####################
+        #      code 5      #
+        ####################
+        sm = self.memory['softmax']
+        Jacobs = []
+        for i in range(sm.shape[0]):
+            r = sm[i]
+            #对每一行求雅各比矩阵(因为导数只与本行有关)
+            J = np.diag(r) - np.outer(r, r)
+            Jacobs.append(J)
+        Jacobs = np.array(Jacobs)
+
+        grad_y = grad_y[:,np.newaxis,:]
+        grad_x = np.matmul(grad_y,Jacobs)
+        grad_x = np.squeeze(grad_x,axis=1)
+
+        return grad_x
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+
+
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+
+        # 以下变量需要在 backward 中更新
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+        
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        ####################
+        #      code 6      #
+        ####################
+        x = self.matmul_1.forward(x, self.W1)
+        x = self.relu_1.forward(x)
+        x = self.matmul_2.forward(x, self.W2)
+        x = self.relu_2.forward(x)
+        x = self.matmul_3.forward(x, self.W3)
+        x = self.softmax.forward(x)
+        x = self.log.forward(x)
+
+        return x
+    
+    def backward(self, y):
+        
+
+            
+        ####################
+        #      code 7      #
+        ####################
+        
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+
+        mulgrade3,self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(mulgrade3)
+
+        mulgrade2,self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(mulgrade2)
+
+        self.x1_grad,self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+
+
+        pass
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
+
+
+
+    
\ No newline at end of file
diff --git a/assignment-2/submission/18307130154/numpy_mnist.py b/assignment-2/submission/18307130154/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..1abc1e73eef32967faa94c5f1d93f20f8ae96d2d
--- /dev/null
+++ b/assignment-2/submission/18307130154/numpy_mnist.py
@@ -0,0 +1,112 @@
+from numpy_fnn import NumpyModel, NumpyLoss
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+def get_torch_initialization(numpy = True,a=0):
+
+
+    def Kaiming_uniform(fan_in,fan_out,a):
+        bound = 6.0 / (1 + a * a) / fan_in
+        bound = bound ** 0.5
+        W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out))
+        return W
+
+    W1 = Kaiming_uniform(28 * 28, 256, a)
+    W2 = Kaiming_uniform(256, 64, a)
+    W3 = Kaiming_uniform(64, 10, a)
+    return W1,W2,W3
+    
+def plot_curve(data):
+    plt.plot(range(len(data)), data, color='blue')
+    plt.legend(['loss_value'], loc='upper right')
+    plt.xlabel('step')
+    plt.ylabel('value')
+    plt.show()
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    data = []
+    label = []
+    for x in dataset:
+        data.append(np.array(x[0]))
+        label.append(x[1])
+    data = np.array(data)
+    label = np.array(label)
+
+    #索引随机打乱
+    siz = data.shape[0]
+    ind = np.arange(siz)
+    np.random.shuffle(ind)
+
+    #划分batch
+    res = []
+    con = 0
+    while con + batch_size <= siz:
+        data_batch = data[ind[con:con + batch_size]]
+        label_batch = label[ind[con:con + batch_size]]
+        res.append((data_batch,label_batch))
+        con += batch_size
+
+    return res
+
+def batch(dataset, numpy=True):
+    data = []
+    label = []
+    for x in dataset:
+        data.append(np.array(x[0]))
+        label.append(x[1])
+    data = np.array(data)
+    label = np.array(label)
+    return [(data, label)]
+
+def one_hot(y, numpy=True):
+    y_ = np.zeros((y.shape[0], 10))
+    y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+    return y_
+
+def download_mnist():
+    from torchvision import datasets, transforms
+    
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+    ])
+    
+    train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+    test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+    
+    return train_dataset, test_dataset
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130213/README.md b/assignment-2/submission/18307130213/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..44314b9df69abb8780c981955900b82f26b376e5
--- /dev/null
+++ b/assignment-2/submission/18307130213/README.md
@@ -0,0 +1,100 @@
+# 课程报告
+
+## NumpyModel 类实现
+
+  `NumpyModel` 类的实现位于 [numpy_fnn.py](./numpy_fnn.py) 。
+
+具体内容包括:
+
+1. 实现 `Matmul, Relu, Log, Softmax` 等支持前向传播和反向传播的基础算子类。
+2. 完善 `NumpyModel` 的前向传播函数 `forward` 和反向传播函数 `backward` 。
+
+
+
+## 模型训练与测试
+
+此模型应用了新的初始化方法即非 `PyTorch` 版初始化,在第一个 `epoch` 能达到更好的效果。
+
+单次实验的三个 `epoch` 中,模型的准确率分别在 95.7%, 96.6%, 97.2% 附近波动,以下为某次实验的结果:
+
+```
+[0] Accuracy: 0.9550
+[1] Accuracy: 0.9651
+[2] Accuracy: 0.9723
+```
+
+对应的图像为:
+
+
+
+可以看到,随着模型训练过程 `Loss` 逐渐收敛于某个较小值。
+
+
+
+## 数据处理和参数初始化
+
+在 `NumPy` 库基础上实现了 `mini_batch` 函数和 `get_torch_initialization` 函数,位于[numpy_mnist.py](./numpy_mnist.py) 。
+
+其中 `get_torch_initialization` 函数使用了**何恺明**提出的 `Kaiming` 初始化方法。这也是 `PyTorch` 线性层默认的初始化方法。
+
+究其原因可能有以下两方面的考量:
+
+1. 若权重初始绝对值过小,导致信号逐层衰减,激活函数趋于线性。
+2. 若权重初始绝对值过大,导致信号逐层放大,激活函数饱和,可能造成梯度消失等后果。
+
+使用 `Kaiming` 初始化可以得到一个适中的随机分布值,有效地加强训练效果。
+
+### Kaiming初始化公式
+
+ `Kaiming` 初始化方法相较于其他方法可以在使用 `relu` 或 `leaky_relu` 时取得更好的效果。
+
+令 `a` 为 `leaky_relu` 的负区域所对应的的斜率且尽量保证 $a<1$,显然对于 `relu` 有 $a = 0$。
+
+ `Kaiming` 初始化即使用某个均匀分布 `U(-bound, bound)` 对参数矩阵进行初始化。
+
+其中 `bound` 的计算公式为
+$$
+bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}}
+$$
+ `fan_in` 为扇入部分的参数个数。
+
+此方法的具体实现见 `get_torch_initialization` 函数。
+
+
+
+## 反向传播算子公式推导
+
+在本实验中,大部分算子要求进行矩阵对矩阵求导,正确的求导方式应先将矩阵向量化,进行向量对向量的求导。
+
+
+
+
+### Matmul算子
+
+
+
+### Relu算子
+
+
+
+### log算子
+
+
+
+### softmax算子
+
+
+
+## 总结
+
+已完成:自动测试 `60%`
+
+已完成:模仿 `torch_mnist.py` 的代码,在 `numpy_mnist.py` 中进行模型的训练和测试,并在报告中介绍你的实验过程与结果 `20%`
+
+ 已完成:在 `numpy_mnist.py` 中只用 `NumPy` 实现 `mini_batch` 函数,替换 `utils.py` 中使用 `PyTorch` 实现的 `mini_batch` 函数 `10%`
+
+已完成:在报告中推导 `numpy_fnn.py` 中实现算子的反向传播计算公式 `10%`
+
+已完成:调研 `PyTorch` 中权重初始化的方法,并实现代码替换 `get_torch_initialization` 函数 `10%`
+
+已完成:相关 `bug` 查杀工作
\ No newline at end of file
diff --git a/assignment-2/submission/18307130213/img/.keep b/assignment-2/submission/18307130213/img/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/assignment-2/submission/18307130213/img/formula_1.jpg b/assignment-2/submission/18307130213/img/formula_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d3c5d14dd27f77a3f0a7012d1dc25e82c1f84f5f
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_1.jpg differ
diff --git a/assignment-2/submission/18307130213/img/formula_2.jpg b/assignment-2/submission/18307130213/img/formula_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..eca0921203b32a83a12f95005d2a5f1cb6fc7247
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_2.jpg differ
diff --git a/assignment-2/submission/18307130213/img/formula_3.jpg b/assignment-2/submission/18307130213/img/formula_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2bd1069a8c4128fdb5633402ebda4492ac92fb9b
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_3.jpg differ
diff --git a/assignment-2/submission/18307130213/img/formula_4.jpg b/assignment-2/submission/18307130213/img/formula_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d7bf4291c88813f3c5ed2c9790f1f1074922c1d8
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_4.jpg differ
diff --git a/assignment-2/submission/18307130213/img/formula_5.jpg b/assignment-2/submission/18307130213/img/formula_5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7989af3b0414f5f593d8bb705563e54a2f9bd3e0
Binary files /dev/null and b/assignment-2/submission/18307130213/img/formula_5.jpg differ
diff --git a/assignment-2/submission/18307130213/img/numpy_minist_result.jpg b/assignment-2/submission/18307130213/img/numpy_minist_result.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fb54da399984213c5c65d7b3e74f603161f99208
Binary files /dev/null and b/assignment-2/submission/18307130213/img/numpy_minist_result.jpg differ
diff --git a/assignment-2/submission/18307130213/numpy_fnn.py b/assignment-2/submission/18307130213/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..13e65fbb9150c6c4643cfeec3fa0c31e3eaf4005
--- /dev/null
+++ b/assignment-2/submission/18307130213/numpy_fnn.py
@@ -0,0 +1,161 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        grad_x = np.matmul(grad_y, self.memory['W'].T)
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        grad_x = np.where(self.memory['x']>0, grad_y, np.zeros_like(grad_y))
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        grad_x = np.divide(grad_y, self.memory['x'] + self.epsilon)
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        N, c = x.shape
+        e_x = np.exp(x)
+        sum_e_x = np.repeat(np.expand_dims(np.sum(e_x, axis=-1), axis=1), c, axis=1)
+        out = np.divide(e_x, sum_e_x)
+        self.memory['x'] = x
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        N, c = self.memory['x'].shape
+        e_x = np.power(np.e, self.memory['x'])
+        sum_e_x = np.repeat(np.expand_dims(np.sum(e_x, axis=-1), axis=1), c, axis=1)
+        fout = np.divide(e_x, sum_e_x)
+        e_g = e_x * grad_y
+        sum_e_g = np.repeat(np.expand_dims(np.sum(e_g, axis=-1), axis=1), c, axis=1)
+        grad_x = fout * (grad_y - np.divide(sum_e_g, sum_e_x))
+        return grad_x
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+
+
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+
+        # 以下变量需要在 backward 中更新
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+        
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = self.matmul_1.forward(x, self.W1)
+        x = self.relu_1.forward(x)
+        x = self.matmul_2.forward(x, self.W2)
+        x = self.relu_2.forward(x)
+        x = self.matmul_3.forward(x, self.W3)
+        x = self.softmax.forward(x)
+        x = self.log.forward(x)
+        return x
+
+
+    def backward(self, y):
+
+        self.log_grad = self.log.backward(y)
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+
+        self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+
+        self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+
+        self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130213/numpy_mnist.py b/assignment-2/submission/18307130213/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..315fbf32a5a00ec2eaaca9978fbd311f1392a0ae
--- /dev/null
+++ b/assignment-2/submission/18307130213/numpy_mnist.py
@@ -0,0 +1,68 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, plot_curve, one_hot
+
+
+def get_torch_initialization(numpy=True):
+
+    def kaiming_uniform(fan_in, fan_out, a = 0.0):
+        # a: the negative slope of the rectifier used after this layer, specially 0 for relu
+        bound = (6.0 / ((1.0 + a**2) * fan_in))**0.5
+        return np.random.uniform(low = -bound, high = bound, size = (fan_in, fan_out))
+
+    return kaiming_uniform(28 * 28, 256), kaiming_uniform(256, 64), kaiming_uniform(64, 10)
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    data = []
+    label = []
+    for x in dataset:
+        data.append(np.array(x[0]))
+        label.append(x[1])
+    data = np.array(data)
+    label = np.array(label)
+
+    size = data.shape[0]
+    index = np.arange(size)
+    np.random.shuffle(index)
+
+    batches = []
+    i = 0
+    while i + batch_size <= size:
+        batches.append((data[index[i:i + batch_size]], label[index[i:i + batch_size]]))
+        i += batch_size
+
+    return batches
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()