diff --git a/assignment-2/submission/16307130040/README.md b/assignment-2/submission/16307130040/README.md new file mode 100644 index 0000000000000000000000000000000000000000..74274e457388b56b4227c995ac2586137ddf1d19 --- /dev/null +++ b/assignment-2/submission/16307130040/README.md @@ -0,0 +1,148 @@ +# 实验报告2 + +### 1,实验结果 + +在完成了对mini_batch函数的替换后,模型得以顺利进行: + +![](.\img\Figure_1.png) + +```shell +[0] Accuracy: 0.9453 +[1] Accuracy: 0.9656 +[2] Accuracy: 0.9689 +``` + +### 2,mini_batch的替换 + +```python +def mini_batch(dataset, batch_size=128, numpy=True): + data = [] + label = [] + for each in dataset: + data.append(np.array(each[0])) + label.append(each[1]) + data = np.array(data) + label = np.array(label) + + m = data.shape[0] + permutation = list(np.random.permutation(m)) + data =data[permutation] + label=label[permutation] + + n=m//batch_size + mini_batches=[] + for i in range(n): + mini_batches.append([data[i*batch_size:(i+1)*batch_size],label[i*batch_size:(i+1)*batch_size]]) + + return mini_batches +``` + +整体上参考了utils.py的batch函数。前半段和batch一样,将dataset中的数据分别放到data和label之中。之后,让data和label的元素顺序随机变化,再将每一个小batch的数据和标记放入对应的列表中,依次放入一个大的列表,并最终输出。 + +### 3,反向传播公式的推导 + +matmul: + +#### ![](.\img\matmul.jpg) + +设输出y为l维的向量。 + +**dx:**对于每个x中的元素xi,它对y1,y2,y3,.....,yl的偏导为wi1,wi2......,wil. + +dL/dxi=(dL/dy1)·wi1+(dL/dy2)·wi2+……+(dL/dyl)·wil + +所以,dx=dy·WT. + +**dW:** 对于W的元素wij,它对yj的偏导为xi。 + +dL/dwij=(dL/dyj)·xi + +所以,dW=xT*dy。对于多个样本,需要求平均值。 + +```python +def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + N=grad_y.shape[0] + + grad_x=np.matmul(grad_y,self.memory['W'].T) + grad_W=np.matmul(self.memory['x'].T,grad_y) + + + return grad_x, grad_W +``` + + + +2,Relu函数 + +```python +def backward(self, grad_y): + """ + grad_y: same shape as x + """ + x=self.memory['x'] + grad_x = grad_y.copy() + grad_x[x<=0]=0 + return grad_x + +``` + +在xi<=0时,dyi/dxi=0,dL/dxi=0 + +在xi>0时,dyi/dxi=1,dL/dxi=dL/dyi + +所以如上所示,如果xi大于0,则dx相应的位置照搬dy;如果xi小于等于0,则dx响应的位置设置为0. + +3,log函数 + +```python +def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + x=self.memory['x'] + x[x<=self.epsilon]=self.epsilon + grad_x=grad_y*(1/x) + + return grad_x +``` + +dyi/dxi=1/xi,dL/dxi=(dL/dyi)·1/xi + +所以,dx=dy·(1/x). + +4,softmax函数 + +![](.\img\softmax.jpg) + +设x和y均为l维的向量。 + +则对于dyj/dxi,如果i=j,则dyj/dxi=yj-(yj)^2.如果i!=j,则有dyj/dxi=-yi·yj. + +设D=diag(y)-yT·y,有dyj/dxi=Dij + +这样的话,有dL/dxi=(dL/dy1)·Di1+(dL/dy2)·Di2+……+(dL/dyl)·Dil + +所以,dx=dy·D + +```python +def backward(self, grad_y): + """ + grad_y: same shape as x + """ + y=self.memory['y'] + l=y.shape[0] + grad_x=[] + for grad_y1,y1 in zip(grad_y,y): + D= np.diag(y1) - np.outer(y1,y1) + grad_x1=np.dot(grad_y1, D) + grad_x.append(grad_x1) + grad_x=np.array(grad_x) + return grad_x +``` + +不过,在实际的实现中,要考虑到一批中不只有一个数据,要一个一个数据地逐个生成dx。 + diff --git a/assignment-2/submission/16307130040/img/Figure_1.png b/assignment-2/submission/16307130040/img/Figure_1.png new file mode 100644 index 0000000000000000000000000000000000000000..dab6049f889917dcbf2e93d6203b3a6579908777 Binary files /dev/null and b/assignment-2/submission/16307130040/img/Figure_1.png differ diff --git a/assignment-2/submission/16307130040/img/matmul.jpg b/assignment-2/submission/16307130040/img/matmul.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dd071796bbe85141e48275be4c38358eefa4112f Binary files /dev/null and b/assignment-2/submission/16307130040/img/matmul.jpg differ diff --git a/assignment-2/submission/16307130040/img/softmax.jpg b/assignment-2/submission/16307130040/img/softmax.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2daa913899b5f8401693ffb777ff3e27ed24cf09 Binary files /dev/null and b/assignment-2/submission/16307130040/img/softmax.jpg differ diff --git a/assignment-2/submission/16307130040/numpy_fnn.py b/assignment-2/submission/16307130040/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..277f81a3f11fb44523777ef4bddcb998454bcdc3 --- /dev/null +++ b/assignment-2/submission/16307130040/numpy_fnn.py @@ -0,0 +1,184 @@ +import numpy as np + + +class NumpyOp: + + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + + def forward(self, x, W): + """ + x: shape(N, d) + w: shape(d, d') + """ + self.memory['x'] = x + self.memory['W'] = W + h = np.matmul(x, W) + return h + + def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + + grad_x = np.matmul(grad_y, self.memory['W'].T) + grad_W = np.matmul(self.memory['x'].T, grad_y) + + + + return grad_x, grad_W + + +class Relu(NumpyOp): + + def forward(self, x): + self.memory['x'] = x + return np.where(x > 0, x, np.zeros_like(x)) + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + x=self.memory['x'] + grad_x = grad_y.copy() + grad_x[x<=0]=0 + + return grad_x + + +class Log(NumpyOp): + + def forward(self, x): + """ + x: shape(N, c) + """ + + out = np.log(x + self.epsilon) + self.memory['x'] = x + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + x=self.memory['x'] + x[x<=self.epsilon]=self.epsilon + grad_x=grad_y*(1/x) + + return grad_x + + +class Softmax(NumpyOp): + """ + softmax over last dimension + """ + + def forward(self, x): + """ + x: shape(N, c) + """ + + shift_x = x - np.max(x, axis=1).reshape(-1, 1) + y = np.exp(shift_x) / np.sum(np.exp(shift_x), axis=1).reshape(-1, 1) + #y = np.exp(x+1) / np.sum(np.exp(x+1), axis=1).reshape(-1, 1) + self.memory['y'] = y + + return y + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + y=self.memory['y'] + l=y.shape[0] + grad_x=[] + for grad_y1,y1 in zip(grad_y,y): + D= np.diag(y1) - np.outer(y1,y1) + grad_x1=np.dot(grad_y1, D) + grad_x.append(grad_x1) + grad_x=np.array(grad_x) + + return grad_x + + +class NumpyLoss: + + def __init__(self): + self.target = None + + def get_loss(self, pred, target): + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self): + return -self.target / self.target.shape[0] + + +class NumpyModel: + def __init__(self): + self.W1 = np.random.normal(size=(28 * 28, 256)) + self.W2 = np.random.normal(size=(256, 64)) + self.W3 = np.random.normal(size=(64, 10)) + + # 以下算子会在 forward 和 backward 中使用 + self.matmul_1 = Matmul() + self.relu_1 = Relu() + self.matmul_2 = Matmul() + self.relu_2 = Relu() + self.matmul_3 = Matmul() + self.softmax = Softmax() + self.log = Log() + + # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导) + self.x1_grad, self.W1_grad = None, None + self.relu_1_grad = None + self.x2_grad, self.W2_grad = None, None + self.relu_2_grad = None + self.x3_grad, self.W3_grad = None, None + self.softmax_grad = None + self.log_grad = None + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + + x=self.matmul_1.forward(x,self.W1) + x=self.relu_1.forward(x) + x=self.matmul_2.forward(x,self.W2) + x=self.relu_2.forward(x) + x=self.matmul_3.forward(x, self.W3) + + + x = self.softmax.forward(x) + x = self.log.forward(x) + + return x + + def backward(self, y): + + y = self.log.backward(y) + self.log_grad = y + y = self.softmax.backward(y) + self.softmax_grad = y + y, self.W3_grad = self.matmul_3.backward(y) + self.x3_grad = y + y = self.relu_2.backward(y) + y, self.W2_grad = self.matmul_2.backward(y) + self.x2_grad = y + y = self.relu_1.backward(y) + y, self.W1_grad = self.matmul_1.backward(y) + self.x1_grad = y + + pass + + def optimize(self, learning_rate): + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad diff --git a/assignment-2/submission/16307130040/numpy_mnist.py b/assignment-2/submission/16307130040/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..a688f7c64114bf150ffff2b903dfc74688bda4ad --- /dev/null +++ b/assignment-2/submission/16307130040/numpy_mnist.py @@ -0,0 +1,59 @@ +import numpy as np +from numpy_fnn import NumpyModel, NumpyLoss +from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot + +def mini_batch(dataset, batch_size=128, numpy=True): + data = [] + label = [] + for each in dataset: + data.append(np.array(each[0])) + label.append(each[1]) + data = np.array(data) + label = np.array(label) + + m = data.shape[0] + permutation = list(np.random.permutation(m)) + data =data[permutation] + label=label[permutation] + + n=m//batch_size + mini_batches=[] + for i in range(n): + mini_batches.append([data[i*batch_size:(i+1)*batch_size],label[i*batch_size:(i+1)*batch_size]]) + + return mini_batches + +def numpy_run(): + train_dataset, test_dataset = download_mnist() + + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset): + y = one_hot(y) + + y_pred = model.forward(x) + loss = numpy_loss.get_loss(y_pred, y) + + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + + +if __name__ == "__main__": + numpy_run() diff --git a/assignment-2/submission/16307130040/torch_mnist.py b/assignment-2/submission/16307130040/torch_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..6a5649bbfa750b3520b4b895de7260c3aa8ea7cd --- /dev/null +++ b/assignment-2/submission/16307130040/torch_mnist.py @@ -0,0 +1,64 @@ +import torch +from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve + + +class TorchModel: + + def __init__(self): + self.W1 = torch.randn((28 * 28, 256), requires_grad=True) + self.W2 = torch.randn((256, 64), requires_grad=True) + self.W3 = torch.randn((64, 10), requires_grad=True) + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + x = torch.relu(torch.matmul(x, self.W1)) + x = torch.relu(torch.matmul(x, self.W2)) + x = torch.matmul(x, self.W3) + self.softmax = torch.softmax(x, 1) + self.log = torch.log(self.softmax) + self.softmax.retain_grad() # for test only + self.log.retain_grad() # for test only + return self.log + + def optimize(self, learning_rate): + with torch.no_grad(): + self.W1 -= learning_rate * self.W1.grad + self.W2 -= learning_rate * self.W2.grad + self.W3 -= learning_rate * self.W3.grad + + self.W1.grad = None + self.W2.grad = None + self.W3.grad = None + + +def torch_run(): + train_dataset, test_dataset = download_mnist() + + model = TorchModel() + model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False) + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset, numpy=False): + y = one_hot(y, numpy=False) + + y_pred = model.forward(x) + loss = (-y_pred * y).sum(dim=1).mean() + loss.backward() + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset, numpy=False)[0] + accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item() + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + torch_run()