diff --git a/test/test_network_ops/test_fast_gelu.py b/test/test_network_ops/test_fast_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0606e0fe7e09dd92bce05aaa93c55c5babd24848
--- /dev/null
+++ b/test/test_network_ops/test_fast_gelu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestFastGelu(TestCase):
+   def npu_op_exec(self, input1):
+        input1.requires_grad = True
+        output = torch_npu.fast_gelu(input1)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+        output = output.cpu().detach().numpy()
+        return output_grad, output
+
+   def test_fastgelu(self, device):
+        input1 = torch.tensor([1.,2.,3.,4.]).npu()
+        exoutputgrad = torch.tensor([1.0677795, 1.0738151, 1.0245483, 1.0064018])
+        exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
+        outputgrad, output   = self.npu_op_exec(input1)
+        self.assertRtolEqual(exoutputgrad.numpy(), outputgrad) 
+        self.assertRtolEqual(exoutput.numpy(), output) 
+
+instantiate_device_type_tests(TestFastGelu, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_fast_gelu_backward.py b/test/test_network_ops/test_fast_gelu_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07b54795f68150ed267e3aaa20ead5ed398048f
--- /dev/null
+++ b/test/test_network_ops/test_fast_gelu_backward.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestFastGelu(TestCase):
+   def npu_op_exec(self, input1):
+        input1.requires_grad = True
+        output = torch_npu.fast_gelu(input1)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+        output = output.cpu().detach().numpy()
+        return output_grad, output
+
+   def test_fastgelu(self, device):
+        input1 = torch.tensor([1.,2.,3.,4.]).npu()
+        exoutputgrad = torch.tensor([1.0677795, 1.0738151, 1.0245483, 1.0064018])
+        exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
+        outputgrad, output = self.npu_op_exec(input1)
+        self.assertRtolEqual(exoutputgrad.numpy(), outputgrad) 
+        self.assertRtolEqual(exoutput.numpy(), output) 
+
+instantiate_device_type_tests(TestFastGelu, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 1f08a26654f2b0c9ba138a375290349f87213c44..3fca45b39430e264e74eccdc12908c78fe978ce1 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -1890,6 +1890,10 @@ custom:
     variants: function, method
   - func: one_(Tensor(a!) self) -> Tensor(a!)
     variants: method, function
+  - func: fast_gelu_backward(Tensor grad, Tensor self) -> Tensor
+    variants: function, method
+  - func: npu_bert_apply_adam(Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor var, Tensor m, Tensor v)
+  - func: npu_bert_apply_adam.out(Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay, *, Tensor(a!) var, Tensor(b!) m, Tensor(c!) v) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   - func: npu_conv_transpose2d_backward(Tensor input, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   - func: npu_conv_transpose3d_backward(Tensor input, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   - func: npu_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[] stride, int[] padding, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
@@ -1911,6 +1915,9 @@ custom:
   - func: npu_slice.out(Tensor self, int[] offsets, int[] size, *, Tensor(a!) out) -> Tensor(a!)
   - func: npu_indexing(Tensor self, int[] begin, int[] end, int[] strides, int begin_mask=0, int end_mask=0, int ellipsis_mask=0, int new_axis_mask=0, int shrink_axis_mask=0) -> Tensor
   - func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, int begin_mask=0, int end_mask=0, int ellipsis_mask=0, int new_axis_mask=0, int shrink_axis_mask=0, *, Tensor(a!) out) -> Tensor(a!)
+  - func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor
+    variants: function, method
 custom_autograd:
   - func: npu_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
   - func: npu_convolution_transpose(Tensor input, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
+  - func: fast_gelu(Tensor self) -> Tensor
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/FastGeluKernelNpu.cpp b/torch_npu/csrc/aten/ops/FastGeluKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..67c19226f40c0b7e3e37c168943cb4f55f03e4f4
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/FastGeluKernelNpu.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <torch/csrc/autograd/custom_function.h>
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+namespace {
+at::Tensor fast_gelu_npu_nocheck(at::Tensor& result, const at::Tensor& self) {
+
+    OpCommand cmd;
+    cmd.Name("FastGelu")
+        .Input(self)
+        .Output(result)
+        .Run();
+
+    return result;
+}
+
+} // namespace
+
+namespace {
+at::Tensor& fast_gelu_backward_npu_nocheck(
+    at::Tensor& grad_input,
+    const at::Tensor& grad,
+    const at::Tensor& self) {
+  // constructs the input and output NPUTensorDesc
+  OpCommand cmd;
+  cmd.Name("FastGeluGrad")
+    .Input(grad)
+    .Input(self)
+    .Output(grad_input)
+    .Run();
+
+  return grad_input;
+}
+}
+
+class NPUFastGeluFunction : public torch::autograd::Function<NPUFastGeluFunction> {
+public:
+  static at::Tensor forward(torch::autograd::AutogradContext *ctx,
+    const at::Tensor& self) {
+    at::AutoNonVariableTypeMode g;
+    ctx->save_for_backward({self});
+    auto outputSize = input_same_output_size(self);
+    at::Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  
+    return fast_gelu_npu_nocheck(result, self);
+  }
+
+  static torch::autograd::tensor_list backward(AutogradContext *ctx,
+    torch::autograd::tensor_list grad_outputs) {
+    auto saved = ctx->get_saved_variables();
+    auto input = saved[0];
+
+    at::Tensor result = NPUNativeFunctions::fast_gelu_backward(grad_outputs[0], input);
+    torch::autograd::tensor_list output = {result};
+    return output;
+  }
+};
+
+at::Tensor NPUNativeFunctions::fast_gelu_backward(
+    const at::Tensor& grad, 
+    const at::Tensor& self) {
+  // calculate the output size
+  auto outputSize = input_same_output_size(self);
+
+  // construct the output tensor of the NPU
+  at::Tensor grad_input = at::empty_with_format(
+      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  
+  // calculate the output result of the NPU
+  fast_gelu_backward_npu_nocheck(grad_input, grad, self);
+  
+  return grad_input;
+}
+
+at::Tensor NPUNativeFunctions::fast_gelu(const at::Tensor& self) {
+    return NPUFastGeluFunction::apply(self);
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file