diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a9011620d456096f85442c0633a1e513dc2132e
--- /dev/null
+++ b/test/test_network_ops/test_ge.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestGe(TestCase):
+    def generate_scalar(self, min1, max1):
+        scalar = np.random.uniform(min1, max1)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.ge(input1, input2)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.ge(input1, input2, out = input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.ge(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3):
+        torch.ge(input1, input2, out = input3)
+        output = input3.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.ge_(input2)
+        output = input1
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.ge_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar(self, input1, scalar):
+        output = torch.ge(input1, scalar)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.ge(input1, scalar, out = input2)
+        output = input2.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, scalar):
+        output = torch.ge(input1, scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.ge(input1, scalar, out = input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec_scalar(self, input1, scalar):
+        output = input1.ge_(scalar)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec_scalar(self, input1, scalar):
+        output = input1.ge_(scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def ge_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3 = torch.randn(item[1][2]) < 0
+            npu_input3 = cpu_input3.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_ge_tensor_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.ge_tensor_out_result(shape_format)
+
+    def ge_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2 = torch.randn(item[1][2]) < 0
+            npu_input2 = cpu_input2.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_ge_scalar_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [16, 3, 1111, 1212]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [16, 16, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [1313, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.ge_scalar_out_result(shape_format)
+
+    def test_ge_bool(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        scalar_list = [True, False]
+        shape_format = [
+            [[np.int32, i, j], k] for i in format_list for j in shape_list 
+            for k in scalar_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
+            cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1])
+            npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1])
+            cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50)
+            npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50)
+
+            self.assertRtolEqual(cpu_output1, npu_output1)
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+    def test_ge_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_scalar_int32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.int32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_tensor_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_tensor_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_float16(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ge_mix_dtype(self, device):
+        cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestGe, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b7d933b5bf781347c066bf902dc171fe144e2eb
--- /dev/null
+++ b/test/test_network_ops/test_le.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestLe(TestCase):
+    def generate_scalar(self, min1, max1):
+        scalar = np.random.uniform(min1, max1)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.le(input1, input2)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.le(input1, input2, out = input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.le(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.le_(input2)
+        output = input1
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, output):
+        torch.le(input1, input2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar(self, input1, scalar):
+        output = torch.le(input1, scalar)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.le(input1, scalar, out = input2)
+        output = input2.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, scalar):
+        output = torch.le(input1, scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec_scalar(self, input1, scalar):
+        output = input1.le_(scalar)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec_scalar(self, input1, scalar):
+        input1 = input1.to("npu")
+        output = input1.le_(scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self, input1, scalar, output):
+        torch.le(input1, scalar, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_stride_exec(self, input1, input2):
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        input2 = input2.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_stride_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        input2 = input2.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_stride_scalar_exec(self, input1, input2):
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_stride_scalar_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def le_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3 = torch.randn(item[1][2]) < 0
+            npu_input3 = cpu_input3.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            if cpu_input1.dtype == torch.float16:
+                cpu_output_out = cpu_output_out.astype(np.float16)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_le_tensor_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.le_tensor_out_result(shape_format)
+
+    def le_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2 = torch.randn(item[1][2]) < 0
+            npu_input2 = cpu_input2.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            if cpu_input1.dtype == torch.float16:
+                cpu_output_out = cpu_output_out.astype(np.float16)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_le_scalar_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 2, [16, 3, 11, 121, 21]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [16, 16, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 2, [1313, 3, 3, 3, 121]], [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.le_scalar_out_result(shape_format)
+
+    def test_le_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_scalar_int32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.int32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_tensor_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_tensor_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_inplace_float32(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_inplace_float16(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_inplace_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            scalar1 = copy.deepcopy(scalar)
+            ncpu_input = copy.deepcopy(cpu_input)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_inplace_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_le_mix_dtype(self, device):
+        cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestLe, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_network_ops/test_ne.py b/test/test_network_ops/test_ne.py
new file mode 100644
index 0000000000000000000000000000000000000000..a403a208d6c46a77aa8e66ff93fe5a254fbd9c28
--- /dev/null
+++ b/test/test_network_ops/test_ne.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestNe(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.ne(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.ne(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        input3 = torch.empty(0).bool().npu()
+        torch.ne(input1, input2, out=input3)
+        output = input3.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_ne_shape_format_fp32(self, device):
+        dtype_list = [np.float32]
+        format_list = [0, 3]
+        shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
+        shape_format = [
+            [d, i, j] for d in dtype_list for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)            
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_ne_shape_format_fp16(self, device):
+        dtype_list = [np.float16]
+        format_list = [0, 3]
+        shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
+        shape_format = [
+            [d, i, j] for d in dtype_list for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)            
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ne_out_shape_format_fp32(self, device):
+        dtype_list = [np.float32]
+        format_list = [0]
+        shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
+        shape_format = [
+            [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -10, 10)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)           
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def test_ne_scalar_out_shape_format_fp32(self, device):
+        dtype_list = [np.float32]
+        format_list = [0]
+        shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
+        shape_format = [
+            [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10)
+            npu_output_out = self.npu_op_exec_out(npu_input1, 5)
+            cpu_output = self.cpu_op_exec(cpu_input1, 5)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def test_ne_mix_dtype(self, device):
+        cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestNe, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py
new file mode 100644
index 0000000000000000000000000000000000000000..84c204a86fc4b2c4774be98ab9d5a14602e32d6a
--- /dev/null
+++ b/test/test_network_ops/test_nonzero.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestNonzero(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.nonzero(input1)
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.nonzero(input1)
+        output = output.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def test_nonzero_shape_format(self, device):
+        dtype_list = [np.float32, np.float16, np.int32, np.int64]
+        format_list = [0]
+        shape_list = [[256,10], [256,256,100],[5,256,256,100]]
+
+        shape_format = [
+                [[i, j, k]] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestNonzero, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_rsub.py b/test/test_network_ops/test_rsub.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2167d78f65e871adda07d8e4daedb14cb8d53f
--- /dev/null
+++ b/test/test_network_ops/test_rsub.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestRsub(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = input2 - input1
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = input2 - input1
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        output = input1 - input2
+        output = output.to("cpu")
+        output = output.numpy()
+        output = -output
+        return output
+
+    def rsub_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def rsub_scalar_result(self, shape_format):
+        for item in shape_format:
+            scalar = np.random.uniform(0, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, scalar)
+            npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar)
+
+            cpu_output = cpu_output.astype(npu_output_scalar.dtype)
+            self.assertRtolEqual(cpu_output, npu_output_scalar)
+
+    def test_sub_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    # int-------------------------------------------------------------------------------
+    def test_sub_shape_format_int32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [32]], [np.int32, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [5, 3]], [np.int32, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [256, 480, 14]], [np.int32, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    # scalar----------------------------------------------------------------------------
+    def test_sub_scalar_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [32]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [32]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_2d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_2d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_3d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_3d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_4d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_4d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+
+instantiate_device_type_tests(TestRsub, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_tanh.py b/test/test_network_ops/test_tanh.py
new file mode 100644
index 0000000000000000000000000000000000000000..28c8c0789fcc21e214d57dcce013778c6d60fe92
--- /dev/null
+++ b/test/test_network_ops/test_tanh.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestTanh(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.tanh(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.tanh(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_tanh_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (4, 3, 3)], 1, 100],
+            [[np.float32, -1, (7,5,5)], 21474836,21474837],
+            [[np.float32, -1, (4, 44, 44)], 3450,34020],
+            [[np.float32, -1, (65500,3,3)], -214748,-214746],
+            [[np.float32, -1, (1024, 448, 448)], 200, 300],
+            [[np.float32, -1, (128, 3, 5)],  0.3219780311757745 , 92 ],
+            [[np.float32, -1, (8, 7, 7)], 0.4820305734500543 , 28],
+            [[np.float32, -1, (15, 8, 8)],0.8563874665918477 , 98],
+            [[np.float32, -1, (11, 6, 6)], 0.0694198357720135 , 50],
+            [[np.float32, -1, (24, 24, 3)], -2,-2],
+            [[np.float32, -1, (6, 10, 10)], 0.6447298684351989 , 95],
+            [[np.float32, -1, (3, 9, 9)], 0.8723538084975545 , 85],
+            [[np.float32, -1, (5, 5, 5)], 0.8283759153463854 , 71],
+            [[np.float32, -1, (5, 1, 1)], 0.24718684227306953 , 25],
+            [[np.float32, -1, (14, 7, 7)], 0.3989186243492233 , 7 ],
+            [[np.float32, -1, (4, 10, 10)], 0.7866457165672994 , 5],
+            [[np.float32, -1, (3, 7, 7)],  0.3793216987112159 , 39],
+            [[np.float32, -1, (2, 8, 8)], 0.9662927186969077 , 5 ],
+            [[np.float32, -1, (3, 7, 7)], 0.9956475043306917 , 28],
+            [[np.float32, -1, (7, 10, 10)], 0.769565434387681 , 9],
+            [[np.float32, -1, (54, 93, 3)],0.6447298684351989 , 95],
+            [[np.float32, -1, (6, 3, 3)],  0.03133650248813469 , 37 ],
+            [[np.float32, -1, (65500, 1, 1)], 95, 100],
+            [[np.float32, -1, (6, 3, 10)], 0.03133650248813469 , 37],
+
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_tanh_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            output = torch.tanh(input1)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (65500, 1)], 212,225],
+            [[np.float16, -1, (1024,448,448)], 200, 300],
+            [[np.float16, -1, (16,16)],  -1000, -100],
+            [[np.float16, -1, (4,1)], -1.1754943508e-38,-1.1754943508e-38],
+            [[np.float16, -1, (7, 5, 5)], 21474836,21474837],
+            [[np.float16, -1, (4, 44, 44)], 3450,34020],
+            [[np.float16, -1, (65500, 3, 3)], -214748,-214746],
+            [[np.float16, -1, (64, 4, 4)], -9.313225746154785e-10,9.313225746154785e-10],
+            [[np.float16, -1, (128, 3, 5)], -0.000000000000000000000000000000000000011754943508,0.000000000000000000000000000000000000011754943508],
+            [[np.float16, -1, (1, 1, 1)], 0.9283381566708346 , 16],
+            [[np.float16, -1, (6, 3, 10)], 0.03133650248813469 , 37],
+            [[np.float16, -1, (65500, 1, 1)], 95, 100 ],
+            [[np.float16, -1, (13, 5, 5)], 0.9790231845699171 , 41],
+            [[np.float16, -1, (5, 7, 7)], 0.7852605507867441 , 87 ],
+            [[np.float16, -1, (13, 2, 2)],0.8758750778305631 , 82],
+            [[np.float16, -1, (14, 6, 6)],0.6963691068720794 , 92],
+            [[np.float16, -1, (5, 6, 6)], 0.7570129172808612 , 21],
+            [[np.float16, -1, (1, 10, 10)], 0.990800730328874 , 86],
+            [[np.float16, -1, (4, 5, 5)], 0.7349293532899402 , 35],
+            [[np.float16, -1, (6, 4, 4)], 0.7349293532899402, 35],
+            [[np.float16, -1, (5, 8, 8)],0.9583309378850908 , 60],
+
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = cpu_op_exec_fp16(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_tanh_inplace_common_shape_format(self, device):
+        def cpu_op_inplace_exec(input1):
+            output = torch.tanh_(input1)
+            output = output.numpy()
+            return output
+
+        def npu_op_inplace_exec(input1):
+            input1 = input1.to("npu")
+            output = torch.tanh_(input1)
+            output = output.to("cpu")
+            output = output.numpy()
+            return output
+
+        shape_format = [
+            [[np.float32, -1, (4, 3, 3)], 1, 100],
+            [[np.float32, -1, (7,5,5)], 21474836,21474837],
+            [[np.float32, -1, (4, 44, 44)], 3450,34020],
+            [[np.float32, -1, (65500,3,3)], -214748,-214746]
+
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = cpu_op_inplace_exec(cpu_input1)
+            npu_output = npu_op_inplace_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestTanh, globals(), except_for='cpu')
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_tanh_backward.py b/test/test_network_ops/test_tanh_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e108d1960a8eaad461210ea4b5fb355935edc11
--- /dev/null
+++ b/test/test_network_ops/test_tanh_backward.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestTanhBackward(TestCase):
+    
+    def cpu_op_exec(self, input1):
+        input1.requires_grad = True
+        input1_tanh = torch.tanh(input1)
+        input1_tanh.backward(torch.ones_like(input1_tanh))
+        output = input1.grad.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1.requires_grad = True
+        input1_tanh = torch.tanh(input1)
+        input1_tanh.backward(torch.ones_like(input1_tanh))
+        output = input1.grad
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_tanh_backward_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (4, 3)], 1, 100],
+            [[np.float32, -1, (7, 5, 5)], 21474836,21474837],
+            [[np.float32, -1, (4, 44, 44)], 3450,34020],
+            [[np.float32, -1, (65500,3,3)], -214748,-214746],
+            [[np.float32, -1, (1024, 448, 448)], 200, 300],
+            [[np.float32, -1, (24, 24, 3)], -2,-2],
+            [[np.float32, -1, (3, 7, 7)],  0.3793216987112159, 1],
+            [[np.float32, -1, (2, 8, 8)], 0.9662927186969077, 1],
+            [[np.float32, -1, (3, 7, 7)], 0.9956475043306917, 2],
+            [[np.float32, -1, (7, 10, 10)], 0.769565434387681, 3],
+            [[np.float32, -1, (65500, 1, 1)], 95, 100],
+            [[np.float32, -1, (6, 3, 10)], 0.03133650248813469 , 2],
+            [[np.float32, -1, (4, 3, 3, 3, 3, 3, 3, 3)], 0, 1],
+            [[np.float32, -1, (5,)], 0, 1],
+            [[np.float32, -1, (5,5,5,5,5,5)], 1, 2],
+            [[np.float32, -1, (5,5,5,5,5,5)], 2, 3],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_tanh_backward_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            input1.requires_grad = True
+            input1_tanh = torch.tanh(input1)
+            input1_tanh.backward(torch.ones_like(input1_tanh))
+            output = input1.grad.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (65500, 1)], 212, 225],
+            [[np.float16, -1, (1024, 448, 448)], 200, 300],
+            [[np.float16, -1, (16, 16)],  -1000, -100],
+            [[np.float16, -1, (4, 1)], -1.1754943508e-38, -1.1754943508e-38],
+            [[np.float16, -1, (7, 5, 5)], 21474836, 21474837],
+            [[np.float16, -1, (4, 44, 44)], 3450, 34020],
+            [[np.float16, -1, (65500, 3, 3)], -214748, -214746],
+            [[np.float16, -1, (64, 4, 4)], -9.313225746154785e-10,9.313225746154785e-10],
+            [[np.float16, -1, (128, 3, 5)], 
+                -0.000000000000000000000000000000000000011754943508, 
+                0.000000000000000000000000000000000000011754943508],       
+            [[np.float16, -1, (65500, 1, 1)], 95, 100],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = cpu_op_exec_fp16(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestTanhBackward, globals(), except_for='cpu')
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py
new file mode 100644
index 0000000000000000000000000000000000000000..893adf140e34b82bb03b8732ecf7c9becf3224e4
--- /dev/null
+++ b/test/test_network_ops/test_uniform_.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestUniform(TestCase):
+    def test_uniform(self, device):
+        shape_format = [
+           [(20,300), -100, 100, torch.float32],
+           [(20,300), -100, 100, torch.float16]
+        ]
+
+        for item in shape_format:
+            input1 = torch.zeros(item[0], dtype=item[3]).npu()
+            input1.uniform_(item[1], item[2])
+            self.assertTrue(item[1] <= input1.min())
+            self.assertTrue(item[2] >= input1.max())
+    
+    def test_uniform_trans(self, device):
+        shape_format = [
+           [(20,300), -100, 100, torch.float32],
+        ]
+
+        for item in shape_format:
+            input1 = torch.zeros(item[0], dtype=item[3]).npu()
+            input1.npu_format_cast(3)
+            input1.uniform_(item[1], item[2])
+            self.assertTrue(item[1] <= input1.min())
+            self.assertTrue(item[2] >= input1.max())
+
+
+instantiate_device_type_tests(TestUniform, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch_npu/csrc/aten/ops/GeKernelNpu.cpp b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a71323f79b633929cc7f021e2c08d83975b12ea
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& ge_out_npu_nocheck(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  at::Tensor selfCast = self;
+  at::Tensor otherCast = other;
+  if (self.dtype() == at::ScalarType::Int || other.dtype() == at::ScalarType::Int 
+      || self.dtype() == at::ScalarType::Bool || other.dtype() == at::ScalarType::Bool) {
+    selfCast = self.to(at::ScalarType::Float);
+    otherCast = other.to(at::ScalarType::Float);
+  }
+  auto unified_result = OpPreparation::comparison_op_check(result, selfCast, otherCast, true);
+  OpCommand cmd;
+  cmd.Name("GreaterEqual")
+     .Expect(unified_result)
+     .Input(selfCast)
+     .Input(otherCast)
+     .Output(result)
+     .Run();
+  
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ge_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      result.scalar_type(),
+      outputSize);
+
+  ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
+  return result;
+}
+
+at::Tensor& ge_out_npu_nocheck(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  at::Tensor selfCast = self;
+  if (self.dtype() == at::ScalarType::Int || self.dtype() == at::ScalarType::Bool) {
+    selfCast = self.to(at::ScalarType::Float);
+  }
+  OpCommand cmd;
+  cmd.Name("GreaterEqual")
+     .Input(selfCast)
+     .Input(other, selfCast.scalar_type())
+     .Output(result)
+     .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ge_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  auto outputSize = formatCastOfSelf.sizes(); 
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      result.scalar_type(),
+      outputSize);
+
+  ge_out_npu_nocheck(formatCastOfSelf, other, result);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, const at::Tensor& other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      formatCastOfSelf.options().dtype(at::kBool),
+      ACL_FORMAT_ND);
+  ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, at::Scalar other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
+      formatCastOfSelf.sizes(),
+      formatCastOfSelf.options().dtype(at::kBool),
+      ACL_FORMAT_ND);
+  ge_out_npu_nocheck(formatCastOfSelf, other, result);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ge_(at::Tensor& self, const at::Tensor& other) {
+  OpPreparation::CastBackToOriFormat(self);
+  at::Tensor ori_other = OpPreparation::CastBackToOriFormat(other);
+  OpPreparation::CheckMemory({self, ori_other}, {self}); 
+
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    ge_out_npu_nocheck(contiguousSelf, ori_other, result);
+  } else {
+    ge_out_npu_nocheck(self, ori_other, result);
+  }
+  self.copy_(result);
+  return self;
+}
+
+at::Tensor& NPUNativeFunctions::ge_(at::Tensor& self, at::Scalar other) {
+  OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CheckMemory({self}, {self}); 
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    ge_out_npu_nocheck(contiguousSelf, other, result);
+  } else {
+    ge_out_npu_nocheck(self, other, result);
+  }
+  self.copy_(result);
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/LeKernelNpu.cpp b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44404cf8f9a63adec7aa1ec1abdc31e9b9ff2bad
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp
@@ -0,0 +1,134 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& le_out_npu_nocheck(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("LessEqual")
+      .Input(self)
+      .Input(other, self.scalar_type())
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::le_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  auto outputSize = formatCastOfSelf.sizes();
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      result.scalar_type(),
+      outputSize);
+
+  le_out_npu_nocheck(formatCastOfSelf, other, result);
+  return result;
+}
+
+at::Tensor& le_out_npu_nocheck(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  auto unified_result = OpPreparation::comparison_op_check(result, self, other, true);
+  OpCommand cmd;
+  cmd.Name("LessEqual")
+      .Expect(unified_result)
+      .Input(self)
+      .Input(other)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::le_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      result.scalar_type(),
+      outputSize);
+
+  le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::le(const at::Tensor& self, at::Scalar other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
+      formatCastOfSelf.sizes(),
+      formatCastOfSelf.options().dtype(at::kBool),
+      ACL_FORMAT_ND);
+  le_out_npu_nocheck(formatCastOfSelf, other, result);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::le(const at::Tensor& self, const at::Tensor& other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      formatCastOfSelf.options().dtype(at::kBool),
+      ACL_FORMAT_ND);
+
+  le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::le_(at::Tensor& self, at::Scalar other) {
+  OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CheckMemory({self}, {self}); 
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    le_out_npu_nocheck(contiguousSelf, other, result);
+  } else {
+    le_out_npu_nocheck(self, other, result);
+  }
+  self.copy_(result);
+  return self;
+}
+
+at::Tensor& NPUNativeFunctions::le_(at::Tensor& self, const at::Tensor& other) {
+  OpPreparation::CastBackToOriFormat(self);
+  at::Tensor ori_other = OpPreparation::CastBackToOriFormat(other);
+  OpPreparation::CheckMemory({self, ori_other}, {self}); 
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    le_out_npu_nocheck(contiguousSelf, ori_other, result);
+  } else {
+    le_out_npu_nocheck(self, ori_other, result);
+  }
+  self.copy_(result);
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/NeKernelNpu.cpp b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b95c86a50da19535dcfb20563efed18dbecc572c
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp
@@ -0,0 +1,157 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, const at::Tensor& other) {
+  at::Tensor selfCast = self;
+  at::Tensor otherCast = other;
+  if(self.dtype() == at::ScalarType::Int || other.dtype() == at::ScalarType::Int){
+    selfCast = self.to(at::ScalarType::Float);
+    otherCast = other.to(at::ScalarType::Float);
+  }
+  auto unified_result = OpPreparation::comparison_op_check(result, selfCast, otherCast, true);
+  if(self.scalar_type() == at::kLong) {
+    TORCH_WARN_ONCE("The oprator of ne is executed, Currently High Accuracy but Low Performance OP with 64-bit has been used,"
+      "Please Do Some Cast at Python Functions with 32-bit for Better Performance!");
+  }
+  OpCommand cmd;
+  cmd.Name("NotEqual")
+    .Expect(unified_result)
+    .Input(selfCast)
+    .Input(otherCast)
+    .Output(result)
+    .Run();
+
+  return result;
+}
+
+at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, at::Scalar other) {
+  at::Tensor selfCast = self;
+  if(self.dtype() == at::ScalarType::Int){
+    selfCast = self.to(at::ScalarType::Float);
+  }
+  if(self.scalar_type() == at::kLong) {
+    TORCH_WARN_ONCE("The oprator of ne is executed, Currently High Accuracy but Low Performance OP with 64-bit has been used,"
+      "Please Do Some Cast at Python Functions with 32-bit for Better Performance!");
+  }
+  OpCommand cmd;
+  cmd.Name("NotEqual")
+    .Input(selfCast)
+    .Input(other, selfCast.scalar_type())
+    .Output(result)
+    .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+  OpPreparation::CheckOut(
+      {self, other},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      at::ScalarType::Bool,
+      at::IntArrayRef(outputSize));
+  ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      at::ScalarType::Bool,
+      formatCastOfSelf.sizes());
+  ne_out_npu_nocheck(result, formatCastOfSelf, other);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+  at::Tensor result = OpPreparation::ApplyTensor(
+      outputSize,
+      formatCastOfSelf.options().dtype(at::kBool),
+      formatCastOfSelf);
+
+  ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, at::Scalar other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+
+  at::Tensor result = OpPreparation::ApplyTensor(
+      formatCastOfSelf,
+      formatCastOfSelf.options().dtype(at::kBool));
+
+  ne_out_npu_nocheck(result, formatCastOfSelf, other);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ne_(at::Tensor& self, const at::Tensor& other) {
+  OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CastBackToOriFormat(other);
+  OpPreparation::CheckMemory({self, other}, {self});
+
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    ne_out_npu_nocheck(result, contiguousSelf, other);
+  } else {
+    ne_out_npu_nocheck(result, self, other);
+  }
+
+  self.copy_(result);
+
+  return self;
+}
+
+at::Tensor& NPUNativeFunctions::ne_(at::Tensor& self, at::Scalar other) {
+  OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CheckMemory({self}, {self});
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    ne_out_npu_nocheck(result, contiguousSelf, other);
+  } else {
+    ne_out_npu_nocheck(result, self, other);
+  }
+
+  self.copy_(result);
+
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..640564e8ddf2f2a80ba9294cd567d98c67c03736
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2020, Huawei Technologies.
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& nonzero_out_npu_nocheck(at::Tensor& result, const at::Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("NonZero")
+    .Input(self)
+    .Output(result)
+    .Attr("transpose", false)
+    .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::nonzero_out(const at::Tensor& self, at::Tensor& result) {
+  auto outputSize = nonzero_npu_output_size(self);
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(self),
+      at::ScalarType::Long,
+      outputSize);
+
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self](at::Tensor& result){nonzero_out_npu_nocheck(result, self);})
+   .Call(result);
+}
+
+at::Tensor NPUNativeFunctions::nonzero(const at::Tensor& self) {
+  // calculate the output size
+  auto outputSize = nonzero_npu_output_size(self);
+
+  // construct the output tensor of the NPU
+  at::Tensor result = OpPreparation::ApplyTensor(
+      outputSize, self.options().dtype(at::kLong), self);
+
+  // calculate the output result of the NPU
+  nonzero_out_npu_nocheck(result, self);
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp b/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4c8200b57732fab4147600e4be98cc0633b02dc3
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp
@@ -0,0 +1,96 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor rsub_dest_output(const at::Tensor& self, const at::Tensor& other) {
+  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
+
+  return isSelfWrapped ? other : self;
+}
+
+at::Tensor& rsub_out_npu_nocheck(
+    at::Tensor& result,
+    const at::Tensor& self,
+    const at::Tensor& other,
+    at::Scalar alpha) {
+  // other*alpha
+  at::Tensor otherMulResult;
+  if (!CalcuOpUtil::is_scalar_one(alpha)) {
+    otherMulResult = at::mul(self, alpha);
+  }
+
+  OpCommand cmd;
+  if (otherMulResult.defined()) {
+    cmd.Name("Sub")
+       .Input(other)
+       .Input(otherMulResult)
+       .Output(result)
+       .Run();
+  } else {
+    cmd.Name("Sub")
+       .Input(other)
+       .Input(self)
+       .Output(result)
+       .Run();
+  }
+
+  return result;
+}
+
+at::Tensor& rsub_out_npu_nocheck(
+    at::Tensor& result,
+    const at::Tensor& self,
+    at::Scalar other,
+    at::Scalar alpha) {
+  // other*alpha
+  at::Tensor scalarValue(at::mul(self, alpha));
+
+  OpCommand cmd;
+  cmd.Name("Sub")
+       .Input(other, self.scalar_type())
+       .Input(scalarValue)
+       .Output(result)
+       .Run();
+
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::rsub(const at::Tensor& self, const at::Tensor& other, at::Scalar alpha) {
+  at::Tensor outputTensor = rsub_dest_output(self, other);
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+
+  at::Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize);
+
+  rsub_out_npu_nocheck(result, self, other, alpha);
+
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::rsub(const at::Tensor& self, at::Scalar other, at::Scalar alpha) {
+  at::Tensor result = OpPreparation::ApplyTensor(self);
+
+  rsub_out_npu_nocheck(result, self, other, alpha);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0abc60d25c187d4c6a14b26980cababf730a453
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& tanh_backward_out_npu_nocheck(
+    at::Tensor& result,
+    const at::Tensor& grad_output,
+    const at::Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("TanhGrad")
+    .Input(self)
+    .Input(grad_output)
+    .Output(result)
+    .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::tanh_backward_out(
+    const at::Tensor& grad_output,
+    const at::Tensor& self,
+    at::Tensor& result) {
+  OpPreparation::CheckOut({grad_output, self}, result, self);
+  tanh_backward_out_npu_nocheck(result, grad_output, self);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::tanh_backward(const at::Tensor& grad_output, const at::Tensor& self) {
+  at::Tensor result = OpPreparation::ApplyTensor(self);
+  tanh_backward_out_npu_nocheck(result, grad_output, self);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp b/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ec2311735064899ec0674b1d59a82f62acd309f
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& NPUNativeFunctions::tanh_out(const at::Tensor& self, at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("Tanh")
+      .Input(self)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::tanh(const at::Tensor& self) {
+  at::Tensor result = OpPreparation::ApplyTensor(self);
+  // calculate the output result of the NPU
+  NPUNativeFunctions::tanh_out(self, result);
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::tanh_(at::Tensor& self) {
+  OpPreparation::CheckMemory({self}, {self});
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    at::Tensor result = NPUNativeFunctions::tanh_out(contiguousSelf, contiguousSelf);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    NPUNativeFunctions::tanh_out(self, self);
+  }
+
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71a12cb5aaea916005229d4de6975d25b605c9c8
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& uniform_out_npu(
+    const at::Tensor& self,
+    double from,
+    double to,
+    c10::optional<at::Generator> gen_,
+    at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("Uniform")
+    .Input(self)
+    .Output(result)
+    .Attr("from", static_cast<float>(from))
+    .Attr("to", static_cast<float>(to))
+    .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::uniform_(at::Tensor& self, double from, double to, c10::optional<at::Generator> gen_) {
+  // The operator needs to use fp32 for calculation.
+  at::Tensor selfCopy = self;
+  if (self.scalar_type() == at::ScalarType::Half) {
+    selfCopy = self.to(at::ScalarType::Float);
+  }
+
+  if (!NpuUtils::check_match(&selfCopy)) {
+    at::Tensor selfContiguous = NpuUtils::format_contiguous(selfCopy);
+    at::Tensor result = uniform_out_npu(selfContiguous, from, to, gen_, selfContiguous);
+    NpuUtils::format_fresh_view(selfCopy, result);
+  } else {
+    uniform_out_npu(selfCopy, from, to, gen_, selfCopy);
+  }
+  self.copy_(selfCopy);
+  
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu