From 62dd4e5d1f56b165cf81391f2336346949712543 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 9 Feb 2022 18:34:33 +0800
Subject: [PATCH 1/4] ge, index, le, masked_fill, ne, nonzero, rsub, tanh,
 uniform

---
 test/test_network_ops/test_ge.py              | 301 +++++++++++++++++
 test/test_network_ops/test_index.py           | 204 +++++++++++
 test/test_network_ops/test_le.py              | 318 ++++++++++++++++++
 test/test_network_ops/test_masked_fill.py     | 145 ++++++++
 test/test_network_ops/test_ne.py              | 110 ++++++
 test/test_network_ops/test_nonzero.py         |  54 +++
 test/test_network_ops/test_rsub.py            | 171 ++++++++++
 test/test_network_ops/test_tanh.py            | 139 ++++++++
 test/test_network_ops/test_tanh_backward.py   | 101 ++++++
 test/test_network_ops/test_uniform_.py        |  50 +++
 torch_npu/csrc/aten/ops/GeKernelNpu.cpp       | 145 ++++++++
 torch_npu/csrc/aten/ops/IndexKernelNpu.cpp    |  74 ++++
 torch_npu/csrc/aten/ops/LeKernelNpu.cpp       | 134 ++++++++
 .../csrc/aten/ops/MaskedFillKernelNpu.cpp     | 117 +++++++
 torch_npu/csrc/aten/ops/NeKernelNpu.cpp       | 157 +++++++++
 torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp  |  63 ++++
 torch_npu/csrc/aten/ops/RsubKernelNpu.cpp     |  96 ++++++
 .../csrc/aten/ops/TanhBackwardKernelNpu.cpp   |  54 +++
 torch_npu/csrc/aten/ops/TanhKernelNpu.cpp     |  55 +++
 torch_npu/csrc/aten/ops/UniformKernelNpu.cpp  |  58 ++++
 20 files changed, 2546 insertions(+)
 create mode 100644 test/test_network_ops/test_ge.py
 create mode 100644 test/test_network_ops/test_index.py
 create mode 100644 test/test_network_ops/test_le.py
 create mode 100644 test/test_network_ops/test_masked_fill.py
 create mode 100644 test/test_network_ops/test_ne.py
 create mode 100644 test/test_network_ops/test_nonzero.py
 create mode 100644 test/test_network_ops/test_rsub.py
 create mode 100644 test/test_network_ops/test_tanh.py
 create mode 100644 test/test_network_ops/test_tanh_backward.py
 create mode 100644 test/test_network_ops/test_uniform_.py
 create mode 100644 torch_npu/csrc/aten/ops/GeKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/LeKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/NeKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/RsubKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/TanhKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/UniformKernelNpu.cpp

diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py
new file mode 100644
index 0000000000..ff5ea46a02
--- /dev/null
+++ b/test/test_network_ops/test_ge.py
@@ -0,0 +1,301 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestGe(TestCase):
+    def generate_scalar(self, min, max):
+        scalar = np.random.uniform(min, max)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.ge(input1, input2)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.ge(input1, input2, out = input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.ge(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3):
+        torch.ge(input1, input2, out = input3)
+        output = input3.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.ge_(input2)
+        output = input1
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.ge_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar(self, input, scalar):
+        output = torch.ge(input, scalar)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.ge(input1, scalar, out = input2)
+        output = input2.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input, scalar):
+        output = torch.ge(input, scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.ge(input1, scalar, out = input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec_scalar(self, input, scalar):
+        output = input.ge_(scalar)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec_scalar(self, input, scalar):
+        output = input.ge_(scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def ge_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3 = torch.randn(item[1][2])<0
+            npu_input3 = cpu_input3.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_ge_tensor_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.ge_tensor_out_result(shape_format)
+
+    def ge_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2 = torch.randn(item[1][2])<0
+            npu_input2 = cpu_input2.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_ge_scalar_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [16, 3, 1111, 1212]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [16, 16, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [1313, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.ge_scalar_out_result(shape_format)
+
+    def test_ge_bool(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        scalar_list = [True, False]
+        shape_format = [
+            [[np.int32, i, j], k] for i in format_list for j in shape_list 
+            for k in scalar_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
+            cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1])
+            npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1])
+            cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50)
+            npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50)
+            self.assertEqual(cpu_output1, npu_output1)
+            self.assertEqual(cpu_output2, npu_output2)
+
+    def test_ge_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_scalar_int32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.int32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_tensor_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_tensor_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_float16(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_mix_dtype(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestGe, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_network_ops/test_index.py b/test/test_network_ops/test_index.py
new file mode 100644
index 0000000000..723706ccde
--- /dev/null
+++ b/test/test_network_ops/test_index.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestIndex(TestCase):
+    def generate_index_data_bool(self, shape): 
+        cpu_input = torch.randn(shape)>0 
+        npu_input = cpu_input.to("npu") 
+        return cpu_input, npu_input
+
+    def cpu_op_exec(self, input1, index):
+        output = input1[index]       
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, index):
+        output = input1[index]      
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_ellip(self, input1, index):
+        output = input1[index, ..., index]
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_ellip(self, input1, index):
+        output = input1[index, ..., index]
+        output = output.cpu().numpy()
+        return output
+
+    def cpu_op_exec_semi(self, input1, index):
+        output = input1[index, :, index]
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_semi(self, input1, index):
+        output = input1[index, :, index]
+        output = output.cpu().numpy()
+        return output
+
+    def test_index_ellip(self, device):
+        dtype_list = [np.float32, np.float16, np.int32]
+        format_list = [0]
+        shape_list = [[5, 256, 256, 100]]
+        shape_format_tensor = [
+            [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list
+        ]
+
+        for item in shape_format_tensor:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_index1, npu_index1 = create_common_tensor(item[1], 0, 2)
+            cpu_output = self.cpu_op_exec_ellip(cpu_input1, cpu_index1)
+            npu_output = self.npu_op_exec_ellip(npu_input1, npu_index1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_semi(self, device):
+        dtype_list = [np.float32, np.float16, np.int32]
+        format_list = [0]
+        shape_list = [[5, 256, 256, 100]]
+        shape_format_tensor = [
+            [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list
+        ]
+
+        for item in shape_format_tensor:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_index1, npu_index1 = create_common_tensor(item[1], 0, 2)
+            cpu_output = self.cpu_op_exec_semi(cpu_input1, cpu_index1)
+            npu_output = self.npu_op_exec_semi(npu_input1, npu_index1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_shape_format_tensor(self, device):
+        #test index is tensor
+        dtype_list = [np.float32, np.float16, np.int32]  
+        format_list = [0]         
+        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
+        shape_format_tensor = [
+            [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        
+        for item in shape_format_tensor:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_index1)
+            npu_output = self.npu_op_exec(npu_input1, npu_index1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_shape_format_tensor_x(self, device):
+        #test index is [tensor, x] , (x=1,bool,range)
+        dtype_list = [np.float32, np.float16, np.int32]
+        format_list = [0]       
+        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
+        shape_format_tensor = [
+            [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        
+        for item in shape_format_tensor:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3)
+            for i in [1, range(2), True]:
+                cpu_output = self.cpu_op_exec(cpu_input1, (cpu_index1, i))
+                npu_output = self.npu_op_exec(npu_input1, (npu_index1, i))
+                self.assertRtolEqual(cpu_output, npu_output)
+                                
+    def test_index_shape_format_tensor_tensor(self, device):
+        #test index is [tensor, tensor]
+        dtype_list = [np.float32, np.float16, np.int32]
+        format_list = [0]
+        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 1000]]
+        shape_format_multiTensor = [
+            [[i, j, k], [np.int64, 0, [1,2]]] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        
+        for item in shape_format_multiTensor:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3)
+            cpu_index2, npu_index2 = create_common_tensor(item[1], 1, 3)
+            cpu_output = self.cpu_op_exec(cpu_input1, (cpu_index1,cpu_index2))
+            npu_output = self.npu_op_exec(npu_input1, (npu_index1, npu_index2))
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_index_shape_format_list(self, device):
+        #test index is list
+        dtype_list = [np.float32, np.float16, np.int32]
+        format_list = [0]   
+        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
+        shape_format_list = [
+            [[i, j, k], (0,1)] for i in dtype_list for j in format_list for k in shape_list       
+        ]
+        
+        for item in shape_format_list:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_index_shape_format_list_x(self, device):
+        #test index is [list, x],  (x=1,bool,range)
+        dtype_list = [np.float32, np.float16, np.int32]
+        format_list = [0]
+        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
+        shape_format_list = [
+            [[i, j, k], (0,1)] for i in dtype_list for j in format_list for k in shape_list       
+        ]
+        
+        for item in shape_format_list:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            for i in [1, range(2), (0, 1), True]:
+                cpu_output = self.cpu_op_exec(cpu_input1, (item[1], i))
+                npu_output = self.npu_op_exec(npu_input1, (item[1], i))
+                self.assertRtolEqual(cpu_output, npu_output)  
+                                
+    def test_index_shape_format_tensor_bool(self, device):      
+        #test index is bool tensor 
+        dtype_list = [np.float32, np.float16, np.int32]
+        format_list = [0]
+        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]        
+        shape_format_tensor_bool = [
+            [[i, j, k],k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        
+        for item in shape_format_tensor_bool:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_index ,npu_index = self.generate_index_data_bool(item[1])
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_index)
+            npu_output = self.npu_op_exec(npu_input1, npu_index)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_shape_format_bool_x(self, device):     
+        #test index is [bool, x] , (x=1,bool,range)
+        dtype_list = [np.float32, np.float16, np.int32]
+        format_list = [0]
+        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
+        index_list = [(True), (False), (True, 1), (True,range(4)), (True,False)]   
+        shape_format_tensor_bool_list = [
+            [[i, j, k], l] for i in dtype_list for j in format_list for k in shape_list for l in index_list
+        ]
+        
+        for item in shape_format_tensor_bool_list:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            self.assertRtolEqual(cpu_output, npu_output)       
+
+instantiate_device_type_tests(TestIndex, globals(), except_for="cpu") 
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py
new file mode 100644
index 0000000000..b5bcbacb61
--- /dev/null
+++ b/test/test_network_ops/test_le.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestLe(TestCase):
+    def generate_scalar(self, min, max):
+        scalar = np.random.uniform(min, max)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.le(input1, input2)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.le(input1, input2, out = input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.le(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.le_(input2)
+        output = input1
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, output):
+        torch.le(input1, input2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar(self, input, scalar):
+        output = torch.le(input, scalar)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.le(input1, scalar, out = input2)
+        output = input2.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input, scalar):
+        output = torch.le(input, scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec_scalar(self, input, scalar):
+        output = input.le_(scalar)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec_scalar(self, input, scalar):
+        input = input.to("npu")
+        output = input.le_(scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self, input, scalar, output):
+        torch.le(input, scalar, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_stride_exec(self, input1, input2):
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        input2 = input2.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_stride_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        input2 = input2.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_stride_scalar_exec(self, input1, input2):
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_stride_scalar_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def le_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3 = torch.randn(item[1][2])<0
+            npu_input3 = cpu_input3.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_le_tensor_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.le_tensor_out_result(shape_format)
+
+    def le_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2 = torch.randn(item[1][2])<0
+            npu_input2 = cpu_input2.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_le_scalar_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 2, [16, 3, 11, 121, 21]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [16, 16, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 2, [1313, 3, 3, 3, 121]], [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.le_scalar_out_result(shape_format)
+
+    def test_le_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_scalar_int32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.int32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_tensor_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_tensor_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_inplace_float32(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_inplace_float16(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_inplace_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            scalar1 = copy.deepcopy(scalar)
+            ncpu_input = copy.deepcopy(cpu_input)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_inplace_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_mix_dtype(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestLe, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_network_ops/test_masked_fill.py b/test/test_network_ops/test_masked_fill.py
new file mode 100644
index 0000000000..256e366b29
--- /dev/null
+++ b/test/test_network_ops/test_masked_fill.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestMaskedFill(TestCase):
+    def create_bool_tensor(self, shape, minValue, maxValue):
+        input1 = np.random.uniform(minValue, maxValue, shape)
+        input1 = input1 > 0.5
+        cpu_input = torch.from_numpy(input1)
+        npu_input = torch.from_numpy(input1).to("npu")
+        return cpu_input, npu_input
+
+    def cpu_op_exec(self, input1, mask, value):
+        output = torch.masked_fill(input1, mask, value)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, mask, value):
+        output = torch.masked_fill(input1, mask, value)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1, mask, value):
+        output = input1.masked_fill_(mask, value)
+        output = output.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1, mask, value):
+        output = input1.masked_fill_(mask, value)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_masked_fill_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
+        value_list = [1.25,
+                      torch.tensor(1.25, dtype=torch.float32),
+                      torch.tensor(5, dtype=torch.int32),
+                      torch.tensor(5, dtype=torch.int64)]
+                      
+        shape_format = [[[np.float16, i, j], v] for i in format_list for j in shape_list for v in value_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1)
+            cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1])
+            npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1])
+            cpu_output1 = cpu_output1.astype(npu_output1.dtype)
+            self.assertRtolEqual(cpu_output1, npu_output1)
+
+            cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1])
+            npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
+            cpu_output2 = cpu_output2.astype(npu_output2.dtype)
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+    def test_masked_fill_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
+        value_list = [1.25,
+                      torch.tensor(1.25, dtype=torch.float32),
+                      torch.tensor(5, dtype=torch.int32),
+                      torch.tensor(5, dtype=torch.int64)]
+                      
+        shape_format = [[[np.float32, i, j], v] for i in format_list for j in shape_list for v in value_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1)
+
+            cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1])
+            npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1])
+            self.assertRtolEqual(cpu_output1, npu_output1)
+
+            cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1])
+            npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+    def test_masked_fill_shape_format_int32(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
+        value_list = [1.25,
+                      torch.tensor(1.25, dtype=torch.float32),
+                      torch.tensor(5, dtype=torch.int32),
+                      torch.tensor(5, dtype=torch.int64)]
+                      
+        shape_format = [[[np.int32, i, j], v] for i in format_list for j in shape_list for v in value_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1)
+
+            cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1])
+            npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1])
+            self.assertRtolEqual(cpu_output1, npu_output1)
+
+            cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1])
+            npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+    def test_masked_fill_shape_format_int64(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
+        value_list = [1.25,
+                      torch.tensor(1.25, dtype=torch.float32),
+                      torch.tensor(5, dtype=torch.int32),
+                      torch.tensor(5, dtype=torch.int64)]
+                      
+        shape_format = [[[np.int64, i, j], v] for i in format_list for j in shape_list for v in value_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1)
+
+            cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1])
+            npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1])
+            cpu_output1 = cpu_output1.astype(np.int32)
+            npu_output1 = npu_output1.astype(np.int32)
+            self.assertRtolEqual(cpu_output1, npu_output1)
+
+            cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1])
+            npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
+            cpu_output2 = cpu_output2.astype(np.int32)
+            npu_output2 = npu_output2.astype(np.int32)
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+instantiate_device_type_tests(TestMaskedFill, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_ne.py b/test/test_network_ops/test_ne.py
new file mode 100644
index 0000000000..b22864a65e
--- /dev/null
+++ b/test/test_network_ops/test_ne.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestNe(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.ne(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.ne(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        input3 = torch.empty(0).bool().npu()
+        torch.ne(input1, input2, out=input3)
+        output = input3.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_ne_shape_format_fp32(self, device):
+        dtype_list = [np.float32]
+        format_list = [0, 3]
+        shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
+        shape_format = [
+            [d, i, j] for d in dtype_list for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)            
+            self.assertEqual(cpu_output, npu_output)
+            
+    def test_ne_shape_format_fp16(self, device):
+        dtype_list = [np.float16]
+        format_list = [0, 3]
+        shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
+        shape_format = [
+            [d, i, j] for d in dtype_list for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)            
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ne_out_shape_format_fp32(self, device):
+        dtype_list = [np.float32]
+        format_list = [0]
+        shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
+        shape_format = [
+            [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -10, 10)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)           
+            self.assertEqual(npu_output_out, npu_output)
+
+    def test_ne_scalar_out_shape_format_fp32(self, device):
+        dtype_list = [np.float32]
+        format_list = [0]
+        shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]]
+        shape_format = [
+            [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10)
+            npu_output_out = self.npu_op_exec_out(npu_input1, 5)
+            npu_output = self.npu_op_exec(npu_input1, 5)           
+            self.assertEqual(npu_output_out, npu_output)
+
+    def test_ne_mix_dtype(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestNe, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py
new file mode 100644
index 0000000000..116c3dbb52
--- /dev/null
+++ b/test/test_network_ops/test_nonzero.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestNonzero(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.nonzero(input)
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec(self, input):
+        output = torch.nonzero(input)
+        output = output.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def test_nonzero_shape_format(self, device):
+        dtype_list = [np.float32, np.float16, np.int32, np.int64]
+        format_list = [0]
+        shape_list = [[256,10], [256,256,100],[5,256,256,100]]
+
+        shape_format = [
+                [[i, j, k]] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            print(item)
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestNonzero, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_rsub.py b/test/test_network_ops/test_rsub.py
new file mode 100644
index 0000000000..9b2167d78f
--- /dev/null
+++ b/test/test_network_ops/test_rsub.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestRsub(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = input2 - input1
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = input2 - input1
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        output = input1 - input2
+        output = output.to("cpu")
+        output = output.numpy()
+        output = -output
+        return output
+
+    def rsub_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def rsub_scalar_result(self, shape_format):
+        for item in shape_format:
+            scalar = np.random.uniform(0, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, scalar)
+            npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar)
+
+            cpu_output = cpu_output.astype(npu_output_scalar.dtype)
+            self.assertRtolEqual(cpu_output, npu_output_scalar)
+
+    def test_sub_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    # int-------------------------------------------------------------------------------
+    def test_sub_shape_format_int32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [32]], [np.int32, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [5, 3]], [np.int32, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [256, 480, 14]], [np.int32, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    # scalar----------------------------------------------------------------------------
+    def test_sub_scalar_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [32]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [32]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_2d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_2d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_3d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_3d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_4d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_4d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+
+instantiate_device_type_tests(TestRsub, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_tanh.py b/test/test_network_ops/test_tanh.py
new file mode 100644
index 0000000000..28c8c0789f
--- /dev/null
+++ b/test/test_network_ops/test_tanh.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestTanh(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.tanh(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.tanh(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_tanh_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (4, 3, 3)], 1, 100],
+            [[np.float32, -1, (7,5,5)], 21474836,21474837],
+            [[np.float32, -1, (4, 44, 44)], 3450,34020],
+            [[np.float32, -1, (65500,3,3)], -214748,-214746],
+            [[np.float32, -1, (1024, 448, 448)], 200, 300],
+            [[np.float32, -1, (128, 3, 5)],  0.3219780311757745 , 92 ],
+            [[np.float32, -1, (8, 7, 7)], 0.4820305734500543 , 28],
+            [[np.float32, -1, (15, 8, 8)],0.8563874665918477 , 98],
+            [[np.float32, -1, (11, 6, 6)], 0.0694198357720135 , 50],
+            [[np.float32, -1, (24, 24, 3)], -2,-2],
+            [[np.float32, -1, (6, 10, 10)], 0.6447298684351989 , 95],
+            [[np.float32, -1, (3, 9, 9)], 0.8723538084975545 , 85],
+            [[np.float32, -1, (5, 5, 5)], 0.8283759153463854 , 71],
+            [[np.float32, -1, (5, 1, 1)], 0.24718684227306953 , 25],
+            [[np.float32, -1, (14, 7, 7)], 0.3989186243492233 , 7 ],
+            [[np.float32, -1, (4, 10, 10)], 0.7866457165672994 , 5],
+            [[np.float32, -1, (3, 7, 7)],  0.3793216987112159 , 39],
+            [[np.float32, -1, (2, 8, 8)], 0.9662927186969077 , 5 ],
+            [[np.float32, -1, (3, 7, 7)], 0.9956475043306917 , 28],
+            [[np.float32, -1, (7, 10, 10)], 0.769565434387681 , 9],
+            [[np.float32, -1, (54, 93, 3)],0.6447298684351989 , 95],
+            [[np.float32, -1, (6, 3, 3)],  0.03133650248813469 , 37 ],
+            [[np.float32, -1, (65500, 1, 1)], 95, 100],
+            [[np.float32, -1, (6, 3, 10)], 0.03133650248813469 , 37],
+
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_tanh_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            output = torch.tanh(input1)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (65500, 1)], 212,225],
+            [[np.float16, -1, (1024,448,448)], 200, 300],
+            [[np.float16, -1, (16,16)],  -1000, -100],
+            [[np.float16, -1, (4,1)], -1.1754943508e-38,-1.1754943508e-38],
+            [[np.float16, -1, (7, 5, 5)], 21474836,21474837],
+            [[np.float16, -1, (4, 44, 44)], 3450,34020],
+            [[np.float16, -1, (65500, 3, 3)], -214748,-214746],
+            [[np.float16, -1, (64, 4, 4)], -9.313225746154785e-10,9.313225746154785e-10],
+            [[np.float16, -1, (128, 3, 5)], -0.000000000000000000000000000000000000011754943508,0.000000000000000000000000000000000000011754943508],
+            [[np.float16, -1, (1, 1, 1)], 0.9283381566708346 , 16],
+            [[np.float16, -1, (6, 3, 10)], 0.03133650248813469 , 37],
+            [[np.float16, -1, (65500, 1, 1)], 95, 100 ],
+            [[np.float16, -1, (13, 5, 5)], 0.9790231845699171 , 41],
+            [[np.float16, -1, (5, 7, 7)], 0.7852605507867441 , 87 ],
+            [[np.float16, -1, (13, 2, 2)],0.8758750778305631 , 82],
+            [[np.float16, -1, (14, 6, 6)],0.6963691068720794 , 92],
+            [[np.float16, -1, (5, 6, 6)], 0.7570129172808612 , 21],
+            [[np.float16, -1, (1, 10, 10)], 0.990800730328874 , 86],
+            [[np.float16, -1, (4, 5, 5)], 0.7349293532899402 , 35],
+            [[np.float16, -1, (6, 4, 4)], 0.7349293532899402, 35],
+            [[np.float16, -1, (5, 8, 8)],0.9583309378850908 , 60],
+
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = cpu_op_exec_fp16(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_tanh_inplace_common_shape_format(self, device):
+        def cpu_op_inplace_exec(input1):
+            output = torch.tanh_(input1)
+            output = output.numpy()
+            return output
+
+        def npu_op_inplace_exec(input1):
+            input1 = input1.to("npu")
+            output = torch.tanh_(input1)
+            output = output.to("cpu")
+            output = output.numpy()
+            return output
+
+        shape_format = [
+            [[np.float32, -1, (4, 3, 3)], 1, 100],
+            [[np.float32, -1, (7,5,5)], 21474836,21474837],
+            [[np.float32, -1, (4, 44, 44)], 3450,34020],
+            [[np.float32, -1, (65500,3,3)], -214748,-214746]
+
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = cpu_op_inplace_exec(cpu_input1)
+            npu_output = npu_op_inplace_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestTanh, globals(), except_for='cpu')
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_tanh_backward.py b/test/test_network_ops/test_tanh_backward.py
new file mode 100644
index 0000000000..1e108d1960
--- /dev/null
+++ b/test/test_network_ops/test_tanh_backward.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestTanhBackward(TestCase):
+    
+    def cpu_op_exec(self, input1):
+        input1.requires_grad = True
+        input1_tanh = torch.tanh(input1)
+        input1_tanh.backward(torch.ones_like(input1_tanh))
+        output = input1.grad.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1.requires_grad = True
+        input1_tanh = torch.tanh(input1)
+        input1_tanh.backward(torch.ones_like(input1_tanh))
+        output = input1.grad
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_tanh_backward_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (4, 3)], 1, 100],
+            [[np.float32, -1, (7, 5, 5)], 21474836,21474837],
+            [[np.float32, -1, (4, 44, 44)], 3450,34020],
+            [[np.float32, -1, (65500,3,3)], -214748,-214746],
+            [[np.float32, -1, (1024, 448, 448)], 200, 300],
+            [[np.float32, -1, (24, 24, 3)], -2,-2],
+            [[np.float32, -1, (3, 7, 7)],  0.3793216987112159, 1],
+            [[np.float32, -1, (2, 8, 8)], 0.9662927186969077, 1],
+            [[np.float32, -1, (3, 7, 7)], 0.9956475043306917, 2],
+            [[np.float32, -1, (7, 10, 10)], 0.769565434387681, 3],
+            [[np.float32, -1, (65500, 1, 1)], 95, 100],
+            [[np.float32, -1, (6, 3, 10)], 0.03133650248813469 , 2],
+            [[np.float32, -1, (4, 3, 3, 3, 3, 3, 3, 3)], 0, 1],
+            [[np.float32, -1, (5,)], 0, 1],
+            [[np.float32, -1, (5,5,5,5,5,5)], 1, 2],
+            [[np.float32, -1, (5,5,5,5,5,5)], 2, 3],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_tanh_backward_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            input1.requires_grad = True
+            input1_tanh = torch.tanh(input1)
+            input1_tanh.backward(torch.ones_like(input1_tanh))
+            output = input1.grad.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (65500, 1)], 212, 225],
+            [[np.float16, -1, (1024, 448, 448)], 200, 300],
+            [[np.float16, -1, (16, 16)],  -1000, -100],
+            [[np.float16, -1, (4, 1)], -1.1754943508e-38, -1.1754943508e-38],
+            [[np.float16, -1, (7, 5, 5)], 21474836, 21474837],
+            [[np.float16, -1, (4, 44, 44)], 3450, 34020],
+            [[np.float16, -1, (65500, 3, 3)], -214748, -214746],
+            [[np.float16, -1, (64, 4, 4)], -9.313225746154785e-10,9.313225746154785e-10],
+            [[np.float16, -1, (128, 3, 5)], 
+                -0.000000000000000000000000000000000000011754943508, 
+                0.000000000000000000000000000000000000011754943508],       
+            [[np.float16, -1, (65500, 1, 1)], 95, 100],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2])
+            cpu_output = cpu_op_exec_fp16(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestTanhBackward, globals(), except_for='cpu')
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py
new file mode 100644
index 0000000000..893adf140e
--- /dev/null
+++ b/test/test_network_ops/test_uniform_.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestUniform(TestCase):
+    def test_uniform(self, device):
+        shape_format = [
+           [(20,300), -100, 100, torch.float32],
+           [(20,300), -100, 100, torch.float16]
+        ]
+
+        for item in shape_format:
+            input1 = torch.zeros(item[0], dtype=item[3]).npu()
+            input1.uniform_(item[1], item[2])
+            self.assertTrue(item[1] <= input1.min())
+            self.assertTrue(item[2] >= input1.max())
+    
+    def test_uniform_trans(self, device):
+        shape_format = [
+           [(20,300), -100, 100, torch.float32],
+        ]
+
+        for item in shape_format:
+            input1 = torch.zeros(item[0], dtype=item[3]).npu()
+            input1.npu_format_cast(3)
+            input1.uniform_(item[1], item[2])
+            self.assertTrue(item[1] <= input1.min())
+            self.assertTrue(item[2] >= input1.max())
+
+
+instantiate_device_type_tests(TestUniform, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch_npu/csrc/aten/ops/GeKernelNpu.cpp b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp
new file mode 100644
index 0000000000..65448746f6
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& ge_out_npu_nocheck(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  at::Tensor selfCast = self;
+  at::Tensor otherCast = other;
+  if (self.dtype() == at::ScalarType::Int || other.dtype() == at::ScalarType::Int 
+      || self.dtype() == at::ScalarType::Bool || other.dtype() == at::ScalarType::Bool) {
+    selfCast = self.to(at::ScalarType::Float);
+    otherCast = other.to(at::ScalarType::Float);
+  }
+  auto unified_result = OpPreparation::comparison_op_check(result, selfCast, otherCast, true);
+  OpCommand cmd;
+  cmd.Name("GreaterEqual")
+     .Expect(unified_result)
+     .Input(selfCast)
+     .Input(otherCast)
+     .Output(result)
+     .Run();
+  
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ge_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      result.scalar_type(),
+      outputSize);
+
+  ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
+  return result;
+}
+
+at::Tensor& ge_out_npu_nocheck(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  at::Tensor selfCast = self;
+  if (self.dtype() == at::ScalarType::Int || self.dtype() == at::ScalarType::Bool) {
+    selfCast = self.to(at::ScalarType::Float);
+  }
+  OpCommand cmd;
+  cmd.Name("GreaterEqual")
+     .Input(selfCast)
+     .Input(other, selfCast.scalar_type())
+     .Output(result)
+     .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ge_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  auto outputSize = formatCastOfSelf.sizes(); 
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      result.scalar_type(),
+      outputSize);
+
+  ge_out_npu_nocheck(formatCastOfSelf, other, result);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, const at::Tensor& other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      formatCastOfSelf.options().dtype(kBool),
+      ACL_FORMAT_ND);
+  ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, Scalar other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
+      formatCastOfSelf.sizes(),
+      formatCastOfSelf.options().dtype(kBool),
+      ACL_FORMAT_ND);
+  ge_out_npu_nocheck(formatCastOfSelf, other, result);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ge_(at::Tensor& self, const at::Tensor& other) {
+  OpPreparation::CastBackToOriFormat(self);
+  at::Tensor ori_other = OpPreparation::CastBackToOriFormat(other);
+  OpPreparation::CheckMemory({self, ori_other}, {self}); 
+
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    ge_out_npu_nocheck(contiguousSelf, ori_other, result);
+  } else {
+    ge_out_npu_nocheck(self, ori_other, result);
+  }
+  self.copy_(result);
+  return self;
+}
+
+at::Tensor& NPUNativeFunctions::ge_(at::Tensor& self, at::Scalar other) {
+  OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CheckMemory({self}, {self}); 
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    ge_out_npu_nocheck(contiguousSelf, other, result);
+  } else {
+    ge_out_npu_nocheck(self, other, result);
+  }
+  self.copy_(result);
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
new file mode 100644
index 0000000000..72d7984b0e
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/NpuUtils.h"
+#include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& index_out_nocheck_npu(
+    const at::Tensor& self,
+    const at::Tensor& masksTensor,
+    const at::TensorList& allDefinedIndices,
+    at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("Index")
+      .Input(self)
+      .Input(masksTensor);
+  for (int i = 0; i < allDefinedIndices.size(); i++) {
+    cmd.Input(allDefinedIndices[i]);
+  }
+  cmd.Output(result)
+      .Run();
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::index(const at::Tensor& self, const torch::List<c10::optional<at::Tensor>>& orig) {  
+  checkIndexTensorTypes(orig);
+  // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
+  auto indices = expandTensors(self, orig);
+  at::Tensor formatCastOfSelf = self.npu_format_cast(ACL_FORMAT_ND);
+
+  // calculate the output size
+  auto outputSize = index_npu_output_size(formatCastOfSelf, indices);
+
+  // construct the output tensor of the NPU
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(formatCastOfSelf,  outputSize, ACL_FORMAT_ND);
+
+  // masks corresponds to indices. 0 indicates undefined tensor.
+  SmallVector<int64_t, N> masks;
+  std::vector<at::Tensor> allDefinedIndices;
+  for (int64_t i = 0; i < indices.size(); i++) {
+    if (indices[i].defined()) {
+      masks.emplace_back(1);
+      allDefinedIndices.emplace_back(indices[i]);
+    } else {
+      masks.emplace_back(0);
+    }
+  }
+
+  at::Tensor masksTensor = CalcuOpUtil::copy_tensor_host_to_device(
+      from_blob(masks.data(), {masks.size()}, dtype(at::ScalarType::Long)));
+
+  // calculate the output result of the NPU
+  index_out_nocheck_npu(formatCastOfSelf, masksTensor, allDefinedIndices, result);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/LeKernelNpu.cpp b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp
new file mode 100644
index 0000000000..dfd9c4d584
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp
@@ -0,0 +1,134 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& le_out_npu_nocheck(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("LessEqual")
+      .Input(self)
+      .Input(other, self.scalar_type())
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::le_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  auto outputSize = formatCastOfSelf.sizes();
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      result.scalar_type(),
+      outputSize);
+
+  le_out_npu_nocheck(formatCastOfSelf, other, result);
+  return result;
+}
+
+at::Tensor& le_out_npu_nocheck(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  auto unified_result = OpPreparation::comparison_op_check(result, self, other, true);
+  OpCommand cmd;
+  cmd.Name("LessEqual")
+      .Expect(unified_result)
+      .Input(self)
+      .Input(other)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::le_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      ACL_FORMAT_ND,
+      result.scalar_type(),
+      outputSize);
+
+  le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::le(const at::Tensor& self, at::Scalar other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
+      formatCastOfSelf.sizes(),
+      formatCastOfSelf.options().dtype(kBool),
+      ACL_FORMAT_ND);
+  le_out_npu_nocheck(formatCastOfSelf, other, result);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::le(const at::Tensor& self, const at::Tensor& other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+  at::Tensor result = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      formatCastOfSelf.options().dtype(kBool),
+      ACL_FORMAT_ND);
+
+  le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::le_(at::Tensor& self, at::Scalar other) {
+  OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CheckMemory({self}, {self}); 
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    le_out_npu_nocheck(contiguousSelf, other, result);
+  } else {
+    le_out_npu_nocheck(self, other, result);
+  }
+  self.copy_(result);
+  return self;
+}
+
+at::Tensor& NPUNativeFunctions::le_(at::Tensor& self, const at::Tensor& other) {
+  OpPreparation::CastBackToOriFormat(self);
+  at::Tensor ori_other = OpPreparation::CastBackToOriFormat(other);
+  OpPreparation::CheckMemory({self, ori_other}, {self}); 
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    le_out_npu_nocheck(contiguousSelf, ori_other, result);
+  } else {
+    le_out_npu_nocheck(self, ori_other, result);
+  }
+  self.copy_(result);
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp
new file mode 100644
index 0000000000..847896ec3a
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& NPUNativeFunctions::masked_fill_out(const at::Tensor& self, const at::Tensor& mask, const at::Tensor& value, at::Tensor& result) {
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self);
+  at::Tensor maskBool = mask;
+  int64_t dimOfSelf = self.dim();
+
+  /* Avoid the problem that the TBE operator does not support 0-dimensional tensor input */
+  if (dimOfSelf == 0) {
+    self.unsqueeze_(0);
+  }
+
+  if ((mask.dtype() != at::kBool)) {
+    maskBool = mask.to(at::kBool);
+  }
+  at::Tensor valueTensor = value;
+  if (value.dtype() != self.dtype()) {
+    valueTensor = valueTensor.to(self.dtype());
+  }
+
+  OpCommand cmd;
+  cmd.Name("MaskedFill")
+      .Input(self)
+      .Input(maskBool)
+      .Input(valueTensor)      
+      .Output(result)
+      .Run();
+  
+  if (dimOfSelf == 0) {
+    result.squeeze_(0);
+  }
+  
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::masked_fill_out(const at::Tensor& self, const at::Tensor& mask, at::Scalar value, at::Tensor& result) {
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(self),
+      self.scalar_type(),
+      self.sizes());
+  at::Tensor maskBool = mask;
+  int64_t dimOfSelf = self.dim();
+
+  /* Avoid the problem that the TBE operator does not support 0-dimensional tensor input */
+  if (dimOfSelf == 0) {
+    self.unsqueeze_(0);
+  }
+
+  if (!(mask.dtype() == at::kBool)) {
+    maskBool = mask.to(at::kBool);
+  }
+
+  OpCommand cmd;
+  cmd.Name("MaskedFill")
+    .Input(self)
+    .Input(maskBool)
+    .Input(value, self.scalar_type())
+    .Output(result)
+    .Run();
+  
+  if (dimOfSelf == 0) {
+    result.squeeze_(0);
+  }
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::masked_fill_(at::Tensor& self, const at::Tensor& mask, const at::Tensor& value) {
+  // OpPreparation::CheckMemory({self, mask, value}, {self});
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    at::Tensor result = masked_fill_out(contiguousSelf, mask, value, contiguousSelf);
+    self.copy_(result);
+  } else {
+    masked_fill_out(self, mask, value, self);
+  }
+  return self;
+}
+
+at::Tensor& NPUNativeFunctions::masked_fill_(at::Tensor& self, const at::Tensor& mask, at::Scalar value) {
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    at::Tensor result = masked_fill_out(contiguousSelf, mask, value, contiguousSelf);
+    self.copy_(result);
+  } else {
+    masked_fill_out(self, mask, value, self);
+  }
+
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/NeKernelNpu.cpp b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp
new file mode 100644
index 0000000000..d26ae3c535
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp
@@ -0,0 +1,157 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, const at::Tensor& other) {
+  at::Tensor selfCast = self;
+  at::Tensor otherCast = other;
+  if(self.dtype() == at::ScalarType::Int || other.dtype() == at::ScalarType::Int){
+    selfCast = self.to(at::ScalarType::Float);
+    otherCast = other.to(at::ScalarType::Float);
+  }
+  auto unified_result = OpPreparation::comparison_op_check(result, selfCast, otherCast, true);
+  if(self.scalar_type() == at::kLong) {
+    TORCH_WARN_ONCE("The oprator of ne is executed, Currently High Accuracy but Low Performance OP with 64-bit has been used,"
+      "Please Do Some Cast at Python Functions with 32-bit for Better Performance!");
+  }
+  OpCommand cmd;
+  cmd.Name("NotEqual")
+    .Expect(unified_result)
+    .Input(selfCast)
+    .Input(otherCast)
+    .Output(result)
+    .Run();
+
+  return result;
+}
+
+at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, at::Scalar other) {
+  at::Tensor selfCast = self;
+  if(self.dtype() == at::ScalarType::Int){
+    selfCast = self.to(at::ScalarType::Float);
+  }
+  if(self.scalar_type() == at::kLong) {
+    TORCH_WARN_ONCE("The oprator of ne is executed, Currently High Accuracy but Low Performance OP with 64-bit has been used,"
+      "Please Do Some Cast at Python Functions with 32-bit for Better Performance!");
+  }
+  OpCommand cmd;
+  cmd.Name("NotEqual")
+    .Input(selfCast)
+    .Input(other, selfCast.scalar_type())
+    .Output(result)
+    .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+  OpPreparation::CheckOut(
+      {self, other},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ScalarType::Bool,
+      IntArrayRef(outputSize));
+  ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
+      ScalarType::Bool,
+      formatCastOfSelf.sizes());
+  ne_out_npu_nocheck(result, formatCastOfSelf, other);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
+
+  auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
+  at::Tensor result = OpPreparation::ApplyTensor(
+      outputSize,
+      formatCastOfSelf.options().dtype(kBool),
+      formatCastOfSelf);
+
+  ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, at::Scalar other) {
+  at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+
+  at::Tensor result = OpPreparation::ApplyTensor(
+      formatCastOfSelf,
+      formatCastOfSelf.options().dtype(kBool));
+
+  ne_out_npu_nocheck(result, formatCastOfSelf, other);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::ne_(at::Tensor& self, const at::Tensor& other) {
+  OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CastBackToOriFormat(other);
+  OpPreparation::CheckMemory({self, other}, {self});
+
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    ne_out_npu_nocheck(result, contiguousSelf, other);
+  } else {
+    ne_out_npu_nocheck(result, self, other);
+  }
+
+  self.copy_(result);
+
+  return self;
+}
+
+at::Tensor& NPUNativeFunctions::ne_(at::Tensor& self, at::Scalar other) {
+  OpPreparation::CastBackToOriFormat(self);
+  OpPreparation::CheckMemory({self}, {self});
+  at::Tensor result = OpPreparation::ApplyTensor(
+      self,
+      self.options().dtype(at::ScalarType::Byte));
+
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    ne_out_npu_nocheck(result, contiguousSelf, other);
+  } else {
+    ne_out_npu_nocheck(result, self, other);
+  }
+
+  self.copy_(result);
+
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp
new file mode 100644
index 0000000000..c2f86b65a6
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2020, Huawei Technologies.
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& nonzero_out_npu_nocheck(at::Tensor& result, const at::Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("NonZero")
+    .Input(self)
+    .Output(result)
+    .Attr("transpose", false)
+    .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::nonzero_out(const at::Tensor& self, at::Tensor& result) {
+  auto outputSize = nonzero_npu_output_size(self);
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(self),
+      ScalarType::Long,
+      outputSize);
+
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self](at::Tensor& result){nonzero_out_npu_nocheck(result, self);})
+   .Call(result);
+}
+
+at::Tensor NPUNativeFunctions::nonzero(const at::Tensor& self) {
+  // calculate the output size
+  auto outputSize = nonzero_npu_output_size(self);
+
+  // construct the output tensor of the NPU
+  at::Tensor result = OpPreparation::ApplyTensor(
+      outputSize, self.options().dtype(at::kLong), self);
+
+  // calculate the output result of the NPU
+  nonzero_out_npu_nocheck(result, self);
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp b/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp
new file mode 100644
index 0000000000..4c8200b577
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp
@@ -0,0 +1,96 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor rsub_dest_output(const at::Tensor& self, const at::Tensor& other) {
+  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
+
+  return isSelfWrapped ? other : self;
+}
+
+at::Tensor& rsub_out_npu_nocheck(
+    at::Tensor& result,
+    const at::Tensor& self,
+    const at::Tensor& other,
+    at::Scalar alpha) {
+  // other*alpha
+  at::Tensor otherMulResult;
+  if (!CalcuOpUtil::is_scalar_one(alpha)) {
+    otherMulResult = at::mul(self, alpha);
+  }
+
+  OpCommand cmd;
+  if (otherMulResult.defined()) {
+    cmd.Name("Sub")
+       .Input(other)
+       .Input(otherMulResult)
+       .Output(result)
+       .Run();
+  } else {
+    cmd.Name("Sub")
+       .Input(other)
+       .Input(self)
+       .Output(result)
+       .Run();
+  }
+
+  return result;
+}
+
+at::Tensor& rsub_out_npu_nocheck(
+    at::Tensor& result,
+    const at::Tensor& self,
+    at::Scalar other,
+    at::Scalar alpha) {
+  // other*alpha
+  at::Tensor scalarValue(at::mul(self, alpha));
+
+  OpCommand cmd;
+  cmd.Name("Sub")
+       .Input(other, self.scalar_type())
+       .Input(scalarValue)
+       .Output(result)
+       .Run();
+
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::rsub(const at::Tensor& self, const at::Tensor& other, at::Scalar alpha) {
+  at::Tensor outputTensor = rsub_dest_output(self, other);
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+
+  at::Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize);
+
+  rsub_out_npu_nocheck(result, self, other, alpha);
+
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::rsub(const at::Tensor& self, at::Scalar other, at::Scalar alpha) {
+  at::Tensor result = OpPreparation::ApplyTensor(self);
+
+  rsub_out_npu_nocheck(result, self, other, alpha);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp
new file mode 100644
index 0000000000..e0abc60d25
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& tanh_backward_out_npu_nocheck(
+    at::Tensor& result,
+    const at::Tensor& grad_output,
+    const at::Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("TanhGrad")
+    .Input(self)
+    .Input(grad_output)
+    .Output(result)
+    .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::tanh_backward_out(
+    const at::Tensor& grad_output,
+    const at::Tensor& self,
+    at::Tensor& result) {
+  OpPreparation::CheckOut({grad_output, self}, result, self);
+  tanh_backward_out_npu_nocheck(result, grad_output, self);
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::tanh_backward(const at::Tensor& grad_output, const at::Tensor& self) {
+  at::Tensor result = OpPreparation::ApplyTensor(self);
+  tanh_backward_out_npu_nocheck(result, grad_output, self);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp b/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp
new file mode 100644
index 0000000000..5ec2311735
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& NPUNativeFunctions::tanh_out(const at::Tensor& self, at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("Tanh")
+      .Input(self)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::tanh(const at::Tensor& self) {
+  at::Tensor result = OpPreparation::ApplyTensor(self);
+  // calculate the output result of the NPU
+  NPUNativeFunctions::tanh_out(self, result);
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::tanh_(at::Tensor& self) {
+  OpPreparation::CheckMemory({self}, {self});
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    at::Tensor result = NPUNativeFunctions::tanh_out(contiguousSelf, contiguousSelf);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    NPUNativeFunctions::tanh_out(self, self);
+  }
+
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp
new file mode 100644
index 0000000000..3dd2834a35
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& uniform_out_npu(
+    const at::Tensor& self,
+    double from,
+    double to,
+    c10::optional<at::Generator> gen_,
+    at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("Uniform")
+    .Input(self)
+    .Output(result)
+    .Attr("from", static_cast<float>(from))
+    .Attr("to", static_cast<float>(to))
+    .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::uniform_(at::Tensor& self, double from, double to, c10::optional<at::Generator> gen_) {
+  // TODO(Ascend): The operator needs to use fp32 for calculation.
+  at::Tensor selfCopy = self;
+  if (self.scalar_type() == at::ScalarType::Half) {
+    selfCopy = self.to(at::ScalarType::Float);
+  }
+
+  if (!NpuUtils::check_match(&selfCopy)) {
+    at::Tensor selfContiguous = NpuUtils::format_contiguous(selfCopy);
+    at::Tensor result = uniform_out_npu(selfContiguous, from, to, gen_, selfContiguous);
+    NpuUtils::format_fresh_view(selfCopy, result);
+  } else {
+    uniform_out_npu(selfCopy, from, to, gen_, selfCopy);
+  }
+  self.copy_(selfCopy);
+  
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
-- 
Gitee


From 388945c11ff7811aca54d66d2481282cc8981905 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 9 Feb 2022 19:24:07 +0800
Subject: [PATCH 2/4] fix bugs

---
 test/test_network_ops/test_ge.py              |  38 ++--
 test/test_network_ops/test_index.py           | 204 ------------------
 test/test_network_ops/test_le.py              |  36 ++--
 test/test_network_ops/test_masked_fill.py     | 145 -------------
 test/test_network_ops/test_ne.py              |  23 +-
 test/test_network_ops/test_nonzero.py         |   1 -
 torch_npu/csrc/aten/ops/GeKernelNpu.cpp       |   6 +-
 torch_npu/csrc/aten/ops/IndexKernelNpu.cpp    |  74 -------
 torch_npu/csrc/aten/ops/LeKernelNpu.cpp       |   4 +-
 .../csrc/aten/ops/MaskedFillKernelNpu.cpp     | 117 ----------
 torch_npu/csrc/aten/ops/NeKernelNpu.cpp       |  12 +-
 torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp  |   2 +-
 12 files changed, 59 insertions(+), 603 deletions(-)
 delete mode 100644 test/test_network_ops/test_index.py
 delete mode 100644 test/test_network_ops/test_masked_fill.py
 delete mode 100644 torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
 delete mode 100644 torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp

diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py
index ff5ea46a02..1afd23a188 100644
--- a/test/test_network_ops/test_ge.py
+++ b/test/test_network_ops/test_ge.py
@@ -107,7 +107,6 @@ class TestGe(TestCase):
             cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
             npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
             cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
     def test_ge_tensor_out(self, device):
@@ -163,8 +162,9 @@ class TestGe(TestCase):
             npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1])
             cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50)
             npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50)
-            self.assertEqual(cpu_output1, npu_output1)
-            self.assertEqual(cpu_output2, npu_output2)
+
+            self.assertRtolEqual(cpu_output1, npu_output1)
+            self.assertRtolEqual(cpu_output2, npu_output2)
 
     def test_ge_scalar_float32(self, device):
         format_list = [0]
@@ -177,7 +177,7 @@ class TestGe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_scalar_float16(self, device):
         format_list = [0]
@@ -191,8 +191,7 @@ class TestGe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_scalar_int32(self, device):
         format_list = [0]
@@ -205,7 +204,7 @@ class TestGe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_tensor_float32(self, device):
         format_list = [0]
@@ -217,7 +216,7 @@ class TestGe(TestCase):
             cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
             cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_tensor_float16(self, device):
         format_list = [0]
@@ -231,8 +230,7 @@ class TestGe(TestCase):
             cpu_input2 = cpu_input2.to(torch.float32)
             cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_inplace_float32(self, device):
         format_list = [0]
@@ -244,7 +242,7 @@ class TestGe(TestCase):
             cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
             cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_inplace_float16(self, device):
         format_list = [0, 3]
@@ -258,8 +256,8 @@ class TestGe(TestCase):
             cpu_input2 = cpu_input2.to(torch.float32)
             cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_inplace_scalar_float32(self, device):
         format_list = [0]
@@ -272,7 +270,7 @@ class TestGe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_inplace_scalar_float16(self, device):
         format_list = [0]
@@ -286,14 +284,14 @@ class TestGe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ge_mix_dtype(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
 instantiate_device_type_tests(TestGe, globals(), except_for="cpu")
diff --git a/test/test_network_ops/test_index.py b/test/test_network_ops/test_index.py
deleted file mode 100644
index 723706ccde..0000000000
--- a/test/test_network_ops/test_index.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch_npu
-import numpy as np
-
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
-
-class TestIndex(TestCase):
-    def generate_index_data_bool(self, shape): 
-        cpu_input = torch.randn(shape)>0 
-        npu_input = cpu_input.to("npu") 
-        return cpu_input, npu_input
-
-    def cpu_op_exec(self, input1, index):
-        output = input1[index]       
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, index):
-        output = input1[index]      
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_ellip(self, input1, index):
-        output = input1[index, ..., index]
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_ellip(self, input1, index):
-        output = input1[index, ..., index]
-        output = output.cpu().numpy()
-        return output
-
-    def cpu_op_exec_semi(self, input1, index):
-        output = input1[index, :, index]
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_semi(self, input1, index):
-        output = input1[index, :, index]
-        output = output.cpu().numpy()
-        return output
-
-    def test_index_ellip(self, device):
-        dtype_list = [np.float32, np.float16, np.int32]
-        format_list = [0]
-        shape_list = [[5, 256, 256, 100]]
-        shape_format_tensor = [
-            [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list
-        ]
-
-        for item in shape_format_tensor:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_index1, npu_index1 = create_common_tensor(item[1], 0, 2)
-            cpu_output = self.cpu_op_exec_ellip(cpu_input1, cpu_index1)
-            npu_output = self.npu_op_exec_ellip(npu_input1, npu_index1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_semi(self, device):
-        dtype_list = [np.float32, np.float16, np.int32]
-        format_list = [0]
-        shape_list = [[5, 256, 256, 100]]
-        shape_format_tensor = [
-            [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list
-        ]
-
-        for item in shape_format_tensor:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_index1, npu_index1 = create_common_tensor(item[1], 0, 2)
-            cpu_output = self.cpu_op_exec_semi(cpu_input1, cpu_index1)
-            npu_output = self.npu_op_exec_semi(npu_input1, npu_index1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_shape_format_tensor(self, device):
-        #test index is tensor
-        dtype_list = [np.float32, np.float16, np.int32]  
-        format_list = [0]         
-        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
-        shape_format_tensor = [
-            [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        
-        for item in shape_format_tensor:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_index1)
-            npu_output = self.npu_op_exec(npu_input1, npu_index1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_shape_format_tensor_x(self, device):
-        #test index is [tensor, x] , (x=1,bool,range)
-        dtype_list = [np.float32, np.float16, np.int32]
-        format_list = [0]       
-        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
-        shape_format_tensor = [
-            [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        
-        for item in shape_format_tensor:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3)
-            for i in [1, range(2), True]:
-                cpu_output = self.cpu_op_exec(cpu_input1, (cpu_index1, i))
-                npu_output = self.npu_op_exec(npu_input1, (npu_index1, i))
-                self.assertRtolEqual(cpu_output, npu_output)
-                                
-    def test_index_shape_format_tensor_tensor(self, device):
-        #test index is [tensor, tensor]
-        dtype_list = [np.float32, np.float16, np.int32]
-        format_list = [0]
-        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 1000]]
-        shape_format_multiTensor = [
-            [[i, j, k], [np.int64, 0, [1,2]]] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        
-        for item in shape_format_multiTensor:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3)
-            cpu_index2, npu_index2 = create_common_tensor(item[1], 1, 3)
-            cpu_output = self.cpu_op_exec(cpu_input1, (cpu_index1,cpu_index2))
-            npu_output = self.npu_op_exec(npu_input1, (npu_index1, npu_index2))
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-    def test_index_shape_format_list(self, device):
-        #test index is list
-        dtype_list = [np.float32, np.float16, np.int32]
-        format_list = [0]   
-        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
-        shape_format_list = [
-            [[i, j, k], (0,1)] for i in dtype_list for j in format_list for k in shape_list       
-        ]
-        
-        for item in shape_format_list:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-    def test_index_shape_format_list_x(self, device):
-        #test index is [list, x],  (x=1,bool,range)
-        dtype_list = [np.float32, np.float16, np.int32]
-        format_list = [0]
-        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
-        shape_format_list = [
-            [[i, j, k], (0,1)] for i in dtype_list for j in format_list for k in shape_list       
-        ]
-        
-        for item in shape_format_list:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            for i in [1, range(2), (0, 1), True]:
-                cpu_output = self.cpu_op_exec(cpu_input1, (item[1], i))
-                npu_output = self.npu_op_exec(npu_input1, (item[1], i))
-                self.assertRtolEqual(cpu_output, npu_output)  
-                                
-    def test_index_shape_format_tensor_bool(self, device):      
-        #test index is bool tensor 
-        dtype_list = [np.float32, np.float16, np.int32]
-        format_list = [0]
-        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]        
-        shape_format_tensor_bool = [
-            [[i, j, k],k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        
-        for item in shape_format_tensor_bool:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_index ,npu_index = self.generate_index_data_bool(item[1])
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_index)
-            npu_output = self.npu_op_exec(npu_input1, npu_index)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_shape_format_bool_x(self, device):     
-        #test index is [bool, x] , (x=1,bool,range)
-        dtype_list = [np.float32, np.float16, np.int32]
-        format_list = [0]
-        shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]]
-        index_list = [(True), (False), (True, 1), (True,range(4)), (True,False)]   
-        shape_format_tensor_bool_list = [
-            [[i, j, k], l] for i in dtype_list for j in format_list for k in shape_list for l in index_list
-        ]
-        
-        for item in shape_format_tensor_bool_list:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)       
-
-instantiate_device_type_tests(TestIndex, globals(), except_for="cpu") 
-if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py
index b5bcbacb61..5bdb1617b7 100644
--- a/test/test_network_ops/test_le.py
+++ b/test/test_network_ops/test_le.py
@@ -138,7 +138,8 @@ class TestLe(TestCase):
                 cpu_input3 = cpu_input3.to(torch.float32)
             cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
             npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            if cpu_input1.dtype == torch.float16:
+                cpu_output_out = cpu_output_out.astype(np.float16)
 
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
@@ -165,7 +166,8 @@ class TestLe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
             npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            if cpu_input1.dtype == torch.float16:
+                cpu_output_out = cpu_output_out.astype(np.float16)
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
     def test_le_scalar_out(self, device):
@@ -191,7 +193,7 @@ class TestLe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_le_scalar_int32(self, device):
         format_list = [0]
@@ -204,9 +206,9 @@ class TestLe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
-    def test_gt_scalar_float16(self, device):
+    def test_le_scalar_float16(self, device):
         format_list = [0]
         shape_list = [(5, 3), (2, 3, 4)]
         shape_format = [
@@ -218,8 +220,7 @@ class TestLe(TestCase):
             scalar = self.generate_scalar(0, 100)
             cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_le_tensor_float32(self, device):
         format_list = [0]
@@ -231,7 +232,7 @@ class TestLe(TestCase):
             cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
             cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_le_tensor_float16(self, device):
         format_list = [0]
@@ -245,8 +246,7 @@ class TestLe(TestCase):
             cpu_input2 = cpu_input2.to(torch.float32)
             cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_le_inplace_float32(self, device):
         format_list = [0, 3]
@@ -258,7 +258,7 @@ class TestLe(TestCase):
             cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
             cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_le_inplace_float16(self, device):
         format_list = [0, 3]
@@ -273,7 +273,7 @@ class TestLe(TestCase):
             cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
             cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_le_inplace_scalar_float32(self, device):
         format_list = [0]
@@ -288,7 +288,7 @@ class TestLe(TestCase):
             ncpu_input = copy.deepcopy(cpu_input)
             cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_le_inplace_scalar_float16(self, device):
         format_list = [0]
@@ -303,13 +303,13 @@ class TestLe(TestCase):
             cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
             npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
             cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_le_mix_dtype(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
 
diff --git a/test/test_network_ops/test_masked_fill.py b/test/test_network_ops/test_masked_fill.py
deleted file mode 100644
index 256e366b29..0000000000
--- a/test/test_network_ops/test_masked_fill.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch_npu
-import numpy as np
-
-from torch_npu.testing.common_utils import TestCase, run_tests
-from torch_npu.testing.common_device_type import instantiate_device_type_tests
-from torch_npu.testing.util_test import create_common_tensor
-
-class TestMaskedFill(TestCase):
-    def create_bool_tensor(self, shape, minValue, maxValue):
-        input1 = np.random.uniform(minValue, maxValue, shape)
-        input1 = input1 > 0.5
-        cpu_input = torch.from_numpy(input1)
-        npu_input = torch.from_numpy(input1).to("npu")
-        return cpu_input, npu_input
-
-    def cpu_op_exec(self, input1, mask, value):
-        output = torch.masked_fill(input1, mask, value)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, mask, value):
-        output = torch.masked_fill(input1, mask, value)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1, mask, value):
-        output = input1.masked_fill_(mask, value)
-        output = output.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1, mask, value):
-        output = input1.masked_fill_(mask, value)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_masked_fill_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
-        value_list = [1.25,
-                      torch.tensor(1.25, dtype=torch.float32),
-                      torch.tensor(5, dtype=torch.int32),
-                      torch.tensor(5, dtype=torch.int64)]
-                      
-        shape_format = [[[np.float16, i, j], v] for i in format_list for j in shape_list for v in value_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1)
-            cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1])
-            npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1])
-            cpu_output1 = cpu_output1.astype(npu_output1.dtype)
-            self.assertRtolEqual(cpu_output1, npu_output1)
-
-            cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1])
-            npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
-            cpu_output2 = cpu_output2.astype(npu_output2.dtype)
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_masked_fill_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
-        value_list = [1.25,
-                      torch.tensor(1.25, dtype=torch.float32),
-                      torch.tensor(5, dtype=torch.int32),
-                      torch.tensor(5, dtype=torch.int64)]
-                      
-        shape_format = [[[np.float32, i, j], v] for i in format_list for j in shape_list for v in value_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1)
-
-            cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1])
-            npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1])
-            self.assertRtolEqual(cpu_output1, npu_output1)
-
-            cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1])
-            npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_masked_fill_shape_format_int32(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
-        value_list = [1.25,
-                      torch.tensor(1.25, dtype=torch.float32),
-                      torch.tensor(5, dtype=torch.int32),
-                      torch.tensor(5, dtype=torch.int64)]
-                      
-        shape_format = [[[np.int32, i, j], v] for i in format_list for j in shape_list for v in value_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1)
-
-            cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1])
-            npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1])
-            self.assertRtolEqual(cpu_output1, npu_output1)
-
-            cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1])
-            npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_masked_fill_shape_format_int64(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
-        value_list = [1.25,
-                      torch.tensor(1.25, dtype=torch.float32),
-                      torch.tensor(5, dtype=torch.int32),
-                      torch.tensor(5, dtype=torch.int64)]
-                      
-        shape_format = [[[np.int64, i, j], v] for i in format_list for j in shape_list for v in value_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1)
-
-            cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1])
-            npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1])
-            cpu_output1 = cpu_output1.astype(np.int32)
-            npu_output1 = npu_output1.astype(np.int32)
-            self.assertRtolEqual(cpu_output1, npu_output1)
-
-            cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1])
-            npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1])
-            cpu_output2 = cpu_output2.astype(np.int32)
-            npu_output2 = npu_output2.astype(np.int32)
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-instantiate_device_type_tests(TestMaskedFill, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_ne.py b/test/test_network_ops/test_ne.py
index b22864a65e..a403a208d6 100644
--- a/test/test_network_ops/test_ne.py
+++ b/test/test_network_ops/test_ne.py
@@ -50,7 +50,7 @@ class TestNe(TestCase):
             cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
             cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
             npu_output = self.npu_op_exec(npu_input1, npu_input2)            
-            self.assertEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
             
     def test_ne_shape_format_fp16(self, device):
         dtype_list = [np.float16]
@@ -67,9 +67,8 @@ class TestNe(TestCase):
             if cpu_input1.dtype == torch.float16:
                 cpu_input2 = cpu_input2.to(torch.float32)
             cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)            
-            self.assertEqual(cpu_output, npu_output)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)            
+            self.assertRtolEqual(cpu_output, npu_output)
 
     def test_ne_out_shape_format_fp32(self, device):
         dtype_list = [np.float32]
@@ -82,8 +81,8 @@ class TestNe(TestCase):
             cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10)
             cpu_input2, npu_input2 = create_common_tensor(item[0], -10, 10)
             npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)           
-            self.assertEqual(npu_output_out, npu_output)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)           
+            self.assertRtolEqual(cpu_output, npu_output_out)
 
     def test_ne_scalar_out_shape_format_fp32(self, device):
         dtype_list = [np.float32]
@@ -95,14 +94,14 @@ class TestNe(TestCase):
         for item in shape_format:
             cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10)
             npu_output_out = self.npu_op_exec_out(npu_input1, 5)
-            npu_output = self.npu_op_exec(npu_input1, 5)           
-            self.assertEqual(npu_output_out, npu_output)
+            cpu_output = self.cpu_op_exec(cpu_input1, 5)
+            self.assertRtolEqual(cpu_output, npu_output_out)
 
     def test_ne_mix_dtype(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
         self.assertRtolEqual(cpu_output, npu_output)
 
 instantiate_device_type_tests(TestNe, globals(), except_for="cpu")
diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py
index 116c3dbb52..3973186d65 100644
--- a/test/test_network_ops/test_nonzero.py
+++ b/test/test_network_ops/test_nonzero.py
@@ -42,7 +42,6 @@ class TestNonzero(TestCase):
                 [[i, j, k]] for i in dtype_list for j in format_list for k in shape_list
         ]
         for item in shape_format:
-            print(item)
             cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
             cpu_output = self.cpu_op_exec(cpu_input)
             npu_output = self.npu_op_exec(npu_input)
diff --git a/torch_npu/csrc/aten/ops/GeKernelNpu.cpp b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp
index 65448746f6..6a71323f79 100644
--- a/torch_npu/csrc/aten/ops/GeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp
@@ -90,17 +90,17 @@ at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, const at::Tensor& othe
   auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
   at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       outputSize,
-      formatCastOfSelf.options().dtype(kBool),
+      formatCastOfSelf.options().dtype(at::kBool),
       ACL_FORMAT_ND);
   ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
   return result;
 }
 
-at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, Scalar other) {
+at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, at::Scalar other) {
   at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
   at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       formatCastOfSelf.sizes(),
-      formatCastOfSelf.options().dtype(kBool),
+      formatCastOfSelf.options().dtype(at::kBool),
       ACL_FORMAT_ND);
   ge_out_npu_nocheck(formatCastOfSelf, other, result);
   return result;
diff --git a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
deleted file mode 100644
index 72d7984b0e..0000000000
--- a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "torch_npu/csrc/framework/utils/OpAdapter.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h"
-#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-
-namespace at_npu {
-namespace native {
-
-at::Tensor& index_out_nocheck_npu(
-    const at::Tensor& self,
-    const at::Tensor& masksTensor,
-    const at::TensorList& allDefinedIndices,
-    at::Tensor& result) {
-  OpCommand cmd;
-  cmd.Name("Index")
-      .Input(self)
-      .Input(masksTensor);
-  for (int i = 0; i < allDefinedIndices.size(); i++) {
-    cmd.Input(allDefinedIndices[i]);
-  }
-  cmd.Output(result)
-      .Run();
-  return result;
-}
-
-at::Tensor NPUNativeFunctions::index(const at::Tensor& self, const torch::List<c10::optional<at::Tensor>>& orig) {  
-  checkIndexTensorTypes(orig);
-  // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
-  auto indices = expandTensors(self, orig);
-  at::Tensor formatCastOfSelf = self.npu_format_cast(ACL_FORMAT_ND);
-
-  // calculate the output size
-  auto outputSize = index_npu_output_size(formatCastOfSelf, indices);
-
-  // construct the output tensor of the NPU
-  at::Tensor result = OpPreparation::ApplyTensorWithFormat(formatCastOfSelf,  outputSize, ACL_FORMAT_ND);
-
-  // masks corresponds to indices. 0 indicates undefined tensor.
-  SmallVector<int64_t, N> masks;
-  std::vector<at::Tensor> allDefinedIndices;
-  for (int64_t i = 0; i < indices.size(); i++) {
-    if (indices[i].defined()) {
-      masks.emplace_back(1);
-      allDefinedIndices.emplace_back(indices[i]);
-    } else {
-      masks.emplace_back(0);
-    }
-  }
-
-  at::Tensor masksTensor = CalcuOpUtil::copy_tensor_host_to_device(
-      from_blob(masks.data(), {masks.size()}, dtype(at::ScalarType::Long)));
-
-  // calculate the output result of the NPU
-  index_out_nocheck_npu(formatCastOfSelf, masksTensor, allDefinedIndices, result);
-
-  return result;
-}
-
-} // namespace native
-} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/LeKernelNpu.cpp b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp
index dfd9c4d584..44404cf8f9 100644
--- a/torch_npu/csrc/aten/ops/LeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp
@@ -77,7 +77,7 @@ at::Tensor NPUNativeFunctions::le(const at::Tensor& self, at::Scalar other) {
   at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
   at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       formatCastOfSelf.sizes(),
-      formatCastOfSelf.options().dtype(kBool),
+      formatCastOfSelf.options().dtype(at::kBool),
       ACL_FORMAT_ND);
   le_out_npu_nocheck(formatCastOfSelf, other, result);
   return result;
@@ -90,7 +90,7 @@ at::Tensor NPUNativeFunctions::le(const at::Tensor& self, const at::Tensor& othe
   auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
   at::Tensor result = OpPreparation::ApplyTensorWithFormat(
       outputSize,
-      formatCastOfSelf.options().dtype(kBool),
+      formatCastOfSelf.options().dtype(at::kBool),
       ACL_FORMAT_ND);
 
   le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result);
diff --git a/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp
deleted file mode 100644
index 847896ec3a..0000000000
--- a/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "torch_npu/csrc/framework/utils/OpAdapter.h"
-#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-
-namespace at_npu {
-namespace native {
-
-at::Tensor& NPUNativeFunctions::masked_fill_out(const at::Tensor& self, const at::Tensor& mask, const at::Tensor& value, at::Tensor& result) {
-  OpPreparation::CheckOut(
-      {self},
-      result,
-      self);
-  at::Tensor maskBool = mask;
-  int64_t dimOfSelf = self.dim();
-
-  /* Avoid the problem that the TBE operator does not support 0-dimensional tensor input */
-  if (dimOfSelf == 0) {
-    self.unsqueeze_(0);
-  }
-
-  if ((mask.dtype() != at::kBool)) {
-    maskBool = mask.to(at::kBool);
-  }
-  at::Tensor valueTensor = value;
-  if (value.dtype() != self.dtype()) {
-    valueTensor = valueTensor.to(self.dtype());
-  }
-
-  OpCommand cmd;
-  cmd.Name("MaskedFill")
-      .Input(self)
-      .Input(maskBool)
-      .Input(valueTensor)      
-      .Output(result)
-      .Run();
-  
-  if (dimOfSelf == 0) {
-    result.squeeze_(0);
-  }
-  
-  return result;
-}
-
-at::Tensor& NPUNativeFunctions::masked_fill_out(const at::Tensor& self, const at::Tensor& mask, at::Scalar value, at::Tensor& result) {
-  OpPreparation::CheckOut(
-      {self},
-      result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
-  at::Tensor maskBool = mask;
-  int64_t dimOfSelf = self.dim();
-
-  /* Avoid the problem that the TBE operator does not support 0-dimensional tensor input */
-  if (dimOfSelf == 0) {
-    self.unsqueeze_(0);
-  }
-
-  if (!(mask.dtype() == at::kBool)) {
-    maskBool = mask.to(at::kBool);
-  }
-
-  OpCommand cmd;
-  cmd.Name("MaskedFill")
-    .Input(self)
-    .Input(maskBool)
-    .Input(value, self.scalar_type())
-    .Output(result)
-    .Run();
-  
-  if (dimOfSelf == 0) {
-    result.squeeze_(0);
-  }
-  return result;
-}
-
-at::Tensor& NPUNativeFunctions::masked_fill_(at::Tensor& self, const at::Tensor& mask, const at::Tensor& value) {
-  // OpPreparation::CheckMemory({self, mask, value}, {self});
-  if (!NpuUtils::check_match(&self)) {
-    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    at::Tensor result = masked_fill_out(contiguousSelf, mask, value, contiguousSelf);
-    self.copy_(result);
-  } else {
-    masked_fill_out(self, mask, value, self);
-  }
-  return self;
-}
-
-at::Tensor& NPUNativeFunctions::masked_fill_(at::Tensor& self, const at::Tensor& mask, at::Scalar value) {
-  if (!NpuUtils::check_match(&self)) {
-    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    at::Tensor result = masked_fill_out(contiguousSelf, mask, value, contiguousSelf);
-    self.copy_(result);
-  } else {
-    masked_fill_out(self, mask, value, self);
-  }
-
-  return self;
-}
-
-} // namespace native
-} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/NeKernelNpu.cpp b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp
index d26ae3c535..b95c86a50d 100644
--- a/torch_npu/csrc/aten/ops/NeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp
@@ -62,7 +62,7 @@ at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, at::S
   return result;
 }
 
-at::Tensor& NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
+at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) {
   at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
   at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other);
   auto outputSize = broadcast_ops_npu_output_size(self, other);
@@ -70,8 +70,8 @@ at::Tensor& NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& oth
       {self, other},
       result,
       CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
-      ScalarType::Bool,
-      IntArrayRef(outputSize));
+      at::ScalarType::Bool,
+      at::IntArrayRef(outputSize));
   ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther);
   return result;
 }
@@ -82,7 +82,7 @@ at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, at::Scalar other,
       {self},
       result,
       CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf),
-      ScalarType::Bool,
+      at::ScalarType::Bool,
       formatCastOfSelf.sizes());
   ne_out_npu_nocheck(result, formatCastOfSelf, other);
   return result;
@@ -95,7 +95,7 @@ at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& othe
   auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther);
   at::Tensor result = OpPreparation::ApplyTensor(
       outputSize,
-      formatCastOfSelf.options().dtype(kBool),
+      formatCastOfSelf.options().dtype(at::kBool),
       formatCastOfSelf);
 
   ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther);
@@ -107,7 +107,7 @@ at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, at::Scalar other) {
 
   at::Tensor result = OpPreparation::ApplyTensor(
       formatCastOfSelf,
-      formatCastOfSelf.options().dtype(kBool));
+      formatCastOfSelf.options().dtype(at::kBool));
 
   ne_out_npu_nocheck(result, formatCastOfSelf, other);
   return result;
diff --git a/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp
index c2f86b65a6..640564e8dd 100644
--- a/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp
@@ -37,7 +37,7 @@ at::Tensor& NPUNativeFunctions::nonzero_out(const at::Tensor& self, at::Tensor&
       {self},
       result,
       CalcuOpUtil::get_tensor_npu_format(self),
-      ScalarType::Long,
+      at::ScalarType::Long,
       outputSize);
 
   OpPipeWithDefinedOut pipe;
-- 
Gitee


From ab7c6d45e0d434bdb61fec5ceeebffcba6d08fad Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 9 Feb 2022 19:52:30 +0800
Subject: [PATCH 3/4] ge, le, nonzero ut clean code

---
 test/test_network_ops/test_ge.py      | 24 ++++++++++-----------
 test/test_network_ops/test_le.py      | 30 +++++++++++++--------------
 test/test_network_ops/test_nonzero.py |  8 +++----
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py
index 1afd23a188..5a9011620d 100644
--- a/test/test_network_ops/test_ge.py
+++ b/test/test_network_ops/test_ge.py
@@ -21,8 +21,8 @@ from torch_npu.testing.util_test import create_common_tensor
 
 
 class TestGe(TestCase):
-    def generate_scalar(self, min, max):
-        scalar = np.random.uniform(min, max)
+    def generate_scalar(self, min1, max1):
+        scalar = np.random.uniform(min1, max1)
         return scalar
 
     def cpu_op_exec(self, input1, input2):
@@ -59,8 +59,8 @@ class TestGe(TestCase):
         output = output.numpy()
         return output
 
-    def cpu_op_exec_scalar(self, input, scalar):
-        output = torch.ge(input, scalar)
+    def cpu_op_exec_scalar(self, input1, scalar):
+        output = torch.ge(input1, scalar)
         output = output.numpy()
         return output
 
@@ -69,8 +69,8 @@ class TestGe(TestCase):
         output = input2.numpy()
         return output
 
-    def npu_op_exec_scalar(self, input, scalar):
-        output = torch.ge(input, scalar)
+    def npu_op_exec_scalar(self, input1, scalar):
+        output = torch.ge(input1, scalar)
         output = output.to("cpu")
         output = output.numpy()
         return output
@@ -81,13 +81,13 @@ class TestGe(TestCase):
         output = output.numpy()
         return output
 
-    def cpu_op_inplace_exec_scalar(self, input, scalar):
-        output = input.ge_(scalar)
+    def cpu_op_inplace_exec_scalar(self, input1, scalar):
+        output = input1.ge_(scalar)
         output = output.numpy()
         return output
 
-    def npu_op_inplace_exec_scalar(self, input, scalar):
-        output = input.ge_(scalar)
+    def npu_op_inplace_exec_scalar(self, input1, scalar):
+        output = input1.ge_(scalar)
         output = output.to("cpu")
         output = output.numpy()
         return output
@@ -96,7 +96,7 @@ class TestGe(TestCase):
         for item in shape_format:
             cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
             cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3 = torch.randn(item[1][2])<0
+            cpu_input3 = torch.randn(item[1][2]) < 0
             npu_input3 = cpu_input3.npu()
             if cpu_input1.dtype == torch.float16:
                 cpu_input1 = cpu_input1.to(torch.float32)
@@ -123,7 +123,7 @@ class TestGe(TestCase):
     def ge_scalar_out_result(self, shape_format):
         for item in shape_format:
             cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2 = torch.randn(item[1][2])<0
+            cpu_input2 = torch.randn(item[1][2]) < 0
             npu_input2 = cpu_input2.npu()
             if cpu_input1.dtype == torch.float16:
                 cpu_input1 = cpu_input1.to(torch.float32)
diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py
index 5bdb1617b7..5b7d933b5b 100644
--- a/test/test_network_ops/test_le.py
+++ b/test/test_network_ops/test_le.py
@@ -21,8 +21,8 @@ from torch_npu.testing.common_device_type import instantiate_device_type_tests
 from torch_npu.testing.util_test import create_common_tensor
 
 class TestLe(TestCase):
-    def generate_scalar(self, min, max):
-        scalar = np.random.uniform(min, max)
+    def generate_scalar(self, min1, max1):
+        scalar = np.random.uniform(min1, max1)
         return scalar
 
     def cpu_op_exec(self, input1, input2):
@@ -59,8 +59,8 @@ class TestLe(TestCase):
         output = output.numpy()
         return output
 
-    def cpu_op_exec_scalar(self, input, scalar):
-        output = torch.le(input, scalar)
+    def cpu_op_exec_scalar(self, input1, scalar):
+        output = torch.le(input1, scalar)
         output = output.numpy()
         return output
 
@@ -69,26 +69,26 @@ class TestLe(TestCase):
         output = input2.numpy()
         return output
 
-    def npu_op_exec_scalar(self, input, scalar):
-        output = torch.le(input, scalar)
+    def npu_op_exec_scalar(self, input1, scalar):
+        output = torch.le(input1, scalar)
         output = output.to("cpu")
         output = output.numpy()
         return output
 
-    def cpu_op_inplace_exec_scalar(self, input, scalar):
-        output = input.le_(scalar)
+    def cpu_op_inplace_exec_scalar(self, input1, scalar):
+        output = input1.le_(scalar)
         output = output.numpy()
         return output
 
-    def npu_op_inplace_exec_scalar(self, input, scalar):
-        input = input.to("npu")
-        output = input.le_(scalar)
+    def npu_op_inplace_exec_scalar(self, input1, scalar):
+        input1 = input1.to("npu")
+        output = input1.le_(scalar)
         output = output.to("cpu")
         output = output.numpy()
         return output
 
-    def npu_op_exec_scalar_out(self, input, scalar, output):
-        torch.le(input, scalar, out=output)
+    def npu_op_exec_scalar_out(self, input1, scalar, output):
+        torch.le(input1, scalar, out=output)
         output = output.to("cpu")
         output = output.numpy()
         return output
@@ -128,7 +128,7 @@ class TestLe(TestCase):
         for item in shape_format:
             cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
             cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3 = torch.randn(item[1][2])<0
+            cpu_input3 = torch.randn(item[1][2]) < 0
             npu_input3 = cpu_input3.npu()
             if cpu_input1.dtype == torch.float16:
                 cpu_input1 = cpu_input1.to(torch.float32)
@@ -157,7 +157,7 @@ class TestLe(TestCase):
     def le_scalar_out_result(self, shape_format):
         for item in shape_format:
             cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2 = torch.randn(item[1][2])<0
+            cpu_input2 = torch.randn(item[1][2]) < 0
             npu_input2 = cpu_input2.npu()
             if cpu_input1.dtype == torch.float16:
                 cpu_input1 = cpu_input1.to(torch.float32)
diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py
index 3973186d65..84c204a86f 100644
--- a/test/test_network_ops/test_nonzero.py
+++ b/test/test_network_ops/test_nonzero.py
@@ -22,13 +22,13 @@ from torch_npu.testing.common_device_type import instantiate_device_type_tests
 from torch_npu.testing.util_test import create_common_tensor
 
 class TestNonzero(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.nonzero(input)
+    def cpu_op_exec(self, input1):
+        output = torch.nonzero(input1)
         output = output.numpy().astype(np.int32)
         return output
 
-    def npu_op_exec(self, input):
-        output = torch.nonzero(input)
+    def npu_op_exec(self, input1):
+        output = torch.nonzero(input1)
         output = output.to("cpu")
         output = output.numpy().astype(np.int32)
         return output
-- 
Gitee


From 343d35ff466da2bcf2dccfcc26c9fcff0b37491f Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Thu, 10 Feb 2022 10:11:05 +0800
Subject: [PATCH 4/4] uniform_ rm redundant note

---
 torch_npu/csrc/aten/ops/UniformKernelNpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp
index 3dd2834a35..71a12cb5aa 100644
--- a/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp
@@ -36,7 +36,7 @@ at::Tensor& uniform_out_npu(
 }
 
 at::Tensor& NPUNativeFunctions::uniform_(at::Tensor& self, double from, double to, c10::optional<at::Generator> gen_) {
-  // TODO(Ascend): The operator needs to use fp32 for calculation.
+  // The operator needs to use fp32 for calculation.
   at::Tensor selfCopy = self;
   if (self.scalar_type() == at::ScalarType::Half) {
     selfCopy = self.to(at::ScalarType::Float);
-- 
Gitee