From 62dd4e5d1f56b165cf81391f2336346949712543 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 9 Feb 2022 18:34:33 +0800 Subject: [PATCH 1/4] ge, index, le, masked_fill, ne, nonzero, rsub, tanh, uniform --- test/test_network_ops/test_ge.py | 301 +++++++++++++++++ test/test_network_ops/test_index.py | 204 +++++++++++ test/test_network_ops/test_le.py | 318 ++++++++++++++++++ test/test_network_ops/test_masked_fill.py | 145 ++++++++ test/test_network_ops/test_ne.py | 110 ++++++ test/test_network_ops/test_nonzero.py | 54 +++ test/test_network_ops/test_rsub.py | 171 ++++++++++ test/test_network_ops/test_tanh.py | 139 ++++++++ test/test_network_ops/test_tanh_backward.py | 101 ++++++ test/test_network_ops/test_uniform_.py | 50 +++ torch_npu/csrc/aten/ops/GeKernelNpu.cpp | 145 ++++++++ torch_npu/csrc/aten/ops/IndexKernelNpu.cpp | 74 ++++ torch_npu/csrc/aten/ops/LeKernelNpu.cpp | 134 ++++++++ .../csrc/aten/ops/MaskedFillKernelNpu.cpp | 117 +++++++ torch_npu/csrc/aten/ops/NeKernelNpu.cpp | 157 +++++++++ torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp | 63 ++++ torch_npu/csrc/aten/ops/RsubKernelNpu.cpp | 96 ++++++ .../csrc/aten/ops/TanhBackwardKernelNpu.cpp | 54 +++ torch_npu/csrc/aten/ops/TanhKernelNpu.cpp | 55 +++ torch_npu/csrc/aten/ops/UniformKernelNpu.cpp | 58 ++++ 20 files changed, 2546 insertions(+) create mode 100644 test/test_network_ops/test_ge.py create mode 100644 test/test_network_ops/test_index.py create mode 100644 test/test_network_ops/test_le.py create mode 100644 test/test_network_ops/test_masked_fill.py create mode 100644 test/test_network_ops/test_ne.py create mode 100644 test/test_network_ops/test_nonzero.py create mode 100644 test/test_network_ops/test_rsub.py create mode 100644 test/test_network_ops/test_tanh.py create mode 100644 test/test_network_ops/test_tanh_backward.py create mode 100644 test/test_network_ops/test_uniform_.py create mode 100644 torch_npu/csrc/aten/ops/GeKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/IndexKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/LeKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/NeKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/RsubKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/TanhKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/UniformKernelNpu.cpp diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py new file mode 100644 index 0000000000..ff5ea46a02 --- /dev/null +++ b/test/test_network_ops/test_ge.py @@ -0,0 +1,301 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestGe(TestCase): + def generate_scalar(self, min, max): + scalar = np.random.uniform(min, max) + return scalar + + def cpu_op_exec(self, input1, input2): + output = torch.ge(input1, input2) + output = output.numpy() + return output + + def cpu_op_exec_out(self, input1, input2, input3): + torch.ge(input1, input2, out = input3) + output = input3.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = torch.ge(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2, input3): + torch.ge(input1, input2, out = input3) + output = input3.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec(self, input1, input2): + output = input1.ge_(input2) + output = input1 + output = output.numpy() + return output + + def npu_op_inplace_exec(self, input1, input2): + output = input1.ge_(input2) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_exec_scalar(self, input, scalar): + output = torch.ge(input, scalar) + output = output.numpy() + return output + + def cpu_op_exec_scalar_out(self, input1, scalar, input2): + torch.ge(input1, scalar, out = input2) + output = input2.numpy() + return output + + def npu_op_exec_scalar(self, input, scalar): + output = torch.ge(input, scalar) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_scalar_out(self, input1, scalar, input2): + torch.ge(input1, scalar, out = input2) + output = input2.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec_scalar(self, input, scalar): + output = input.ge_(scalar) + output = output.numpy() + return output + + def npu_op_inplace_exec_scalar(self, input, scalar): + output = input.ge_(scalar) + output = output.to("cpu") + output = output.numpy() + return output + + def ge_tensor_out_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) + cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100) + cpu_input3 = torch.randn(item[1][2])<0 + npu_input3 = cpu_input3.npu() + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + if cpu_input3.dtype == torch.float16: + cpu_input3 = cpu_input3.to(torch.float32) + cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) + cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) + + self.assertRtolEqual(cpu_output_out, npu_output_out) + + def test_ge_tensor_out(self, device): + shape_format = [ + [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]], + [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]], + [[np.float32, 0, [256, 128, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]], + [[np.float32, 0, [2, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]], + [[np.float32, 0, [128, 232, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]], + ] + self.ge_tensor_out_result(shape_format) + + def ge_scalar_out_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) + cpu_input2 = torch.randn(item[1][2])<0 + npu_input2 = cpu_input2.npu() + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2) + npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2) + cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) + self.assertRtolEqual(cpu_output_out, npu_output_out) + + def test_ge_scalar_out(self, device): + shape_format = [ + [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [16, 3, 1111, 1212]], [np.float16, 0, [3, 3, 3]]], + [[np.float16, 0, [16, 16, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]], + [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]], + [[np.float32, 0, [1313, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]], + [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]], + ] + self.ge_scalar_out_result(shape_format) + + def test_ge_bool(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + scalar_list = [True, False] + shape_format = [ + [[np.int32, i, j], k] for i in format_list for j in shape_list + for k in scalar_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100) + cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1]) + npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1]) + cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50) + npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50) + self.assertEqual(cpu_output1, npu_output1) + self.assertEqual(cpu_output2, npu_output2) + + def test_ge_scalar_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertEqual(cpu_output, npu_output) + + def test_ge_scalar_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + cpu_input = cpu_input.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_ge_scalar_int32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.int32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertEqual(cpu_output, npu_output) + + def test_ge_tensor_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float32, i, j], [np.float32, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertEqual(cpu_output, npu_output) + + def test_ge_tensor_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float16, i, j], [np.float16, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_ge_inplace_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float32, i, j], [np.float32, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) + self.assertEqual(cpu_output, npu_output) + + def test_ge_inplace_float16(self, device): + format_list = [0, 3] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float16, i, j], [np.float16, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_ge_inplace_scalar_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) + self.assertEqual(cpu_output, npu_output) + + def test_ge_inplace_scalar_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + cpu_input = cpu_input.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_ge_mix_dtype(self, device): + npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(npu_input1, npu_input3) + npu_output = self.npu_op_exec(npu_input2, npu_input4) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestGe, globals(), except_for="cpu") +if __name__ == '__main__': + run_tests() diff --git a/test/test_network_ops/test_index.py b/test/test_network_ops/test_index.py new file mode 100644 index 0000000000..723706ccde --- /dev/null +++ b/test/test_network_ops/test_index.py @@ -0,0 +1,204 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestIndex(TestCase): + def generate_index_data_bool(self, shape): + cpu_input = torch.randn(shape)>0 + npu_input = cpu_input.to("npu") + return cpu_input, npu_input + + def cpu_op_exec(self, input1, index): + output = input1[index] + output = output.numpy() + return output + + def npu_op_exec(self, input1, index): + output = input1[index] + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_exec_ellip(self, input1, index): + output = input1[index, ..., index] + output = output.numpy() + return output + + def npu_op_exec_ellip(self, input1, index): + output = input1[index, ..., index] + output = output.cpu().numpy() + return output + + def cpu_op_exec_semi(self, input1, index): + output = input1[index, :, index] + output = output.numpy() + return output + + def npu_op_exec_semi(self, input1, index): + output = input1[index, :, index] + output = output.cpu().numpy() + return output + + def test_index_ellip(self, device): + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[5, 256, 256, 100]] + shape_format_tensor = [ + [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format_tensor: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_index1, npu_index1 = create_common_tensor(item[1], 0, 2) + cpu_output = self.cpu_op_exec_ellip(cpu_input1, cpu_index1) + npu_output = self.npu_op_exec_ellip(npu_input1, npu_index1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_index_semi(self, device): + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[5, 256, 256, 100]] + shape_format_tensor = [ + [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format_tensor: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_index1, npu_index1 = create_common_tensor(item[1], 0, 2) + cpu_output = self.cpu_op_exec_semi(cpu_input1, cpu_index1) + npu_output = self.npu_op_exec_semi(npu_input1, npu_index1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_index_shape_format_tensor(self, device): + #test index is tensor + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] + shape_format_tensor = [ + [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format_tensor: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_index1) + npu_output = self.npu_op_exec(npu_input1, npu_index1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_index_shape_format_tensor_x(self, device): + #test index is [tensor, x] , (x=1,bool,range) + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] + shape_format_tensor = [ + [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format_tensor: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3) + for i in [1, range(2), True]: + cpu_output = self.cpu_op_exec(cpu_input1, (cpu_index1, i)) + npu_output = self.npu_op_exec(npu_input1, (npu_index1, i)) + self.assertRtolEqual(cpu_output, npu_output) + + def test_index_shape_format_tensor_tensor(self, device): + #test index is [tensor, tensor] + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 1000]] + shape_format_multiTensor = [ + [[i, j, k], [np.int64, 0, [1,2]]] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format_multiTensor: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3) + cpu_index2, npu_index2 = create_common_tensor(item[1], 1, 3) + cpu_output = self.cpu_op_exec(cpu_input1, (cpu_index1,cpu_index2)) + npu_output = self.npu_op_exec(npu_input1, (npu_index1, npu_index2)) + self.assertRtolEqual(cpu_output, npu_output) + + def test_index_shape_format_list(self, device): + #test index is list + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] + shape_format_list = [ + [[i, j, k], (0,1)] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format_list: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, item[1]) + npu_output = self.npu_op_exec(npu_input1, item[1]) + self.assertRtolEqual(cpu_output, npu_output) + + def test_index_shape_format_list_x(self, device): + #test index is [list, x], (x=1,bool,range) + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] + shape_format_list = [ + [[i, j, k], (0,1)] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format_list: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + for i in [1, range(2), (0, 1), True]: + cpu_output = self.cpu_op_exec(cpu_input1, (item[1], i)) + npu_output = self.npu_op_exec(npu_input1, (item[1], i)) + self.assertRtolEqual(cpu_output, npu_output) + + def test_index_shape_format_tensor_bool(self, device): + #test index is bool tensor + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] + shape_format_tensor_bool = [ + [[i, j, k],k] for i in dtype_list for j in format_list for k in shape_list + ] + + for item in shape_format_tensor_bool: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_index ,npu_index = self.generate_index_data_bool(item[1]) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_index) + npu_output = self.npu_op_exec(npu_input1, npu_index) + self.assertRtolEqual(cpu_output, npu_output) + + def test_index_shape_format_bool_x(self, device): + #test index is [bool, x] , (x=1,bool,range) + dtype_list = [np.float32, np.float16, np.int32] + format_list = [0] + shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] + index_list = [(True), (False), (True, 1), (True,range(4)), (True,False)] + shape_format_tensor_bool_list = [ + [[i, j, k], l] for i in dtype_list for j in format_list for k in shape_list for l in index_list + ] + + for item in shape_format_tensor_bool_list: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, item[1]) + npu_output = self.npu_op_exec(npu_input1, item[1]) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestIndex, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py new file mode 100644 index 0000000000..b5bcbacb61 --- /dev/null +++ b/test/test_network_ops/test_le.py @@ -0,0 +1,318 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestLe(TestCase): + def generate_scalar(self, min, max): + scalar = np.random.uniform(min, max) + return scalar + + def cpu_op_exec(self, input1, input2): + output = torch.le(input1, input2) + output = output.numpy() + return output + + def cpu_op_exec_out(self, input1, input2, input3): + torch.le(input1, input2, out = input3) + output = input3.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = torch.le(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec(self, input1, input2): + output = input1.le_(input2) + output = input1 + output = output.numpy() + return output + + def npu_op_inplace_exec(self, input1, input2): + output = input1.le_(input2) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2, output): + torch.le(input1, input2, out=output) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_exec_scalar(self, input, scalar): + output = torch.le(input, scalar) + output = output.numpy() + return output + + def cpu_op_exec_scalar_out(self, input1, scalar, input2): + torch.le(input1, scalar, out = input2) + output = input2.numpy() + return output + + def npu_op_exec_scalar(self, input, scalar): + output = torch.le(input, scalar) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec_scalar(self, input, scalar): + output = input.le_(scalar) + output = output.numpy() + return output + + def npu_op_inplace_exec_scalar(self, input, scalar): + input = input.to("npu") + output = input.le_(scalar) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_scalar_out(self, input, scalar, output): + torch.le(input, scalar, out=output) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_stride_exec(self, input1, input2): + input1 = input1.as_strided([2, 2], [1, 2], 1) + input2 = input2.as_strided([2, 2], [1, 2], 1) + output = input1.le_(input2) + output = output.numpy() + return output + + def npu_op_inplace_stride_exec(self, input1, input2): + input1 = input1.to("npu") + input2 = input2.to("npu") + input1 = input1.as_strided([2, 2], [1, 2], 1) + input2 = input2.as_strided([2, 2], [1, 2], 1) + output = input1.le_(input2) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_stride_scalar_exec(self, input1, input2): + input1 = input1.as_strided([2, 2], [1, 2], 1) + output = input1.le_(input2) + output = output.numpy() + return output + + def npu_op_inplace_stride_scalar_exec(self, input1, input2): + input1 = input1.to("npu") + input1 = input1.as_strided([2, 2], [1, 2], 1) + output = input1.le_(input2) + output = output.to("cpu") + output = output.numpy() + return output + + def le_tensor_out_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) + cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100) + cpu_input3 = torch.randn(item[1][2])<0 + npu_input3 = cpu_input3.npu() + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + if cpu_input3.dtype == torch.float16: + cpu_input3 = cpu_input3.to(torch.float32) + cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) + cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) + + self.assertRtolEqual(cpu_output_out, npu_output_out) + + def test_le_tensor_out(self, device): + shape_format = [ + [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]], + [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]], + [[np.float32, 0, [256, 128, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]], + [[np.float32, 0, [2, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]], + [[np.float32, 0, [128, 232, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]], + ] + self.le_tensor_out_result(shape_format) + + def le_scalar_out_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) + cpu_input2 = torch.randn(item[1][2])<0 + npu_input2 = cpu_input2.npu() + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2) + npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2) + cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) + self.assertRtolEqual(cpu_output_out, npu_output_out) + + def test_le_scalar_out(self, device): + shape_format = [ + [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 2, [16, 3, 11, 121, 21]], [np.float16, 0, [3, 3, 3]]], + [[np.float16, 0, [16, 16, 14]], [np.float16, 0, [128, 116, 14, 14]]], + [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]], + [[np.float32, 2, [1313, 3, 3, 3, 121]], [np.float32, 0, [3, 1, 3]]], + [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]], + ] + self.le_scalar_out_result(shape_format) + + def test_le_scalar_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertEqual(cpu_output, npu_output) + + def test_le_scalar_int32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.int32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertEqual(cpu_output, npu_output) + + def test_gt_scalar_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + cpu_input = cpu_input.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_le_tensor_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float32, i, j], [np.float32, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertEqual(cpu_output, npu_output) + + def test_le_tensor_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float16, i, j], [np.float16, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_le_inplace_float32(self, device): + format_list = [0, 3] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float32, i, j], [np.float32, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) + self.assertEqual(cpu_output, npu_output) + + def test_le_inplace_float16(self, device): + format_list = [0, 3] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float16, i, j], [np.float16, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_le_inplace_scalar_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + scalar1 = copy.deepcopy(scalar) + ncpu_input = copy.deepcopy(cpu_input) + cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1) + self.assertEqual(cpu_output, npu_output) + + def test_le_inplace_scalar_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + cpu_input = cpu_input.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_le_mix_dtype(self, device): + npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(npu_input1, npu_input3) + npu_output = self.npu_op_exec(npu_input2, npu_input4) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestLe, globals(), except_for="cpu") +if __name__ == '__main__': + run_tests() diff --git a/test/test_network_ops/test_masked_fill.py b/test/test_network_ops/test_masked_fill.py new file mode 100644 index 0000000000..256e366b29 --- /dev/null +++ b/test/test_network_ops/test_masked_fill.py @@ -0,0 +1,145 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestMaskedFill(TestCase): + def create_bool_tensor(self, shape, minValue, maxValue): + input1 = np.random.uniform(minValue, maxValue, shape) + input1 = input1 > 0.5 + cpu_input = torch.from_numpy(input1) + npu_input = torch.from_numpy(input1).to("npu") + return cpu_input, npu_input + + def cpu_op_exec(self, input1, mask, value): + output = torch.masked_fill(input1, mask, value) + output = output.numpy() + return output + + def npu_op_exec(self, input1, mask, value): + output = torch.masked_fill(input1, mask, value) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_inp_op_exec(self, input1, mask, value): + output = input1.masked_fill_(mask, value) + output = output.numpy() + return output + + def npu_inp_op_exec(self, input1, mask, value): + output = input1.masked_fill_(mask, value) + output = output.to("cpu") + output = output.numpy() + return output + + def test_masked_fill_shape_format_fp16(self, device): + format_list = [0] + shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]] + value_list = [1.25, + torch.tensor(1.25, dtype=torch.float32), + torch.tensor(5, dtype=torch.int32), + torch.tensor(5, dtype=torch.int64)] + + shape_format = [[[np.float16, i, j], v] for i in format_list for j in shape_list for v in value_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1) + cpu_input1 = cpu_input1.to(torch.float32) + + cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1]) + npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1]) + cpu_output1 = cpu_output1.astype(npu_output1.dtype) + self.assertRtolEqual(cpu_output1, npu_output1) + + cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1]) + npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1]) + cpu_output2 = cpu_output2.astype(npu_output2.dtype) + self.assertRtolEqual(cpu_output2, npu_output2) + + def test_masked_fill_shape_format_fp32(self, device): + format_list = [0] + shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]] + value_list = [1.25, + torch.tensor(1.25, dtype=torch.float32), + torch.tensor(5, dtype=torch.int32), + torch.tensor(5, dtype=torch.int64)] + + shape_format = [[[np.float32, i, j], v] for i in format_list for j in shape_list for v in value_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1) + + cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1]) + npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1]) + self.assertRtolEqual(cpu_output1, npu_output1) + + cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1]) + npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1]) + self.assertRtolEqual(cpu_output2, npu_output2) + + def test_masked_fill_shape_format_int32(self, device): + format_list = [0] + shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]] + value_list = [1.25, + torch.tensor(1.25, dtype=torch.float32), + torch.tensor(5, dtype=torch.int32), + torch.tensor(5, dtype=torch.int64)] + + shape_format = [[[np.int32, i, j], v] for i in format_list for j in shape_list for v in value_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1) + + cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1]) + npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1]) + self.assertRtolEqual(cpu_output1, npu_output1) + + cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1]) + npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1]) + self.assertRtolEqual(cpu_output2, npu_output2) + + def test_masked_fill_shape_format_int64(self, device): + format_list = [0] + shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]] + value_list = [1.25, + torch.tensor(1.25, dtype=torch.float32), + torch.tensor(5, dtype=torch.int32), + torch.tensor(5, dtype=torch.int64)] + + shape_format = [[[np.int64, i, j], v] for i in format_list for j in shape_list for v in value_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1) + + cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1]) + npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1]) + cpu_output1 = cpu_output1.astype(np.int32) + npu_output1 = npu_output1.astype(np.int32) + self.assertRtolEqual(cpu_output1, npu_output1) + + cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1]) + npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1]) + cpu_output2 = cpu_output2.astype(np.int32) + npu_output2 = npu_output2.astype(np.int32) + self.assertRtolEqual(cpu_output2, npu_output2) + +instantiate_device_type_tests(TestMaskedFill, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_ne.py b/test/test_network_ops/test_ne.py new file mode 100644 index 0000000000..b22864a65e --- /dev/null +++ b/test/test_network_ops/test_ne.py @@ -0,0 +1,110 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestNe(TestCase): + def cpu_op_exec(self, input1, input2): + output = torch.ne(input1, input2) + output = output.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = torch.ne(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2): + input3 = torch.empty(0).bool().npu() + torch.ne(input1, input2, out=input3) + output = input3.to("cpu") + output = output.numpy() + return output + + def test_ne_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0, 3] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [d, i, j] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 1, 100) + cpu_input2, npu_input2 = create_common_tensor(item, 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertEqual(cpu_output, npu_output) + + def test_ne_shape_format_fp16(self, device): + dtype_list = [np.float16] + format_list = [0, 3] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [d, i, j] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 1, 100) + cpu_input2, npu_input2 = create_common_tensor(item, 1, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input1.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_ne_out_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10) + cpu_input2, npu_input2 = create_common_tensor(item[0], -10, 10) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertEqual(npu_output_out, npu_output) + + def test_ne_scalar_out_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10) + npu_output_out = self.npu_op_exec_out(npu_input1, 5) + npu_output = self.npu_op_exec(npu_input1, 5) + self.assertEqual(npu_output_out, npu_output) + + def test_ne_mix_dtype(self, device): + npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(npu_input1, npu_input3) + npu_output = self.npu_op_exec(npu_input2, npu_input4) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestNe, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py new file mode 100644 index 0000000000..116c3dbb52 --- /dev/null +++ b/test/test_network_ops/test_nonzero.py @@ -0,0 +1,54 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestNonzero(TestCase): + def cpu_op_exec(self, input): + output = torch.nonzero(input) + output = output.numpy().astype(np.int32) + return output + + def npu_op_exec(self, input): + output = torch.nonzero(input) + output = output.to("cpu") + output = output.numpy().astype(np.int32) + return output + + def test_nonzero_shape_format(self, device): + dtype_list = [np.float32, np.float16, np.int32, np.int64] + format_list = [0] + shape_list = [[256,10], [256,256,100],[5,256,256,100]] + + shape_format = [ + [[i, j, k]] for i in dtype_list for j in format_list for k in shape_list + ] + for item in shape_format: + print(item) + cpu_input, npu_input = create_common_tensor(item[0], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestNonzero, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_rsub.py b/test/test_network_ops/test_rsub.py new file mode 100644 index 0000000000..9b2167d78f --- /dev/null +++ b/test/test_network_ops/test_rsub.py @@ -0,0 +1,171 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestRsub(TestCase): + def cpu_op_exec(self, input1, input2): + output = input2 - input1 + output = output.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = input2 - input1 + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_scalar(self, input1, input2): + output = input1 - input2 + output = output.to("cpu") + output = output.numpy() + output = -output + return output + + def rsub_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def rsub_scalar_result(self, shape_format): + for item in shape_format: + scalar = np.random.uniform(0, 100) + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, scalar) + npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar) + + cpu_output = cpu_output.astype(npu_output_scalar.dtype) + self.assertRtolEqual(cpu_output, npu_output_scalar) + + def test_sub_shape_format_fp16_1d(self, device): + format_list = [-1, 0, 3] + shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp32_1d(self, device): + format_list = [-1, 0, 3] + shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp16_2d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp32_2d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp16_3d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp32_3d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp16_4d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp32_4d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list] + self.rsub_result(shape_format) + + # int------------------------------------------------------------------------------- + def test_sub_shape_format_int32_1d(self, device): + format_list = [-1, 0] + shape_format = [[[np.int32, i, [32]], [np.int32, i, [32]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_int32_2d(self, device): + format_list = [-1, 0] + shape_format = [[[np.int32, i, [5, 3]], [np.int32, i, [5, 3]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_int32_3d(self, device): + format_list = [-1, 0] + shape_format = [[[np.int32, i, [256, 480, 14]], [np.int32, i, [256, 480, 14]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_int32_4d(self, device): + format_list = [-1, 0] + shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list] + self.rsub_result(shape_format) + + # scalar---------------------------------------------------------------------------- + def test_sub_scalar_shape_format_fp16_1d(self, device): + format_list = [-1, 0] + shape_format = [[[np.float16, i, [32]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp32_1d(self, device): + format_list = [-1, 0] + shape_format = [[[np.float16, i, [32]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp16_2d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp32_2d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp16_3d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp32_3d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp16_4d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp32_4d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + +instantiate_device_type_tests(TestRsub, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_tanh.py b/test/test_network_ops/test_tanh.py new file mode 100644 index 0000000000..28c8c0789f --- /dev/null +++ b/test/test_network_ops/test_tanh.py @@ -0,0 +1,139 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestTanh(TestCase): + def cpu_op_exec(self, input1): + output = torch.tanh(input1) + output = output.numpy() + return output + + def npu_op_exec(self, input1): + input1 = input1.to("npu") + output = torch.tanh(input1) + output = output.to("cpu") + output = output.numpy() + return output + + def test_tanh_common_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (4, 3, 3)], 1, 100], + [[np.float32, -1, (7,5,5)], 21474836,21474837], + [[np.float32, -1, (4, 44, 44)], 3450,34020], + [[np.float32, -1, (65500,3,3)], -214748,-214746], + [[np.float32, -1, (1024, 448, 448)], 200, 300], + [[np.float32, -1, (128, 3, 5)], 0.3219780311757745 , 92 ], + [[np.float32, -1, (8, 7, 7)], 0.4820305734500543 , 28], + [[np.float32, -1, (15, 8, 8)],0.8563874665918477 , 98], + [[np.float32, -1, (11, 6, 6)], 0.0694198357720135 , 50], + [[np.float32, -1, (24, 24, 3)], -2,-2], + [[np.float32, -1, (6, 10, 10)], 0.6447298684351989 , 95], + [[np.float32, -1, (3, 9, 9)], 0.8723538084975545 , 85], + [[np.float32, -1, (5, 5, 5)], 0.8283759153463854 , 71], + [[np.float32, -1, (5, 1, 1)], 0.24718684227306953 , 25], + [[np.float32, -1, (14, 7, 7)], 0.3989186243492233 , 7 ], + [[np.float32, -1, (4, 10, 10)], 0.7866457165672994 , 5], + [[np.float32, -1, (3, 7, 7)], 0.3793216987112159 , 39], + [[np.float32, -1, (2, 8, 8)], 0.9662927186969077 , 5 ], + [[np.float32, -1, (3, 7, 7)], 0.9956475043306917 , 28], + [[np.float32, -1, (7, 10, 10)], 0.769565434387681 , 9], + [[np.float32, -1, (54, 93, 3)],0.6447298684351989 , 95], + [[np.float32, -1, (6, 3, 3)], 0.03133650248813469 , 37 ], + [[np.float32, -1, (65500, 1, 1)], 95, 100], + [[np.float32, -1, (6, 3, 10)], 0.03133650248813469 , 37], + + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = self.cpu_op_exec(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_tanh_float16_shape_format(self, device): + def cpu_op_exec_fp16(input1): + input1 = input1.to(torch.float32) + output = torch.tanh(input1) + output = output.numpy() + output = output.astype(np.float16) + return output + + shape_format = [ + [[np.float16, -1, (65500, 1)], 212,225], + [[np.float16, -1, (1024,448,448)], 200, 300], + [[np.float16, -1, (16,16)], -1000, -100], + [[np.float16, -1, (4,1)], -1.1754943508e-38,-1.1754943508e-38], + [[np.float16, -1, (7, 5, 5)], 21474836,21474837], + [[np.float16, -1, (4, 44, 44)], 3450,34020], + [[np.float16, -1, (65500, 3, 3)], -214748,-214746], + [[np.float16, -1, (64, 4, 4)], -9.313225746154785e-10,9.313225746154785e-10], + [[np.float16, -1, (128, 3, 5)], -0.000000000000000000000000000000000000011754943508,0.000000000000000000000000000000000000011754943508], + [[np.float16, -1, (1, 1, 1)], 0.9283381566708346 , 16], + [[np.float16, -1, (6, 3, 10)], 0.03133650248813469 , 37], + [[np.float16, -1, (65500, 1, 1)], 95, 100 ], + [[np.float16, -1, (13, 5, 5)], 0.9790231845699171 , 41], + [[np.float16, -1, (5, 7, 7)], 0.7852605507867441 , 87 ], + [[np.float16, -1, (13, 2, 2)],0.8758750778305631 , 82], + [[np.float16, -1, (14, 6, 6)],0.6963691068720794 , 92], + [[np.float16, -1, (5, 6, 6)], 0.7570129172808612 , 21], + [[np.float16, -1, (1, 10, 10)], 0.990800730328874 , 86], + [[np.float16, -1, (4, 5, 5)], 0.7349293532899402 , 35], + [[np.float16, -1, (6, 4, 4)], 0.7349293532899402, 35], + [[np.float16, -1, (5, 8, 8)],0.9583309378850908 , 60], + + ] + + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = cpu_op_exec_fp16(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_tanh_inplace_common_shape_format(self, device): + def cpu_op_inplace_exec(input1): + output = torch.tanh_(input1) + output = output.numpy() + return output + + def npu_op_inplace_exec(input1): + input1 = input1.to("npu") + output = torch.tanh_(input1) + output = output.to("cpu") + output = output.numpy() + return output + + shape_format = [ + [[np.float32, -1, (4, 3, 3)], 1, 100], + [[np.float32, -1, (7,5,5)], 21474836,21474837], + [[np.float32, -1, (4, 44, 44)], 3450,34020], + [[np.float32, -1, (65500,3,3)], -214748,-214746] + + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = cpu_op_inplace_exec(cpu_input1) + npu_output = npu_op_inplace_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestTanh, globals(), except_for='cpu') + +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_tanh_backward.py b/test/test_network_ops/test_tanh_backward.py new file mode 100644 index 0000000000..1e108d1960 --- /dev/null +++ b/test/test_network_ops/test_tanh_backward.py @@ -0,0 +1,101 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestTanhBackward(TestCase): + + def cpu_op_exec(self, input1): + input1.requires_grad = True + input1_tanh = torch.tanh(input1) + input1_tanh.backward(torch.ones_like(input1_tanh)) + output = input1.grad.numpy() + return output + + def npu_op_exec(self, input1): + input1.requires_grad = True + input1_tanh = torch.tanh(input1) + input1_tanh.backward(torch.ones_like(input1_tanh)) + output = input1.grad + output = output.to("cpu") + output = output.numpy() + return output + + def test_tanh_backward_common_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (4, 3)], 1, 100], + [[np.float32, -1, (7, 5, 5)], 21474836,21474837], + [[np.float32, -1, (4, 44, 44)], 3450,34020], + [[np.float32, -1, (65500,3,3)], -214748,-214746], + [[np.float32, -1, (1024, 448, 448)], 200, 300], + [[np.float32, -1, (24, 24, 3)], -2,-2], + [[np.float32, -1, (3, 7, 7)], 0.3793216987112159, 1], + [[np.float32, -1, (2, 8, 8)], 0.9662927186969077, 1], + [[np.float32, -1, (3, 7, 7)], 0.9956475043306917, 2], + [[np.float32, -1, (7, 10, 10)], 0.769565434387681, 3], + [[np.float32, -1, (65500, 1, 1)], 95, 100], + [[np.float32, -1, (6, 3, 10)], 0.03133650248813469 , 2], + [[np.float32, -1, (4, 3, 3, 3, 3, 3, 3, 3)], 0, 1], + [[np.float32, -1, (5,)], 0, 1], + [[np.float32, -1, (5,5,5,5,5,5)], 1, 2], + [[np.float32, -1, (5,5,5,5,5,5)], 2, 3], + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = self.cpu_op_exec(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_tanh_backward_float16_shape_format(self, device): + def cpu_op_exec_fp16(input1): + input1 = input1.to(torch.float32) + input1.requires_grad = True + input1_tanh = torch.tanh(input1) + input1_tanh.backward(torch.ones_like(input1_tanh)) + output = input1.grad.numpy() + output = output.astype(np.float16) + return output + + shape_format = [ + [[np.float16, -1, (65500, 1)], 212, 225], + [[np.float16, -1, (1024, 448, 448)], 200, 300], + [[np.float16, -1, (16, 16)], -1000, -100], + [[np.float16, -1, (4, 1)], -1.1754943508e-38, -1.1754943508e-38], + [[np.float16, -1, (7, 5, 5)], 21474836, 21474837], + [[np.float16, -1, (4, 44, 44)], 3450, 34020], + [[np.float16, -1, (65500, 3, 3)], -214748, -214746], + [[np.float16, -1, (64, 4, 4)], -9.313225746154785e-10,9.313225746154785e-10], + [[np.float16, -1, (128, 3, 5)], + -0.000000000000000000000000000000000000011754943508, + 0.000000000000000000000000000000000000011754943508], + [[np.float16, -1, (65500, 1, 1)], 95, 100], + ] + + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = cpu_op_exec_fp16(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestTanhBackward, globals(), except_for='cpu') + +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py new file mode 100644 index 0000000000..893adf140e --- /dev/null +++ b/test/test_network_ops/test_uniform_.py @@ -0,0 +1,50 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestUniform(TestCase): + def test_uniform(self, device): + shape_format = [ + [(20,300), -100, 100, torch.float32], + [(20,300), -100, 100, torch.float16] + ] + + for item in shape_format: + input1 = torch.zeros(item[0], dtype=item[3]).npu() + input1.uniform_(item[1], item[2]) + self.assertTrue(item[1] <= input1.min()) + self.assertTrue(item[2] >= input1.max()) + + def test_uniform_trans(self, device): + shape_format = [ + [(20,300), -100, 100, torch.float32], + ] + + for item in shape_format: + input1 = torch.zeros(item[0], dtype=item[3]).npu() + input1.npu_format_cast(3) + input1.uniform_(item[1], item[2]) + self.assertTrue(item[1] <= input1.min()) + self.assertTrue(item[2] >= input1.max()) + + +instantiate_device_type_tests(TestUniform, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/csrc/aten/ops/GeKernelNpu.cpp b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp new file mode 100644 index 0000000000..65448746f6 --- /dev/null +++ b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& ge_out_npu_nocheck(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor selfCast = self; + at::Tensor otherCast = other; + if (self.dtype() == at::ScalarType::Int || other.dtype() == at::ScalarType::Int + || self.dtype() == at::ScalarType::Bool || other.dtype() == at::ScalarType::Bool) { + selfCast = self.to(at::ScalarType::Float); + otherCast = other.to(at::ScalarType::Float); + } + auto unified_result = OpPreparation::comparison_op_check(result, selfCast, otherCast, true); + OpCommand cmd; + cmd.Name("GreaterEqual") + .Expect(unified_result) + .Input(selfCast) + .Input(otherCast) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::ge_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); + return result; +} + +at::Tensor& ge_out_npu_nocheck(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor selfCast = self; + if (self.dtype() == at::ScalarType::Int || self.dtype() == at::ScalarType::Bool) { + selfCast = self.to(at::ScalarType::Float); + } + OpCommand cmd; + cmd.Name("GreaterEqual") + .Input(selfCast) + .Input(other, selfCast.scalar_type()) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::ge_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + auto outputSize = formatCastOfSelf.sizes(); + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + ge_out_npu_nocheck(formatCastOfSelf, other, result); + return result; +} + +at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, const at::Tensor& other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + outputSize, + formatCastOfSelf.options().dtype(kBool), + ACL_FORMAT_ND); + ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); + return result; +} + +at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, Scalar other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + formatCastOfSelf.sizes(), + formatCastOfSelf.options().dtype(kBool), + ACL_FORMAT_ND); + ge_out_npu_nocheck(formatCastOfSelf, other, result); + return result; +} + +at::Tensor& NPUNativeFunctions::ge_(at::Tensor& self, const at::Tensor& other) { + OpPreparation::CastBackToOriFormat(self); + at::Tensor ori_other = OpPreparation::CastBackToOriFormat(other); + OpPreparation::CheckMemory({self, ori_other}, {self}); + + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + ge_out_npu_nocheck(contiguousSelf, ori_other, result); + } else { + ge_out_npu_nocheck(self, ori_other, result); + } + self.copy_(result); + return self; +} + +at::Tensor& NPUNativeFunctions::ge_(at::Tensor& self, at::Scalar other) { + OpPreparation::CastBackToOriFormat(self); + OpPreparation::CheckMemory({self}, {self}); + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + ge_out_npu_nocheck(contiguousSelf, other, result); + } else { + ge_out_npu_nocheck(self, other, result); + } + self.copy_(result); + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp new file mode 100644 index 0000000000..72d7984b0e --- /dev/null +++ b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp @@ -0,0 +1,74 @@ +// Copyright (c) 2020, Huawei Technologies.All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/NpuUtils.h" +#include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& index_out_nocheck_npu( + const at::Tensor& self, + const at::Tensor& masksTensor, + const at::TensorList& allDefinedIndices, + at::Tensor& result) { + OpCommand cmd; + cmd.Name("Index") + .Input(self) + .Input(masksTensor); + for (int i = 0; i < allDefinedIndices.size(); i++) { + cmd.Input(allDefinedIndices[i]); + } + cmd.Output(result) + .Run(); + return result; +} + +at::Tensor NPUNativeFunctions::index(const at::Tensor& self, const torch::List>& orig) { + checkIndexTensorTypes(orig); + // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors + auto indices = expandTensors(self, orig); + at::Tensor formatCastOfSelf = self.npu_format_cast(ACL_FORMAT_ND); + + // calculate the output size + auto outputSize = index_npu_output_size(formatCastOfSelf, indices); + + // construct the output tensor of the NPU + at::Tensor result = OpPreparation::ApplyTensorWithFormat(formatCastOfSelf, outputSize, ACL_FORMAT_ND); + + // masks corresponds to indices. 0 indicates undefined tensor. + SmallVector masks; + std::vector allDefinedIndices; + for (int64_t i = 0; i < indices.size(); i++) { + if (indices[i].defined()) { + masks.emplace_back(1); + allDefinedIndices.emplace_back(indices[i]); + } else { + masks.emplace_back(0); + } + } + + at::Tensor masksTensor = CalcuOpUtil::copy_tensor_host_to_device( + from_blob(masks.data(), {masks.size()}, dtype(at::ScalarType::Long))); + + // calculate the output result of the NPU + index_out_nocheck_npu(formatCastOfSelf, masksTensor, allDefinedIndices, result); + + return result; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/LeKernelNpu.cpp b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp new file mode 100644 index 0000000000..dfd9c4d584 --- /dev/null +++ b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp @@ -0,0 +1,134 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& le_out_npu_nocheck(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + OpCommand cmd; + cmd.Name("LessEqual") + .Input(self) + .Input(other, self.scalar_type()) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::le_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + auto outputSize = formatCastOfSelf.sizes(); + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + le_out_npu_nocheck(formatCastOfSelf, other, result); + return result; +} + +at::Tensor& le_out_npu_nocheck(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + auto unified_result = OpPreparation::comparison_op_check(result, self, other, true); + OpCommand cmd; + cmd.Name("LessEqual") + .Expect(unified_result) + .Input(self) + .Input(other) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::le_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); + return result; +} + +at::Tensor NPUNativeFunctions::le(const at::Tensor& self, at::Scalar other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + formatCastOfSelf.sizes(), + formatCastOfSelf.options().dtype(kBool), + ACL_FORMAT_ND); + le_out_npu_nocheck(formatCastOfSelf, other, result); + return result; +} + +at::Tensor NPUNativeFunctions::le(const at::Tensor& self, const at::Tensor& other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + outputSize, + formatCastOfSelf.options().dtype(kBool), + ACL_FORMAT_ND); + + le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); + return result; +} + +at::Tensor& NPUNativeFunctions::le_(at::Tensor& self, at::Scalar other) { + OpPreparation::CastBackToOriFormat(self); + OpPreparation::CheckMemory({self}, {self}); + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + le_out_npu_nocheck(contiguousSelf, other, result); + } else { + le_out_npu_nocheck(self, other, result); + } + self.copy_(result); + return self; +} + +at::Tensor& NPUNativeFunctions::le_(at::Tensor& self, const at::Tensor& other) { + OpPreparation::CastBackToOriFormat(self); + at::Tensor ori_other = OpPreparation::CastBackToOriFormat(other); + OpPreparation::CheckMemory({self, ori_other}, {self}); + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + le_out_npu_nocheck(contiguousSelf, ori_other, result); + } else { + le_out_npu_nocheck(self, ori_other, result); + } + self.copy_(result); + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp new file mode 100644 index 0000000000..847896ec3a --- /dev/null +++ b/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp @@ -0,0 +1,117 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& NPUNativeFunctions::masked_fill_out(const at::Tensor& self, const at::Tensor& mask, const at::Tensor& value, at::Tensor& result) { + OpPreparation::CheckOut( + {self}, + result, + self); + at::Tensor maskBool = mask; + int64_t dimOfSelf = self.dim(); + + /* Avoid the problem that the TBE operator does not support 0-dimensional tensor input */ + if (dimOfSelf == 0) { + self.unsqueeze_(0); + } + + if ((mask.dtype() != at::kBool)) { + maskBool = mask.to(at::kBool); + } + at::Tensor valueTensor = value; + if (value.dtype() != self.dtype()) { + valueTensor = valueTensor.to(self.dtype()); + } + + OpCommand cmd; + cmd.Name("MaskedFill") + .Input(self) + .Input(maskBool) + .Input(valueTensor) + .Output(result) + .Run(); + + if (dimOfSelf == 0) { + result.squeeze_(0); + } + + return result; +} + +at::Tensor& NPUNativeFunctions::masked_fill_out(const at::Tensor& self, const at::Tensor& mask, at::Scalar value, at::Tensor& result) { + OpPreparation::CheckOut( + {self}, + result, + CalcuOpUtil::get_tensor_npu_format(self), + self.scalar_type(), + self.sizes()); + at::Tensor maskBool = mask; + int64_t dimOfSelf = self.dim(); + + /* Avoid the problem that the TBE operator does not support 0-dimensional tensor input */ + if (dimOfSelf == 0) { + self.unsqueeze_(0); + } + + if (!(mask.dtype() == at::kBool)) { + maskBool = mask.to(at::kBool); + } + + OpCommand cmd; + cmd.Name("MaskedFill") + .Input(self) + .Input(maskBool) + .Input(value, self.scalar_type()) + .Output(result) + .Run(); + + if (dimOfSelf == 0) { + result.squeeze_(0); + } + return result; +} + +at::Tensor& NPUNativeFunctions::masked_fill_(at::Tensor& self, const at::Tensor& mask, const at::Tensor& value) { + // OpPreparation::CheckMemory({self, mask, value}, {self}); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + at::Tensor result = masked_fill_out(contiguousSelf, mask, value, contiguousSelf); + self.copy_(result); + } else { + masked_fill_out(self, mask, value, self); + } + return self; +} + +at::Tensor& NPUNativeFunctions::masked_fill_(at::Tensor& self, const at::Tensor& mask, at::Scalar value) { + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + at::Tensor result = masked_fill_out(contiguousSelf, mask, value, contiguousSelf); + self.copy_(result); + } else { + masked_fill_out(self, mask, value, self); + } + + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/NeKernelNpu.cpp b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp new file mode 100644 index 0000000000..d26ae3c535 --- /dev/null +++ b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp @@ -0,0 +1,157 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, const at::Tensor& other) { + at::Tensor selfCast = self; + at::Tensor otherCast = other; + if(self.dtype() == at::ScalarType::Int || other.dtype() == at::ScalarType::Int){ + selfCast = self.to(at::ScalarType::Float); + otherCast = other.to(at::ScalarType::Float); + } + auto unified_result = OpPreparation::comparison_op_check(result, selfCast, otherCast, true); + if(self.scalar_type() == at::kLong) { + TORCH_WARN_ONCE("The oprator of ne is executed, Currently High Accuracy but Low Performance OP with 64-bit has been used," + "Please Do Some Cast at Python Functions with 32-bit for Better Performance!"); + } + OpCommand cmd; + cmd.Name("NotEqual") + .Expect(unified_result) + .Input(selfCast) + .Input(otherCast) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, at::Scalar other) { + at::Tensor selfCast = self; + if(self.dtype() == at::ScalarType::Int){ + selfCast = self.to(at::ScalarType::Float); + } + if(self.scalar_type() == at::kLong) { + TORCH_WARN_ONCE("The oprator of ne is executed, Currently High Accuracy but Low Performance OP with 64-bit has been used," + "Please Do Some Cast at Python Functions with 32-bit for Better Performance!"); + } + OpCommand cmd; + cmd.Name("NotEqual") + .Input(selfCast) + .Input(other, selfCast.scalar_type()) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + auto outputSize = broadcast_ops_npu_output_size(self, other); + OpPreparation::CheckOut( + {self, other}, + result, + CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf), + ScalarType::Bool, + IntArrayRef(outputSize)); + ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther); + return result; +} + +at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + OpPreparation::CheckOut( + {self}, + result, + CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf), + ScalarType::Bool, + formatCastOfSelf.sizes()); + ne_out_npu_nocheck(result, formatCastOfSelf, other); + return result; +} + +at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + at::Tensor result = OpPreparation::ApplyTensor( + outputSize, + formatCastOfSelf.options().dtype(kBool), + formatCastOfSelf); + + ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther); + return result; +} + +at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, at::Scalar other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + + at::Tensor result = OpPreparation::ApplyTensor( + formatCastOfSelf, + formatCastOfSelf.options().dtype(kBool)); + + ne_out_npu_nocheck(result, formatCastOfSelf, other); + return result; +} + +at::Tensor& NPUNativeFunctions::ne_(at::Tensor& self, const at::Tensor& other) { + OpPreparation::CastBackToOriFormat(self); + OpPreparation::CastBackToOriFormat(other); + OpPreparation::CheckMemory({self, other}, {self}); + + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + ne_out_npu_nocheck(result, contiguousSelf, other); + } else { + ne_out_npu_nocheck(result, self, other); + } + + self.copy_(result); + + return self; +} + +at::Tensor& NPUNativeFunctions::ne_(at::Tensor& self, at::Scalar other) { + OpPreparation::CastBackToOriFormat(self); + OpPreparation::CheckMemory({self}, {self}); + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + ne_out_npu_nocheck(result, contiguousSelf, other); + } else { + ne_out_npu_nocheck(result, self, other); + } + + self.copy_(result); + + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp new file mode 100644 index 0000000000..c2f86b65a6 --- /dev/null +++ b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp @@ -0,0 +1,63 @@ +// Copyright (c) 2020, Huawei Technologies. +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& nonzero_out_npu_nocheck(at::Tensor& result, const at::Tensor& self) { + OpCommand cmd; + cmd.Name("NonZero") + .Input(self) + .Output(result) + .Attr("transpose", false) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::nonzero_out(const at::Tensor& self, at::Tensor& result) { + auto outputSize = nonzero_npu_output_size(self); + OpPreparation::CheckOut( + {self}, + result, + CalcuOpUtil::get_tensor_npu_format(self), + ScalarType::Long, + outputSize); + + OpPipeWithDefinedOut pipe; + return pipe.CheckMemory({self}, {result}) + .Func([&self](at::Tensor& result){nonzero_out_npu_nocheck(result, self);}) + .Call(result); +} + +at::Tensor NPUNativeFunctions::nonzero(const at::Tensor& self) { + // calculate the output size + auto outputSize = nonzero_npu_output_size(self); + + // construct the output tensor of the NPU + at::Tensor result = OpPreparation::ApplyTensor( + outputSize, self.options().dtype(at::kLong), self); + + // calculate the output result of the NPU + nonzero_out_npu_nocheck(result, self); + return result; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp b/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp new file mode 100644 index 0000000000..4c8200b577 --- /dev/null +++ b/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp @@ -0,0 +1,96 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor rsub_dest_output(const at::Tensor& self, const at::Tensor& other) { + bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self); + + return isSelfWrapped ? other : self; +} + +at::Tensor& rsub_out_npu_nocheck( + at::Tensor& result, + const at::Tensor& self, + const at::Tensor& other, + at::Scalar alpha) { + // other*alpha + at::Tensor otherMulResult; + if (!CalcuOpUtil::is_scalar_one(alpha)) { + otherMulResult = at::mul(self, alpha); + } + + OpCommand cmd; + if (otherMulResult.defined()) { + cmd.Name("Sub") + .Input(other) + .Input(otherMulResult) + .Output(result) + .Run(); + } else { + cmd.Name("Sub") + .Input(other) + .Input(self) + .Output(result) + .Run(); + } + + return result; +} + +at::Tensor& rsub_out_npu_nocheck( + at::Tensor& result, + const at::Tensor& self, + at::Scalar other, + at::Scalar alpha) { + // other*alpha + at::Tensor scalarValue(at::mul(self, alpha)); + + OpCommand cmd; + cmd.Name("Sub") + .Input(other, self.scalar_type()) + .Input(scalarValue) + .Output(result) + .Run(); + + return result; +} + +at::Tensor NPUNativeFunctions::rsub(const at::Tensor& self, const at::Tensor& other, at::Scalar alpha) { + at::Tensor outputTensor = rsub_dest_output(self, other); + auto outputSize = broadcast_ops_npu_output_size(self, other); + + at::Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize); + + rsub_out_npu_nocheck(result, self, other, alpha); + + return result; +} + +at::Tensor NPUNativeFunctions::rsub(const at::Tensor& self, at::Scalar other, at::Scalar alpha) { + at::Tensor result = OpPreparation::ApplyTensor(self); + + rsub_out_npu_nocheck(result, self, other, alpha); + + return result; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp new file mode 100644 index 0000000000..e0abc60d25 --- /dev/null +++ b/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp @@ -0,0 +1,54 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& tanh_backward_out_npu_nocheck( + at::Tensor& result, + const at::Tensor& grad_output, + const at::Tensor& self) { + OpCommand cmd; + cmd.Name("TanhGrad") + .Input(self) + .Input(grad_output) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::tanh_backward_out( + const at::Tensor& grad_output, + const at::Tensor& self, + at::Tensor& result) { + OpPreparation::CheckOut({grad_output, self}, result, self); + tanh_backward_out_npu_nocheck(result, grad_output, self); + return result; +} + +at::Tensor NPUNativeFunctions::tanh_backward(const at::Tensor& grad_output, const at::Tensor& self) { + at::Tensor result = OpPreparation::ApplyTensor(self); + tanh_backward_out_npu_nocheck(result, grad_output, self); + + return result; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp b/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp new file mode 100644 index 0000000000..5ec2311735 --- /dev/null +++ b/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp @@ -0,0 +1,55 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& NPUNativeFunctions::tanh_out(const at::Tensor& self, at::Tensor& result) { + OpCommand cmd; + cmd.Name("Tanh") + .Input(self) + .Output(result) + .Run(); + + return result; +} + +at::Tensor NPUNativeFunctions::tanh(const at::Tensor& self) { + at::Tensor result = OpPreparation::ApplyTensor(self); + // calculate the output result of the NPU + NPUNativeFunctions::tanh_out(self, result); + + return result; +} + +at::Tensor& NPUNativeFunctions::tanh_(at::Tensor& self) { + OpPreparation::CheckMemory({self}, {self}); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + at::Tensor result = NPUNativeFunctions::tanh_out(contiguousSelf, contiguousSelf); + NpuUtils::format_fresh_view(self, result); + } else { + NPUNativeFunctions::tanh_out(self, self); + } + + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp new file mode 100644 index 0000000000..3dd2834a35 --- /dev/null +++ b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp @@ -0,0 +1,58 @@ +// Copyright (c) 2021 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& uniform_out_npu( + const at::Tensor& self, + double from, + double to, + c10::optional gen_, + at::Tensor& result) { + OpCommand cmd; + cmd.Name("Uniform") + .Input(self) + .Output(result) + .Attr("from", static_cast(from)) + .Attr("to", static_cast(to)) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::uniform_(at::Tensor& self, double from, double to, c10::optional gen_) { + // TODO(Ascend): The operator needs to use fp32 for calculation. + at::Tensor selfCopy = self; + if (self.scalar_type() == at::ScalarType::Half) { + selfCopy = self.to(at::ScalarType::Float); + } + + if (!NpuUtils::check_match(&selfCopy)) { + at::Tensor selfContiguous = NpuUtils::format_contiguous(selfCopy); + at::Tensor result = uniform_out_npu(selfContiguous, from, to, gen_, selfContiguous); + NpuUtils::format_fresh_view(selfCopy, result); + } else { + uniform_out_npu(selfCopy, from, to, gen_, selfCopy); + } + self.copy_(selfCopy); + + return self; +} + +} // namespace native +} // namespace at_npu -- Gitee From 388945c11ff7811aca54d66d2481282cc8981905 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 9 Feb 2022 19:24:07 +0800 Subject: [PATCH 2/4] fix bugs --- test/test_network_ops/test_ge.py | 38 ++-- test/test_network_ops/test_index.py | 204 ------------------ test/test_network_ops/test_le.py | 36 ++-- test/test_network_ops/test_masked_fill.py | 145 ------------- test/test_network_ops/test_ne.py | 23 +- test/test_network_ops/test_nonzero.py | 1 - torch_npu/csrc/aten/ops/GeKernelNpu.cpp | 6 +- torch_npu/csrc/aten/ops/IndexKernelNpu.cpp | 74 ------- torch_npu/csrc/aten/ops/LeKernelNpu.cpp | 4 +- .../csrc/aten/ops/MaskedFillKernelNpu.cpp | 117 ---------- torch_npu/csrc/aten/ops/NeKernelNpu.cpp | 12 +- torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp | 2 +- 12 files changed, 59 insertions(+), 603 deletions(-) delete mode 100644 test/test_network_ops/test_index.py delete mode 100644 test/test_network_ops/test_masked_fill.py delete mode 100644 torch_npu/csrc/aten/ops/IndexKernelNpu.cpp delete mode 100644 torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py index ff5ea46a02..1afd23a188 100644 --- a/test/test_network_ops/test_ge.py +++ b/test/test_network_ops/test_ge.py @@ -107,7 +107,6 @@ class TestGe(TestCase): cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3) npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) - self.assertRtolEqual(cpu_output_out, npu_output_out) def test_ge_tensor_out(self, device): @@ -163,8 +162,9 @@ class TestGe(TestCase): npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1]) cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50) npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50) - self.assertEqual(cpu_output1, npu_output1) - self.assertEqual(cpu_output2, npu_output2) + + self.assertRtolEqual(cpu_output1, npu_output1) + self.assertRtolEqual(cpu_output2, npu_output2) def test_ge_scalar_float32(self, device): format_list = [0] @@ -177,7 +177,7 @@ class TestGe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_exec_scalar(npu_input, scalar) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_scalar_float16(self, device): format_list = [0] @@ -191,8 +191,7 @@ class TestGe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_exec_scalar(npu_input, scalar) - cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_scalar_int32(self, device): format_list = [0] @@ -205,7 +204,7 @@ class TestGe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_exec_scalar(npu_input, scalar) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_tensor_float32(self, device): format_list = [0] @@ -217,7 +216,7 @@ class TestGe(TestCase): cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_exec(npu_input1, npu_input2) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_tensor_float16(self, device): format_list = [0] @@ -231,8 +230,7 @@ class TestGe(TestCase): cpu_input2 = cpu_input2.to(torch.float32) cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_exec(npu_input1, npu_input2) - cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_inplace_float32(self, device): format_list = [0] @@ -244,7 +242,7 @@ class TestGe(TestCase): cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_inplace_float16(self, device): format_list = [0, 3] @@ -258,8 +256,8 @@ class TestGe(TestCase): cpu_input2 = cpu_input2.to(torch.float32) cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) - cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_inplace_scalar_float32(self, device): format_list = [0] @@ -272,7 +270,7 @@ class TestGe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_inplace_scalar_float16(self, device): format_list = [0] @@ -286,14 +284,14 @@ class TestGe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) - cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) def test_ge_mix_dtype(self, device): - npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) - npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) - cpu_output = self.cpu_op_exec(npu_input1, npu_input3) - npu_output = self.npu_op_exec(npu_input2, npu_input4) + cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) self.assertRtolEqual(cpu_output, npu_output) instantiate_device_type_tests(TestGe, globals(), except_for="cpu") diff --git a/test/test_network_ops/test_index.py b/test/test_network_ops/test_index.py deleted file mode 100644 index 723706ccde..0000000000 --- a/test/test_network_ops/test_index.py +++ /dev/null @@ -1,204 +0,0 @@ -# Copyright (c) 2020, Huawei Technologies.All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import torch -import torch_npu -import numpy as np - -from torch_npu.testing.common_utils import TestCase, run_tests -from torch_npu.testing.common_device_type import instantiate_device_type_tests -from torch_npu.testing.util_test import create_common_tensor - -class TestIndex(TestCase): - def generate_index_data_bool(self, shape): - cpu_input = torch.randn(shape)>0 - npu_input = cpu_input.to("npu") - return cpu_input, npu_input - - def cpu_op_exec(self, input1, index): - output = input1[index] - output = output.numpy() - return output - - def npu_op_exec(self, input1, index): - output = input1[index] - output = output.to("cpu") - output = output.numpy() - return output - - def cpu_op_exec_ellip(self, input1, index): - output = input1[index, ..., index] - output = output.numpy() - return output - - def npu_op_exec_ellip(self, input1, index): - output = input1[index, ..., index] - output = output.cpu().numpy() - return output - - def cpu_op_exec_semi(self, input1, index): - output = input1[index, :, index] - output = output.numpy() - return output - - def npu_op_exec_semi(self, input1, index): - output = input1[index, :, index] - output = output.cpu().numpy() - return output - - def test_index_ellip(self, device): - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[5, 256, 256, 100]] - shape_format_tensor = [ - [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list - ] - - for item in shape_format_tensor: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - cpu_index1, npu_index1 = create_common_tensor(item[1], 0, 2) - cpu_output = self.cpu_op_exec_ellip(cpu_input1, cpu_index1) - npu_output = self.npu_op_exec_ellip(npu_input1, npu_index1) - self.assertRtolEqual(cpu_output, npu_output) - - def test_index_semi(self, device): - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[5, 256, 256, 100]] - shape_format_tensor = [ - [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list - ] - - for item in shape_format_tensor: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - cpu_index1, npu_index1 = create_common_tensor(item[1], 0, 2) - cpu_output = self.cpu_op_exec_semi(cpu_input1, cpu_index1) - npu_output = self.npu_op_exec_semi(npu_input1, npu_index1) - self.assertRtolEqual(cpu_output, npu_output) - - def test_index_shape_format_tensor(self, device): - #test index is tensor - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] - shape_format_tensor = [ - [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list - ] - - for item in shape_format_tensor: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3) - cpu_output = self.cpu_op_exec(cpu_input1, cpu_index1) - npu_output = self.npu_op_exec(npu_input1, npu_index1) - self.assertRtolEqual(cpu_output, npu_output) - - def test_index_shape_format_tensor_x(self, device): - #test index is [tensor, x] , (x=1,bool,range) - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] - shape_format_tensor = [ - [[i, j, k], [np.int64, 0, (1,2)]] for i in dtype_list for j in format_list for k in shape_list - ] - - for item in shape_format_tensor: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3) - for i in [1, range(2), True]: - cpu_output = self.cpu_op_exec(cpu_input1, (cpu_index1, i)) - npu_output = self.npu_op_exec(npu_input1, (npu_index1, i)) - self.assertRtolEqual(cpu_output, npu_output) - - def test_index_shape_format_tensor_tensor(self, device): - #test index is [tensor, tensor] - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 1000]] - shape_format_multiTensor = [ - [[i, j, k], [np.int64, 0, [1,2]]] for i in dtype_list for j in format_list for k in shape_list - ] - - for item in shape_format_multiTensor: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - cpu_index1, npu_index1 = create_common_tensor(item[1], 1, 3) - cpu_index2, npu_index2 = create_common_tensor(item[1], 1, 3) - cpu_output = self.cpu_op_exec(cpu_input1, (cpu_index1,cpu_index2)) - npu_output = self.npu_op_exec(npu_input1, (npu_index1, npu_index2)) - self.assertRtolEqual(cpu_output, npu_output) - - def test_index_shape_format_list(self, device): - #test index is list - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] - shape_format_list = [ - [[i, j, k], (0,1)] for i in dtype_list for j in format_list for k in shape_list - ] - - for item in shape_format_list: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - cpu_output = self.cpu_op_exec(cpu_input1, item[1]) - npu_output = self.npu_op_exec(npu_input1, item[1]) - self.assertRtolEqual(cpu_output, npu_output) - - def test_index_shape_format_list_x(self, device): - #test index is [list, x], (x=1,bool,range) - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] - shape_format_list = [ - [[i, j, k], (0,1)] for i in dtype_list for j in format_list for k in shape_list - ] - - for item in shape_format_list: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - for i in [1, range(2), (0, 1), True]: - cpu_output = self.cpu_op_exec(cpu_input1, (item[1], i)) - npu_output = self.npu_op_exec(npu_input1, (item[1], i)) - self.assertRtolEqual(cpu_output, npu_output) - - def test_index_shape_format_tensor_bool(self, device): - #test index is bool tensor - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] - shape_format_tensor_bool = [ - [[i, j, k],k] for i in dtype_list for j in format_list for k in shape_list - ] - - for item in shape_format_tensor_bool: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - cpu_index ,npu_index = self.generate_index_data_bool(item[1]) - cpu_output = self.cpu_op_exec(cpu_input1, cpu_index) - npu_output = self.npu_op_exec(npu_input1, npu_index) - self.assertRtolEqual(cpu_output, npu_output) - - def test_index_shape_format_bool_x(self, device): - #test index is [bool, x] , (x=1,bool,range) - dtype_list = [np.float32, np.float16, np.int32] - format_list = [0] - shape_list = [[256, 10], [256, 256, 100], [5, 256, 256, 100]] - index_list = [(True), (False), (True, 1), (True,range(4)), (True,False)] - shape_format_tensor_bool_list = [ - [[i, j, k], l] for i in dtype_list for j in format_list for k in shape_list for l in index_list - ] - - for item in shape_format_tensor_bool_list: - cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) - cpu_output = self.cpu_op_exec(cpu_input1, item[1]) - npu_output = self.npu_op_exec(npu_input1, item[1]) - self.assertRtolEqual(cpu_output, npu_output) - -instantiate_device_type_tests(TestIndex, globals(), except_for="cpu") -if __name__ == "__main__": - run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py index b5bcbacb61..5bdb1617b7 100644 --- a/test/test_network_ops/test_le.py +++ b/test/test_network_ops/test_le.py @@ -138,7 +138,8 @@ class TestLe(TestCase): cpu_input3 = cpu_input3.to(torch.float32) cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3) npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) - cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) + if cpu_input1.dtype == torch.float16: + cpu_output_out = cpu_output_out.astype(np.float16) self.assertRtolEqual(cpu_output_out, npu_output_out) @@ -165,7 +166,8 @@ class TestLe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2) npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2) - cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) + if cpu_input1.dtype == torch.float16: + cpu_output_out = cpu_output_out.astype(np.float16) self.assertRtolEqual(cpu_output_out, npu_output_out) def test_le_scalar_out(self, device): @@ -191,7 +193,7 @@ class TestLe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_exec_scalar(npu_input, scalar) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_le_scalar_int32(self, device): format_list = [0] @@ -204,9 +206,9 @@ class TestLe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_exec_scalar(npu_input, scalar) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) - def test_gt_scalar_float16(self, device): + def test_le_scalar_float16(self, device): format_list = [0] shape_list = [(5, 3), (2, 3, 4)] shape_format = [ @@ -218,8 +220,7 @@ class TestLe(TestCase): scalar = self.generate_scalar(0, 100) cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_exec_scalar(npu_input, scalar) - cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_le_tensor_float32(self, device): format_list = [0] @@ -231,7 +232,7 @@ class TestLe(TestCase): cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_exec(npu_input1, npu_input2) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_le_tensor_float16(self, device): format_list = [0] @@ -245,8 +246,7 @@ class TestLe(TestCase): cpu_input2 = cpu_input2.to(torch.float32) cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_exec(npu_input1, npu_input2) - cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_le_inplace_float32(self, device): format_list = [0, 3] @@ -258,7 +258,7 @@ class TestLe(TestCase): cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_le_inplace_float16(self, device): format_list = [0, 3] @@ -273,7 +273,7 @@ class TestLe(TestCase): cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_le_inplace_scalar_float32(self, device): format_list = [0] @@ -288,7 +288,7 @@ class TestLe(TestCase): ncpu_input = copy.deepcopy(cpu_input) cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_le_inplace_scalar_float16(self, device): format_list = [0] @@ -303,13 +303,13 @@ class TestLe(TestCase): cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_le_mix_dtype(self, device): - npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) - npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) - cpu_output = self.cpu_op_exec(npu_input1, npu_input3) - npu_output = self.npu_op_exec(npu_input2, npu_input4) + cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) self.assertRtolEqual(cpu_output, npu_output) diff --git a/test/test_network_ops/test_masked_fill.py b/test/test_network_ops/test_masked_fill.py deleted file mode 100644 index 256e366b29..0000000000 --- a/test/test_network_ops/test_masked_fill.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2020, Huawei Technologies.All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import torch -import torch_npu -import numpy as np - -from torch_npu.testing.common_utils import TestCase, run_tests -from torch_npu.testing.common_device_type import instantiate_device_type_tests -from torch_npu.testing.util_test import create_common_tensor - -class TestMaskedFill(TestCase): - def create_bool_tensor(self, shape, minValue, maxValue): - input1 = np.random.uniform(minValue, maxValue, shape) - input1 = input1 > 0.5 - cpu_input = torch.from_numpy(input1) - npu_input = torch.from_numpy(input1).to("npu") - return cpu_input, npu_input - - def cpu_op_exec(self, input1, mask, value): - output = torch.masked_fill(input1, mask, value) - output = output.numpy() - return output - - def npu_op_exec(self, input1, mask, value): - output = torch.masked_fill(input1, mask, value) - output = output.to("cpu") - output = output.numpy() - return output - - def cpu_inp_op_exec(self, input1, mask, value): - output = input1.masked_fill_(mask, value) - output = output.numpy() - return output - - def npu_inp_op_exec(self, input1, mask, value): - output = input1.masked_fill_(mask, value) - output = output.to("cpu") - output = output.numpy() - return output - - def test_masked_fill_shape_format_fp16(self, device): - format_list = [0] - shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]] - value_list = [1.25, - torch.tensor(1.25, dtype=torch.float32), - torch.tensor(5, dtype=torch.int32), - torch.tensor(5, dtype=torch.int64)] - - shape_format = [[[np.float16, i, j], v] for i in format_list for j in shape_list for v in value_list] - for item in shape_format: - cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) - mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1) - cpu_input1 = cpu_input1.to(torch.float32) - - cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1]) - npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1]) - cpu_output1 = cpu_output1.astype(npu_output1.dtype) - self.assertRtolEqual(cpu_output1, npu_output1) - - cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1]) - npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1]) - cpu_output2 = cpu_output2.astype(npu_output2.dtype) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_masked_fill_shape_format_fp32(self, device): - format_list = [0] - shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]] - value_list = [1.25, - torch.tensor(1.25, dtype=torch.float32), - torch.tensor(5, dtype=torch.int32), - torch.tensor(5, dtype=torch.int64)] - - shape_format = [[[np.float32, i, j], v] for i in format_list for j in shape_list for v in value_list] - for item in shape_format: - cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) - mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1) - - cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1]) - npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1]) - self.assertRtolEqual(cpu_output1, npu_output1) - - cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1]) - npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1]) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_masked_fill_shape_format_int32(self, device): - format_list = [0] - shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]] - value_list = [1.25, - torch.tensor(1.25, dtype=torch.float32), - torch.tensor(5, dtype=torch.int32), - torch.tensor(5, dtype=torch.int64)] - - shape_format = [[[np.int32, i, j], v] for i in format_list for j in shape_list for v in value_list] - for item in shape_format: - cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) - mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1) - - cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1]) - npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1]) - self.assertRtolEqual(cpu_output1, npu_output1) - - cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1]) - npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1]) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_masked_fill_shape_format_int64(self, device): - format_list = [0] - shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]] - value_list = [1.25, - torch.tensor(1.25, dtype=torch.float32), - torch.tensor(5, dtype=torch.int32), - torch.tensor(5, dtype=torch.int64)] - - shape_format = [[[np.int64, i, j], v] for i in format_list for j in shape_list for v in value_list] - for item in shape_format: - cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) - mask_cpu, mask_npu = self.create_bool_tensor(item[0][2], 0, 1) - - cpu_output1 = self.cpu_op_exec(cpu_input1, mask_cpu, item[1]) - npu_output1 = self.npu_op_exec(npu_input1, mask_npu, item[1]) - cpu_output1 = cpu_output1.astype(np.int32) - npu_output1 = npu_output1.astype(np.int32) - self.assertRtolEqual(cpu_output1, npu_output1) - - cpu_output2 = self.cpu_inp_op_exec(cpu_input1, mask_cpu, item[1]) - npu_output2 = self.npu_inp_op_exec(npu_input1, mask_npu, item[1]) - cpu_output2 = cpu_output2.astype(np.int32) - npu_output2 = npu_output2.astype(np.int32) - self.assertRtolEqual(cpu_output2, npu_output2) - -instantiate_device_type_tests(TestMaskedFill, globals(), except_for="cpu") -if __name__ == "__main__": - run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_ne.py b/test/test_network_ops/test_ne.py index b22864a65e..a403a208d6 100644 --- a/test/test_network_ops/test_ne.py +++ b/test/test_network_ops/test_ne.py @@ -50,7 +50,7 @@ class TestNe(TestCase): cpu_input2, npu_input2 = create_common_tensor(item, 1, 100) cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) npu_output = self.npu_op_exec(npu_input1, npu_input2) - self.assertEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output) def test_ne_shape_format_fp16(self, device): dtype_list = [np.float16] @@ -67,9 +67,8 @@ class TestNe(TestCase): if cpu_input1.dtype == torch.float16: cpu_input2 = cpu_input2.to(torch.float32) cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) - npu_output = self.npu_op_exec(npu_input1, npu_input2) - cpu_output = cpu_output.astype(np.float16) - self.assertEqual(cpu_output, npu_output) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) def test_ne_out_shape_format_fp32(self, device): dtype_list = [np.float32] @@ -82,8 +81,8 @@ class TestNe(TestCase): cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10) cpu_input2, npu_input2 = create_common_tensor(item[0], -10, 10) npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2) - npu_output = self.npu_op_exec(npu_input1, npu_input2) - self.assertEqual(npu_output_out, npu_output) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + self.assertRtolEqual(cpu_output, npu_output_out) def test_ne_scalar_out_shape_format_fp32(self, device): dtype_list = [np.float32] @@ -95,14 +94,14 @@ class TestNe(TestCase): for item in shape_format: cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10) npu_output_out = self.npu_op_exec_out(npu_input1, 5) - npu_output = self.npu_op_exec(npu_input1, 5) - self.assertEqual(npu_output_out, npu_output) + cpu_output = self.cpu_op_exec(cpu_input1, 5) + self.assertRtolEqual(cpu_output, npu_output_out) def test_ne_mix_dtype(self, device): - npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) - npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) - cpu_output = self.cpu_op_exec(npu_input1, npu_input3) - npu_output = self.npu_op_exec(npu_input2, npu_input4) + cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) self.assertRtolEqual(cpu_output, npu_output) instantiate_device_type_tests(TestNe, globals(), except_for="cpu") diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py index 116c3dbb52..3973186d65 100644 --- a/test/test_network_ops/test_nonzero.py +++ b/test/test_network_ops/test_nonzero.py @@ -42,7 +42,6 @@ class TestNonzero(TestCase): [[i, j, k]] for i in dtype_list for j in format_list for k in shape_list ] for item in shape_format: - print(item) cpu_input, npu_input = create_common_tensor(item[0], 1, 100) cpu_output = self.cpu_op_exec(cpu_input) npu_output = self.npu_op_exec(npu_input) diff --git a/torch_npu/csrc/aten/ops/GeKernelNpu.cpp b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp index 65448746f6..6a71323f79 100644 --- a/torch_npu/csrc/aten/ops/GeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp @@ -90,17 +90,17 @@ at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, const at::Tensor& othe auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, - formatCastOfSelf.options().dtype(kBool), + formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); return result; } -at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, Scalar other) { +at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, at::Scalar other) { at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); at::Tensor result = OpPreparation::ApplyTensorWithFormat( formatCastOfSelf.sizes(), - formatCastOfSelf.options().dtype(kBool), + formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); ge_out_npu_nocheck(formatCastOfSelf, other, result); return result; diff --git a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp deleted file mode 100644 index 72d7984b0e..0000000000 --- a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2020, Huawei Technologies.All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "torch_npu/csrc/framework/utils/OpAdapter.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" -#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" - -namespace at_npu { -namespace native { - -at::Tensor& index_out_nocheck_npu( - const at::Tensor& self, - const at::Tensor& masksTensor, - const at::TensorList& allDefinedIndices, - at::Tensor& result) { - OpCommand cmd; - cmd.Name("Index") - .Input(self) - .Input(masksTensor); - for (int i = 0; i < allDefinedIndices.size(); i++) { - cmd.Input(allDefinedIndices[i]); - } - cmd.Output(result) - .Run(); - return result; -} - -at::Tensor NPUNativeFunctions::index(const at::Tensor& self, const torch::List>& orig) { - checkIndexTensorTypes(orig); - // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors - auto indices = expandTensors(self, orig); - at::Tensor formatCastOfSelf = self.npu_format_cast(ACL_FORMAT_ND); - - // calculate the output size - auto outputSize = index_npu_output_size(formatCastOfSelf, indices); - - // construct the output tensor of the NPU - at::Tensor result = OpPreparation::ApplyTensorWithFormat(formatCastOfSelf, outputSize, ACL_FORMAT_ND); - - // masks corresponds to indices. 0 indicates undefined tensor. - SmallVector masks; - std::vector allDefinedIndices; - for (int64_t i = 0; i < indices.size(); i++) { - if (indices[i].defined()) { - masks.emplace_back(1); - allDefinedIndices.emplace_back(indices[i]); - } else { - masks.emplace_back(0); - } - } - - at::Tensor masksTensor = CalcuOpUtil::copy_tensor_host_to_device( - from_blob(masks.data(), {masks.size()}, dtype(at::ScalarType::Long))); - - // calculate the output result of the NPU - index_out_nocheck_npu(formatCastOfSelf, masksTensor, allDefinedIndices, result); - - return result; -} - -} // namespace native -} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/LeKernelNpu.cpp b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp index dfd9c4d584..44404cf8f9 100644 --- a/torch_npu/csrc/aten/ops/LeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp @@ -77,7 +77,7 @@ at::Tensor NPUNativeFunctions::le(const at::Tensor& self, at::Scalar other) { at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); at::Tensor result = OpPreparation::ApplyTensorWithFormat( formatCastOfSelf.sizes(), - formatCastOfSelf.options().dtype(kBool), + formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); le_out_npu_nocheck(formatCastOfSelf, other, result); return result; @@ -90,7 +90,7 @@ at::Tensor NPUNativeFunctions::le(const at::Tensor& self, const at::Tensor& othe auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, - formatCastOfSelf.options().dtype(kBool), + formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); diff --git a/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp b/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp deleted file mode 100644 index 847896ec3a..0000000000 --- a/torch_npu/csrc/aten/ops/MaskedFillKernelNpu.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2020 Huawei Technologies Co., Ltd -// Copyright (c) 2019, Facebook CORPORATION. -// All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "torch_npu/csrc/framework/utils/OpAdapter.h" -#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" - -namespace at_npu { -namespace native { - -at::Tensor& NPUNativeFunctions::masked_fill_out(const at::Tensor& self, const at::Tensor& mask, const at::Tensor& value, at::Tensor& result) { - OpPreparation::CheckOut( - {self}, - result, - self); - at::Tensor maskBool = mask; - int64_t dimOfSelf = self.dim(); - - /* Avoid the problem that the TBE operator does not support 0-dimensional tensor input */ - if (dimOfSelf == 0) { - self.unsqueeze_(0); - } - - if ((mask.dtype() != at::kBool)) { - maskBool = mask.to(at::kBool); - } - at::Tensor valueTensor = value; - if (value.dtype() != self.dtype()) { - valueTensor = valueTensor.to(self.dtype()); - } - - OpCommand cmd; - cmd.Name("MaskedFill") - .Input(self) - .Input(maskBool) - .Input(valueTensor) - .Output(result) - .Run(); - - if (dimOfSelf == 0) { - result.squeeze_(0); - } - - return result; -} - -at::Tensor& NPUNativeFunctions::masked_fill_out(const at::Tensor& self, const at::Tensor& mask, at::Scalar value, at::Tensor& result) { - OpPreparation::CheckOut( - {self}, - result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); - at::Tensor maskBool = mask; - int64_t dimOfSelf = self.dim(); - - /* Avoid the problem that the TBE operator does not support 0-dimensional tensor input */ - if (dimOfSelf == 0) { - self.unsqueeze_(0); - } - - if (!(mask.dtype() == at::kBool)) { - maskBool = mask.to(at::kBool); - } - - OpCommand cmd; - cmd.Name("MaskedFill") - .Input(self) - .Input(maskBool) - .Input(value, self.scalar_type()) - .Output(result) - .Run(); - - if (dimOfSelf == 0) { - result.squeeze_(0); - } - return result; -} - -at::Tensor& NPUNativeFunctions::masked_fill_(at::Tensor& self, const at::Tensor& mask, const at::Tensor& value) { - // OpPreparation::CheckMemory({self, mask, value}, {self}); - if (!NpuUtils::check_match(&self)) { - at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); - at::Tensor result = masked_fill_out(contiguousSelf, mask, value, contiguousSelf); - self.copy_(result); - } else { - masked_fill_out(self, mask, value, self); - } - return self; -} - -at::Tensor& NPUNativeFunctions::masked_fill_(at::Tensor& self, const at::Tensor& mask, at::Scalar value) { - if (!NpuUtils::check_match(&self)) { - at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); - at::Tensor result = masked_fill_out(contiguousSelf, mask, value, contiguousSelf); - self.copy_(result); - } else { - masked_fill_out(self, mask, value, self); - } - - return self; -} - -} // namespace native -} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/NeKernelNpu.cpp b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp index d26ae3c535..b95c86a50d 100644 --- a/torch_npu/csrc/aten/ops/NeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp @@ -62,7 +62,7 @@ at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, at::S return result; } -at::Tensor& NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { +at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); auto outputSize = broadcast_ops_npu_output_size(self, other); @@ -70,8 +70,8 @@ at::Tensor& NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& oth {self, other}, result, CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf), - ScalarType::Bool, - IntArrayRef(outputSize)); + at::ScalarType::Bool, + at::IntArrayRef(outputSize)); ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther); return result; } @@ -82,7 +82,7 @@ at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, at::Scalar other, {self}, result, CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf), - ScalarType::Bool, + at::ScalarType::Bool, formatCastOfSelf.sizes()); ne_out_npu_nocheck(result, formatCastOfSelf, other); return result; @@ -95,7 +95,7 @@ at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& othe auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); at::Tensor result = OpPreparation::ApplyTensor( outputSize, - formatCastOfSelf.options().dtype(kBool), + formatCastOfSelf.options().dtype(at::kBool), formatCastOfSelf); ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther); @@ -107,7 +107,7 @@ at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, at::Scalar other) { at::Tensor result = OpPreparation::ApplyTensor( formatCastOfSelf, - formatCastOfSelf.options().dtype(kBool)); + formatCastOfSelf.options().dtype(at::kBool)); ne_out_npu_nocheck(result, formatCastOfSelf, other); return result; diff --git a/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp index c2f86b65a6..640564e8dd 100644 --- a/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp @@ -37,7 +37,7 @@ at::Tensor& NPUNativeFunctions::nonzero_out(const at::Tensor& self, at::Tensor& {self}, result, CalcuOpUtil::get_tensor_npu_format(self), - ScalarType::Long, + at::ScalarType::Long, outputSize); OpPipeWithDefinedOut pipe; -- Gitee From ab7c6d45e0d434bdb61fec5ceeebffcba6d08fad Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 9 Feb 2022 19:52:30 +0800 Subject: [PATCH 3/4] ge, le, nonzero ut clean code --- test/test_network_ops/test_ge.py | 24 ++++++++++----------- test/test_network_ops/test_le.py | 30 +++++++++++++-------------- test/test_network_ops/test_nonzero.py | 8 +++---- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py index 1afd23a188..5a9011620d 100644 --- a/test/test_network_ops/test_ge.py +++ b/test/test_network_ops/test_ge.py @@ -21,8 +21,8 @@ from torch_npu.testing.util_test import create_common_tensor class TestGe(TestCase): - def generate_scalar(self, min, max): - scalar = np.random.uniform(min, max) + def generate_scalar(self, min1, max1): + scalar = np.random.uniform(min1, max1) return scalar def cpu_op_exec(self, input1, input2): @@ -59,8 +59,8 @@ class TestGe(TestCase): output = output.numpy() return output - def cpu_op_exec_scalar(self, input, scalar): - output = torch.ge(input, scalar) + def cpu_op_exec_scalar(self, input1, scalar): + output = torch.ge(input1, scalar) output = output.numpy() return output @@ -69,8 +69,8 @@ class TestGe(TestCase): output = input2.numpy() return output - def npu_op_exec_scalar(self, input, scalar): - output = torch.ge(input, scalar) + def npu_op_exec_scalar(self, input1, scalar): + output = torch.ge(input1, scalar) output = output.to("cpu") output = output.numpy() return output @@ -81,13 +81,13 @@ class TestGe(TestCase): output = output.numpy() return output - def cpu_op_inplace_exec_scalar(self, input, scalar): - output = input.ge_(scalar) + def cpu_op_inplace_exec_scalar(self, input1, scalar): + output = input1.ge_(scalar) output = output.numpy() return output - def npu_op_inplace_exec_scalar(self, input, scalar): - output = input.ge_(scalar) + def npu_op_inplace_exec_scalar(self, input1, scalar): + output = input1.ge_(scalar) output = output.to("cpu") output = output.numpy() return output @@ -96,7 +96,7 @@ class TestGe(TestCase): for item in shape_format: cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100) - cpu_input3 = torch.randn(item[1][2])<0 + cpu_input3 = torch.randn(item[1][2]) < 0 npu_input3 = cpu_input3.npu() if cpu_input1.dtype == torch.float16: cpu_input1 = cpu_input1.to(torch.float32) @@ -123,7 +123,7 @@ class TestGe(TestCase): def ge_scalar_out_result(self, shape_format): for item in shape_format: cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) - cpu_input2 = torch.randn(item[1][2])<0 + cpu_input2 = torch.randn(item[1][2]) < 0 npu_input2 = cpu_input2.npu() if cpu_input1.dtype == torch.float16: cpu_input1 = cpu_input1.to(torch.float32) diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py index 5bdb1617b7..5b7d933b5b 100644 --- a/test/test_network_ops/test_le.py +++ b/test/test_network_ops/test_le.py @@ -21,8 +21,8 @@ from torch_npu.testing.common_device_type import instantiate_device_type_tests from torch_npu.testing.util_test import create_common_tensor class TestLe(TestCase): - def generate_scalar(self, min, max): - scalar = np.random.uniform(min, max) + def generate_scalar(self, min1, max1): + scalar = np.random.uniform(min1, max1) return scalar def cpu_op_exec(self, input1, input2): @@ -59,8 +59,8 @@ class TestLe(TestCase): output = output.numpy() return output - def cpu_op_exec_scalar(self, input, scalar): - output = torch.le(input, scalar) + def cpu_op_exec_scalar(self, input1, scalar): + output = torch.le(input1, scalar) output = output.numpy() return output @@ -69,26 +69,26 @@ class TestLe(TestCase): output = input2.numpy() return output - def npu_op_exec_scalar(self, input, scalar): - output = torch.le(input, scalar) + def npu_op_exec_scalar(self, input1, scalar): + output = torch.le(input1, scalar) output = output.to("cpu") output = output.numpy() return output - def cpu_op_inplace_exec_scalar(self, input, scalar): - output = input.le_(scalar) + def cpu_op_inplace_exec_scalar(self, input1, scalar): + output = input1.le_(scalar) output = output.numpy() return output - def npu_op_inplace_exec_scalar(self, input, scalar): - input = input.to("npu") - output = input.le_(scalar) + def npu_op_inplace_exec_scalar(self, input1, scalar): + input1 = input1.to("npu") + output = input1.le_(scalar) output = output.to("cpu") output = output.numpy() return output - def npu_op_exec_scalar_out(self, input, scalar, output): - torch.le(input, scalar, out=output) + def npu_op_exec_scalar_out(self, input1, scalar, output): + torch.le(input1, scalar, out=output) output = output.to("cpu") output = output.numpy() return output @@ -128,7 +128,7 @@ class TestLe(TestCase): for item in shape_format: cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100) - cpu_input3 = torch.randn(item[1][2])<0 + cpu_input3 = torch.randn(item[1][2]) < 0 npu_input3 = cpu_input3.npu() if cpu_input1.dtype == torch.float16: cpu_input1 = cpu_input1.to(torch.float32) @@ -157,7 +157,7 @@ class TestLe(TestCase): def le_scalar_out_result(self, shape_format): for item in shape_format: cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) - cpu_input2 = torch.randn(item[1][2])<0 + cpu_input2 = torch.randn(item[1][2]) < 0 npu_input2 = cpu_input2.npu() if cpu_input1.dtype == torch.float16: cpu_input1 = cpu_input1.to(torch.float32) diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py index 3973186d65..84c204a86f 100644 --- a/test/test_network_ops/test_nonzero.py +++ b/test/test_network_ops/test_nonzero.py @@ -22,13 +22,13 @@ from torch_npu.testing.common_device_type import instantiate_device_type_tests from torch_npu.testing.util_test import create_common_tensor class TestNonzero(TestCase): - def cpu_op_exec(self, input): - output = torch.nonzero(input) + def cpu_op_exec(self, input1): + output = torch.nonzero(input1) output = output.numpy().astype(np.int32) return output - def npu_op_exec(self, input): - output = torch.nonzero(input) + def npu_op_exec(self, input1): + output = torch.nonzero(input1) output = output.to("cpu") output = output.numpy().astype(np.int32) return output -- Gitee From 343d35ff466da2bcf2dccfcc26c9fcff0b37491f Mon Sep 17 00:00:00 2001 From: wangxiao Date: Thu, 10 Feb 2022 10:11:05 +0800 Subject: [PATCH 4/4] uniform_ rm redundant note --- torch_npu/csrc/aten/ops/UniformKernelNpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp index 3dd2834a35..71a12cb5aa 100644 --- a/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp @@ -36,7 +36,7 @@ at::Tensor& uniform_out_npu( } at::Tensor& NPUNativeFunctions::uniform_(at::Tensor& self, double from, double to, c10::optional gen_) { - // TODO(Ascend): The operator needs to use fp32 for calculation. + // The operator needs to use fp32 for calculation. at::Tensor selfCopy = self; if (self.scalar_type() == at::ScalarType::Half) { selfCopy = self.to(at::ScalarType::Float); -- Gitee