diff --git a/test/test_network_ops/test_ge.py b/test/test_network_ops/test_ge.py new file mode 100644 index 0000000000000000000000000000000000000000..5a9011620d456096f85442c0633a1e513dc2132e --- /dev/null +++ b/test/test_network_ops/test_ge.py @@ -0,0 +1,299 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestGe(TestCase): + def generate_scalar(self, min1, max1): + scalar = np.random.uniform(min1, max1) + return scalar + + def cpu_op_exec(self, input1, input2): + output = torch.ge(input1, input2) + output = output.numpy() + return output + + def cpu_op_exec_out(self, input1, input2, input3): + torch.ge(input1, input2, out = input3) + output = input3.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = torch.ge(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2, input3): + torch.ge(input1, input2, out = input3) + output = input3.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec(self, input1, input2): + output = input1.ge_(input2) + output = input1 + output = output.numpy() + return output + + def npu_op_inplace_exec(self, input1, input2): + output = input1.ge_(input2) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_exec_scalar(self, input1, scalar): + output = torch.ge(input1, scalar) + output = output.numpy() + return output + + def cpu_op_exec_scalar_out(self, input1, scalar, input2): + torch.ge(input1, scalar, out = input2) + output = input2.numpy() + return output + + def npu_op_exec_scalar(self, input1, scalar): + output = torch.ge(input1, scalar) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_scalar_out(self, input1, scalar, input2): + torch.ge(input1, scalar, out = input2) + output = input2.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec_scalar(self, input1, scalar): + output = input1.ge_(scalar) + output = output.numpy() + return output + + def npu_op_inplace_exec_scalar(self, input1, scalar): + output = input1.ge_(scalar) + output = output.to("cpu") + output = output.numpy() + return output + + def ge_tensor_out_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) + cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100) + cpu_input3 = torch.randn(item[1][2]) < 0 + npu_input3 = cpu_input3.npu() + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + if cpu_input3.dtype == torch.float16: + cpu_input3 = cpu_input3.to(torch.float32) + cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) + cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) + self.assertRtolEqual(cpu_output_out, npu_output_out) + + def test_ge_tensor_out(self, device): + shape_format = [ + [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]], + [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]], + [[np.float32, 0, [256, 128, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]], + [[np.float32, 0, [2, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]], + [[np.float32, 0, [128, 232, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]], + ] + self.ge_tensor_out_result(shape_format) + + def ge_scalar_out_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) + cpu_input2 = torch.randn(item[1][2]) < 0 + npu_input2 = cpu_input2.npu() + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2) + npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2) + cpu_output_out = cpu_output_out.astype(npu_output_out.dtype) + self.assertRtolEqual(cpu_output_out, npu_output_out) + + def test_ge_scalar_out(self, device): + shape_format = [ + [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [16, 3, 1111, 1212]], [np.float16, 0, [3, 3, 3]]], + [[np.float16, 0, [16, 16, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]], + [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]], + [[np.float32, 0, [1313, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]], + [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]], + ] + self.ge_scalar_out_result(shape_format) + + def test_ge_bool(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + scalar_list = [True, False] + shape_format = [ + [[np.int32, i, j], k] for i in format_list for j in shape_list + for k in scalar_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100) + cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1]) + npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1]) + cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50) + npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50) + + self.assertRtolEqual(cpu_output1, npu_output1) + self.assertRtolEqual(cpu_output2, npu_output2) + + def test_ge_scalar_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_scalar_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + cpu_input = cpu_input.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_scalar_int32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.int32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_tensor_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float32, i, j], [np.float32, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_tensor_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float16, i, j], [np.float16, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_inplace_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float32, i, j], [np.float32, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_inplace_float16(self, device): + format_list = [0, 3] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float16, i, j], [np.float16, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_inplace_scalar_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_inplace_scalar_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + cpu_input = cpu_input.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ge_mix_dtype(self, device): + cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestGe, globals(), except_for="cpu") +if __name__ == '__main__': + run_tests() diff --git a/test/test_network_ops/test_le.py b/test/test_network_ops/test_le.py new file mode 100644 index 0000000000000000000000000000000000000000..5b7d933b5bf781347c066bf902dc171fe144e2eb --- /dev/null +++ b/test/test_network_ops/test_le.py @@ -0,0 +1,318 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestLe(TestCase): + def generate_scalar(self, min1, max1): + scalar = np.random.uniform(min1, max1) + return scalar + + def cpu_op_exec(self, input1, input2): + output = torch.le(input1, input2) + output = output.numpy() + return output + + def cpu_op_exec_out(self, input1, input2, input3): + torch.le(input1, input2, out = input3) + output = input3.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = torch.le(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec(self, input1, input2): + output = input1.le_(input2) + output = input1 + output = output.numpy() + return output + + def npu_op_inplace_exec(self, input1, input2): + output = input1.le_(input2) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2, output): + torch.le(input1, input2, out=output) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_exec_scalar(self, input1, scalar): + output = torch.le(input1, scalar) + output = output.numpy() + return output + + def cpu_op_exec_scalar_out(self, input1, scalar, input2): + torch.le(input1, scalar, out = input2) + output = input2.numpy() + return output + + def npu_op_exec_scalar(self, input1, scalar): + output = torch.le(input1, scalar) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec_scalar(self, input1, scalar): + output = input1.le_(scalar) + output = output.numpy() + return output + + def npu_op_inplace_exec_scalar(self, input1, scalar): + input1 = input1.to("npu") + output = input1.le_(scalar) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_scalar_out(self, input1, scalar, output): + torch.le(input1, scalar, out=output) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_stride_exec(self, input1, input2): + input1 = input1.as_strided([2, 2], [1, 2], 1) + input2 = input2.as_strided([2, 2], [1, 2], 1) + output = input1.le_(input2) + output = output.numpy() + return output + + def npu_op_inplace_stride_exec(self, input1, input2): + input1 = input1.to("npu") + input2 = input2.to("npu") + input1 = input1.as_strided([2, 2], [1, 2], 1) + input2 = input2.as_strided([2, 2], [1, 2], 1) + output = input1.le_(input2) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_stride_scalar_exec(self, input1, input2): + input1 = input1.as_strided([2, 2], [1, 2], 1) + output = input1.le_(input2) + output = output.numpy() + return output + + def npu_op_inplace_stride_scalar_exec(self, input1, input2): + input1 = input1.to("npu") + input1 = input1.as_strided([2, 2], [1, 2], 1) + output = input1.le_(input2) + output = output.to("cpu") + output = output.numpy() + return output + + def le_tensor_out_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) + cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100) + cpu_input3 = torch.randn(item[1][2]) < 0 + npu_input3 = cpu_input3.npu() + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + if cpu_input3.dtype == torch.float16: + cpu_input3 = cpu_input3.to(torch.float32) + cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) + if cpu_input1.dtype == torch.float16: + cpu_output_out = cpu_output_out.astype(np.float16) + + self.assertRtolEqual(cpu_output_out, npu_output_out) + + def test_le_tensor_out(self, device): + shape_format = [ + [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]], + [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]], + [[np.float32, 0, [256, 128, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]], + [[np.float32, 0, [2, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]], + [[np.float32, 0, [128, 232, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]], + ] + self.le_tensor_out_result(shape_format) + + def le_scalar_out_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100) + cpu_input2 = torch.randn(item[1][2]) < 0 + npu_input2 = cpu_input2.npu() + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2) + npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2) + if cpu_input1.dtype == torch.float16: + cpu_output_out = cpu_output_out.astype(np.float16) + self.assertRtolEqual(cpu_output_out, npu_output_out) + + def test_le_scalar_out(self, device): + shape_format = [ + [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]], + [[np.float16, 2, [16, 3, 11, 121, 21]], [np.float16, 0, [3, 3, 3]]], + [[np.float16, 0, [16, 16, 14]], [np.float16, 0, [128, 116, 14, 14]]], + [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]], + [[np.float32, 2, [1313, 3, 3, 3, 121]], [np.float32, 0, [3, 1, 3]]], + [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]], + ] + self.le_scalar_out_result(shape_format) + + def test_le_scalar_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_scalar_int32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.int32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_scalar_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + cpu_input = cpu_input.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_exec_scalar(npu_input, scalar) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_tensor_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float32, i, j], [np.float32, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_tensor_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float16, i, j], [np.float16, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_inplace_float32(self, device): + format_list = [0, 3] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float32, i, j], [np.float32, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_inplace_float16(self, device): + format_list = [0, 3] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [[[np.float16, i, j], [np.float16, i, j]] + for i in format_list for j in shape_list] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_inplace_scalar_float32(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + scalar = self.generate_scalar(0, 100) + scalar1 = copy.deepcopy(scalar) + ncpu_input = copy.deepcopy(cpu_input) + cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_inplace_scalar_float16(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 0, 100) + cpu_input = cpu_input.to(torch.float32) + scalar = self.generate_scalar(0, 100) + cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar) + npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + def test_le_mix_dtype(self, device): + cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestLe, globals(), except_for="cpu") +if __name__ == '__main__': + run_tests() diff --git a/test/test_network_ops/test_ne.py b/test/test_network_ops/test_ne.py new file mode 100644 index 0000000000000000000000000000000000000000..a403a208d6c46a77aa8e66ff93fe5a254fbd9c28 --- /dev/null +++ b/test/test_network_ops/test_ne.py @@ -0,0 +1,109 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestNe(TestCase): + def cpu_op_exec(self, input1, input2): + output = torch.ne(input1, input2) + output = output.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = torch.ne(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2): + input3 = torch.empty(0).bool().npu() + torch.ne(input1, input2, out=input3) + output = input3.to("cpu") + output = output.numpy() + return output + + def test_ne_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0, 3] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [d, i, j] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 1, 100) + cpu_input2, npu_input2 = create_common_tensor(item, 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ne_shape_format_fp16(self, device): + dtype_list = [np.float16] + format_list = [0, 3] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [d, i, j] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 1, 100) + cpu_input2, npu_input2 = create_common_tensor(item, 1, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input1.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_ne_out_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10) + cpu_input2, npu_input2 = create_common_tensor(item[0], -10, 10) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + self.assertRtolEqual(cpu_output, npu_output_out) + + def test_ne_scalar_out_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10) + npu_output_out = self.npu_op_exec_out(npu_input1, 5) + cpu_output = self.cpu_op_exec(cpu_input1, 5) + self.assertRtolEqual(cpu_output, npu_output_out) + + def test_ne_mix_dtype(self, device): + cpu_input1, npu_input1 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestNe, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_nonzero.py b/test/test_network_ops/test_nonzero.py new file mode 100644 index 0000000000000000000000000000000000000000..84c204a86fc4b2c4774be98ab9d5a14602e32d6a --- /dev/null +++ b/test/test_network_ops/test_nonzero.py @@ -0,0 +1,53 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestNonzero(TestCase): + def cpu_op_exec(self, input1): + output = torch.nonzero(input1) + output = output.numpy().astype(np.int32) + return output + + def npu_op_exec(self, input1): + output = torch.nonzero(input1) + output = output.to("cpu") + output = output.numpy().astype(np.int32) + return output + + def test_nonzero_shape_format(self, device): + dtype_list = [np.float32, np.float16, np.int32, np.int64] + format_list = [0] + shape_list = [[256,10], [256,256,100],[5,256,256,100]] + + shape_format = [ + [[i, j, k]] for i in dtype_list for j in format_list for k in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 1, 100) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestNonzero, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_rsub.py b/test/test_network_ops/test_rsub.py new file mode 100644 index 0000000000000000000000000000000000000000..9b2167d78f65e871adda07d8e4daedb14cb8d53f --- /dev/null +++ b/test/test_network_ops/test_rsub.py @@ -0,0 +1,171 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestRsub(TestCase): + def cpu_op_exec(self, input1, input2): + output = input2 - input1 + output = output.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = input2 - input1 + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_scalar(self, input1, input2): + output = input1 - input2 + output = output.to("cpu") + output = output.numpy() + output = -output + return output + + def rsub_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def rsub_scalar_result(self, shape_format): + for item in shape_format: + scalar = np.random.uniform(0, 100) + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, scalar) + npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar) + + cpu_output = cpu_output.astype(npu_output_scalar.dtype) + self.assertRtolEqual(cpu_output, npu_output_scalar) + + def test_sub_shape_format_fp16_1d(self, device): + format_list = [-1, 0, 3] + shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp32_1d(self, device): + format_list = [-1, 0, 3] + shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp16_2d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp32_2d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp16_3d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp32_3d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp16_4d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_fp32_4d(self, device): + format_list = [-1, 0, 3, 29] + shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list] + self.rsub_result(shape_format) + + # int------------------------------------------------------------------------------- + def test_sub_shape_format_int32_1d(self, device): + format_list = [-1, 0] + shape_format = [[[np.int32, i, [32]], [np.int32, i, [32]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_int32_2d(self, device): + format_list = [-1, 0] + shape_format = [[[np.int32, i, [5, 3]], [np.int32, i, [5, 3]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_int32_3d(self, device): + format_list = [-1, 0] + shape_format = [[[np.int32, i, [256, 480, 14]], [np.int32, i, [256, 480, 14]]] for i in format_list] + self.rsub_result(shape_format) + + def test_sub_shape_format_int32_4d(self, device): + format_list = [-1, 0] + shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list] + self.rsub_result(shape_format) + + # scalar---------------------------------------------------------------------------- + def test_sub_scalar_shape_format_fp16_1d(self, device): + format_list = [-1, 0] + shape_format = [[[np.float16, i, [32]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp32_1d(self, device): + format_list = [-1, 0] + shape_format = [[[np.float16, i, [32]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp16_2d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp32_2d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp16_3d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp32_3d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp16_4d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + def test_sub_scalar_shape_format_fp32_4d(self, device): + format_list = [] + shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list] + self.rsub_scalar_result(shape_format) + + +instantiate_device_type_tests(TestRsub, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_tanh.py b/test/test_network_ops/test_tanh.py new file mode 100644 index 0000000000000000000000000000000000000000..28c8c0789fcc21e214d57dcce013778c6d60fe92 --- /dev/null +++ b/test/test_network_ops/test_tanh.py @@ -0,0 +1,139 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestTanh(TestCase): + def cpu_op_exec(self, input1): + output = torch.tanh(input1) + output = output.numpy() + return output + + def npu_op_exec(self, input1): + input1 = input1.to("npu") + output = torch.tanh(input1) + output = output.to("cpu") + output = output.numpy() + return output + + def test_tanh_common_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (4, 3, 3)], 1, 100], + [[np.float32, -1, (7,5,5)], 21474836,21474837], + [[np.float32, -1, (4, 44, 44)], 3450,34020], + [[np.float32, -1, (65500,3,3)], -214748,-214746], + [[np.float32, -1, (1024, 448, 448)], 200, 300], + [[np.float32, -1, (128, 3, 5)], 0.3219780311757745 , 92 ], + [[np.float32, -1, (8, 7, 7)], 0.4820305734500543 , 28], + [[np.float32, -1, (15, 8, 8)],0.8563874665918477 , 98], + [[np.float32, -1, (11, 6, 6)], 0.0694198357720135 , 50], + [[np.float32, -1, (24, 24, 3)], -2,-2], + [[np.float32, -1, (6, 10, 10)], 0.6447298684351989 , 95], + [[np.float32, -1, (3, 9, 9)], 0.8723538084975545 , 85], + [[np.float32, -1, (5, 5, 5)], 0.8283759153463854 , 71], + [[np.float32, -1, (5, 1, 1)], 0.24718684227306953 , 25], + [[np.float32, -1, (14, 7, 7)], 0.3989186243492233 , 7 ], + [[np.float32, -1, (4, 10, 10)], 0.7866457165672994 , 5], + [[np.float32, -1, (3, 7, 7)], 0.3793216987112159 , 39], + [[np.float32, -1, (2, 8, 8)], 0.9662927186969077 , 5 ], + [[np.float32, -1, (3, 7, 7)], 0.9956475043306917 , 28], + [[np.float32, -1, (7, 10, 10)], 0.769565434387681 , 9], + [[np.float32, -1, (54, 93, 3)],0.6447298684351989 , 95], + [[np.float32, -1, (6, 3, 3)], 0.03133650248813469 , 37 ], + [[np.float32, -1, (65500, 1, 1)], 95, 100], + [[np.float32, -1, (6, 3, 10)], 0.03133650248813469 , 37], + + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = self.cpu_op_exec(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_tanh_float16_shape_format(self, device): + def cpu_op_exec_fp16(input1): + input1 = input1.to(torch.float32) + output = torch.tanh(input1) + output = output.numpy() + output = output.astype(np.float16) + return output + + shape_format = [ + [[np.float16, -1, (65500, 1)], 212,225], + [[np.float16, -1, (1024,448,448)], 200, 300], + [[np.float16, -1, (16,16)], -1000, -100], + [[np.float16, -1, (4,1)], -1.1754943508e-38,-1.1754943508e-38], + [[np.float16, -1, (7, 5, 5)], 21474836,21474837], + [[np.float16, -1, (4, 44, 44)], 3450,34020], + [[np.float16, -1, (65500, 3, 3)], -214748,-214746], + [[np.float16, -1, (64, 4, 4)], -9.313225746154785e-10,9.313225746154785e-10], + [[np.float16, -1, (128, 3, 5)], -0.000000000000000000000000000000000000011754943508,0.000000000000000000000000000000000000011754943508], + [[np.float16, -1, (1, 1, 1)], 0.9283381566708346 , 16], + [[np.float16, -1, (6, 3, 10)], 0.03133650248813469 , 37], + [[np.float16, -1, (65500, 1, 1)], 95, 100 ], + [[np.float16, -1, (13, 5, 5)], 0.9790231845699171 , 41], + [[np.float16, -1, (5, 7, 7)], 0.7852605507867441 , 87 ], + [[np.float16, -1, (13, 2, 2)],0.8758750778305631 , 82], + [[np.float16, -1, (14, 6, 6)],0.6963691068720794 , 92], + [[np.float16, -1, (5, 6, 6)], 0.7570129172808612 , 21], + [[np.float16, -1, (1, 10, 10)], 0.990800730328874 , 86], + [[np.float16, -1, (4, 5, 5)], 0.7349293532899402 , 35], + [[np.float16, -1, (6, 4, 4)], 0.7349293532899402, 35], + [[np.float16, -1, (5, 8, 8)],0.9583309378850908 , 60], + + ] + + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = cpu_op_exec_fp16(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_tanh_inplace_common_shape_format(self, device): + def cpu_op_inplace_exec(input1): + output = torch.tanh_(input1) + output = output.numpy() + return output + + def npu_op_inplace_exec(input1): + input1 = input1.to("npu") + output = torch.tanh_(input1) + output = output.to("cpu") + output = output.numpy() + return output + + shape_format = [ + [[np.float32, -1, (4, 3, 3)], 1, 100], + [[np.float32, -1, (7,5,5)], 21474836,21474837], + [[np.float32, -1, (4, 44, 44)], 3450,34020], + [[np.float32, -1, (65500,3,3)], -214748,-214746] + + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = cpu_op_inplace_exec(cpu_input1) + npu_output = npu_op_inplace_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestTanh, globals(), except_for='cpu') + +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_tanh_backward.py b/test/test_network_ops/test_tanh_backward.py new file mode 100644 index 0000000000000000000000000000000000000000..1e108d1960a8eaad461210ea4b5fb355935edc11 --- /dev/null +++ b/test/test_network_ops/test_tanh_backward.py @@ -0,0 +1,101 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestTanhBackward(TestCase): + + def cpu_op_exec(self, input1): + input1.requires_grad = True + input1_tanh = torch.tanh(input1) + input1_tanh.backward(torch.ones_like(input1_tanh)) + output = input1.grad.numpy() + return output + + def npu_op_exec(self, input1): + input1.requires_grad = True + input1_tanh = torch.tanh(input1) + input1_tanh.backward(torch.ones_like(input1_tanh)) + output = input1.grad + output = output.to("cpu") + output = output.numpy() + return output + + def test_tanh_backward_common_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (4, 3)], 1, 100], + [[np.float32, -1, (7, 5, 5)], 21474836,21474837], + [[np.float32, -1, (4, 44, 44)], 3450,34020], + [[np.float32, -1, (65500,3,3)], -214748,-214746], + [[np.float32, -1, (1024, 448, 448)], 200, 300], + [[np.float32, -1, (24, 24, 3)], -2,-2], + [[np.float32, -1, (3, 7, 7)], 0.3793216987112159, 1], + [[np.float32, -1, (2, 8, 8)], 0.9662927186969077, 1], + [[np.float32, -1, (3, 7, 7)], 0.9956475043306917, 2], + [[np.float32, -1, (7, 10, 10)], 0.769565434387681, 3], + [[np.float32, -1, (65500, 1, 1)], 95, 100], + [[np.float32, -1, (6, 3, 10)], 0.03133650248813469 , 2], + [[np.float32, -1, (4, 3, 3, 3, 3, 3, 3, 3)], 0, 1], + [[np.float32, -1, (5,)], 0, 1], + [[np.float32, -1, (5,5,5,5,5,5)], 1, 2], + [[np.float32, -1, (5,5,5,5,5,5)], 2, 3], + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = self.cpu_op_exec(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_tanh_backward_float16_shape_format(self, device): + def cpu_op_exec_fp16(input1): + input1 = input1.to(torch.float32) + input1.requires_grad = True + input1_tanh = torch.tanh(input1) + input1_tanh.backward(torch.ones_like(input1_tanh)) + output = input1.grad.numpy() + output = output.astype(np.float16) + return output + + shape_format = [ + [[np.float16, -1, (65500, 1)], 212, 225], + [[np.float16, -1, (1024, 448, 448)], 200, 300], + [[np.float16, -1, (16, 16)], -1000, -100], + [[np.float16, -1, (4, 1)], -1.1754943508e-38, -1.1754943508e-38], + [[np.float16, -1, (7, 5, 5)], 21474836, 21474837], + [[np.float16, -1, (4, 44, 44)], 3450, 34020], + [[np.float16, -1, (65500, 3, 3)], -214748, -214746], + [[np.float16, -1, (64, 4, 4)], -9.313225746154785e-10,9.313225746154785e-10], + [[np.float16, -1, (128, 3, 5)], + -0.000000000000000000000000000000000000011754943508, + 0.000000000000000000000000000000000000011754943508], + [[np.float16, -1, (65500, 1, 1)], 95, 100], + ] + + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[1],item[2]) + cpu_output = cpu_op_exec_fp16(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestTanhBackward, globals(), except_for='cpu') + +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py new file mode 100644 index 0000000000000000000000000000000000000000..893adf140e34b82bb03b8732ecf7c9becf3224e4 --- /dev/null +++ b/test/test_network_ops/test_uniform_.py @@ -0,0 +1,50 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestUniform(TestCase): + def test_uniform(self, device): + shape_format = [ + [(20,300), -100, 100, torch.float32], + [(20,300), -100, 100, torch.float16] + ] + + for item in shape_format: + input1 = torch.zeros(item[0], dtype=item[3]).npu() + input1.uniform_(item[1], item[2]) + self.assertTrue(item[1] <= input1.min()) + self.assertTrue(item[2] >= input1.max()) + + def test_uniform_trans(self, device): + shape_format = [ + [(20,300), -100, 100, torch.float32], + ] + + for item in shape_format: + input1 = torch.zeros(item[0], dtype=item[3]).npu() + input1.npu_format_cast(3) + input1.uniform_(item[1], item[2]) + self.assertTrue(item[1] <= input1.min()) + self.assertTrue(item[2] >= input1.max()) + + +instantiate_device_type_tests(TestUniform, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/csrc/aten/ops/GeKernelNpu.cpp b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a71323f79b633929cc7f021e2c08d83975b12ea --- /dev/null +++ b/torch_npu/csrc/aten/ops/GeKernelNpu.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& ge_out_npu_nocheck(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor selfCast = self; + at::Tensor otherCast = other; + if (self.dtype() == at::ScalarType::Int || other.dtype() == at::ScalarType::Int + || self.dtype() == at::ScalarType::Bool || other.dtype() == at::ScalarType::Bool) { + selfCast = self.to(at::ScalarType::Float); + otherCast = other.to(at::ScalarType::Float); + } + auto unified_result = OpPreparation::comparison_op_check(result, selfCast, otherCast, true); + OpCommand cmd; + cmd.Name("GreaterEqual") + .Expect(unified_result) + .Input(selfCast) + .Input(otherCast) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::ge_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); + return result; +} + +at::Tensor& ge_out_npu_nocheck(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor selfCast = self; + if (self.dtype() == at::ScalarType::Int || self.dtype() == at::ScalarType::Bool) { + selfCast = self.to(at::ScalarType::Float); + } + OpCommand cmd; + cmd.Name("GreaterEqual") + .Input(selfCast) + .Input(other, selfCast.scalar_type()) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::ge_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + auto outputSize = formatCastOfSelf.sizes(); + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + ge_out_npu_nocheck(formatCastOfSelf, other, result); + return result; +} + +at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, const at::Tensor& other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + outputSize, + formatCastOfSelf.options().dtype(at::kBool), + ACL_FORMAT_ND); + ge_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); + return result; +} + +at::Tensor NPUNativeFunctions::ge(const at::Tensor& self, at::Scalar other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + formatCastOfSelf.sizes(), + formatCastOfSelf.options().dtype(at::kBool), + ACL_FORMAT_ND); + ge_out_npu_nocheck(formatCastOfSelf, other, result); + return result; +} + +at::Tensor& NPUNativeFunctions::ge_(at::Tensor& self, const at::Tensor& other) { + OpPreparation::CastBackToOriFormat(self); + at::Tensor ori_other = OpPreparation::CastBackToOriFormat(other); + OpPreparation::CheckMemory({self, ori_other}, {self}); + + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + ge_out_npu_nocheck(contiguousSelf, ori_other, result); + } else { + ge_out_npu_nocheck(self, ori_other, result); + } + self.copy_(result); + return self; +} + +at::Tensor& NPUNativeFunctions::ge_(at::Tensor& self, at::Scalar other) { + OpPreparation::CastBackToOriFormat(self); + OpPreparation::CheckMemory({self}, {self}); + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + ge_out_npu_nocheck(contiguousSelf, other, result); + } else { + ge_out_npu_nocheck(self, other, result); + } + self.copy_(result); + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/LeKernelNpu.cpp b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..44404cf8f9a63adec7aa1ec1abdc31e9b9ff2bad --- /dev/null +++ b/torch_npu/csrc/aten/ops/LeKernelNpu.cpp @@ -0,0 +1,134 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& le_out_npu_nocheck(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + OpCommand cmd; + cmd.Name("LessEqual") + .Input(self) + .Input(other, self.scalar_type()) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::le_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + auto outputSize = formatCastOfSelf.sizes(); + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + le_out_npu_nocheck(formatCastOfSelf, other, result); + return result; +} + +at::Tensor& le_out_npu_nocheck(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + auto unified_result = OpPreparation::comparison_op_check(result, self, other, true); + OpCommand cmd; + cmd.Name("LessEqual") + .Expect(unified_result) + .Input(self) + .Input(other) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::le_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); + return result; +} + +at::Tensor NPUNativeFunctions::le(const at::Tensor& self, at::Scalar other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + formatCastOfSelf.sizes(), + formatCastOfSelf.options().dtype(at::kBool), + ACL_FORMAT_ND); + le_out_npu_nocheck(formatCastOfSelf, other, result); + return result; +} + +at::Tensor NPUNativeFunctions::le(const at::Tensor& self, const at::Tensor& other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + outputSize, + formatCastOfSelf.options().dtype(at::kBool), + ACL_FORMAT_ND); + + le_out_npu_nocheck(formatCastOfSelf, formatCastOfOther, result); + return result; +} + +at::Tensor& NPUNativeFunctions::le_(at::Tensor& self, at::Scalar other) { + OpPreparation::CastBackToOriFormat(self); + OpPreparation::CheckMemory({self}, {self}); + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + le_out_npu_nocheck(contiguousSelf, other, result); + } else { + le_out_npu_nocheck(self, other, result); + } + self.copy_(result); + return self; +} + +at::Tensor& NPUNativeFunctions::le_(at::Tensor& self, const at::Tensor& other) { + OpPreparation::CastBackToOriFormat(self); + at::Tensor ori_other = OpPreparation::CastBackToOriFormat(other); + OpPreparation::CheckMemory({self, ori_other}, {self}); + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + le_out_npu_nocheck(contiguousSelf, ori_other, result); + } else { + le_out_npu_nocheck(self, ori_other, result); + } + self.copy_(result); + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/NeKernelNpu.cpp b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b95c86a50da19535dcfb20563efed18dbecc572c --- /dev/null +++ b/torch_npu/csrc/aten/ops/NeKernelNpu.cpp @@ -0,0 +1,157 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, const at::Tensor& other) { + at::Tensor selfCast = self; + at::Tensor otherCast = other; + if(self.dtype() == at::ScalarType::Int || other.dtype() == at::ScalarType::Int){ + selfCast = self.to(at::ScalarType::Float); + otherCast = other.to(at::ScalarType::Float); + } + auto unified_result = OpPreparation::comparison_op_check(result, selfCast, otherCast, true); + if(self.scalar_type() == at::kLong) { + TORCH_WARN_ONCE("The oprator of ne is executed, Currently High Accuracy but Low Performance OP with 64-bit has been used," + "Please Do Some Cast at Python Functions with 32-bit for Better Performance!"); + } + OpCommand cmd; + cmd.Name("NotEqual") + .Expect(unified_result) + .Input(selfCast) + .Input(otherCast) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& ne_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, at::Scalar other) { + at::Tensor selfCast = self; + if(self.dtype() == at::ScalarType::Int){ + selfCast = self.to(at::ScalarType::Float); + } + if(self.scalar_type() == at::kLong) { + TORCH_WARN_ONCE("The oprator of ne is executed, Currently High Accuracy but Low Performance OP with 64-bit has been used," + "Please Do Some Cast at Python Functions with 32-bit for Better Performance!"); + } + OpCommand cmd; + cmd.Name("NotEqual") + .Input(selfCast) + .Input(other, selfCast.scalar_type()) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + auto outputSize = broadcast_ops_npu_output_size(self, other); + OpPreparation::CheckOut( + {self, other}, + result, + CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf), + at::ScalarType::Bool, + at::IntArrayRef(outputSize)); + ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther); + return result; +} + +at::Tensor& NPUNativeFunctions::ne_out(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + OpPreparation::CheckOut( + {self}, + result, + CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf), + at::ScalarType::Bool, + formatCastOfSelf.sizes()); + ne_out_npu_nocheck(result, formatCastOfSelf, other); + return result; +} + +at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, const at::Tensor& other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); + + auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); + at::Tensor result = OpPreparation::ApplyTensor( + outputSize, + formatCastOfSelf.options().dtype(at::kBool), + formatCastOfSelf); + + ne_out_npu_nocheck(result, formatCastOfSelf, formatCastOfOther); + return result; +} + +at::Tensor NPUNativeFunctions::ne(const at::Tensor& self, at::Scalar other) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + + at::Tensor result = OpPreparation::ApplyTensor( + formatCastOfSelf, + formatCastOfSelf.options().dtype(at::kBool)); + + ne_out_npu_nocheck(result, formatCastOfSelf, other); + return result; +} + +at::Tensor& NPUNativeFunctions::ne_(at::Tensor& self, const at::Tensor& other) { + OpPreparation::CastBackToOriFormat(self); + OpPreparation::CastBackToOriFormat(other); + OpPreparation::CheckMemory({self, other}, {self}); + + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + ne_out_npu_nocheck(result, contiguousSelf, other); + } else { + ne_out_npu_nocheck(result, self, other); + } + + self.copy_(result); + + return self; +} + +at::Tensor& NPUNativeFunctions::ne_(at::Tensor& self, at::Scalar other) { + OpPreparation::CastBackToOriFormat(self); + OpPreparation::CheckMemory({self}, {self}); + at::Tensor result = OpPreparation::ApplyTensor( + self, + self.options().dtype(at::ScalarType::Byte)); + + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + ne_out_npu_nocheck(result, contiguousSelf, other); + } else { + ne_out_npu_nocheck(result, self, other); + } + + self.copy_(result); + + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..640564e8ddf2f2a80ba9294cd567d98c67c03736 --- /dev/null +++ b/torch_npu/csrc/aten/ops/NonzeroKernelNpu.cpp @@ -0,0 +1,63 @@ +// Copyright (c) 2020, Huawei Technologies. +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& nonzero_out_npu_nocheck(at::Tensor& result, const at::Tensor& self) { + OpCommand cmd; + cmd.Name("NonZero") + .Input(self) + .Output(result) + .Attr("transpose", false) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::nonzero_out(const at::Tensor& self, at::Tensor& result) { + auto outputSize = nonzero_npu_output_size(self); + OpPreparation::CheckOut( + {self}, + result, + CalcuOpUtil::get_tensor_npu_format(self), + at::ScalarType::Long, + outputSize); + + OpPipeWithDefinedOut pipe; + return pipe.CheckMemory({self}, {result}) + .Func([&self](at::Tensor& result){nonzero_out_npu_nocheck(result, self);}) + .Call(result); +} + +at::Tensor NPUNativeFunctions::nonzero(const at::Tensor& self) { + // calculate the output size + auto outputSize = nonzero_npu_output_size(self); + + // construct the output tensor of the NPU + at::Tensor result = OpPreparation::ApplyTensor( + outputSize, self.options().dtype(at::kLong), self); + + // calculate the output result of the NPU + nonzero_out_npu_nocheck(result, self); + return result; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp b/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4c8200b57732fab4147600e4be98cc0633b02dc3 --- /dev/null +++ b/torch_npu/csrc/aten/ops/RsubKernelNpu.cpp @@ -0,0 +1,96 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor rsub_dest_output(const at::Tensor& self, const at::Tensor& other) { + bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self); + + return isSelfWrapped ? other : self; +} + +at::Tensor& rsub_out_npu_nocheck( + at::Tensor& result, + const at::Tensor& self, + const at::Tensor& other, + at::Scalar alpha) { + // other*alpha + at::Tensor otherMulResult; + if (!CalcuOpUtil::is_scalar_one(alpha)) { + otherMulResult = at::mul(self, alpha); + } + + OpCommand cmd; + if (otherMulResult.defined()) { + cmd.Name("Sub") + .Input(other) + .Input(otherMulResult) + .Output(result) + .Run(); + } else { + cmd.Name("Sub") + .Input(other) + .Input(self) + .Output(result) + .Run(); + } + + return result; +} + +at::Tensor& rsub_out_npu_nocheck( + at::Tensor& result, + const at::Tensor& self, + at::Scalar other, + at::Scalar alpha) { + // other*alpha + at::Tensor scalarValue(at::mul(self, alpha)); + + OpCommand cmd; + cmd.Name("Sub") + .Input(other, self.scalar_type()) + .Input(scalarValue) + .Output(result) + .Run(); + + return result; +} + +at::Tensor NPUNativeFunctions::rsub(const at::Tensor& self, const at::Tensor& other, at::Scalar alpha) { + at::Tensor outputTensor = rsub_dest_output(self, other); + auto outputSize = broadcast_ops_npu_output_size(self, other); + + at::Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize); + + rsub_out_npu_nocheck(result, self, other, alpha); + + return result; +} + +at::Tensor NPUNativeFunctions::rsub(const at::Tensor& self, at::Scalar other, at::Scalar alpha) { + at::Tensor result = OpPreparation::ApplyTensor(self); + + rsub_out_npu_nocheck(result, self, other, alpha); + + return result; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e0abc60d25c187d4c6a14b26980cababf730a453 --- /dev/null +++ b/torch_npu/csrc/aten/ops/TanhBackwardKernelNpu.cpp @@ -0,0 +1,54 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& tanh_backward_out_npu_nocheck( + at::Tensor& result, + const at::Tensor& grad_output, + const at::Tensor& self) { + OpCommand cmd; + cmd.Name("TanhGrad") + .Input(self) + .Input(grad_output) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::tanh_backward_out( + const at::Tensor& grad_output, + const at::Tensor& self, + at::Tensor& result) { + OpPreparation::CheckOut({grad_output, self}, result, self); + tanh_backward_out_npu_nocheck(result, grad_output, self); + return result; +} + +at::Tensor NPUNativeFunctions::tanh_backward(const at::Tensor& grad_output, const at::Tensor& self) { + at::Tensor result = OpPreparation::ApplyTensor(self); + tanh_backward_out_npu_nocheck(result, grad_output, self); + + return result; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp b/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5ec2311735064899ec0674b1d59a82f62acd309f --- /dev/null +++ b/torch_npu/csrc/aten/ops/TanhKernelNpu.cpp @@ -0,0 +1,55 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& NPUNativeFunctions::tanh_out(const at::Tensor& self, at::Tensor& result) { + OpCommand cmd; + cmd.Name("Tanh") + .Input(self) + .Output(result) + .Run(); + + return result; +} + +at::Tensor NPUNativeFunctions::tanh(const at::Tensor& self) { + at::Tensor result = OpPreparation::ApplyTensor(self); + // calculate the output result of the NPU + NPUNativeFunctions::tanh_out(self, result); + + return result; +} + +at::Tensor& NPUNativeFunctions::tanh_(at::Tensor& self) { + OpPreparation::CheckMemory({self}, {self}); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + at::Tensor result = NPUNativeFunctions::tanh_out(contiguousSelf, contiguousSelf); + NpuUtils::format_fresh_view(self, result); + } else { + NPUNativeFunctions::tanh_out(self, self); + } + + return self; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..71a12cb5aaea916005229d4de6975d25b605c9c8 --- /dev/null +++ b/torch_npu/csrc/aten/ops/UniformKernelNpu.cpp @@ -0,0 +1,58 @@ +// Copyright (c) 2021 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& uniform_out_npu( + const at::Tensor& self, + double from, + double to, + c10::optional gen_, + at::Tensor& result) { + OpCommand cmd; + cmd.Name("Uniform") + .Input(self) + .Output(result) + .Attr("from", static_cast(from)) + .Attr("to", static_cast(to)) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::uniform_(at::Tensor& self, double from, double to, c10::optional gen_) { + // The operator needs to use fp32 for calculation. + at::Tensor selfCopy = self; + if (self.scalar_type() == at::ScalarType::Half) { + selfCopy = self.to(at::ScalarType::Float); + } + + if (!NpuUtils::check_match(&selfCopy)) { + at::Tensor selfContiguous = NpuUtils::format_contiguous(selfCopy); + at::Tensor result = uniform_out_npu(selfContiguous, from, to, gen_, selfContiguous); + NpuUtils::format_fresh_view(selfCopy, result); + } else { + uniform_out_npu(selfCopy, from, to, gen_, selfCopy); + } + self.copy_(selfCopy); + + return self; +} + +} // namespace native +} // namespace at_npu