From 22e4f868edf8b0f8b66c8a9309dc445df8feda51 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Sat, 26 Feb 2022 16:08:21 +0800 Subject: [PATCH 1/3] eq, floordivide, sigmoid, sigmoid_backward, stack, zero, zeros --- test/test_network_ops/test_eq.py | 111 ++++++++++++ test/test_network_ops/test_floordivide.py | 148 ++++++++++++++++ test/test_network_ops/test_sigmoid.py | 124 ++++++++++++++ .../test_network_ops/test_sigmoid_backward.py | 99 +++++++++++ test/test_network_ops/test_stack.py | 159 ++++++++++++++++++ test/test_network_ops/test_zero.py | 109 ++++++++++++ test/test_network_ops/test_zeros.py | 151 +++++++++++++++++ torch_npu/csrc/aten/ops/EqKernelNpu.cpp | 4 +- .../csrc/aten/ops/FloorDivideKernelNpu.cpp | 152 +++++++++++++++++ .../aten/ops/SigmoidBackwardKernelNpu.cpp | 61 +++++++ torch_npu/csrc/aten/ops/SigmoidKernelNpu.cpp | 57 +++++++ torch_npu/csrc/aten/ops/StackKernelNpu.cpp | 85 ++++++++++ torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp | 52 ++++++ 13 files changed, 1310 insertions(+), 2 deletions(-) create mode 100644 test/test_network_ops/test_eq.py create mode 100644 test/test_network_ops/test_floordivide.py create mode 100644 test/test_network_ops/test_sigmoid.py create mode 100644 test/test_network_ops/test_sigmoid_backward.py create mode 100644 test/test_network_ops/test_stack.py create mode 100644 test/test_network_ops/test_zero.py create mode 100644 test/test_network_ops/test_zeros.py create mode 100644 torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/SigmoidBackwardKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/SigmoidKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/StackKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp diff --git a/test/test_network_ops/test_eq.py b/test/test_network_ops/test_eq.py new file mode 100644 index 00000000000..771c9623351 --- /dev/null +++ b/test/test_network_ops/test_eq.py @@ -0,0 +1,111 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestEqual(TestCase): + def cpu_op_exec(self, input1, input2): + output = torch.eq(input1, input2) + output = output.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = torch.eq(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2): + input3 = torch.empty(0).bool().npu() + torch.eq(input1, input2, out=input3) + output = input3.to("cpu") + output = output.numpy() + return output + + def test_equal_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0, 3] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [d, i, j] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 1, 100) + cpu_input2, npu_input2 = create_common_tensor(item, 1, 100) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertEqual(cpu_output, npu_output) + + def test_equal_shape_format_fp16(self, device): + dtype_list = [np.float16] + format_list = [0, 3] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [d, i, j] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 1, 100) + cpu_input2, npu_input2 = create_common_tensor(item, 1, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input1.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(np.float16) + self.assertEqual(cpu_output, npu_output) + + def test_equal_out_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10) + cpu_input2, npu_input2 = create_common_tensor(item[0], -10, 10) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertEqual(npu_output_out, npu_output) + + def test_equal_scalar_out_shape_format_fp32(self, device): + dtype_list = [np.float32] + format_list = [0] + shape_list = [[1024], [8, 128], [2, 8, 128], [2, 8, 128, 512]] + shape_format = [ + [[d, i, j]] for d in dtype_list for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], -10, 10) + npu_output_out = self.npu_op_exec_out(npu_input1, 5) + npu_output = self.npu_op_exec(npu_input1, 5) + self.assertEqual(npu_output_out, npu_output) + + def test_equal_mix_dtype(self, device): + npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100) + npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100) + cpu_output = self.cpu_op_exec(npu_input1, npu_input3) + npu_output = self.npu_op_exec(npu_input2, npu_input4) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestEqual, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_floordivide.py b/test/test_network_ops/test_floordivide.py new file mode 100644 index 00000000000..e9b3086b014 --- /dev/null +++ b/test/test_network_ops/test_floordivide.py @@ -0,0 +1,148 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestFloorDivide(TestCase): + + def generate_data(self, min, max, shape, dtype): + input1 = np.random.uniform(min, max, shape).astype(dtype) + input2 = np.random.uniform(min, max, shape).astype(dtype) + + #modify from numpy.ndarray to torch.tensor + npu_input1 = torch.from_numpy(input1) + npu_input2 = torch.from_numpy(input2) + + return npu_input1, npu_input2 + + def generate_three_data(self, min, max, shape, dtype): + input1 = np.random.uniform(min, max, shape).astype(dtype) + input2 = np.random.uniform(min, max, shape).astype(dtype) + input3 = np.random.uniform(min, max, shape).astype(dtype) + + #modify from numpy.ndarray to torch.tensor + npu_input1 = torch.from_numpy(input1) + npu_input2 = torch.from_numpy(input2) + npu_input3 = torch.from_numpy(input3) + + return npu_input1, npu_input2, npu_input3 + + def cpu_op_exec(self, input1, input2): + output = torch.floor_divide(input1,input2) + output = output.numpy() + return output + + def npu_op_exec(self, input1, input2): + input1 = input1.to("npu") + input2 = input2.to("npu") + output = torch.floor_divide(input1,input2) + output = output.to("cpu") + output = output.numpy() + return output + + + def npu_op_exec_scalar(self, input1, input2): # + input1 = input1.to("npu") + output = torch.floor_divide(input1,input2) + output = output.to("cpu") + output = output.numpy() + return output + + + def npu_op_exec_out(self, input1, input2, input3): # + input1 = input1.to("npu") + input2 = input2.to("npu") + output = input3.to("npu") + torch.floor_divide(input1, input2, out=output) + output = output.to("cpu") + output = output.numpy() + return output + + + def test_floor_divide_float32(self, device): + npu_input1, npu_input2 = self.generate_data(1, 100, (1, 2), np.float32) + cpu_output = self.cpu_op_exec(npu_input1, npu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + + def test_floor_divide_float32_out(self, device): + npu_input1, npu_input2, npu_input3 = self.generate_three_data(1, 100, (1,2), np.float32) + cpu_output = self.cpu_op_exec(npu_input1, npu_input2) + npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) + self.assertRtolEqual(cpu_output, npu_output) + + + def test_floor_divide_int32(self, device): + npu_input1, npu_input2 = self.generate_data(1, 100, (1,2), np.int32) + cpu_output = self.cpu_op_exec(npu_input1, npu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_floor_divide_int8(self, device): + npu_input1, npu_input2 = self.generate_data(1, 100, (1,2), np.int8) + cpu_output = self.cpu_op_exec(npu_input1, npu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_floor_divide_uint8(self, device): + npu_input1, npu_input2 = self.generate_data(1, 100, (1,3), np.uint8) + cpu_output = self.cpu_op_exec(npu_input1, npu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + self.assertRtolEqual(cpu_output, npu_output) + + def test_floor_divide_scalar_float32(self, device): + npu_input1, _= self.generate_data(1, 100, (1,3), np.float32) + cpu_output = self.cpu_op_exec(npu_input1, 1) + npu_output = self.npu_op_exec_scalar(npu_input1, 1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_floor_divide_scalar_bool(self, device): + npu_input1, _= self.generate_data(1, 10, (2, 5), np.float32) + cpu_output = self.cpu_op_exec(npu_input1 > 5, 1.0) + npu_output = self.npu_op_exec_scalar(npu_input1 > 5, 1.0) + self.assertRtolEqual(cpu_output, npu_output) + + def npu_uncontiguous_op_exec_scalar(self, input1, input2): # + input1 = input1.to("npu") + input1 = input1.as_strided([2,2], [1,2], 1) + output = torch.floor_divide(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_uncontiguous_op_exec_scalar(self, input1, input2): # + input1 = input1.as_strided([2,2], [1,2], 1) + output = torch.floor_divide(input1, input2) + output = output.numpy() + return output + + def test_floor_divide_uncontiguous_float32_scalar(self, device): + npu_input1, npu_input2 = self.generate_data(1, 100, (4,3), np.float32) + cpu_input1 = copy.deepcopy(npu_input1) + cpu_output = self.cpu_uncontiguous_op_exec_scalar(cpu_input1, 2) + npu_output = self.npu_uncontiguous_op_exec_scalar(npu_input1, 2) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestFloorDivide, globals(), except_for='cpu') +if __name__ == '__main__': + run_tests() + diff --git a/test/test_network_ops/test_sigmoid.py b/test/test_network_ops/test_sigmoid.py new file mode 100644 index 00000000000..e7714507950 --- /dev/null +++ b/test/test_network_ops/test_sigmoid.py @@ -0,0 +1,124 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append('..') +import torch +import numpy as np +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + +class TestSigmoid(TestCase): + @dtypes(torch.float) + def test_sigmoid(self, device, dtype): + # TODO: why not simulate math.sigmoid like with rsqrt? + inputValues = [-1000, -1, 0, 0.5, 1, 2, 1000] + expectedOutput = [0.0000, 0.2689, 0.5, 0.6225, 0.7311, 0.8808, 1.000] + precision_4dps = 0.0002 + + self.assertEqual( + torch.tensor( + inputValues, dtype=dtype, device=device).sigmoid().cpu(), + torch.tensor( + expectedOutput, + dtype=dtype, device=device).cpu(), precision_4dps) + + def cpu_op_exec(self, input): + output = torch.sigmoid(input) + output = output.numpy() + return output + + def npu_op_exec(self, input): + output = torch.sigmoid(input) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_out_exec(self, input, output): + torch.sigmoid(input, out = output) + output = output.to("cpu").numpy() + return output + + def test_sigmoid_shape_format_fp16(self, device): + format_list = [0] + shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 1, 100) + cpu_input = cpu_input.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + def test_sigmoid_shape_format_fp32(self, device): + format_list = [0, 3, 4, 29] + shape_list = [1, (32, 32, 3, 3), (256, 2048, 7, 7)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 1, 100) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + self.assertRtolEqual(cpu_output, npu_output) + + def test_sigmoid_out_float32_shape_format(self, device): + shape_format = [ + [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]], + [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [128, 32, 8, 10]]], + [[np.float32, 0, [512, 32]], [np.float32, 0, [1024, 20]]], + [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]], + [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]], + [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]], + [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]], + [[np.float32, 3, [1024]], [np.float32, 3, [1024]]], + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 1, 100) + cpu_output, npu_output = create_common_tensor(item[1], -1, 1) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_out_exec(npu_input, npu_output) + self.assertRtolEqual(cpu_output, npu_output) + + def test_sigmoid_out_float16_shape_format(self, device): + shape_format = [ + [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]], + [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [128, 32, 8, 10]]], + [[np.float16, 0, [510, 32]], [np.float16, 0, [1024, 20]]], + [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]], + [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]], + [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]], + [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]], + [[np.float16, 3, [1024]], [np.float16, 3, [128]]], + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 1, 100) + cpu_output, npu_output = create_common_tensor(item[1], -1, 1) + if item[0][0] == np.float16: + cpu_input = cpu_input.to(torch.float32) + cpu_output = cpu_output.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_out_exec(npu_input, npu_output) + if item[0][0] == np.float16: + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestSigmoid, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_sigmoid_backward.py b/test/test_network_ops/test_sigmoid_backward.py new file mode 100644 index 00000000000..3c4b6b51b34 --- /dev/null +++ b/test/test_network_ops/test_sigmoid_backward.py @@ -0,0 +1,99 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append('..') +import torch +import numpy as np +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + + +def input_grad_hook(grad): + global input_grad + input_grad = grad + input_grad = input_grad.numpy() + + +def npu_input_grad_hook(grad): + global npu_input_grad + npu_input_grad = grad.to("cpu") + npu_input_grad = npu_input_grad.numpy() + + +class TestSigmoidBackward(TestCase): + def cpu_op_exec(self, input, is_contiguous = True): + if is_contiguous is False : + input = input.as_strided([2,2], [1,2], 1) + input.requires_grad = True + input.register_hook(input_grad_hook) + output = torch.sigmoid(input) + z = output.sum() + z.backward() + + def npu_op_exec(self, input, is_contiguous = True): + if is_contiguous is False : + input = input.as_strided([2,2], [1,2], 1) + input.requires_grad = True + input.register_hook(npu_input_grad_hook) + + output = torch.sigmoid(input) + z = output.sum() + z.backward() + input = input.cpu() + + def test_sigmoid_backward_shape_format_fp16(self, device): + format_list = [0] + shape_list = [5,(64, 10),(32, 3, 3),(256, 2048, 7, 7)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + input1, npu_input1 = create_common_tensor(item, 1, 100) + input2, npu_input2 = create_common_tensor(item, 1, 100) + input1 = input1.to(torch.float32) + input2 = input2.to(torch.float32) + self.cpu_op_exec(input1) + self.npu_op_exec(npu_input1) + global input_grad + input_grad = input_grad.astype(npu_input_grad.dtype) + self.assertRtolEqual(input_grad, npu_input_grad) + + self.cpu_op_exec(input2, False) + self.npu_op_exec(npu_input2, False) + input_grad = input_grad.astype(np.float16) + self.assertRtolEqual(input_grad, npu_input_grad) + + def test_sigmoid_backward_shape_format_fp32(self, device): + format_list = [0, 3, 4, 29] + shape_list = [(256, 2048, 7, 7)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + input1, npu_input1 = create_common_tensor(item, 1, 100) + input2, npu_input2 = create_common_tensor(item, 1, 100) + self.cpu_op_exec(input1) + self.npu_op_exec(npu_input1) + self.assertRtolEqual(input_grad, npu_input_grad) + + self.cpu_op_exec(input2, False) + self.npu_op_exec(npu_input2, False) + self.assertRtolEqual(input_grad, npu_input_grad) + + +instantiate_device_type_tests(TestSigmoidBackward, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_stack.py b/test/test_network_ops/test_stack.py new file mode 100644 index 00000000000..135cb870906 --- /dev/null +++ b/test/test_network_ops/test_stack.py @@ -0,0 +1,159 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestStack(TestCase): + def cpu_op_exec(self, input1, input2, dim): + cpu_output = torch.stack((input1, input2), dim) + cpu_output = cpu_output.numpy() + return cpu_output + + def npu_op_exec(self, input1, input2, dim): + output = torch.stack((input1, input2), dim) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_exec_out(self, input1, input2, dim, input3): + torch.stack((input1, input2), dim, out=input3) + output = input3.numpy() + return output + + def npu_op_exec_out(self, input1, input2, dim, input3): + torch.stack((input1, input2), dim, out=input3) + output = input3.to("cpu") + output = output.numpy() + return output + + def npu_output_size(self, inputs = [], dim = 0): + shape = [] + for i in range(dim): + shape.append(inputs[0].size(i)) + shape.append(len(inputs)) + for i in range(dim, inputs[0].dim()): + shape.append(inputs[0].size(i)) + + return shape + + def stack_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100) + shape = self.npu_output_size([npu_input1,npu_input2], item[1]) + npu_input3 = torch.ones(shape, dtype = cpu_input1.dtype).npu() + cpu_input3 = torch.ones(shape, dtype = cpu_input1.dtype) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + cpu_input2 = cpu_input2.to(torch.float32) + cpu_input3 = cpu_input3.to(torch.float32) + + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, item[1]) + npu_output = self.npu_op_exec(npu_input1, npu_input2, item[1]) + cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, item[1], cpu_input3) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, item[1], npu_input3) + + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output_out) + + def test_stack_shape_format_fp16_1d(self, device): + format_list = [0, 3] + shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_fp16_2d(self, device): + format_list = [0, 3, 29] + shape_format = [[[np.float16, i, [5, 256]], np.random.randint(0, 2)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_fp16_3d(self, device): + format_list = [0, 3, 29] + shape_format = [[[np.float16, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_fp16_4d(self, device): + format_list = [0, 3, 29] + shape_format = [[[np.float16, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_fp32_1d(self, device): + format_list = [0, 3] + shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_fp32_2d(self, device): + format_list = [0, 3, 29] + shape_format = [[[np.float32, i, [5, 256]], np.random.randint(0, 2)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_fp32_3d(self, device): + format_list = [0, 3, 29] + shape_format = [[[np.float32, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_fp32_4d(self, device): + format_list = [0, 3, 29] + shape_format = [[[np.float32, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_int32_1d(self, device): + format_list = [0] + shape_format = [[[np.int32, i, [18]], np.random.randint(0, 1)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_int32_2d(self, device): + format_list = [0] + shape_format = [[[np.int32, i, [5, 256]], np.random.randint(0, 2)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_int32_3d(self, device): + format_list = [0] + shape_format = [[[np.int32, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_shape_format_int32_4d(self, device): + format_list = [-1] + shape_format = [[[np.int32, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list] + self.stack_result(shape_format) + + def test_stack_size_dim(self, device): + def cpu_op_exec(input1): + output = torch.stack((input1, input1, input1, input1, input1, input1, input1, input1, input1)) + return output.numpy() + + def npu_op_exec(input1): + output = torch.stack((input1, input1, input1, input1, input1, input1, input1, input1, input1)) + output = output.to("cpu") + return output.numpy() + shape_format = [ + [[np.int32, 0, ()]], + [[np.float32, 0, ()]], + [[np.float16, 0, ()]], + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + cpu_output = cpu_op_exec(cpu_input1) + npu_output = npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestStack, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_zero.py b/test/test_network_ops/test_zero.py new file mode 100644 index 00000000000..a8466832968 --- /dev/null +++ b/test/test_network_ops/test_zero.py @@ -0,0 +1,109 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestZero(TestCase): + def cpu_op_exec(self, input1): + torch.zero_(input1) + output = input1.numpy() + return output + + def npu_op_exec(self, input1): + torch.zero_(input1) + output = input1.to("cpu") + output = output.numpy() + return output + + def zero_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + + cpu_output = self.cpu_op_exec(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def test_zero_shape_format_fp16_1d(self, device): + format_list = [0, 3, 29] + shape_format = [[np.float16, i, [18]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_fp16_2d(self, device): + format_list = [0, 3, 29] + shape_format = [[np.float16, i, [5, 256]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_fp16_3d(self, device): + format_list = [0, 3, 29] + shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_fp16_4d(self, device): + format_list = [0, 3, 29] + shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_fp32_1d(self, device): + format_list = [0, 3, 29] + shape_format = [[np.float32, i, [18]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_fp32_2d(self, device): + format_list = [0, 3, 29] + shape_format = [[np.float32, i, [5, 256]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_fp32_3d(self, device): + format_list = [0, 3, 29] + shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_fp32_4d(self, device): + format_list = [0, 3, 29] + shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_int32_1d(self, device): + format_list = [-1, 0] + shape_format = [[np.int32, i, [18]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_int32_2d(self, device): + format_list = [-1, 0] + shape_format = [[np.int32, i, [5, 256]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_int32_3d(self, device): + format_list = [-1, 0] + shape_format = [[np.int32, i, [32, 3, 3]] for i in format_list] + self.zero_result(shape_format) + + def test_zero_shape_format_int32_4d(self, device): + format_list = [-1, 0] + shape_format = [[np.int32, i, [64, 112, 7, 7]] for i in format_list] + self.zero_result(shape_format) + + +instantiate_device_type_tests(TestZero, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_zeros.py b/test/test_network_ops/test_zeros.py new file mode 100644 index 00000000000..d49d942ab4b --- /dev/null +++ b/test/test_network_ops/test_zeros.py @@ -0,0 +1,151 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestZeros(TestCase): + def cpu_op_exec(self, input1, dtype): + output = torch.zeros(input1.size(), dtype=dtype, device="cpu") + output = output.numpy() + return output + + def npu_op_exec(self, input1, dtype): + output = torch.zeros(input1.size(), dtype=dtype, device="npu") + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2, dtype): + torch.zeros(input1.size(), dtype=dtype, device="npu", out=input2) + output = input2.to("cpu") + output = output.numpy() + return output + + def zeros_result(self, shape_format): + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + npu_input2 = copy.deepcopy(cpu_input1) + npu_input2 = npu_input2.to(item[1]).to('npu') + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + + cpu_output = self.cpu_op_exec(cpu_input1, item[1]) + npu_output = self.npu_op_exec(npu_input1, item[1]) + npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, item[1]) + cpu_output = cpu_output.astype(npu_output.dtype) + + self.assertRtolEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output_out) + + def test_zeros_shape_format_names(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float32, i, [18, 24, 8, 8]], j] for i in format_list for j in dtype_list] + + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + + cpu_output = torch.zeros(cpu_input1.size(), names=('N', 'C', 'H', 'W'), dtype=item[1], device="cpu") + cpu_output = cpu_output.numpy() + npu_output = torch.zeros(cpu_input1.size(), names=('N', 'C', 'H', 'W'), dtype=item[1], device="npu") + npu_output = npu_output.to("cpu") + npu_output = npu_output.numpy() + cpu_output = cpu_output.astype(npu_output.dtype) + + self.assertRtolEqual(cpu_output, npu_output) + + def test_zeros_shape_format_fp16_1d(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float16, i, [18]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_fp16_2d(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float16, i, [5, 256]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_fp16_3d(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float16, i, [32, 3, 3]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_fp16_4d(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float16, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_fp32_1d(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float32, i, [18]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_fp32_2d(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float32, i, [5, 256]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_fp32_3d(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float32, i, [32, 3, 3]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_fp32_4d(self, device): + format_list = [0, 3, 29] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.float32, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_int32_1d(self, device): + format_list = [-1, 0] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.int32, i, [18]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_int32_2d(self, device): + format_list = [-1, 0] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.int32, i, [5, 256]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_int32_3d(self, device): + format_list = [-1, 0] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.int32, i, [32, 3, 3]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + def test_zeros_shape_format_int32_4d(self, device): + format_list = [-1, 0] + dtype_list = [torch.float16, torch.float32, torch.int32] + shape_format = [[[np.int32, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list] + self.zeros_result(shape_format) + + +instantiate_device_type_tests(TestZeros, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp index fa27bc376a4..0b05e8361dd 100644 --- a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp @@ -123,7 +123,7 @@ namespace at_npu return result; } - at::Tensor &eq_npu_(at::Tensor &self, const at::Tensor &other) + at::Tensor& NPUNativeFunctions::eq_(at::Tensor &self, const at::Tensor &other) { OpPreparation::CastBackToOriFormat(self); at::Tensor formatCastOfOther = OpPreparation::CastBackToOriFormat(other); @@ -152,7 +152,7 @@ namespace at_npu return self; } - at::Tensor &eq_npu_(at::Tensor &self, at::Scalar other) + at::Tensor& NPUNativeFunctions::eq_(at::Tensor &self, at::Scalar other) { OpPreparation::CastBackToOriFormat(self); c10::SmallVector inputs = {self}; diff --git a/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp b/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp new file mode 100644 index 00000000000..a1e90de796a --- /dev/null +++ b/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp @@ -0,0 +1,152 @@ +// Copyright (c) 2022 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& floor_divide_out_npu_nocheck(at::Tensor& result, const at::Tensor& self, at::Scalar other) { + OpCommand cmd; + cmd.Name("FloorDiv") + .Input(self) + .Input(other, self.scalar_type()) + .Output(result) + .Run(); + return result; +} + +at::Tensor& floor_divide_out_scalar_npu(const at::Tensor& self, at::Scalar other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + auto outputSize = formatCastOfSelf.sizes(); + OpPreparation::CheckOut( + {self}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + + floor_divide_out_npu_nocheck(result, formatCastOfSelf, other); + return result; +} + +at::Tensor& NPUNativeFunctions::floor_divide_out(const at::Tensor& self, const at::Tensor& other, at::Tensor& result) { + at::Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); + auto outputSize = formatCastOfSelf.sizes(); + OpPreparation::CheckOut( + {self, other}, + result, + ACL_FORMAT_ND, + result.scalar_type(), + outputSize); + // executing the NPU operator + if (other.dim() == 0) { + floor_divide_out_npu_nocheck(result, self, other.item()); + } else { + OpCommand cmd; + cmd.Name("FloorDiv") + .Input(self) + .Input(other) + .Output(result) + .Run(); + } + + return result; +} + +at::Tensor NPUNativeFunctions::floor_divide(const at::Tensor& self, const at::Tensor& other) { + at::Tensor selfCast = self; + if(self.dtype() == at::ScalarType::Bool){ + selfCast = selfCast.to(at::ScalarType::Float); + } + at::Tensor otherCast = other; + if (other.scalar_type() == at::ScalarType::Double) { + otherCast = otherCast.to(at::ScalarType::Float); + } + if (other.scalar_type() == at::ScalarType::Long) { + otherCast = otherCast.to(at::ScalarType::Int); + } + + // calculate the output size + bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(selfCast); + at::Tensor outputTensor = isSelfWrapped ? otherCast : selfCast; + + auto outputSize = broadcast_ops_npu_output_size(selfCast, otherCast); + + // construct the output tensor of the NPU + + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + outputSize, + outputTensor.options(), + CalcuOpUtil::get_tensor_npu_format(selfCast)); + + // calculate the output result of the NPU + NPUNativeFunctions::floor_divide_out(selfCast, otherCast, result); + + return result; +} + +at::Tensor NPUNativeFunctions::floor_divide(const at::Tensor& self, at::Scalar other) { + + // calculate the output size + auto outputSize = input_same_output_size(self); + + // construct the output tensor of the NPU + at::Tensor result = OpPreparation::ApplyTensor(self, outputSize); + + // calculate the output result of the NPU + floor_divide_out_scalar_npu(self, other, result); + + return result; +} + +at::Tensor& NPUNativeFunctions::floor_divide_(at::Tensor& self, const at::Tensor& other) { + Tensor otherCast = other; + if (other.scalar_type() == at::ScalarType::Double) { + otherCast = otherCast.to(at::ScalarType::Float); + } + if (other.scalar_type() == at::ScalarType::Long) { + otherCast = otherCast.to(at::ScalarType::Int); + } + SmallVector inputs = {self, otherCast}; + SmallVector outputs = {self}; + CalcuOpUtil::check_memory_over_laps(inputs, outputs); + + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + at::Tensor result = NPUNativeFunctions::floor_divide_out(contiguousSelf, other, contiguousSelf); + NpuUtils::format_fresh_view(self, result); + } else { + NPUNativeFunctions::floor_divide_out(self, otherCast, self); + } + + return self; +} + +at::Tensor& NPUNativeFunctions::floor_divide_(at::Tensor& self, at::Scalar other) { + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + floor_divide_out_scalar_npu(contiguousSelf, other, contiguousSelf); + NpuUtils::format_fresh_view(self, contiguousSelf); + } else { + floor_divide_out_scalar_npu(self, other, self); + } + return self; +} + +} // namespace native +} // at_npu \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/SigmoidBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/SigmoidBackwardKernelNpu.cpp new file mode 100644 index 00000000000..a304d82ee83 --- /dev/null +++ b/torch_npu/csrc/aten/ops/SigmoidBackwardKernelNpu.cpp @@ -0,0 +1,61 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& sigmoid_backward_out_npu_nocheck( + at::Tensor& result, + const at::Tensor& grad_output, + const at::Tensor& output) { + // output'format must be same with grad_output + if (CalcuOpUtil::get_tensor_npu_format(output) != CalcuOpUtil::get_tensor_npu_format(grad_output)) { + output.npu_format_cast_(CalcuOpUtil::get_tensor_npu_format(grad_output)); + } + + auto unified_result = OpPreparation::binary_op_check(result, output, grad_output, true); + OpCommand cmd; + cmd.Name("SigmoidGrad") + .Expect(unified_result) + .Input(output) + .Input(grad_output) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::sigmoid_backward_out( + const at::Tensor& grad_output, + const at::Tensor& output, + at::Tensor& result) { + OpPreparation::CheckOut({grad_output, output}, result, grad_output); + sigmoid_backward_out_npu_nocheck(result, grad_output, output); + return result; +} + +at::Tensor NPUNativeFunctions::sigmoid_backward(const at::Tensor& grad_output, const at::Tensor& output) { + at::Tensor grad_input = OpPreparation::ApplyTensor(grad_output); + sigmoid_backward_out_npu_nocheck(grad_input, grad_output, output); + + return grad_input; +} + +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/SigmoidKernelNpu.cpp b/torch_npu/csrc/aten/ops/SigmoidKernelNpu.cpp new file mode 100644 index 00000000000..19f6ffb5e1f --- /dev/null +++ b/torch_npu/csrc/aten/ops/SigmoidKernelNpu.cpp @@ -0,0 +1,57 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& sigmoid_out_npu_nocheck(at::Tensor& result, const at::Tensor& self) { + OpCommand cmd; + cmd.Name("Sigmoid") + .Input(self) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::sigmoid_out(const at::Tensor& self, at::Tensor& result) { + OpPreparation::CheckOut( + {self}, + result, + self); + + OpPipeWithDefinedOut pipe; + return pipe.CheckMemory({self}, {result}) + .Func([&self](at::Tensor& result){sigmoid_out_npu_nocheck(result, self);}) + .Call(result); +} + +at::Tensor& NPUNativeFunctions::sigmoid_(at::Tensor& self) { + NPUNativeFunctions::sigmoid_out(self, self); + + return self; +} + +at::Tensor NPUNativeFunctions::sigmoid(const at::Tensor& self) { + at::Tensor result = OpPreparation::ApplyTensor(self); + sigmoid_out_npu_nocheck(result, self); + return result; +} + +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/StackKernelNpu.cpp b/torch_npu/csrc/aten/ops/StackKernelNpu.cpp new file mode 100644 index 00000000000..1b776a6308e --- /dev/null +++ b/torch_npu/csrc/aten/ops/StackKernelNpu.cpp @@ -0,0 +1,85 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +SmallVector stack_npu_output_size( + at::TensorList tensors, + int64_t dim) { + dim = CalcuOpUtil::make_wrap_dim(dim, tensors[0].dim() + 1); + SmallVector shape; + for (int i = 0; i < dim; i++) { + shape.emplace_back(tensors[0].size(i)); + } + shape.emplace_back(tensors.size()); + for (int i = dim; i < tensors[0].dim(); i++) { + shape.emplace_back(tensors[0].size(i)); + } + + return shape; +} + +at::Tensor& stack_out_npu_nocheck(at::TensorList tensors, int64_t dim, at::Tensor& result) { + auto inputTensors = CalcuOpUtil::ConvertTensorListToSmallVector(tensors); + + OpCommand cmd; + cmd.Name("Pack"); + for (int i = 0; i < inputTensors.size(); i++) { + string inputName = "x" + to_string(i); + cmd.Input(inputTensors[i],inputName); + } + cmd.Output(result) + .Attr("N", (int64_t)tensors.size()) + .Attr("axis", dim) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::stack_out(at::TensorList tensors, int64_t dim, at::Tensor& result) { + auto outputSize = stack_npu_output_size(tensors, dim); + + OpPreparation::CheckOut( + {tensors[0]}, + result, + ACL_FORMAT_ND, + tensors[0].scalar_type(), + outputSize); + + stack_out_npu_nocheck(tensors, dim, result); + + return result; +} + +at::Tensor NPUNativeFunctions::stack(at::TensorList tensors, int64_t dim) { + auto outputSize = stack_npu_output_size(tensors, dim); + + at::Tensor result = OpPreparation::ApplyTensorWithFormat( + outputSize, + tensors[0].options(), + ACL_FORMAT_ND); + + stack_out_npu_nocheck(tensors, dim, result); + + return result; +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp b/torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp new file mode 100644 index 00000000000..00e37458c1b --- /dev/null +++ b/torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp @@ -0,0 +1,52 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& NPUNativeFunctions::zeros_out(at::IntArrayRef size, at::Tensor& result) { + result.resize_(size); + return result.zero_(); +} + +at::Tensor NPUNativeFunctions::zeros(at::IntArrayRef size, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt) { + at::TensorOptions option = option.dtype(dtype_opt) + .layout(layout_opt) + .device(device_opt) + .pinned_memory(pin_memory_opt); + at::Tensor result = OpPreparation::ApplyTensorWithFormat(size, option, ACL_FORMAT_ND); + return result.zero_(); +} + +at::Tensor NPUNativeFunctions::zeros( + IntArrayRef size, + optional names, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt) { + return NPUNativeFunctions::zeros(size, dtype_opt, layout_opt, device_opt, pin_memory_opt); +} + +} // namespace native +} // namespace at_npu \ No newline at end of file -- Gitee From 0875ab0356bdf3ecbb0bafb1b6858000707c5cac Mon Sep 17 00:00:00 2001 From: wangxiao Date: Sat, 26 Feb 2022 18:37:59 +0800 Subject: [PATCH 2/3] fix bugs of floordivide, sigmoid, sigmoid_backward, stack, zeros --- test/test_network_ops/test_floordivide.py | 18 +++++------ test/test_network_ops/test_sigmoid.py | 25 +++++++-------- .../test_network_ops/test_sigmoid_backward.py | 32 +++++++++---------- test/test_network_ops/test_stack.py | 2 +- .../csrc/aten/ops/FloorDivideKernelNpu.cpp | 2 +- .../aten/ops/SigmoidBackwardKernelNpu.cpp | 5 --- torch_npu/csrc/aten/ops/StackKernelNpu.cpp | 4 +-- torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp | 4 +-- 8 files changed, 42 insertions(+), 50 deletions(-) diff --git a/test/test_network_ops/test_floordivide.py b/test/test_network_ops/test_floordivide.py index e9b3086b014..8c477d15ae7 100644 --- a/test/test_network_ops/test_floordivide.py +++ b/test/test_network_ops/test_floordivide.py @@ -23,9 +23,9 @@ from torch_npu.testing.util_test import create_common_tensor class TestFloorDivide(TestCase): - def generate_data(self, min, max, shape, dtype): - input1 = np.random.uniform(min, max, shape).astype(dtype) - input2 = np.random.uniform(min, max, shape).astype(dtype) + def generate_data(self, min1, max1, shape, dtype): + input1 = np.random.uniform(min1, max1, shape).astype(dtype) + input2 = np.random.uniform(min1, max1, shape).astype(dtype) #modify from numpy.ndarray to torch.tensor npu_input1 = torch.from_numpy(input1) @@ -33,10 +33,10 @@ class TestFloorDivide(TestCase): return npu_input1, npu_input2 - def generate_three_data(self, min, max, shape, dtype): - input1 = np.random.uniform(min, max, shape).astype(dtype) - input2 = np.random.uniform(min, max, shape).astype(dtype) - input3 = np.random.uniform(min, max, shape).astype(dtype) + def generate_three_data(self, min1, max1, shape, dtype): + input1 = np.random.uniform(min1, max1, shape).astype(dtype) + input2 = np.random.uniform(min1, max1, shape).astype(dtype) + input3 = np.random.uniform(min1, max1, shape).astype(dtype) #modify from numpy.ndarray to torch.tensor npu_input1 = torch.from_numpy(input1) @@ -110,13 +110,13 @@ class TestFloorDivide(TestCase): self.assertRtolEqual(cpu_output, npu_output) def test_floor_divide_scalar_float32(self, device): - npu_input1, _= self.generate_data(1, 100, (1,3), np.float32) + npu_input1, _ = self.generate_data(1, 100, (1,3), np.float32) cpu_output = self.cpu_op_exec(npu_input1, 1) npu_output = self.npu_op_exec_scalar(npu_input1, 1) self.assertRtolEqual(cpu_output, npu_output) def test_floor_divide_scalar_bool(self, device): - npu_input1, _= self.generate_data(1, 10, (2, 5), np.float32) + npu_input1, _ = self.generate_data(1, 10, (2, 5), np.float32) cpu_output = self.cpu_op_exec(npu_input1 > 5, 1.0) npu_output = self.npu_op_exec_scalar(npu_input1 > 5, 1.0) self.assertRtolEqual(cpu_output, npu_output) diff --git a/test/test_network_ops/test_sigmoid.py b/test/test_network_ops/test_sigmoid.py index e7714507950..41cdaf60abd 100644 --- a/test/test_network_ops/test_sigmoid.py +++ b/test/test_network_ops/test_sigmoid.py @@ -11,17 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import sys -sys.path.append('..') import torch +import torch_npu import numpy as np -from common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor class TestSigmoid(TestCase): - @dtypes(torch.float) + @Dtypes(torch.float) def test_sigmoid(self, device, dtype): # TODO: why not simulate math.sigmoid like with rsqrt? inputValues = [-1000, -1, 0, 0.5, 1, 2, 1000] @@ -35,19 +34,19 @@ class TestSigmoid(TestCase): expectedOutput, dtype=dtype, device=device).cpu(), precision_4dps) - def cpu_op_exec(self, input): - output = torch.sigmoid(input) + def cpu_op_exec(self, input1): + output = torch.sigmoid(input1) output = output.numpy() return output - def npu_op_exec(self, input): - output = torch.sigmoid(input) + def npu_op_exec(self, input1): + output = torch.sigmoid(input1) output = output.to("cpu") output = output.numpy() return output - def npu_op_out_exec(self, input, output): - torch.sigmoid(input, out = output) + def npu_op_out_exec(self, input1, output): + torch.sigmoid(input1, out = output) output = output.to("cpu").numpy() return output diff --git a/test/test_network_ops/test_sigmoid_backward.py b/test/test_network_ops/test_sigmoid_backward.py index 3c4b6b51b34..98f6257b6b5 100644 --- a/test/test_network_ops/test_sigmoid_backward.py +++ b/test/test_network_ops/test_sigmoid_backward.py @@ -11,15 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import sys -sys.path.append('..') import torch +import torch_npu import numpy as np -from common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor def input_grad_hook(grad): global input_grad @@ -34,25 +32,25 @@ def npu_input_grad_hook(grad): class TestSigmoidBackward(TestCase): - def cpu_op_exec(self, input, is_contiguous = True): + def cpu_op_exec(self, input1, is_contiguous = True): if is_contiguous is False : - input = input.as_strided([2,2], [1,2], 1) - input.requires_grad = True - input.register_hook(input_grad_hook) - output = torch.sigmoid(input) + input1 = input1.as_strided([2,2], [1,2], 1) + input1.requires_grad = True + input1.register_hook(input_grad_hook) + output = torch.sigmoid(input1) z = output.sum() z.backward() - def npu_op_exec(self, input, is_contiguous = True): + def npu_op_exec(self, input1, is_contiguous = True): if is_contiguous is False : - input = input.as_strided([2,2], [1,2], 1) - input.requires_grad = True - input.register_hook(npu_input_grad_hook) + input1 = input1.as_strided([2,2], [1,2], 1) + input1.requires_grad = True + input1.register_hook(npu_input_grad_hook) - output = torch.sigmoid(input) + output = torch.sigmoid(input1) z = output.sum() z.backward() - input = input.cpu() + input1 = input1.cpu() def test_sigmoid_backward_shape_format_fp16(self, device): format_list = [0] diff --git a/test/test_network_ops/test_stack.py b/test/test_network_ops/test_stack.py index 135cb870906..14b32b8a681 100644 --- a/test/test_network_ops/test_stack.py +++ b/test/test_network_ops/test_stack.py @@ -42,7 +42,7 @@ class TestStack(TestCase): output = output.numpy() return output - def npu_output_size(self, inputs = [], dim = 0): + def npu_output_size(self, inputs, dim = 0): shape = [] for i in range(dim): shape.append(inputs[0].size(i)) diff --git a/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp b/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp index a1e90de796a..5e3d393ed32 100644 --- a/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp @@ -115,7 +115,7 @@ at::Tensor NPUNativeFunctions::floor_divide(const at::Tensor& self, at::Scalar o } at::Tensor& NPUNativeFunctions::floor_divide_(at::Tensor& self, const at::Tensor& other) { - Tensor otherCast = other; + at::Tensor otherCast = other; if (other.scalar_type() == at::ScalarType::Double) { otherCast = otherCast.to(at::ScalarType::Float); } diff --git a/torch_npu/csrc/aten/ops/SigmoidBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/SigmoidBackwardKernelNpu.cpp index a304d82ee83..fdd644c05bd 100644 --- a/torch_npu/csrc/aten/ops/SigmoidBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SigmoidBackwardKernelNpu.cpp @@ -24,11 +24,6 @@ at::Tensor& sigmoid_backward_out_npu_nocheck( at::Tensor& result, const at::Tensor& grad_output, const at::Tensor& output) { - // output'format must be same with grad_output - if (CalcuOpUtil::get_tensor_npu_format(output) != CalcuOpUtil::get_tensor_npu_format(grad_output)) { - output.npu_format_cast_(CalcuOpUtil::get_tensor_npu_format(grad_output)); - } - auto unified_result = OpPreparation::binary_op_check(result, output, grad_output, true); OpCommand cmd; cmd.Name("SigmoidGrad") diff --git a/torch_npu/csrc/aten/ops/StackKernelNpu.cpp b/torch_npu/csrc/aten/ops/StackKernelNpu.cpp index 1b776a6308e..7ad05515d45 100644 --- a/torch_npu/csrc/aten/ops/StackKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/StackKernelNpu.cpp @@ -20,11 +20,11 @@ namespace at_npu { namespace native { -SmallVector stack_npu_output_size( +at::SmallVector stack_npu_output_size( at::TensorList tensors, int64_t dim) { dim = CalcuOpUtil::make_wrap_dim(dim, tensors[0].dim() + 1); - SmallVector shape; + at::SmallVector shape; for (int i = 0; i < dim; i++) { shape.emplace_back(tensors[0].size(i)); } diff --git a/torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp b/torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp index 00e37458c1b..173ec561023 100644 --- a/torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ZerosKernelNpu.cpp @@ -39,8 +39,8 @@ at::Tensor NPUNativeFunctions::zeros(at::IntArrayRef size, } at::Tensor NPUNativeFunctions::zeros( - IntArrayRef size, - optional names, + at::IntArrayRef size, + c10::optional names, c10::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, -- Gitee From 43fd218dc26e8d3a86c3b9c7022a2956c5d45faa Mon Sep 17 00:00:00 2001 From: wangxiao Date: Mon, 28 Feb 2022 10:18:12 +0800 Subject: [PATCH 3/3] fix bugs of floordiv, statck --- torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp | 4 ++-- torch_npu/csrc/aten/ops/StackKernelNpu.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp b/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp index 5e3d393ed32..0e8bc5d1c27 100644 --- a/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/FloorDivideKernelNpu.cpp @@ -122,8 +122,8 @@ at::Tensor& NPUNativeFunctions::floor_divide_(at::Tensor& self, const at::Tensor if (other.scalar_type() == at::ScalarType::Long) { otherCast = otherCast.to(at::ScalarType::Int); } - SmallVector inputs = {self, otherCast}; - SmallVector outputs = {self}; + at::SmallVector inputs = {self, otherCast}; + at::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); if (!NpuUtils::check_match(&self)) { diff --git a/torch_npu/csrc/aten/ops/StackKernelNpu.cpp b/torch_npu/csrc/aten/ops/StackKernelNpu.cpp index 7ad05515d45..cf54ed20af7 100644 --- a/torch_npu/csrc/aten/ops/StackKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/StackKernelNpu.cpp @@ -42,7 +42,7 @@ at::Tensor& stack_out_npu_nocheck(at::TensorList tensors, int64_t dim, at::Tenso OpCommand cmd; cmd.Name("Pack"); for (int i = 0; i < inputTensors.size(); i++) { - string inputName = "x" + to_string(i); + string inputName = "x" + std::to_string(i); cmd.Input(inputTensors[i],inputName); } cmd.Output(result) -- Gitee