From 4e1831ef1e10261037a2ebf5da7f2e02d11be070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E8=B6=85?= Date: Tue, 22 Mar 2022 21:23:50 +0800 Subject: [PATCH] bugfix for asstride in small-amount data copying --- .../test_special_cases_copy_to_contiguous.py | 21 ++++++++++++------- torch_npu/csrc/aten/common/CopyKernel.cpp | 3 +-- torch_npu/csrc/aten/common/CopyKernelNpu.cpp | 2 ++ .../csrc/aten/ops/AsStridedKernelNpu.cpp | 7 +++++++ 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py b/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py index b4a6b62ebd..0d311808c8 100644 --- a/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py +++ b/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py @@ -23,13 +23,20 @@ os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization class TestSpecialCasesCopyToContiguous(TestCase): - def test_expand_copy_to_slice_tensor(self, device="npu"): - cpu_input = torch.zeros((2, 10)).bool() - cpu_out = cpu_input - cpu_out[0, :3] = True - npu_out = cpu_input.npu() - npu_out[0, :3] = True - self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy()) + def test_expand_copy_to_slice_discontiguous_tensor(self, device="npu"): + dtype_list = [np.bool, np.int8, np.int16, np.float16, np.float32, np.int32, np.int64] + index_list = [3, 8, 16, 32] + shape_format = [ + [i, j] for i in dtype_list for j in index_list + ] + for item in shape_format: + np_input = np.zeros(40).astype(item[0]) + cpu_input = torch.from_numpy(np_input) + cpu_out = cpu_input + cpu_out[:item[1]] = 1 + npu_out = cpu_input.npu() + npu_out[:item[1]] = 1 + self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy()) if __name__ == "__main__": diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index 03dd5cc719..54496ce706 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -64,7 +64,6 @@ void copy_d2d_last_method( bool same_type, bool non_blocking) { // general copy method but Low performance - RECORD_FUNCTION("d2dCopyWithPTCopy", std::vector({src})); copy_kernel_npu(self, src, non_blocking); } @@ -85,7 +84,7 @@ void copy_d2d_dtype_baseformat( if (TransContiguous::ContiguousOptimizeWithBaseFormat(self, src)) { // Optimized trans-contiguous method return; - } else if (StorageDescHelper::MetaDataAreMatch(&self)) { + } else { // General trans-contiguous method // Note: AsStrided do not support unmatched tensor input NPUNativeFunctions::npu_stride_copy_out(src, src.sizes(), src.strides(), src.storage_offset(), self); diff --git a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp index 2fe3560a7d..bc285234be 100644 --- a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp @@ -18,6 +18,7 @@ #include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h" #include +#include namespace at_npu { namespace native { @@ -46,6 +47,7 @@ void copy_kernel_npu( at::Tensor& self, const at::Tensor& src, bool non_blocking) { + RECORD_FUNCTION("d2dCopyWithPTCopy", std::vector({src})); const int64_t HEAD_FLAG = 0x6461656800000000; const int64_t FIXED_LEN = 9; // head, len, version, two tensors' numel, offset and strides lens diff --git a/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp b/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp index 3fb30d4b22..248e4985e6 100644 --- a/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp @@ -15,6 +15,8 @@ #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include +#include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h" +#include "torch_npu/csrc/framework/StorageDescHelper.h" namespace at_npu { namespace native { @@ -25,6 +27,11 @@ at::Tensor& stride_copy_out_npu_nocheck( at::IntArrayRef shape, at::IntArrayRef stride, at::Scalar storage_offset) { + if ((result.nbytes() < 32) && (!StorageDescHelper::MetaDataAreMatch(&result))) { + // [算子限制] 对于1. 小于一个block(32B)的数据搬运 2.result不match,Astrided暂不支持。 + copy_kernel_npu(result, self, false); + return result; + } RECORD_FUNCTION("npuAsStrided", std::vector({self})); OpCommand cmd; cmd.Name("AsStrided") -- Gitee