diff --git a/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py b/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py index b4a6b62ebd73368d3d446e4db9901a283a9a4ef3..0d311808c8a6b2b1db3a862070e917389b36c404 100644 --- a/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py +++ b/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py @@ -23,13 +23,20 @@ os.environ["COMBINED_ENABLE"] = "1" # Open combined-view cases optimization class TestSpecialCasesCopyToContiguous(TestCase): - def test_expand_copy_to_slice_tensor(self, device="npu"): - cpu_input = torch.zeros((2, 10)).bool() - cpu_out = cpu_input - cpu_out[0, :3] = True - npu_out = cpu_input.npu() - npu_out[0, :3] = True - self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy()) + def test_expand_copy_to_slice_discontiguous_tensor(self, device="npu"): + dtype_list = [np.bool, np.int8, np.int16, np.float16, np.float32, np.int32, np.int64] + index_list = [3, 8, 16, 32] + shape_format = [ + [i, j] for i in dtype_list for j in index_list + ] + for item in shape_format: + np_input = np.zeros(40).astype(item[0]) + cpu_input = torch.from_numpy(np_input) + cpu_out = cpu_input + cpu_out[:item[1]] = 1 + npu_out = cpu_input.npu() + npu_out[:item[1]] = 1 + self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy()) if __name__ == "__main__": diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index 03dd5cc71934965f8f625b9ab2c53e2a010ddac5..54496ce7068dd942a4b46f736ef53230a7deca8b 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -64,7 +64,6 @@ void copy_d2d_last_method( bool same_type, bool non_blocking) { // general copy method but Low performance - RECORD_FUNCTION("d2dCopyWithPTCopy", std::vector({src})); copy_kernel_npu(self, src, non_blocking); } @@ -85,7 +84,7 @@ void copy_d2d_dtype_baseformat( if (TransContiguous::ContiguousOptimizeWithBaseFormat(self, src)) { // Optimized trans-contiguous method return; - } else if (StorageDescHelper::MetaDataAreMatch(&self)) { + } else { // General trans-contiguous method // Note: AsStrided do not support unmatched tensor input NPUNativeFunctions::npu_stride_copy_out(src, src.sizes(), src.strides(), src.storage_offset(), self); diff --git a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp index 2fe3560a7de42caff7e479f916a0ac42cc754756..bc285234be0c98f9346148fdc8c3499e334ab92c 100644 --- a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp @@ -18,6 +18,7 @@ #include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h" #include +#include namespace at_npu { namespace native { @@ -46,6 +47,7 @@ void copy_kernel_npu( at::Tensor& self, const at::Tensor& src, bool non_blocking) { + RECORD_FUNCTION("d2dCopyWithPTCopy", std::vector({src})); const int64_t HEAD_FLAG = 0x6461656800000000; const int64_t FIXED_LEN = 9; // head, len, version, two tensors' numel, offset and strides lens diff --git a/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp b/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp index 3fb30d4b22dc379b82dfb5b0ff8cddba21523f9e..248e4985e611f6927d5c5f2ef25316fb88e46f08 100644 --- a/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp @@ -15,6 +15,8 @@ #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include +#include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h" +#include "torch_npu/csrc/framework/StorageDescHelper.h" namespace at_npu { namespace native { @@ -25,6 +27,11 @@ at::Tensor& stride_copy_out_npu_nocheck( at::IntArrayRef shape, at::IntArrayRef stride, at::Scalar storage_offset) { + if ((result.nbytes() < 32) && (!StorageDescHelper::MetaDataAreMatch(&result))) { + // [算子限制] 对于1. 小于一个block(32B)的数据搬运 2.result不match,Astrided暂不支持。 + copy_kernel_npu(result, self, false); + return result; + } RECORD_FUNCTION("npuAsStrided", std::vector({self})); OpCommand cmd; cmd.Name("AsStrided")