From 4e1831ef1e10261037a2ebf5da7f2e02d11be070 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E8=B6=85?= <xwqs123@163.com>
Date: Tue, 22 Mar 2022 21:23:50 +0800
Subject: [PATCH] bugfix for asstride in small-amount data copying

---
 .../test_special_cases_copy_to_contiguous.py  | 21 ++++++++++++-------
 torch_npu/csrc/aten/common/CopyKernel.cpp     |  3 +--
 torch_npu/csrc/aten/common/CopyKernelNpu.cpp  |  2 ++
 .../csrc/aten/ops/AsStridedKernelNpu.cpp      |  7 +++++++
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py b/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py
index b4a6b62ebd..0d311808c8 100644
--- a/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py
+++ b/test/test_trans_contiguous/test_special_cases_copy_to_contiguous.py
@@ -23,13 +23,20 @@ os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
 
 class TestSpecialCasesCopyToContiguous(TestCase):
-    def test_expand_copy_to_slice_tensor(self, device="npu"):
-        cpu_input = torch.zeros((2, 10)).bool()
-        cpu_out = cpu_input
-        cpu_out[0, :3] = True
-        npu_out = cpu_input.npu()
-        npu_out[0, :3] = True
-        self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())
+    def test_expand_copy_to_slice_discontiguous_tensor(self, device="npu"):
+        dtype_list = [np.bool, np.int8, np.int16, np.float16, np.float32, np.int32, np.int64]
+        index_list = [3, 8, 16, 32]
+        shape_format = [
+            [i, j] for i in dtype_list for j in index_list
+        ]
+        for item in shape_format: 
+            np_input = np.zeros(40).astype(item[0])
+            cpu_input = torch.from_numpy(np_input)
+            cpu_out = cpu_input
+            cpu_out[:item[1]] = 1
+            npu_out = cpu_input.npu()
+            npu_out[:item[1]] = 1
+            self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())
            
 
 if __name__ == "__main__":
diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp
index 03dd5cc719..54496ce706 100644
--- a/torch_npu/csrc/aten/common/CopyKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernel.cpp
@@ -64,7 +64,6 @@ void copy_d2d_last_method(
     bool same_type,
     bool non_blocking) {
   // general copy method but Low performance
-  RECORD_FUNCTION("d2dCopyWithPTCopy", std::vector<c10::IValue>({src}));
   copy_kernel_npu(self, src, non_blocking);
 }
 
@@ -85,7 +84,7 @@ void copy_d2d_dtype_baseformat(
     if (TransContiguous::ContiguousOptimizeWithBaseFormat(self, src)) {
       // Optimized trans-contiguous method
       return;
-    } else if (StorageDescHelper::MetaDataAreMatch(&self)) {
+    } else {
       // General trans-contiguous method
       // Note: AsStrided do not support unmatched tensor input
       NPUNativeFunctions::npu_stride_copy_out(src, src.sizes(), src.strides(), src.storage_offset(), self);
diff --git a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
index 2fe3560a7d..bc285234be 100644
--- a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
@@ -18,6 +18,7 @@
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h"
 #include <c10/npu/interface/AsyncTaskQueueInterface.h>
+#include <ATen/record_function.h>
 
 namespace at_npu {
 namespace native {
@@ -46,6 +47,7 @@ void copy_kernel_npu(
     at::Tensor& self,
     const at::Tensor& src,
     bool non_blocking) {
+  RECORD_FUNCTION("d2dCopyWithPTCopy", std::vector<c10::IValue>({src}));
   const int64_t HEAD_FLAG = 0x6461656800000000;
   const int64_t FIXED_LEN =
       9; // head, len, version, two tensors' numel, offset and strides lens
diff --git a/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp b/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp
index 3fb30d4b22..248e4985e6 100644
--- a/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AsStridedKernelNpu.cpp
@@ -15,6 +15,8 @@
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include <ATen/record_function.h>
+#include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h"
+#include "torch_npu/csrc/framework/StorageDescHelper.h"
 
 namespace at_npu {
 namespace native {
@@ -25,6 +27,11 @@ at::Tensor& stride_copy_out_npu_nocheck(
     at::IntArrayRef shape,
     at::IntArrayRef stride,
     at::Scalar storage_offset) {
+  if ((result.nbytes() < 32) && (!StorageDescHelper::MetaDataAreMatch(&result))) {
+    // [算子限制] 对于1. 小于一个block(32B)的数据搬运 2.result不match，Astrided暂不支持。
+    copy_kernel_npu(result, self, false);
+    return result;
+  }
   RECORD_FUNCTION("npuAsStrided", std::vector<c10::IValue>({self}));
   OpCommand cmd;
   cmd.Name("AsStrided")
-- 
Gitee