diff --git a/test/test_network_ops/test_grid_sampler_2d.py b/test/test_network_ops/test_grid_sampler_2d.py index 9ea285d514a0c4a80c652d0c28d915cac455538b..eba453672825b78c8080c18f1657515fb3bf00ac 100644 --- a/test/test_network_ops/test_grid_sampler_2d.py +++ b/test/test_network_ops/test_grid_sampler_2d.py @@ -65,12 +65,11 @@ class TestGridSampler2D(TestCase): ] for item in shape_format: cpu_input, npu_input = create_common_tensor(item[0], 1, 100) - cpu_grid, npu_grid = create_common_tensor(item[1], -1, 1) + cpu_grid, npu_grid = create_common_tensor(item[1], -3, 3) cpu_output = self.cpu_op_fp16_exec(cpu_input, cpu_grid) npu_output = self.npu_op_exec(npu_input, npu_grid) self.assertRtolEqual(cpu_output, npu_output) - if __name__ == "__main__": run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_scatter.py b/test/test_network_ops/test_scatter.py index 1936d3761b2035b2c6073afe29bee2c083a15739..f377284bc0d87a6452a1ccaeecfee6d250aca096 100644 --- a/test/test_network_ops/test_scatter.py +++ b/test/test_network_ops/test_scatter.py @@ -51,7 +51,7 @@ class TestScatter(TestCase): input1 = input1.cpu() return input1.numpy() - def test_scatter_shape_format(self, device="npu"): + def test_scatter_shape_format(self): shape_format = [ [0, [3, 5], [np.float32, 0, [2, 5]]], [0, [3, 5], [np.float32, 3, [2, 5]]], @@ -86,6 +86,20 @@ class TestScatter(TestCase): npu_output = self.npu_op_exec_inplace(item[1], item[0], index, 1.23, False) self.assertRtolEqual(cpu_output, npu_output) + def test_scatter_debug(self): + a = np.random.uniform(-2,2,(31, 43, 41, 97)).astype(np.float16) + b = np.random.uniform(0,30,(31, 43, 41, 97)).astype(np.int32) + c = np.random.uniform(-2,2,(31, 43, 41, 97)).astype(np.float16) + ca = torch.from_numpy(a) + cb = torch.from_numpy(b).long() + cc = torch.from_numpy(c) + na = ca.npu() + nb = cb.npu() + nc = cc.npu() + dim = 0 + cpu_output = torch.scatter(ca, dim, cb, cc) + npu_output = torch.scatter(na, dim, nb, nc) + self.assertRtolEqual(cpu_output, npu_output.cpu()) if __name__ == "__main__": run_tests() diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index 13191fa133597d53e774aec194a99dc40d2e68be..03dd5cc71934965f8f625b9ab2c53e2a010ddac5 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -249,7 +249,7 @@ void copy_d2h_baseformat(at::Tensor& dst, const at::Tensor& src, bool non_blocki void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) { if (!FormatHelper::IsBaseFormatType(self)) { - at::Tensor dst = OpPreparation::ApplyTensor(self); + at::Tensor dst = OpPreparation::ApplyTensorWithSizes(self.sizes(), self.options()); copy_h2d_baseformat(dst, src, non_blocking, true); NPUNativeFunctions::npu_format_cast_(self, dst); return; diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp index 756018645026337bac14019313689dc1d0cb5ebb..b71f65b9c41ea6125cfafcfb829c445d7f02a43b 100644 --- a/torch_npu/csrc/aten/common/TensorFactories.cpp +++ b/torch_npu/csrc/aten/common/TensorFactories.cpp @@ -309,12 +309,10 @@ namespace at_npu c10::optional optional_memory_format) { - c10::TensorOptions options; - auto device = device_or_default(device_opt); - options = options.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); return at_npu::native::empty_like_npu(self, options, optional_memory_format); } @@ -419,12 +417,10 @@ namespace at_npu int64_t dst_format) { caffe2::TypeMeta dtype = c10::scalarTypeToTypeMeta(dtype_or_default(dtype_opt)); - c10::TensorOptions options; - auto device = device_or_default(device_opt); - options = options.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); at::Tensor result = OpPreparation::ApplyTensorWithFormat(size, options, dst_format); if (names.has_value()) { @@ -514,12 +510,10 @@ namespace at_npu c10::optional pin_memory_opt) { - c10::TensorOptions options; - auto device = device_or_default(device_opt); - options = options.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); window_function_checks("blackman_window", options, window_length); if (window_length == 0) @@ -558,12 +552,10 @@ namespace at_npu c10::optional pin_memory_opt) { - c10::TensorOptions options; - auto device = device_or_default(device_opt); - options = options.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); window_function_checks("bartlett_window", options, window_length); if (window_length == 0) @@ -604,12 +596,10 @@ namespace at_npu c10::optional pin_memory_opt) { - c10::TensorOptions options; - auto device = device_or_default(device_opt); - options = options.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); window_function_checks("hann_window", options, window_length); return at::hamming_window(window_length, periodic, 0.5, 0.5, options); @@ -636,12 +626,10 @@ namespace at_npu c10::optional pin_memory_opt) { - c10::TensorOptions options; - auto device = device_or_default(device_opt); - options = options.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); window_function_checks("hamming_window", options, window_length); if (window_length == 0) @@ -753,12 +741,10 @@ namespace at_npu c10::optional device_opt, c10::optional pin_memory_opt) { - c10::TensorOptions options; - auto device = device_or_default(device_opt); - options = options.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); TORCH_CHECK( options.layout() != at::kSparse, "full(...) is not implemented for sparse layout"); diff --git a/torch_npu/csrc/aten/ops/ArangeKernelNpu.cpp b/torch_npu/csrc/aten/ops/ArangeKernelNpu.cpp index 494594327d438733d23a9a6ebb0005c747e13e20..31e6acafc9ae8f104f4f18ee0ecf0a2fb4b32a53 100644 --- a/torch_npu/csrc/aten/ops/ArangeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ArangeKernelNpu.cpp @@ -58,12 +58,10 @@ at::Tensor NPUNativeFunctions::arange( c10::optional device_opt, c10::optional pin_memory_opt) { - auto device = device_or_default(device_opt); - at::TensorOptions option; - option = option.dtype(dtype_opt) - .layout(layout_opt) - .device(device) - .pinned_memory(pin_memory_opt); + c10::TensorOptions option = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); float start_value = CalcuOpUtil::get_scalar_float_value(start); float end_value = CalcuOpUtil::get_scalar_float_value(end); diff --git a/torch_npu/csrc/aten/ops/FullKernelNpu.cpp b/torch_npu/csrc/aten/ops/FullKernelNpu.cpp index 8f644cbc0f9e59dd62593602c1b3505e305dfc07..f2fe8936c329c022eb12d94d0832380b714a5afe 100644 --- a/torch_npu/csrc/aten/ops/FullKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/FullKernelNpu.cpp @@ -37,12 +37,10 @@ at::Tensor NPUNativeFunctions::full( c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - c10::TensorOptions option; - auto device = device_or_default(device_opt); - option = option.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions option = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); at::Tensor result = OpPreparation::ApplyTensorWithSizes(size, option); return result.fill_(fill_value); } diff --git a/torch_npu/csrc/aten/ops/GridSampler2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/GridSampler2dKernelNpu.cpp index 674479dbe23b3eb768c83d7018e2c789e3a8118f..38e32044c8822b1177aab2c7b8fb50e23c989369 100644 --- a/torch_npu/csrc/aten/ops/GridSampler2dKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/GridSampler2dKernelNpu.cpp @@ -49,16 +49,13 @@ at::Tensor NPUNativeFunctions::grid_sampler_2d( at::Tensor result = OpPreparation::ApplyTensorWithFormat(dtypeCastOfSelf, outputSize, ACL_FORMAT_ND); - c10::SmallVectorinterMode = {"bilinear", "nearest", "bicubic"}; - c10::SmallVectorpaddingMode = {"zeros", "border", "reflection"}; - OpCommand cmd; cmd.Name("GridSampler2D") .Input(dtypeCastOfSelf) .Input(dtypeCastOfGrid) .Output(result) - .Attr("interpolation_mode", interMode[interpolation_mode]) - .Attr("padding_mode", paddingMode[padding_mode]) + .Attr("interpolation_mode", interpolation_mode) + .Attr("padding_mode", padding_mode) .Attr("align_corners", align_corners) .Run(); @@ -68,6 +65,5 @@ at::Tensor NPUNativeFunctions::grid_sampler_2d( } return result; } - } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/aten/ops/RangeKernelNpu.cpp b/torch_npu/csrc/aten/ops/RangeKernelNpu.cpp index df65f925d39f9a84819d0c68a1199f45a1bc5451..c8ea4f2e408cdf6ef0e26cf1bd3c164551647280 100644 --- a/torch_npu/csrc/aten/ops/RangeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/RangeKernelNpu.cpp @@ -55,12 +55,10 @@ at::Tensor NPUNativeFunctions::range( c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - auto device = device_or_default(device_opt); - c10::TensorOptions option; - option = option.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions option = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); return at::range(start, end, 1, option); } @@ -72,12 +70,10 @@ at::Tensor NPUNativeFunctions::range( c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - auto device = device_or_default(device_opt); - c10::TensorOptions option; - option = option.dtype(dtype_opt) - .device(device) - .layout(layout_opt) - .pinned_memory(pin_memory_opt); + c10::TensorOptions option = c10::TensorOptions().dtype(dtype_opt) + .device(device_opt) + .layout(layout_opt) + .pinned_memory(pin_memory_opt); float start_value = CalcuOpUtil::get_scalar_float_value(start); float end_value = CalcuOpUtil::get_scalar_float_value(end); diff --git a/torch_npu/csrc/aten/ops/ScatterKernelNpu.cpp b/torch_npu/csrc/aten/ops/ScatterKernelNpu.cpp index 0fad28bcd3ea1eaa3c6dfd40822d6489924ba8c0..8a32d4da902920e72d87bd547c51d8299ac1a18d 100644 --- a/torch_npu/csrc/aten/ops/ScatterKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ScatterKernelNpu.cpp @@ -41,7 +41,7 @@ at::Tensor& scatter_npu_src_impl( at::Tensor& self, int64_t dim, const at::Tensor& index_ex, - const at::Tensor& src) { + const at::Tensor& src_ex) { at::ScalarType selfType = self.scalar_type(); if (selfType == at::ScalarType::Half) { self = NPUNativeFunctions::npu_dtype_cast(self, at::ScalarType::Float); @@ -52,6 +52,11 @@ at::Tensor& scatter_npu_src_impl( index = NPUNativeFunctions::npu_dtype_cast(index, at::ScalarType::Float); } + at::Tensor src(src_ex); + if (src.scalar_type() != self.scalar_type()) { + src = NPUNativeFunctions::npu_dtype_cast(src, self.scalar_type()); + } + if (!NpuUtils::check_match(&self)) { at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); @@ -64,7 +69,6 @@ at::Tensor& scatter_npu_src_impl( if(self.scalar_type() != selfType){ self = NPUNativeFunctions::npu_dtype_cast(self, at::ScalarType::Half); } - return self; } @@ -74,10 +78,6 @@ at::Tensor& NPUNativeFunctions::scatter_( const at::Tensor& index_ex, const at::Tensor& src_ex) { at::Tensor src(src_ex); - if (src.scalar_type() != self.scalar_type()) { - src = NPUNativeFunctions::npu_dtype_cast(src, self.scalar_type()); - } - scatter_npu_src_impl(self, dim, index_ex, src); return self; } @@ -90,11 +90,6 @@ at::Tensor& NPUNativeFunctions::scatter_( at::Tensor srcTensor = scalar_to_tensor(src).to(at::ScalarType::Float); srcTensor = CalcuOpUtil::copy_tensor_host_to_device(srcTensor); at::Tensor srcTensor_broadcast = NPUNativeFunctions::npu_broadcast(srcTensor, array_to_small_vector(index_ex.sizes())); - - if (srcTensor_broadcast.scalar_type() != self.scalar_type()) { - srcTensor_broadcast = NPUNativeFunctions::npu_dtype_cast(srcTensor_broadcast, self.scalar_type()); - } - scatter_npu_src_impl(self, dim, index_ex, srcTensor_broadcast); return self; } diff --git a/torch_npu/csrc/aten/ops/TakeKernelNpu.cpp b/torch_npu/csrc/aten/ops/TakeKernelNpu.cpp index f00ecc6d046d7713e72bd65a27580c17b15e63a4..f088f621d8d4b74f60e1726036389c4f77149072 100644 --- a/torch_npu/csrc/aten/ops/TakeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TakeKernelNpu.cpp @@ -19,42 +19,40 @@ namespace at_npu { namespace native { -c10::SmallVector take_npu_input( - const c10::SmallVector& inputTensor) { - at::Tensor contiguousTensor; - c10::SmallVector inputs; - - for (int i = 0; i < inputTensor.size(); i++) { - if (i == 0) { - int64_t input_size = 1; - at::Tensor input_tensor = inputTensor[i].reshape(-1); - contiguousTensor = NpuUtils::format_contiguous(input_tensor); - } else { - contiguousTensor = NpuUtils::format_contiguous(inputTensor[i]); - } - inputs.emplace_back(NPUTensorDesc(contiguousTensor)); - } - return inputs; -} - -c10::SmallVector take_npu_output(const c10::SmallVector& outputTensor) { - return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor); -} +at::Tensor& take_out_nocheck(const at::Tensor& self, const at::Tensor& index, at::Tensor& result) { + at::Tensor input_tensor = self.reshape(-1); + at::Tensor contiguousSelf = NpuUtils::format_contiguous(input_tensor); + at::Tensor contiguousIndex = NpuUtils::format_contiguous(index); -c10::SmallVector take_npu_attr(const at::Tensor& self) { - NPUAttrDesc npuAttrValidateIndices = NPUAttrDesc("validate_indices", false); - c10::SmallVector attrs = {npuAttrValidateIndices}; - return attrs; + OpCommand cmd; + cmd.Name("Gather") + .Input(contiguousSelf) + .Input(contiguousIndex) + .Output(result) + .Attr("validate_indices", false) + .Run(); + + return result; } at::Tensor& NPUNativeFunctions::take_out(const at::Tensor& self, const at::Tensor& index, at::Tensor& result) { - // constructs the input and output NPUTensorDesc - auto inputs = take_npu_input({self,index}); - auto outputs = take_npu_output({result}); - // constructs the attr of the NPUAttrDesc - auto attrs = take_npu_attr(self); - // executing the NPU operator - CalcuOpUtil::execute_npu_operate("Gather", inputs, outputs, attrs); + // calculate the output size + auto outputSize = input_same_output_size(index); + + OpPreparation::CheckOut( + {self, index}, + result, + self, + outputSize); + + if (!NpuUtils::check_match(&result)) { + at::Tensor contiguousResult = NpuUtils::format_contiguous(result); + take_out_nocheck(self, index, contiguousResult); + NpuUtils::format_fresh_view(result, contiguousResult); + } else { + take_out_nocheck(self, index, result); + } + return result; } @@ -67,7 +65,8 @@ at::Tensor NPUNativeFunctions::take(const at::Tensor& self, const at::Tensor& in outputSize, self.options()); - NPUNativeFunctions::take_out(self, index, result); + take_out_nocheck(self, index, result); + return result; } } // namespace native diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index 304138f66428410ca01b2130795a5329f2cbc18c..967b92859911b3cd88c2b5b47bab1ae1aff166e0 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -375,7 +375,16 @@ namespace at_npu } } - void ReleaseFunc(void* ptr, c10::npu::ReleaseQueue& releaseQueue) + void ReleaseFunc(void* ptr, c10::npu::ReleaseQueue& releaseQueue) { + auto queueParam = static_cast(ptr); + auto type = queueParam->paramType; + if (type == c10::npu::queue::COMPILE_AND_EXECUTE) { + auto cur_paras = static_cast(queueParam->paramVal); + cur_paras->Release(); + } + } + + void ReleaseFunc_(void* ptr, c10::npu::ReleaseQueue& releaseQueue) { releaseQueue.PushToReleaseQueue(ptr); } diff --git a/torch_npu/utils/tensor_methods.py b/torch_npu/utils/tensor_methods.py index 0839be9d8e3be927a58e91577a2557658c1b1b9f..0bdbe7e685429b8131ee72d4730051320b8b3be6 100644 --- a/torch_npu/utils/tensor_methods.py +++ b/torch_npu/utils/tensor_methods.py @@ -52,6 +52,10 @@ def one_(self): warnings.warn(warning_str.format("one_")) return torch_npu.one_(self) +def npu_confusion_transpose(self, perm, shape, transpose_first): + warnings.warn(warning_str.format("npu_confusion_transpose")) + return torch_npu.npu_confusion_transpose(self, perm, shape, transpose_first) + def add_tensor_methods(): torch.Tensor.npu_format_cast_ = npu_format_cast_ @@ -59,4 +63,5 @@ def add_tensor_methods(): torch.Tensor.npu_dtype_cast = npu_dtype_cast torch.Tensor.npu_dtype_cast_ = npu_dtype_cast_ torch.Tensor.copy_memory_ = copy_memory_ - torch.Tensor.one_ = one_ \ No newline at end of file + torch.Tensor.one_ = one_ + torch.Tensor.npu_confusion_transpose = npu_confusion_transpose \ No newline at end of file