diff --git a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md index 48ddd588986a17becfa7645d7a5c905a05cb490b..9ca63313afc7b4d46165f502210713f8b033e924 100644 --- a/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md +++ b/docs/en/PyTorch Operator Development Guide/PyTorch Operator Development Guide.md @@ -515,7 +515,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a Scalar other_c1_offset( other.storage_offset() / (other.size(2) * other.size(3) * c0_len)); Scalar stride_len(self.size(1) / c0_len); - Tensor result = at::npu_stride_add( + Tensor result = NPUNativeFunctions::npu_stride_add( self_use, other_use, self_c1_offset, other_c1_offset, stride_len); return result; } @@ -524,7 +524,8 @@ The following uses the torch.add\(\) operator as an example to describe how to a auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + at::Tensor result = (self, outputSize, npu_format); + Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -541,7 +542,7 @@ The following uses the torch.add\(\) operator as an example to describe how to a // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" index f48ac3586507cd1602f7a8259e1508bbe3d2cac9..e2f6a2c9fadac7965ee26db268036d50df760b6b 100644 --- "a/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227/PyTorch\347\256\227\345\255\220\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -515,7 +515,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 Scalar other_c1_offset( other.storage_offset() / (other.size(2) * other.size(3) * c0_len)); Scalar stride_len(self.size(1) / c0_len); - Tensor result = at::npu_stride_add( + Tensor result = NPUNativeFunctions::npu_stride_add( self_use, other_use, self_c1_offset, other_c1_offset, stride_len); return result; } @@ -524,7 +524,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -541,7 +541,7 @@ PyTorch算子开发包含TBE算子开发和PyTorch框架下的算子适配。 // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py index 7f9e5e4d0a11d208f3547c9043b071551011f634..52585e231104ab9daa514f5ea9831110172df118 100644 --- a/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py +++ b/test/test_network_ops/test_batchnorm_gather_stats_with_counts.py @@ -47,7 +47,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase): input1 = np.array(data).astype(dtype) npu_counts = torch.from_numpy(input1).to("npu:0") if npu_format != -1: - npu_counts = npu_counts.npu_format_cast(npu_format) + npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format) return npu_counts def create_counts_tensor16(self, item): @@ -58,7 +58,7 @@ class TestBatchNormGatherStatsWithCounts(TestCase): input1 = np.array(data).astype(dtype) npu_counts = torch.from_numpy(input1).to("npu:0") if npu_format != -1: - npu_counts = npu_counts.npu_format_cast(npu_format) + npu_counts = torch_npu.npu_format_cast(npu_counts, npu_format) return npu_counts def test_batch_norm_gather_stats_with_counts(self, device): diff --git a/test/test_network_ops/test_uniform_.py b/test/test_network_ops/test_uniform_.py index 893adf140e34b82bb03b8732ecf7c9becf3224e4..de4a3a96697a825ce0f10b9cfddb1d427f283de6 100644 --- a/test/test_network_ops/test_uniform_.py +++ b/test/test_network_ops/test_uniform_.py @@ -39,7 +39,7 @@ class TestUniform(TestCase): for item in shape_format: input1 = torch.zeros(item[0], dtype=item[3]).npu() - input1.npu_format_cast(3) + input1 = torch_npu.npu_format_cast(input1, 3) input1.uniform_(item[1], item[2]) self.assertTrue(item[1] <= input1.min()) self.assertTrue(item[2] >= input1.max()) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index cedd54404fc7b8ddf019ccc4259a5594e2b8dc4b..de976de73043c85eb346f6eeb33614fe4c9cd821 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -24,6 +24,8 @@ import torch_npu.npu.amp import torch_npu.distributed import torch_npu._C +from torch_npu.utils import nn_monkey_patches, apply_module_patch + from .version import __version__ as __version__ __all__ = [] @@ -35,16 +37,20 @@ for name in dir(torch_npu._C._VariableFunctions): globals()[name] = getattr(torch_npu._C._VariableFunctions, name) __all__.append(name) +all_monkey_patches = [ + ["npu", torch_npu.npu], + ["npu.amp", torch_npu.npu.amp], + ["autograd.profiler", torch_npu.npu.profiler], + ["distributed", torch_npu.distributed], + ["distributed.distributed_c10d", torch_npu.distributed.distributed_c10d], + ["nn.parallel.distributed._get_default_group", torch_npu.distributed.distributed_c10d._get_default_group] +] + +all_monkey_patches += nn_monkey_patches + -def _apply_patches(): - monkey_patches = [ - ["npu", torch_npu.npu], - ["npu.amp", torch_npu.npu.amp], - ["autograd.profiler", torch_npu.npu.profiler], - ["distributed", torch_npu.distributed], - ["distributed.distributed_c10d", torch_npu.distributed.distributed_c10d], - ["nn.parallel.distributed._get_default_group", torch_npu.distributed.distributed_c10d._get_default_group] - ] +def _apply_patches(monkey_patches): + def _getattr(module_list, root_module=torch): if len(module_list) <= 1: return root_module @@ -54,7 +60,7 @@ def _apply_patches(): else: empty_module_name = f'{root_module.__name__}.{module_list[0]}' sys.modules[empty_module_name] = types.ModuleType(empty_module_name) - setattr(root_module, module_list[0], sys.modules[empty_module_name]) + setattr(root_module, module_list[0], sys.modules.get(empty_module_name)) return _getattr(module_list[1:], getattr(root_module, module_list[0])) for patch_pair in monkey_patches: @@ -76,7 +82,8 @@ def _apply_patches(): setattr(dest_module, attr, getattr(patch, attr)) # Apply monkey-patches. -_apply_patches() +_apply_patches(all_monkey_patches) +apply_module_patch() # NPU exit, need to synchronize devices def _npu_shutdown(): diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index 6bc360ae5bc1360064c35195fda6f612c1116470..e511fb30dd288f0ed08752de8fce9ba72d3ddc97 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -188,7 +188,7 @@ void copy_d2d_dtype_format(at::Tensor& self, const at::Tensor& src, bool non_blo at::Tensor src_4D = FormatCastHelper::ApplyBaseFormatTensorBy(src); at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self); copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking); - self.npu_format_cast_(dst_4D); + NPUNativeFunctions::npu_format_cast_(self, dst_4D); return; } copy_d2d_dtype_baseformat(self, src, non_blocking); @@ -312,7 +312,7 @@ void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) { if (!FormatHelper::IsBaseFormatType(self)) { at::Tensor dst = OpPreparation::ApplyTensor(self); copy_h2d_baseformat(dst, src, non_blocking, true); - self.npu_format_cast_(dst); + NPUNativeFunctions::npu_format_cast_(self, dst); return; } copy_h2d_baseformat(self, src, non_blocking); @@ -363,7 +363,7 @@ void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking) } at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self); copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking); - self.npu_format_cast_(dst_4D); + NPUNativeFunctions::npu_format_cast_(self, dst_4D); return; } copy_d2d_dtype_format(self, src, non_blocking); diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp index aa98978e8a5cba9188bf2c7fd1b8877a99a209a4..13d82c3f5f83c0720db21367ce6e089248f36b87 100644 --- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp +++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp @@ -26,9 +26,9 @@ bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor& return FormatHelper::GetBaseFormat(src_format) == FormatHelper::GetBaseFormat(dst_format); } -void FormatCastHelper::base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src) { +void FormatCastHelper::base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src) { dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides()); - dst.copy_memory_(src, true); + NPUNativeFunctions::copy_memory_(dst, src, true); } void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclFormat format) { diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.h b/torch_npu/csrc/aten/common/FormatCastHelper.h index 91e9b78182ab2c6af387d62799601ad134ac8d4c..ea2b6ab507a036c88be8e08f2fc24ae097d64270 100644 --- a/torch_npu/csrc/aten/common/FormatCastHelper.h +++ b/torch_npu/csrc/aten/common/FormatCastHelper.h @@ -33,7 +33,7 @@ public: static at::Tensor& CovertSelfToBaseFormat(at::Tensor& src); private: // help function of format_cast_between_group - static void base_format_cast_nocheck(const at::Tensor& dst, const at::Tensor& src); + static void base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src); }; // class FormatCastHelper } // namespace native diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp index 2139cfdb4a9082ec7f2965db6e667198b466c981..c518156b0a4c281420fab2e229102037e8292ad6 100644 --- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp +++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp @@ -76,7 +76,7 @@ at::Tensor NPUNativeFunctions::npu_format_cast( TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half, "can not cast format when src is not float32 or float16"); - at::Tensor dst = at::empty_with_format( + at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), acl_format); // calculate the output result of the NPU @@ -105,7 +105,7 @@ at::Tensor& NPUNativeFunctions::npu_format_cast_( TORCH_CHECK(src.scalar_type() == at::ScalarType::Float || src.scalar_type() == at::ScalarType::Half, "can not cast format when src is not float32 or float16"); - at::Tensor dst = at::empty_with_format( + at::Tensor dst = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), acl_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/common/NpuFastReshape.cpp b/torch_npu/csrc/aten/common/NpuFastReshape.cpp index e81d4f0c2984afb07f9eadd7fc0854a62682820a..e7df99dc6a30f7214f612cd28fc6d6470649d040 100644 --- a/torch_npu/csrc/aten/common/NpuFastReshape.cpp +++ b/torch_npu/csrc/aten/common/NpuFastReshape.cpp @@ -17,6 +17,7 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/InferFormat.h" #include "torch_npu/csrc/framework/StorageDescHelper.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu { namespace native { @@ -45,7 +46,7 @@ void npu_fast_reshape_(at::Tensor& tensor) { // refresh matadata to input tensor StorageDescHelper::ReflushDescBySelf(tensor); auto base_format = InferFormat::GuessBaseFormat(tensor.sizes()); - tensor.npu_format_cast_(base_format); + NPUNativeFunctions::npu_format_cast_(tensor, base_format); } } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/aten/common/ResizeNpu.cpp b/torch_npu/csrc/aten/common/ResizeNpu.cpp index e05736bf2e480927258b30ab7d6e4bc0ccc9a66c..35faadb6f2c7ec353be756587b10fe756f5d0e87 100644 --- a/torch_npu/csrc/aten/common/ResizeNpu.cpp +++ b/torch_npu/csrc/aten/common/ResizeNpu.cpp @@ -31,7 +31,7 @@ at::Tensor& NPUNativeFunctions::resize_( // because of resize _impl_npu_ only support at base format, so // no need to reflush NpuStorageDesc here. if (!FormatHelper::IsBaseFormatType(self)) { - self.npu_format_cast_(FormatHelper::GetBaseFormat(self)); + NPUNativeFunctions::npu_format_cast_(self, FormatHelper::GetBaseFormat(self)); } auto* self_ = self.unsafeGetTensorImpl(); resize_impl_npu_(self_, size, /*strides=*/c10::nullopt); diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp index 3d95b195a17a1aabf1cb5ce721dac7bc66f9367d..c2a9ae6eabaad42410c47e81877c2ea3e7cd2891 100644 --- a/torch_npu/csrc/aten/common/TensorFactories.cpp +++ b/torch_npu/csrc/aten/common/TensorFactories.cpp @@ -30,11 +30,12 @@ #include #include #include +#include + #include "torch_npu/csrc/aten/common/ResizeNpu.h" #include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/framework/InferFormat.h" #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h" -#include #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "torch_npu/csrc/core/tensor_impl.h" @@ -230,7 +231,7 @@ namespace at_npu { auto npu_format = self.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_; - result = at::empty_with_format(self.sizes(), self.options(), npu_format); + result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), npu_format); } } @@ -347,7 +348,7 @@ namespace at_npu options.layout(layout_opt); options.pinned_memory(pin_memory_opt); at::Tensor result = - at::empty_with_format(size, options, dst_format); + OpPreparation::ApplyTensorWithFormat(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); @@ -361,7 +362,7 @@ namespace at_npu int64_t dst_format) { at::Tensor result = - at::empty_with_format(size, options, dst_format); + OpPreparation::ApplyTensorWithFormat(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); @@ -376,7 +377,7 @@ namespace at_npu int64_t dst_format) { at::Tensor result = - at::empty_with_format(size, options, dst_format); + OpPreparation::ApplyTensorWithFormat(size, options, dst_format); if (names.has_value()) { internal_set_names_inplace(result, names); diff --git a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp index 19db800366429c9bedaad43374429a142de6d743..0e4ed1eb1a79c82f1d94c1d52093ea48043f20f5 100644 --- a/torch_npu/csrc/aten/ops/AddKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AddKernelNpu.cpp @@ -170,7 +170,7 @@ namespace at_npu else { c10::NPUStorageDesc src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - at::Tensor src_new = at::empty_with_format( + at::Tensor src_new = OpPreparation::ApplyTensorWithFormat( src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); src_new.set_( src.storage(), @@ -206,7 +206,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -223,7 +223,7 @@ namespace at_npu // calculate the output size auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp index 7cdac57d78e7325e08861421c939f59f21fdfb80..c7a287bdab0597e452c03d96aaa8626608ab1995 100644 --- a/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AnyKernelNpu.cpp @@ -70,7 +70,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self, int64_t dim, bool kee auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU @@ -88,7 +88,7 @@ at::Tensor NPUNativeFunctions::any(const at::Tensor& self) { // when self's dim = 0, convert [1] tensor and reduce it if (self.dim() == 0) { at::Tensor self_tmp = self; - self_tmp = at::empty_with_format( + self_tmp = OpPreparation::ApplyTensorWithFormat( {1}, self.options().dtype(at::ScalarType::Float), CalcuOpUtil::get_tensor_npu_format(self)) diff --git a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp index 2e5df37da07969d61ad8dec892d3d94b8b868af1..9e42f4f0ef89c51f954a15dc4c76b6681ddec1d8 100644 --- a/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BitwiseAndKernelNpu.cpp @@ -112,7 +112,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, const at::Ten auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output at::Tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, ref_tensor.options(), CalcuOpUtil::get_tensor_npu_format(ref_tensor)); @@ -128,7 +128,7 @@ at::Tensor NPUNativeFunctions::bitwise_and(const at::Tensor& self, at::Scalar ot auto outputSize = input_same_output_size(self); // construct the output at::Tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp index 4042b6f678cfd6ed37bb6da08fb845a4e793015c..14f62aa86baf7ed9e735dc8c58908c3d48993057 100644 --- a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp @@ -73,9 +73,9 @@ at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去 if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) && !torch_npu::option::OptionsManager::CheckSwitchMMOutputEnable()) { - result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); + result = OpPreparation::ApplyTensorWithFormat(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND); + result = OpPreparation::ApplyTensorWithFormat(outputSize, self.options(), ACL_FORMAT_ND); } // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp index 2f696eb0ad35d92d8de4bdcbd55ebc7f70658948..f928e9773889e7d145792e04cf6368d31655eec0 100644 --- a/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BmmV2KernelNpu.cpp @@ -83,9 +83,9 @@ at::Tensor pure_bmm_v2_npu(const at::Tensor& self, const at::Tensor& mat2, const at::Tensor result; if ((tensor1.scalar_type() == at::ScalarType::Half)) { - result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ); + result = OpPreparation::ApplyTensorWithFormat(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND); + result = OpPreparation::ApplyTensorWithFormat(output_size, tensor1.options(), ACL_FORMAT_ND); } at::Tensor contiguous_self = tensor1; diff --git a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp index 79c6fdae296474ee93c073587111e80dfe678ef9..f63a18da30580cb73ca066bbda63dbe531ed0743 100644 --- a/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BroadcastKernelNpu.cpp @@ -53,7 +53,7 @@ namespace at_npu input = input.to(at::kInt); } - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( size, input.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp index a033c8c511ec970d4f86479c82c8449f664c9794..b7ca932113f802962720ea14fd9ccd792d0fd90a 100644 --- a/torch_npu/csrc/aten/ops/DivKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/DivKernelNpu.cpp @@ -85,7 +85,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -102,7 +102,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp index 3dff36422833c16153fb761e32cb03e4d3485979..fa27bc376a46b8306ca6d9b5974dea18bfb83658 100644 --- a/torch_npu/csrc/aten/ops/EqKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/EqKernelNpu.cpp @@ -95,7 +95,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -113,7 +113,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -131,7 +131,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(c10::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -159,7 +159,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(c10::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp index 50d4c3fbcda623feaac7431e08e81fe1d65eecb4..4b453091b6f5c74ee035654cd7fad05f34f39b40 100644 --- a/torch_npu/csrc/aten/ops/GtKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/GtKernelNpu.cpp @@ -104,7 +104,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(formatCastOfSelf, formatCastOfOther); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -121,7 +121,7 @@ namespace at_npu auto outputSize = input_same_output_size(formatCastOfSelf); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, formatCastOfSelf.options().dtype(at::kBool), ACL_FORMAT_ND); @@ -139,7 +139,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -167,7 +167,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp index 40bf84de52da26c8a7da34c5c59c611d46816750..1aa6ffb9dcc28898a44d8f71cd36fcd449de746e 100644 --- a/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/IndexKernelNpu.cpp @@ -42,7 +42,7 @@ at::Tensor NPUNativeFunctions::index(const at::Tensor& self, const torch::List outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); @@ -161,7 +161,7 @@ namespace at_npu c10::SmallVector outputs = {self}; CalcuOpUtil::check_memory_over_laps(inputs, outputs); - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options().dtype(at::ScalarType::Byte), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp index 0f6e2b6875ee4064b3a992cc064e0dfda2769ca1..8ca851a36d906b60afa54388b61d5f3f23b12361 100644 --- a/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MeanKernelNpu.cpp @@ -142,7 +142,7 @@ namespace at_npu } // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options().dtype(dstType), npu_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp index ceba5c8a22cb635ddaf98f7495126365b1053941..05bd74b0e0a6dad3866d2e6722ef97a670a98c22 100644 --- a/torch_npu/csrc/aten/ops/MmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MmKernelNpu.cpp @@ -183,12 +183,12 @@ Return: if ((self.scalar_type() == at::ScalarType::Half) && !torch_npu::option::OptionsManager::CheckSwitchMMOutputEnable()) { - result = at::empty_with_format( + result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = at::empty_with_format(outputSize, self.options()); + result = OpPreparation::ApplyTensorWithSizes(outputSize, self.options()); } // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp index 1fa1b7afc88814b1ab3962ab716b90c39286f11b..d9982ffc566a6f2268e9e0e66e227de818f418cc 100644 --- a/torch_npu/csrc/aten/ops/MulKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/MulKernelNpu.cpp @@ -113,7 +113,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(selfCast, otherCast); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -135,7 +135,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp index a04ed1ff90608d7c0ae23bacfb604c2c75dc8832..a3ae15a640ec2c1c35718fb430a30856f0e73c97 100644 --- a/torch_npu/csrc/aten/ops/NegKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/NegKernelNpu.cpp @@ -50,7 +50,7 @@ namespace at_npu at::Tensor NPUNativeFunctions::neg(const at::Tensor &self) { // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp index 5135cd4a2763ce37a4afea750afd524b3a5d13e7..e8bfa33690ab8d2d577f13fe3d666744900431ea 100644 --- a/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/NormalKernelNpu.cpp @@ -118,7 +118,7 @@ namespace at_npu TORCH_CHECK(std > 0.0, "normal_ expects std > 0.0, but found std=", std); // the op of PTNormalFloatFloat only support format of ND - at::Tensor formatCastOfResult = result.npu_format_cast(ACL_FORMAT_ND); + at::Tensor formatCastOfResult = NPUNativeFunctions::npu_format_cast(result, ACL_FORMAT_ND); if (formatCastOfResult.scalar_type() == at::ScalarType::Half) { formatCastOfResult = formatCastOfResult.to(at::ScalarType::Float); @@ -182,7 +182,7 @@ namespace at_npu c10::optional pin_memory_opt) { // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = NPUNativeFunctions::empty_with_format( size, dtype_opt, layout_opt, device_opt, pin_memory_opt, ACL_FORMAT_ND); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp index 12dd1e754252ada59d1571cca520ab0b9df90c0d..4e65798b226b6925be7fe900c2ee3a1e49684f97 100644 --- a/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/OnesLikeKernelNpu.cpp @@ -45,12 +45,9 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format(outputSize, - dtype_opt, - layout_opt, - device_opt, - pin_memory_opt, - CalcuOpUtil::get_tensor_npu_format(self)); + at::Tensor result = NPUNativeFunctions::empty_with_format( + outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt, + CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPUc return NPUNativeFunctions::one_(result); } diff --git a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp index d8ab5517335c7aac344a6d9ca4cd4a71b1de63c5..1f4331abc0efe4773755e5f861a85e635e37bed0 100644 --- a/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ReluKernelNpu.cpp @@ -17,7 +17,7 @@ #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" #include "torch_npu/csrc/framework/utils/NpuUtils.h" - +#include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu @@ -65,7 +65,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp index cdbd32be1de5ee336f678a64a1b6daa3f6e2bac5..bb0050fe1889f881f826a3db80e3c867b22d019e 100644 --- a/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SoftmaxBackwardKernelNpu.cpp @@ -52,17 +52,18 @@ namespace at_npu // calculate the output size auto outputSize = input_same_output_size(grad_output); + at::Tensor tmp_output = output; // output'format must be same with grad_output - if (CalcuOpUtil::get_tensor_npu_format(output) != CalcuOpUtil::get_tensor_npu_format(grad_output)) + if (CalcuOpUtil::get_tensor_npu_format(tmp_output) != CalcuOpUtil::get_tensor_npu_format(grad_output)) { - output.npu_format_cast_(CalcuOpUtil::get_tensor_npu_format(grad_output)); + NPUNativeFunctions::npu_format_cast_(tmp_output, CalcuOpUtil::get_tensor_npu_format(grad_output)); } // construct the output tensor of the NPU at::Tensor grad_input = OpPreparation::ApplyTensor(grad_output, outputSize); // calculate the output result of the NPU - softmax_backward_out_npu(grad_input, grad_output, output, dim, self); + softmax_backward_out_npu(grad_input, grad_output, tmp_output, dim, self); return grad_input; } diff --git a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp index 79783b2f4c40f7ab1738b3072dbdf08ad7c80c2d..eb85dfefab763a445d32b5e4012eb1132e09a056 100644 --- a/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/StrideAddKernelNpu.cpp @@ -17,6 +17,7 @@ #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" #include "torch_npu/csrc/framework/utils/NpuUtils.h" +#include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace at_npu @@ -119,7 +120,7 @@ namespace at_npu outputSize[1] = c1_len.toInt() * 16; // construct the output at::Tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp index fe90315466cfe4bf1a0fe6050284792c398428b1..33ac4018eee507020e5312ed0c371dc560abcd54 100644 --- a/torch_npu/csrc/aten/ops/SubKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SubKernelNpu.cpp @@ -103,7 +103,7 @@ namespace at_npu auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, outputTensor.options(), CalcuOpUtil::get_tensor_npu_format(outputTensor)); @@ -120,7 +120,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp index 48129f2a6bfac787148be744e6f2ce122996c487..23819bc687ef79e30b65d23f2a2768fc16c79ef2 100644 --- a/torch_npu/csrc/aten/ops/SumKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/SumKernelNpu.cpp @@ -197,7 +197,7 @@ namespace at_npu } // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options().dtype(dstType), npu_format); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp index 537bcc2444c8b21eefcecaa7d24ab37585913db4..0bc0118a9cddbc4e1d656c361a11f6b39f61d26a 100644 --- a/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ThresholdBackwardKernelNpu.cpp @@ -62,7 +62,7 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // use 5HD in Relu diff --git a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp index 0bddf27eae72b027412ec0c6ddc55ca0ba331fc5..9a4bd8fb5a805ebef6c7fb4ab5a6f8ac7c38c73a 100644 --- a/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TopKKernelNpu.cpp @@ -219,11 +219,11 @@ namespace at_npu // construct the output tensor of the NPU at::Tensor transposeSelf = NPUNativeFunctions::npu_transpose(self, perm); auto outputSize = transpose_npu_output_size(values, perm); - at::Tensor transposeValue = at::empty_with_format( + at::Tensor transposeValue = OpPreparation::ApplyTensorWithFormat( outputSize, values.options(), CalcuOpUtil::get_tensor_npu_format(values)); - at::Tensor transposeIndices = at::empty_with_format( + at::Tensor transposeIndices = OpPreparation::ApplyTensorWithFormat( outputSize, indices.options(), CalcuOpUtil::get_tensor_npu_format(indices)); @@ -290,9 +290,9 @@ namespace at_npu // calculate the output size auto outputSize = topk_npu_output_size(selfCp, k, dim, largest, sorted); // construct the output tensor of the NPU - at::Tensor values = at::empty_with_format( + at::Tensor values = OpPreparation::ApplyTensorWithFormat( outputSize, selfCp.options(), CalcuOpUtil::get_tensor_npu_format(selfCp)); - at::Tensor indices = at::empty_with_format( + at::Tensor indices = OpPreparation::ApplyTensorWithFormat( outputSize, selfCp.options().dtype(at::kInt), ACL_FORMAT_ND); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp index fcda11d5bcadb84f9162c30806080bb9ccdcf6e0..40631abc0b1c999cecc9d9966e68e4f7e2d03e7f 100644 --- a/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/TransposeKernelNpu.cpp @@ -76,7 +76,7 @@ namespace at_npu { RECORD_FUNCTION("transpose_to_contiguous", vector({self})); int64_t self_format = CalcuOpUtil::get_tensor_npu_format(self); - at::Tensor result = at::empty_with_format(self.sizes(), self.options(), self_format); + at::Tensor result = OpPreparation::ApplyTensorWithFormat(self.sizes(), self.options(), self_format); // obtain the transpose axises at::IntArrayRef dim; diff --git a/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp b/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp index edb450c9cf678115cb48c900dd012aca93eab0c8..8976e4f1b1fd2ded5347957255f95473a90b0f85 100644 --- a/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/WhereKernelNpu.cpp @@ -92,7 +92,7 @@ vector NPUNativeFunctions::where(const at::Tensor& condition) { at::Tensor formatCastOfCondition = condition; if (condition.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ != ACL_FORMAT_ND) { - formatCastOfCondition = formatCastOfCondition.npu_format_cast(ACL_FORMAT_ND); + formatCastOfCondition = NPUNativeFunctions::npu_format_cast(formatCastOfCondition, ACL_FORMAT_ND); } if (condition.scalar_type() == at::ScalarType::Half) { formatCastOfCondition = NPUNativeFunctions::npu_dtype_cast(formatCastOfCondition, at::ScalarType::Float); diff --git a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp index 18ec3038d102a4f379fb87876c0fe3efb276eb34..8147622a0130525911f788b0f8b54693eff8bd4b 100644 --- a/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/ZerosLikeKernelNpu.cpp @@ -56,12 +56,9 @@ namespace at_npu auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format(outputSize, - dtype_opt, - layout_opt, - device_opt, - pin_memory_opt, - CalcuOpUtil::get_tensor_npu_format(self)); + at::Tensor result = NPUNativeFunctions::empty_with_format( + outputSize, dtype_opt, layout_opt, device_opt, pin_memory_opt, + CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU return result.zero_(); diff --git a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp index dcddfa5d6563c703d906e7f2ab266e8d149514a1..b93bf1710d966dc1d75b85212a1d014c16453d09 100644 --- a/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/Conv2dBackwardKernelNpu.cpp @@ -213,7 +213,7 @@ tuple NPUNativeFunctions::npu_conv2d_backwar at::Tensor gradBias; // construct the output tensor of the NPU if (grad_input_mask[0]) { - gradInput = at::empty_with_format( + gradInput = OpPreparation::ApplyTensorWithFormat( std::get<0>(outputSizes), input.options(), ACL_FORMAT_NC1HWC0); } @@ -221,12 +221,12 @@ tuple NPUNativeFunctions::npu_conv2d_backwar // For group conv2d: keep consistent with weight to avoid allreduce accuracy problem. // For more info: https://gitee.com/ascend/pytorch-develop/pulls/2255 if (groups > 1) { - gradWeight = at::empty_with_format( + gradWeight = OpPreparation::ApplyTensorWithFormat( std::get<1>(outputSizes), weight.options().dtype(at::kFloat), ACL_FORMAT_NCHW); } else { - gradWeight = at::empty_with_format( + gradWeight = OpPreparation::ApplyTensorWithFormat( std::get<1>(outputSizes), weight.options().dtype(at::kFloat), ACL_FORMAT_FRACTAL_Z); @@ -234,7 +234,7 @@ tuple NPUNativeFunctions::npu_conv2d_backwar } if (grad_input_mask[2]) { - gradBias = at::empty_with_format( + gradBias = OpPreparation::ApplyTensorWithFormat( std::get<2>(outputSizes), grad.options(), ACL_FORMAT_NCHW); } diff --git a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp index 48be4e0d619f87479fc2cd5db928eecc28d318e8..b38ef864a05aa48868d63da4c0b493f60a390bcb 100644 --- a/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/Conv3dBackwardKernelNpu.cpp @@ -107,7 +107,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[0]) { // format should be NDC1HWC0 - gradInput = at::empty_with_format( + gradInput = OpPreparation::ApplyTensorWithFormat( input.sizes(), input.options(), ACL_FORMAT_NDC1HWC0); conv3d_backward_inputmask( @@ -116,7 +116,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[1]) { // format should be FRACTAL_Z_3D - gradWeight = at::empty_with_format( + gradWeight = OpPreparation::ApplyTensorWithFormat( weight.sizes(), weight.options().dtype(at::kFloat), ACL_FRACTAL_Z_3D); conv3d_backward_weightmask( @@ -125,7 +125,7 @@ tuple NPUNativeFunctions::npu_conv3d_backwar if (grad_input_mask[2]) { // format should be NCHW, gradias.size = grad.size(1) - gradBias = at::empty_with_format( + gradBias = OpPreparation::ApplyTensorWithFormat( {grad.size(1)}, grad.options(), ACL_FORMAT_NCHW); conv3d_backward_biasmask( diff --git a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp index c6e2f7837824c6f25e3ff5c9372afdc45a341c45..a0a32368fbba8a9a2229796107397f571337738c 100644 --- a/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/convolution/ConvTranspose2dKernelNpu.cpp @@ -81,7 +81,7 @@ at::Tensor NPUNativeFunctions::npu_conv_transpose2d( // construct the output tensor of the NPU at::Tensor result = - at::empty_with_format(outputSize, input.options(), ACL_FORMAT_NC1HWC0); + OpPreparation::ApplyTensorWithFormat(outputSize, input.options(), ACL_FORMAT_NC1HWC0); // calculate the output result of the NPU conv_transpose2d_out_npu( diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp index 5e31d05824c3434d08d08f941c9f0abca4580b0f..bf6da61f93fab158c5d09f567bc3522924ee740d 100644 --- a/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/loss/NLLLossBackwardKernelNpu.cpp @@ -91,7 +91,7 @@ at::Tensor NPUNativeFunctions::nll_loss_backward( auto outputSize = input_same_output_size(self); // construct the output tensor of the NPU - at::Tensor grad_input = at::empty_with_format( + at::Tensor grad_input = OpPreparation::ApplyTensorWithFormat( outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); // calculate the output result of the NPU diff --git a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp index cfda03265f6e418a944ee01a093096009ec1c637..f274745813203aa15166bcfcaca0519c9170ea57 100644 --- a/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/loss/NLLLossKernelNpu.cpp @@ -96,11 +96,11 @@ tuple NPUNativeFunctions::nll_loss_forward( outputSize, totalWeightSize); // construct the output tensor of the NPU - at::Tensor result = at::empty_with_format( + at::Tensor result = OpPreparation::ApplyTensorWithFormat( std::get<0>(outputSizes), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - at::Tensor total_weight = at::empty_with_format( + at::Tensor total_weight = OpPreparation::ApplyTensorWithFormat( std::get<1>(outputSizes), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 951ba5960741a71fe6c276abb7ac28d62e3273a3..e8717648c5ce728b8125839a61240e31588e3ac2 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -33,6 +33,7 @@ #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp" #include "torch_npu/csrc/distributed/Init.h" #include "torch_npu/csrc/distributed/reducer.hpp" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" namespace torch_npu { @@ -48,7 +49,7 @@ class BroadcastWork { public: inline std::vector cast_tensors(at::TensorList tensors) { static auto cast_back_to_ori_format = [](const at::Tensor &t) { - return t.npu_format_cast(t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); + return at_npu::native::NPUNativeFunctions::npu_format_cast(t, t.storage().unsafeGetStorageImpl()->npu_desc_.origin_format_); }; return c10::fmap(tensors, cast_back_to_ori_format); } diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp index b31bd008d0deb0b45af5c866b1d7ccc279a85e16..81f7f049682ad718d3c2902348c4246e573925f8 100644 --- a/torch_npu/csrc/distributed/reducer.cpp +++ b/torch_npu/csrc/distributed/reducer.cpp @@ -31,6 +31,8 @@ #include #include "torch_npu/csrc/distributed/reducer.hpp" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" namespace c10d_npu { namespace { @@ -406,9 +408,9 @@ void Reducer::copy_grad_to_bucket( if (comm_hook_ == nullptr) { // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp // Divides while copying into the bucket view. - bucket_view.copy_memory_(grad.mul(float(1.) / divFactor_), true); + at_npu::native::NPUNativeFunctions::copy_memory_(bucket_view, grad.mul(float(1.) / divFactor_), true); } else { - bucket_view.copy_memory_(grad, true); + at_npu::native::NPUNativeFunctions::copy_memory_(bucket_view, grad, true); } } @@ -441,7 +443,7 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) { // make sure grad has the same format as variable if (grad.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_ != variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_) { - grad = grad.npu_format_cast( + grad = at_npu::native::NPUNativeFunctions::npu_format_cast(grad, variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); } this->copy_grad_to_bucket(grad, bucket_view); @@ -1072,12 +1074,12 @@ void Reducer::copy_bucket_to_grad( if (!grad.defined()) { // Creates grad according to the "Gradient Layout Contract" // (see torch/csrc/grad/AccumulateGrad.h) - grad = at::empty_with_format(variable.sizes(), - bucket_view.options(), - variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); - grad.copy_memory_(bucket_view, true); + grad = at_npu::native::OpPreparation::ApplyTensorWithFormat( + variable.sizes(), bucket_view.options(), + variable.storage().unsafeGetStorageImpl()->npu_desc_.npu_format_); + at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true); } else { - grad.copy_memory_(bucket_view, true); + at_npu::native::NPUNativeFunctions::copy_memory_(grad, bucket_view, true); } // The grad is modified and needs to be written back. return true; diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp index 246adc7ec10c2bd5c7dd00fe8b8f307483988780..08a0e1a6cad6205ac307eebfbb9f073c67370d61 100644 --- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp +++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp @@ -14,6 +14,7 @@ // limitations under the License. #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" namespace at_npu { @@ -109,7 +110,7 @@ namespace at_npu const at::Tensor &src, const std::vector &optimizations) { - auto self = at::empty_with_format( + auto self = OpPreparation::ApplyTensorWithFormat( src.sizes(), src.options(), src.storage().get_npu_desc().npu_format_); diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp index a0fa15f3cb95fbc3efc8e710dd42a8208371c2da..1ce04a827f2e1b4d2830b779b89bfe2ea251490e 100644 --- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp @@ -16,8 +16,10 @@ #include #include #include + #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" namespace at_npu { @@ -498,7 +500,7 @@ namespace at_npu { // case 2: The first tensor is discontiguous-type, // conduct the standard optimization procedure. - auto contiguous_src = at::empty_with_format( + auto contiguous_src = OpPreparation::ApplyTensorWithFormat( src.sizes(), src.options(), src.storage().get_npu_desc().npu_format_); diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp index a0efe852f78d13a31e6efcdce65a6014b6015c1a..07bda7c6a64e1f05af970139288a49047f68f9da 100644 --- a/torch_npu/csrc/framework/utils/NpuUtils.cpp +++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp @@ -16,7 +16,6 @@ #include #include -#include #include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" @@ -25,6 +24,8 @@ #include "torch_npu/csrc/framework/utils/KernelNpuOutputSize.h" #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" #include "torch_npu/csrc/framework/interface/EnvVariables.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" namespace at_npu { @@ -165,7 +166,7 @@ namespace at_npu // 3. get output size auto outputSize = index_select_npu_output_size(src_tmp, dim, index); int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(src_tmp); - at::Tensor result = at::empty_with_format(outputSize, src_tmp.options(), npu_format); + at::Tensor result = OpPreparation::ApplyTensorWithFormat(outputSize, src_tmp.options(), npu_format); // std::cout << "npu_format: " << npu_format << std::endl; // 4. get input and output @@ -208,7 +209,7 @@ namespace at_npu at::Tensor deal_with_5d_5d_match(const at::Tensor &src) { auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_; - at::Tensor src_new = at::empty_with_format(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); + at::Tensor src_new = OpPreparation::ApplyTensorWithFormat(src_desc.base_sizes_, src.options(), ACL_FORMAT_NC1HWC0); c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream(); int64_t numel = src_new.numel(); aclError error = aclrtMemcpyAsync( @@ -252,7 +253,7 @@ namespace at_npu // a temporary tensor, which always monopolizes its own storage. if (numelEq && (!FormatHelper::IsBaseFormatType(src))) { - at::Tensor tempTensor = at::npu_format_cast(src, FormatHelper::GetBaseFormat(src)); + at::Tensor tempTensor = NPUNativeFunctions::npu_format_cast(src, FormatHelper::GetBaseFormat(src)); auto &temp_desc = tempTensor.storage().unsafeGetStorageImpl()->npu_desc_; temp_desc.base_sizes_ = tempTensor.sizes(); diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp index bf70fb7277814d6e5f2f29a89e4f2383aeb9033b..3726a9765a675c7e0b5765ba1ae043302a7e4d6b 100644 --- a/torch_npu/csrc/framework/utils/OpPreparation.cpp +++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp @@ -17,6 +17,8 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/InferFormat.h" #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + namespace at_npu { @@ -176,13 +178,13 @@ namespace at_npu if (output.scalar_type() == at::ScalarType::Float || output.scalar_type() == at::ScalarType::Half) { TORCH_CHECK(!is_read_write, "can not cast format when output is input"); - output.npu_format_cast_(format); + NPUNativeFunctions::npu_format_cast_(output, format); } else { TORCH_CHECK(FormatHelper::IsBaseFormatType(output) && FormatHelper::IsBaseFormatType(static_cast(format)), "can not cast format to un-base format when output has bool dtype"); - output.npu_format_cast_(format); + NPUNativeFunctions::npu_format_cast_(output, format); } } } @@ -197,7 +199,7 @@ namespace at_npu at::Tensor &OpPreparation::CastBackToOriFormat(at::Tensor &tensor) { auto &tensor_desc = tensor.storage().unsafeGetStorageImpl()->npu_desc_; - tensor.npu_format_cast_(tensor_desc.origin_format_); + NPUNativeFunctions::npu_format_cast_(tensor, tensor_desc.origin_format_); return tensor; } @@ -234,13 +236,17 @@ namespace at_npu at::Tensor OpPreparation::ApplyTensorWithFormat(c10::IntArrayRef sizes, const c10::TensorOptions &options, int64_t format) { auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format); - return at::empty_with_format(sizes, options, fixFormat); + return NPUNativeFunctions::empty_with_format( + sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), + options.device_opt(), options.pinned_memory_opt(), fixFormat); } at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options) { auto format = InferFormat::GuessBaseFormat(sizes); - return at::empty_with_format(sizes, options, format); + return NPUNativeFunctions::empty_with_format( + sizes, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), + options.device_opt(), options.pinned_memory_opt(), format); } void OpPreparation::CheckMemory(const std::initializer_list &inputs, const std::initializer_list &outputs) diff --git a/torch_npu/testing/util_test.py b/torch_npu/testing/util_test.py index a460af4ae332f2fe2a229946dfc58da3c3e0483e..835814c30d45ef5933293b04c3ac18da6f353d59 100644 --- a/torch_npu/testing/util_test.py +++ b/torch_npu/testing/util_test.py @@ -15,6 +15,7 @@ # limitations under the License. import torch +import torch_npu import numpy as np import os @@ -42,7 +43,7 @@ def create_common_tensor(item, minValue, maxValue): cpu_input = torch.from_numpy(input1) npu_input = torch.from_numpy(input1).to(npu_device) if npu_format != -1: - npu_input = npu_input.npu_format_cast(npu_format) + npu_input = torch_npu.npu_format_cast(npu_input, npu_format) return cpu_input, npu_input @@ -125,5 +126,5 @@ def create_dtype_tensor(shape, dtype, npu_format=-1, min_value=-5, max_value=5, cpu_input = torch.from_numpy(x) npu_input = torch.from_numpy(x).to(npu_device) if npu_format != -1 and (dtype in [torch.float, torch.half]): - npu_input = npu_input.npu_format_cast(npu_format) + npu_input = torch_npu.npu_format_cast(npu_input, npu_format) return cpu_input, npu_input \ No newline at end of file diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..092f9cebfba63364dd18297f48981e47e747999a --- /dev/null +++ b/torch_npu/utils/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .module import LayerNorm, apply_module_patch + + +def _get_monkey_patches(): + _monkey_patches = [] + _monkey_patches.append(["nn.modules.normalization.LayerNorm", LayerNorm]) + _monkey_patches.append(["nn.modules.LayerNorm", LayerNorm]) + _monkey_patches.append(["nn.LayerNorm", LayerNorm]) + return _monkey_patches + + +nn_monkey_patches = _get_monkey_patches() diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py new file mode 100644 index 0000000000000000000000000000000000000000..0d275ef923ecd8f8a108a888964304c72abf1aa3 --- /dev/null +++ b/torch_npu/utils/module.py @@ -0,0 +1,132 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +import torch +import torch_npu + + +def npu(self, device=None): + r"""Moves all model parameters and buffers to the npu. + + This also makes associated parameters and buffers different objects. So + it should be called before constructing optimizer if the module will + live on npu while being optimized. + + Arguments: + device (int, optional): if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + if device is None: + device = torch.device("npu") + if torch_npu.npu.is_available(): + with torch.no_grad(): + self.cast_weight(device) + return self._apply(lambda t: t.npu(device)) + + +def to(self, *args, **kwargs): + device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs) + + if dtype is not None: + if not (dtype.is_floating_point or dtype.is_complex): + raise TypeError('nn.Module.to only accepts floating point or complex ' + 'dtypes, but got desired dtype={}'.format(dtype)) + if dtype.is_complex: + warnings.warn( + "Complex modules are a new feature under active development whose design may change, " + "and some modules might not work as expected when using complex tensors as parameters or buffers. " + "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md " + "if a complex module does not work as expected.") + if torch_npu.npu.is_available(): + with torch.no_grad(): + self.cast_weight(device) + + def convert(t): + if convert_to_format is not None and t.dim() == 4: + return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, + non_blocking, memory_format=convert_to_format) + return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking) + + return self._apply(convert) + + +def cast_weight(self, device): + + def _format_cast(module, class_name): + if issubclass(class_name, torch.nn.Linear): + module.weight.data = module.weight.data.to(device) + module.weight.data = torch_npu.npu_format_cast(module.weight.data, 29) # ACL_FORMAT_FRACTAL_NZ + if issubclass(class_name, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)): + if module.affine: + module.weight.data = module.weight.data.to(device) + module.weight.data = torch_npu.npu_format_cast(module.weight.data, 3) # ACL_FORMAT_NC1HWC0 + module.bias.data = module.bias.data.to(device) + module.bias.data = torch_npu.npu_format_cast(module.bias.data, 3) + module.running_mean.data = module.running_mean.data.to(device) + module.running_mean.data = torch_npu.npu_format_cast(module.running_mean.data, 3) + module.running_var.data = module.running_var.data.to(device) + module.running_var.data = torch_npu.npu_format_cast(module.running_var.data, 3) + if issubclass(class_name, torch.nn.Conv2d): + if (module.in_channels == module.groups and module.groups > 1 + and module.weight.size(0) % module.in_channels == 0): + return + module.weight.data = module.weight.data.to(device) + module.weight.data = torch_npu.npu_format_cast(module.weight.data, 4) # ACL_FORMAT_FRACTAL_Z + if issubclass(class_name, torch.nn.Conv3d): + module.weight.data = module.weight.data.to(device) + module.weight.data = torch_npu.npu_format_cast(module.weight.data.half(), 33).float() # ACL_FRACTAL_Z_3D + if "MultiheadAttention" in str(class_name) and \ + hasattr(module,"q_proj_weight") and module.q_proj_weight and \ + hasattr(module,"k_proj_weight") and module.k_proj_weight and \ + hasattr(module,"v_proj_weight") and module.v_proj_weight: + module.q_proj_weight.data = module.q_proj_weight.data.to(device) + module.q_proj_weight.data = torch_npu.npu_format_cast(module.q_proj_weight.data, 29) + module.k_proj_weight.data = module.k_proj_weight.data.to(device) + module.k_proj_weight.data = torch_npu.npu_format_cast(module.k_proj_weight.data, 29) + module.v_proj_weight.data = module.v_proj_weight.data.to(device) + module.v_proj_weight.data = torch_npu.npu_format_cast(module.v_proj_weight.data, 29) + + if device is None or "npu" not in str(device): + return + + current_class = self.__class__ + _format_cast(self, current_class) + + if not self.children: + return + + for sub_module in self.children(): + if isinstance(sub_module, torch.nn.Module): + sub_module.cast_weight(device) + + +def apply_module_patch(): + torch.nn.Module.npu = npu + torch.nn.Module.to = to + torch.nn.Module.cast_weight = cast_weight + + +class LayerNorm(torch.nn.LayerNorm): + + def forward(self, input: torch.Tensor) -> torch.Tensor: + if self.training: + return torch.nn.functional.layer_norm( + input, self.normalized_shape, self.weight, self.bias, self.eps) + else: + return torch_npu.npu_layer_norm_eval(input, self.normalized_shape, self.weight, self.bias, self.eps) diff --git a/torch_npu/version.py b/torch_npu/version.py index 32fb6cf7bf30cc67a5bf6bef7b2f2d33d6901856..8790208db40d2c1d0007a4131bba6e688595fd6e 100644 --- a/torch_npu/version.py +++ b/torch_npu/version.py @@ -1 +1,16 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + __version__ = "1.8.1rc1"