diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index c34eac8c074ca08ae4c8ec710abad70923e9e6ea..b57fa93b780cad2a8642da4eff93b93dcd206eab 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -18,6 +18,7 @@ #include #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" +#include "torch_npu/csrc/framework/graph/execute/GraphExecutor.h" #include #include @@ -52,6 +53,7 @@ PyObject * THPModule_npu_shutdown(PyObject * /* unused */) // all of op tasks completed before device memory free. if (c10::npu::NpuSysCtrl::GetInstance().GetInitFlag()) { c10::npu::npuSynchronizeDevice(); + at_npu::native::GraphExecutor::GetInstance().Finalize(); THNPUCachingHostAllocator_emptyCache(); c10_npu::NPUCachingAllocator::emptyCache(); c10::npu::NpuSysCtrl::SysStatus status = c10::npu::NpuSysCtrl::GetInstance().Finalize(); diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp index 4d6f9fe35ddbdb0ec30d8417055fae30de6432ee..576dfd5358ac9427e7be127a9a6f117e7115370f 100644 --- a/torch_npu/csrc/aten/common/CopyKernel.cpp +++ b/torch_npu/csrc/aten/common/CopyKernel.cpp @@ -23,6 +23,7 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/framework/StorageDescHelper.h" #include "torch_npu/csrc/framework/utils/OpTemplate.h" +#include "torch_npu/csrc/framework/graph/util/GraphModeGuard.h" #include "torch_npu/csrc/aten/common/FormatCastHelper.h" #include "torch_npu/csrc/aten/common/InnerNpuNativeFunction.h" #include "torch_npu/csrc/framework/allocator/THNPUCachingHostAllocator.h" @@ -326,6 +327,7 @@ at::Tensor& NPUNativeFunctions::copy_(at::Tensor& self, const at::Tensor& src, b } } else { if (src.is_npu()) { + GraphModeGuard mode_guard(c10_npu::ModeKind::SINGLE_OP_MODE); copy_d2h(self, src, non_blocking); } } diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp index 13d82c3f5f83c0720db21367ce6e089248f36b87..3934fc6cd597437ccd2d7944fc3085a5cbad50cd 100644 --- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp +++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp @@ -16,6 +16,7 @@ #include "torch_npu/csrc/framework/FormatHelper.h" #include "torch_npu/csrc/aten/common/FormatCastHelper.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/npu/NPURunMode.h" namespace at_npu { namespace native { @@ -56,7 +57,26 @@ bool FormatCastHelper::format_cast_between_group(at::Tensor& dst, const at::Tens auto src_base_format = FormatHelper::GetBaseFormat(src); format_cast_as_base_format(src, FormatHelper::GetBaseFormat(dst)); // prepare: covert src to dst base format format_cast_inside_group(dst, src); // src base format (src format) -> dst base format - format_cast_as_base_format(src, src_base_format); // recover: dst base format -> dst format + + // NB + // In Graph Mode + // a = torch.empty([2,3]).npu() + // a.npu_format_cast(nc1hwc0); + // a.npu_format_cast(nz); + // torch.npu.launch_graph() + + // a base format change: ND-> NCHW -> ND + // when we run graph, + // FE get task : ND/ND -> NCHW/NC1HWC0, which will be failed + // so we judge condition below make a base format change become + // ND->NCHW->NCHW + // then FE get task : NCHW/NCHW -> NCHW/NC1HWC0 and NCHW/NCHW -> NCHW/NZ + + if (c10_npu::NpuRunMode::IsGraphMode() && src_base_format == ACL_FORMAT_ND) { + return true; + } + // recover: dst base format -> dst format + format_cast_as_base_format(src, src_base_format); return true; } } else { diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp index 8d006e1055fe526c3dca2e807e8458fca4b68c68..a547c4a3033c4c50877c1f16181a4d1f61f5a5f1 100644 --- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp +++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp @@ -17,7 +17,7 @@ #include #include #include - +#include "torch_npu/csrc/framework/graph/util/GraphModeGuard.h" #include "third_party/acl/inc/acl/acl_base.h" #include "third_party/acl/inc/acl/acl_rt.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" @@ -33,6 +33,7 @@ c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self) { self.scalar_type(), "_local_scalar_dense_npu", [&] { + GraphModeGuard mode_guard(c10_npu::ModeKind::SINGLE_OP_MODE); scalar_t value = 0; c10::npu::NPUStream copy_stream = c10::npu::getCurrentNPUStream(); aclError error = aclrtMemcpyAsync( diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp index 26821c4f374d02a475e74338eba6009876f678bf..98d9adbfc13135c284737c65374773158981a610 100644 --- a/torch_npu/csrc/aten/common/TensorFactories.cpp +++ b/torch_npu/csrc/aten/common/TensorFactories.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" @@ -97,6 +98,15 @@ namespace at_npu true); auto tensor = at::detail::make_tensor(storage_impl, dtype); + + // NB + // Store weak intrusive ptr of storage impl in both graph mode and single op mode + // because we need to get all live tensor in context in mode change scene + // we want to manage all storage without affect their life cycle + // so in graph mode, we can get all live tensor storage + c10::npu::graph::NpuGraphContextManager::GetInstance().AddOutputStorage( + storage_impl); + // Default at::TensorImpl has size [0] if (size.size() != 1 || size[0] != 0) { @@ -278,7 +288,12 @@ namespace at_npu aclFormat format = InferFormat::GuessStorageFormat(size, (aclFormat)dst_format); int64_t nelements = StorageDescHelper::GetMemorySize(size, format); auto dtype = c10::scalarTypeToTypeMeta(dtype_or_default(dtype_opt)); - int64_t size_bytes = nelements * dtype.itemsize(); + + // In graph mode, empty with format is used to make inner tensor, + // ASCEND-GE will take charge of the memory of them + int64_t size_bytes = + c10_npu::NpuRunMode::IsGraphMode() ? 0 : nelements * dtype.itemsize(); + auto storage_impl = c10::make_intrusive( c10::StorageImpl::use_byte_size_t(), size_bytes, @@ -287,6 +302,12 @@ namespace at_npu true); auto tensor = at::detail::make_tensor(storage_impl, dtype); + + // NB Store weak intrusive ptr of storage impl in graph mode + // see note above + c10::npu::graph::NpuGraphContextManager::GetInstance().AddOutputStorage( + storage_impl); + // Default NPUTensorImpl has size [0] if (size.size() != 1 || size[0] != 0) { @@ -312,7 +333,11 @@ namespace at_npu aclFormat format = InferFormat::GuessStorageFormat(size, (aclFormat)dst_format); int64_t nelements = StorageDescHelper::GetMemorySize(size, format); auto dtype = options.dtype(); - int64_t size_bytes = nelements * dtype.itemsize(); + // In graph mode, empty with format is used to make inner tensor, + // ASCEND-GE will take charge of the memory of them + auto size_bytes = + c10_npu::NpuRunMode::IsGraphMode() ? 0 : nelements * dtype.itemsize(); + auto storage_impl = c10::make_intrusive( c10::StorageImpl::use_byte_size_t(), size_bytes, @@ -321,6 +346,12 @@ namespace at_npu true); auto tensor = at::detail::make_tensor(storage_impl, dtype); + + // NB Store weak intrusive ptr of storage impl in graph mode + // see note above + c10::npu::graph::NpuGraphContextManager::GetInstance().AddOutputStorage( + storage_impl); + // Default at::TensorImpl has size [0] if (size.size() != 1 || size[0] != 0) { diff --git a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp index ac699de78356a8674da5a4595070295cdc233827..152e69c1386afc9f15e0eab47e6d7624b42584a3 100644 --- a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp +++ b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp @@ -31,6 +31,10 @@ MemOverlap has_internal_overlap(at::TensorImpl* t) { return MemOverlap::NO; } + if (t->storage().data() == nullptr) { + return MemOverlap::IS_NULL; + } + auto strides = t->strides(); auto sizes = t->sizes(); for (size_t i = 0; i < strides.size(); ++i) { @@ -59,6 +63,9 @@ MemOverlapStatus get_overlap_status(const at::Tensor& a, const at::Tensor& b) { MemOverlapStatus get_overlap_status(at::TensorImpl* a, at::TensorImpl* b) { if (a == b) return MemOverlapStatus::FULL; + if (a->storage().data() == nullptr || b->storage().data() == nullptr) { + return MemOverlapStatus::IS_NULL; + } if (a->numel() == 0 || b->numel() == 0) { return MemOverlapStatus::NO; } diff --git a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h index 93c57359f2b86e1c82fa3358e97e0138df321749..dd655d49f9c97fb2b744640288b662683a527f95 100644 --- a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h +++ b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h @@ -25,11 +25,12 @@ namespace at_npu { namespace native { // // NO: Absolutely no memory overlap // YES: Absolutely yes memory overlap -// TOO_HARD: There might be memory overlap, but it was too expensive to compute. +// TOO_HARD: There might be memory overlap, but it was too expensive to compute +// IS_NULL: In npu graph mode, some tensors have no device ptr. // // NB: Please update the python test for these if you renumber them. -enum class MemOverlap { NO, YES, TOO_HARD }; -enum class MemOverlapStatus { FULL, PARTIAL, NO, TOO_HARD }; +enum class MemOverlap { NO, YES, TOO_HARD, IS_NULL }; +enum class MemOverlapStatus { FULL, PARTIAL, NO, TOO_HARD, IS_NULL }; MemOverlap has_internal_overlap(const at::Tensor& t); MemOverlap has_internal_overlap(at::TensorImpl* t); diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dBackwardKernelNpu.cpp index 75775ef0b4a04d41327fa5cba5441efb3c5903de..5ba51d951e6de1fc47a184752b8ab28bc5adf1e5 100644 --- a/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dBackwardKernelNpu.cpp @@ -59,7 +59,7 @@ at::Tensor& NPUNativeFunctions::adaptive_max_pool2d_backward_out( cmd.Name("MaxPoolGradWithArgmaxV1") .Input(self) .Input(grad_output) - .Input(indices, "", "uint16") + .Input(indices, "", c10::nullopt, "uint16") .Output(grad_input) .Attr("ksize", kernelSize) .Attr("strides", stridesSize) diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dKernelNpu.cpp index 3c5b9ba028811b36e82ef7fa9dd43ff9e6102be7..64bd6cd9e3e6cdf6bed6091605c461d2ad4d6300 100644 --- a/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveMaxPool2dKernelNpu.cpp @@ -57,7 +57,7 @@ tuple NPUNativeFunctions::adaptive_max_pool2d_out( cmd.Name("MaxPoolWithArgmaxV1") .Input(self) .Output(output) - .Output(indices,"uint16") + .Output(indices, "", c10::nullopt, "uint16") .Attr("ksize", kernelSize) .Attr("strides", stridesSize) .Attr("pads", paddings) diff --git a/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesBackwardKernelNpu.cpp index 6916db1ba8a8c363943e82581f13ca8b406c83d1..1e3b7dbc1ed7d34a3feef085f50cc628890fd29a 100644 --- a/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesBackwardKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesBackwardKernelNpu.cpp @@ -52,7 +52,7 @@ at::Tensor& NPUNativeFunctions::max_pool2d_with_indices_backward_out( cmd.Name("MaxPoolGradWithArgmaxV1") .Input(self) .Input(grad_output) - .Input(indices, "", "uint16") + .Input(indices, "", c10::nullopt, "uint16") .Output(grad_input) .Attr("ksize", kernelSize) .Attr("strides", stridesSize) diff --git a/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesKernelNpu.cpp index 8aa842bc37e2050ff986320a85c920db8bc2196e..6be2e061bd7107bf954eeb65b9f1beec38552ee9 100644 --- a/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/pooling/MaxPool2dWithIndicesKernelNpu.cpp @@ -51,7 +51,7 @@ tuple NPUNativeFunctions::max_pool2d_with_indices_out( cmd.Name("MaxPoolWithArgmaxV1") .Input(self) .Output(output) - .Output(indices, "uint16") + .Output(indices, "", c10::nullopt, "uint16") .Attr("ksize", kernelSize) .Attr("strides", stridesSize) .Attr("pads", paddings) diff --git a/torch_npu/csrc/core/npu/NPURunMode.cpp b/torch_npu/csrc/core/npu/NPURunMode.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6b1d6154981a030c63f6021eaf9ad1410259a5cd --- /dev/null +++ b/torch_npu/csrc/core/npu/NPURunMode.cpp @@ -0,0 +1,33 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "NPURunMode.h" + +namespace c10_npu { +ModeKind NpuRunMode::cur_mode_ = ModeKind::DEFAULT_MODE; + +void NpuRunMode::SetNpuRunMode(const ModeKind &mode) { + cur_mode_ = mode; + return; +} + +ModeKind NpuRunMode::CurRunMode() { + return cur_mode_; +} + +bool NpuRunMode::IsGraphMode() { + return cur_mode_ == ModeKind::GRAPH_MODE; +} +} // namespace c10 \ No newline at end of file diff --git a/torch_npu/csrc/core/npu/NPURunMode.h b/torch_npu/csrc/core/npu/NPURunMode.h new file mode 100644 index 0000000000000000000000000000000000000000..dc21a1dcb8276600f0a1e2ced237e8a0bf7530da --- /dev/null +++ b/torch_npu/csrc/core/npu/NPURunMode.h @@ -0,0 +1,38 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include + +namespace c10_npu { +enum class ModeKind : uint8_t { + DEFAULT_MODE = 0, + SINGLE_OP_MODE = DEFAULT_MODE, + GRAPH_MODE, +}; + +class TORCH_API NpuRunMode{ +public: + static void SetNpuRunMode(const ModeKind& mode); + static ModeKind CurRunMode(); + static bool IsGraphMode(); + +private: + static ModeKind cur_mode_; +}; +} // namespace c10 \ No newline at end of file diff --git a/torch_npu/csrc/framework/OpCommandBase.h b/torch_npu/csrc/framework/OpCommandBase.h index bc7f4368f4c49be793183d209494aa9bb4752b2a..f0a873f9c20142d9e569c52f2f3dd9a6644d032c 100644 --- a/torch_npu/csrc/framework/OpCommandBase.h +++ b/torch_npu/csrc/framework/OpCommandBase.h @@ -22,289 +22,337 @@ #include "torch_npu/csrc/framework/OpCmdHelper.h" #include "torch_npu/csrc/framework/OpParamMaker.h" #include "torch_npu/csrc/framework/utils/NpuUtils.h" +#include "torch_npu/csrc/framework/FormatHelper.h" +#include "torch_npu/csrc/core/npu/NPURunMode.h" #include "torch_npu/csrc/framework/allocator/THNPUCachingHostAllocator.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/framework/graph/construct/GraphConstructor.h" -namespace at_npu -{ - namespace native - { +#define IF_GRAPH_MODE_THEN_RUN(...) \ + do { \ + if (c10_npu::NpuRunMode::IsGraphMode()) { \ + __VA_ARGS__; \ + } \ + } while (false); - // get common dtype and shape from op adapter layer - struct UnifiedResult - { - c10::optional common_type = c10::nullopt; - c10::optional common_shape = c10::nullopt; - // judge result tensor's dtype is defined or not. - // if result's dtype is defined, result_type_defined is true and result's dtype remains unchanged. - bool result_type_defined = false; - }; +#define IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(...) \ + do { \ + if (c10_npu::NpuRunMode::IsGraphMode()) { \ + __VA_ARGS__; \ + return static_cast(*this); \ + } \ + } while (false); - template - class OpCommandBase - { - public: - explicit OpCommandBase() - { - aclCmds = OpCommandImpls::GetInstance(); - aclCmds->Push(aclCmd); - } - virtual ~OpCommandBase() {} +namespace at_npu { +namespace native { - Derived &Name(string name) - { - aclCmd->SetName(name); - return static_cast(*this); - } +// get common dtype and shape from op adapter layer +struct UnifiedResult { + c10::optional common_type = c10::nullopt; + c10::optional common_shape = c10::nullopt; + // judge result tensor's dtype is defined or not. + // if result's dtype is defined, result_type_defined is true and result's dtype remains unchanged. + bool result_type_defined = false; +}; - Derived &Expect(UnifiedResult unified_result) - { - commonType = unified_result.common_type; - resultTypeDefined = unified_result.result_type_defined; - commonShape = unified_result.common_shape; - return static_cast(*this); - } +template +class OpCommandBase { +public: + OpCommandBase() { + IF_GRAPH_MODE_THEN_RUN(return;) + aclCmds = OpCommandImpls::GetInstance(); + aclCmds->Push(aclCmd); + } + virtual ~OpCommandBase() {} - template - Derived &Attr(string name, dataType value) - { - aclCmd->AddAttr(name, value); - return static_cast(*this); - } + OpCommandBase(const OpCommandBase &other) = delete; + OpCommandBase(OpCommandBase &&other) = delete; + OpCommandBase &operator=(const OpCommandBase &) = delete; + OpCommandBase &operator=(OpCommandBase &&) = delete; - Derived &InputWithoutContiguous( - const at::Tensor &input, - const string &descName = "", - const string &realData = "") - { - return AddTensorInput(const_cast(input), at::ScalarType::Undefined, descName, realData); - } + Derived &Name(const string &name) { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS(graphCmd.SetName(name);) + aclCmd->SetName(name); + return static_cast(*this); + } - Derived &Input() - { - return AddNoneTensor(); - } + Derived &DynamicInputReg( + c10::npu::graph::DynamicInputRegFunc func, + c10::npu::graph::DyNumAndIndex num_and_index) { + IF_GRAPH_MODE_THEN_RUN( + graphCmd.AddDynamicInputRegFunc(func, num_and_index);) + return static_cast(*this); + } - Derived &Input( - const at::Tensor &input, - const string &descName = "", - const string &realData = "") - { - return AddTensorInput(Contiguous(input), at::ScalarType::Undefined, descName, realData); - } + Derived &Expect(UnifiedResult unified_result) { + commonType = unified_result.common_type; + resultTypeDefined = unified_result.result_type_defined; + commonShape = unified_result.common_shape; + return static_cast(*this); + } - Derived &Input( - const at::Tensor &cpuTensor, - c10::SmallVector dimList, - const string &descName = "") - { - at::Tensor npuTensor = CopyHostToDevice(cpuTensor); - aclCmd->AddConst(dimList); - return AddTensorInput(npuTensor, at::ScalarType::Undefined, descName, "", cpuTensor); - } + template + Derived &Attr(const string &name, dataType value) { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + graphCmd.AddAttr(name, value); + ) + aclCmd->AddAttr(name, value); + return static_cast(*this); + } - Derived &Input(c10::SmallVector &dimList, - at::ScalarType toType = at::kLong) - { + Derived &Input() { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + graphCmd.AddInput(); + ) + return AddNoneTensor(); + } - at::Tensor &cpuTensor = CreateHostTensor((void *)dimList.data(), - dimList.size(), - c10::TensorOptions(at::kCPU).dtype(at::kLong), - toType); - return AddHostTensorInput(cpuTensor); - } + Derived &Input( + const at::Tensor &input, + const string &descName = "", + const c10::optional &sensitive_format = c10::nullopt, + const string &realData = "") { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + auto contiguous_input = Contiguous(input); + if (commonType.has_value() && + commonType.value() != contiguous_input.scalar_type()) { + contiguous_input = NPUNativeFunctions::npu_dtype_cast(contiguous_input, commonType.value()); + } + graphCmd.AddInput(contiguous_input, descName, realData, sensitive_format); + ) + return AddTensorInput( + Contiguous(input), c10::ScalarType::Undefined, descName, realData); + } - Derived &Input(c10::IntArrayRef &dimListRef, - at::ScalarType toType = at::kLong) - { + Derived &InputWithoutContiguousGeneral( + const at::Tensor &input, + const string &descName = "", + const c10::optional &sensitive_format = c10::nullopt, + const string &realData = "") { + return AddTensorInput(const_cast(input), c10::ScalarType::Undefined, descName, realData); + } - at::Tensor &cpuTensor = CreateHostTensor((void *)dimListRef.data(), - dimListRef.size(), - c10::TensorOptions(at::kCPU).dtype(at::kLong), - toType); - return AddHostTensorInput(cpuTensor); - } + Derived &InputWithoutContiguous(const at::Tensor &input, + const string &descName = "", + const string &realData = "") { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + graphCmd.AddInput(input, descName, realData); + ) + if (input.storage_offset() != 0) { + NPU_LOGW( + "[Check][offset] Check input storage_offset[%ld] = 0 failed, result is untrustworthy", + input.storage_offset()); + } + return AddTensorInput(const_cast(input)); + } - Derived &Input(const c10::Scalar &input, const at::ScalarType type, - CompileType compileType = CompileType::MEMORY_DEVICE_COMPILE) - { - if ((compileType == MEMORY_DEVICE_COMPILE) && - (torch_npu::option::OptionsManager::CheckScalarToHostMemEnable())) { - compileType = MEMORY_HOST_COMPILE_INDEPENDENT; - } - if (compileType == CompileType::MEMORY_DEVICE_COMPILE) - { - return AddScalarInput(input, type); - } - else - { - auto scalarTensor = CreateScalarTensor(input, type); - return AddHostTensorInput(scalarTensor, compileType); - } - } + Derived &Input( + const at::Tensor &cpuTensor, + c10::SmallVector dimList, + const string &descName = "") { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + graphCmd.AddInput(dimList, cpuTensor.scalar_type()); + ) + at::Tensor npuTensor = CopyHostToDevice(cpuTensor); + aclCmd->AddConst(dimList); + return AddTensorInput(npuTensor, at::ScalarType::Undefined, descName, "", cpuTensor); + } - Derived &Output(at::Tensor &output, const string &realType = "") - { - return AddOutput(output, realType); - } + Derived &Input(c10::SmallVector &dimList, + at::ScalarType toType = at::kLong) { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + graphCmd.AddInput(dimList, toType); + ) + at::Tensor &cpuTensor = CreateHostTensor((void *) dimList.data(), + dimList.size(), + c10::TensorOptions(at::kCPU).dtype(at::kLong), + toType); + return AddHostTensorInput(cpuTensor); + } - void Run() - { - if (torch_npu::option::OptionsManager::CheckQueueEnable()) - { - ExecuteParas params; - aclCmd->ExportParams(params); - c10::npu::enCurrentNPUStream(¶ms); - aclCmd->releaseSource(false); - } - else - { - aclCmd->Run(); - aclCmd->releaseSource(); - } - aclCmds->Pop(); - } + Derived &Input(c10::IntArrayRef &dimListRef, + at::ScalarType toType = at::kLong) { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + graphCmd.AddInput(dimListRef, toType); + ) + at::Tensor &cpuTensor = CreateHostTensor((void *) dimListRef.data(), + dimListRef.size(), + c10::TensorOptions(at::kCPU).dtype(at::kLong), + toType); + return AddHostTensorInput(cpuTensor); + } - protected: - Derived &AddTensorInput(at::Tensor &tensor, - at::ScalarType forceScaleType = at::ScalarType::Undefined, - const string &descName = "", const string &realData = "", - c10::optional cpu_tensor = c10::nullopt) - { - std::tuple res; - if (commonType.has_value() && commonType.value() != tensor.scalar_type()) - { - tensor = NPUNativeFunctions::npu_dtype_cast(tensor, commonType.value()); - } - // 针对dim=0的场景,绝对不会有输入为uint16的情况,因为这个是TBE引入的,TBE没有dim=0的情况 - if (tensor.dim() == 0) - { - if (tensor.is_npu()) - { - res = OpCmdHelper::CovertNPUTensorWithZeroDimToAclInput(tensor, descName); - } - else - { - res = OpCmdHelper::CovertTensorWithZeroDimToAclInput(tensor, forceScaleType); - } - } - else - { - res = OpCmdHelper::CovertTensorToAclInput(tensor, cpu_tensor, descName, realData); - } - aclCmd->AddInput( - std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res)); - return static_cast(*this); - } - Derived &AddHostTensorInput(const at::Tensor &tensor, - CompileType compileType = CompileType::MEMORY_HOST_COMPILE_DEPENDENT) - { - std::tuple res; - res = OpCmdHelper::CovertHostTensorToAclInput(tensor, tensor.scalar_type(), compileType); - aclCmd->AddInput( - std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res), tensor); - return static_cast(*this); - } - Derived &AddNoneTensor() - { - AclTensorDescMaker desc; - auto aclDesc = desc.Create(ACL_DT_UNDEFINED, ACL_FORMAT_UNDEFINED).Get(); - AclTensorBufferMaker buffer(nullptr, 0); - aclCmd->AddInput(aclDesc, buffer.Get(), 0, ACL_FORMAT_UNDEFINED); - return static_cast(*this); - } - Derived &AddScalarInput(const c10::Scalar &input, - at::ScalarType type) - { - at::ScalarType type_bk = type; - if (commonType.has_value()) - { - type_bk = commonType.value(); + Derived &Input(const c10::Scalar &input, const at::ScalarType type, + CompileType compileType = CompileType::MEMORY_DEVICE_COMPILE) { + if ((compileType == MEMORY_DEVICE_COMPILE) && + (torch_npu::option::OptionsManager::CheckScalarToHostMemEnable())) { + compileType = MEMORY_HOST_COMPILE_INDEPENDENT; + } + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + auto true_type = commonType.has_value() ? commonType.value() : type; + graphCmd.AddInput(input, true_type, compileType); + ) + if (compileType == CompileType::MEMORY_DEVICE_COMPILE) { + return AddScalarInput(input, type); + } else { + auto scalarTensor = CreateScalarTensor(input, type); + return AddHostTensorInput(scalarTensor, compileType); + } + } + + Derived &Output( + at::Tensor &output, + const string &descName = "", + const c10::optional &sensitive_format = c10::nullopt, + const string &realType = "") { + IF_GRAPH_MODE_THEN_RUN_WITH_RET_THIS( + if (sensitive_format.has_value() && + FormatHelper::GetBaseFormat(output) != sensitive_format.value()) { + output = NPUNativeFunctions::npu_format_cast(output, sensitive_format.value()); } - at::Tensor aclInput = CopyHostToDevice(input, type_bk); - auto res = OpCmdHelper::CovertScalarToAclInput(aclInput, type_bk); - aclCmd->AddInput( - std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res)); - return static_cast(*this); - } - Derived &AddOutput(at::Tensor &output, const string &realType = "") - { - if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) - { + graphCmd.AddOutput(output, descName, realType, sensitive_format); + if (!resultTypeDefined && commonType.has_value() && + output.scalar_type() != commonType.value()) { output = NPUNativeFunctions::npu_dtype_cast(output, commonType.value()); } - const at::Tensor *tensor = &output; - auto res = OpCmdHelper::CovertToAclOutput(tensor, realType); - aclCmd->AddOutput( - std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res)); - return static_cast(*this); - } + ) + return AddOutput(output, realType); + } - protected: - // 由于format_contiguous会生成新Tensor,为了保证其在生命周期内有效,故而放到对象中存储 - // 同下,CopyScalarToDevice也有同样问题 - at::Tensor &Contiguous(const at::Tensor &input) - { - storage.emplace_back(NpuUtils::format_contiguous_add_copy_optimize(input)); - return storage.back(); - } - at::Tensor CopyHostToDevice(const c10::Scalar &scalar, at::ScalarType type) - { - auto tensor = scalar_to_tensor(scalar).to(type); - return CopyHostToDevice(tensor); - } - at::Tensor CopyHostToDevice(const at::Tensor &cpuTensor) - { - at::Tensor cpuPinMemTensor = cpuTensor.pin_memory(); - int deviceIndex = 0; - AT_NPU_CHECK(aclrtGetDevice(&deviceIndex)); - auto tensor = cpuPinMemTensor.to( - c10::Device(c10::DeviceType::NPU, deviceIndex), - cpuPinMemTensor.scalar_type(), - true, - true); - storage.emplace_back(tensor); - return storage.back(); + void Run() { + IF_GRAPH_MODE_THEN_RUN(return;) + if (torch_npu::option::OptionsManager::CheckQueueEnable()) { + ExecuteParas params; + aclCmd->ExportParams(params); + c10::npu::enCurrentNPUStream(¶ms); + aclCmd->releaseSource(false); + } else { + aclCmd->Run(); + aclCmd->releaseSource(); + } + aclCmds->Pop(); + } + +protected: + Derived &AddTensorInput(at::Tensor &tensor, + at::ScalarType forceScaleType = at::ScalarType::Undefined, + const string &descName = "", const string &realData = "", + c10::optional cpu_tensor = c10::nullopt) { + std::tuple < aclTensorDesc * , aclDataBuffer *, int64_t, aclFormat > res; + if (commonType.has_value() && commonType.value() != tensor.scalar_type()) { + tensor = NPUNativeFunctions::npu_dtype_cast(tensor, commonType.value()); + } + // 针对dim=0的场景,绝对不会有输入为uint16的情况,因为这个是TBE引入的,TBE没有dim=0的情况 + if (tensor.dim() == 0) { + if (tensor.is_npu()) { + res = OpCmdHelper::CovertNPUTensorWithZeroDimToAclInput(tensor, descName); + } else { + res = OpCmdHelper::CovertTensorWithZeroDimToAclInput(tensor, forceScaleType); } + } else { + res = OpCmdHelper::CovertTensorToAclInput(tensor, cpu_tensor, descName, realData); + } + aclCmd->AddInput( + std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res)); + return static_cast(*this); + } + Derived &AddHostTensorInput(const at::Tensor &tensor, + CompileType compileType = CompileType::MEMORY_HOST_COMPILE_DEPENDENT) { + std::tuple < aclTensorDesc * , aclDataBuffer *, int64_t, aclFormat > res; + res = OpCmdHelper::CovertHostTensorToAclInput(tensor, tensor.scalar_type(), compileType); + aclCmd->AddInput( + std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res), tensor); + return static_cast(*this); + } + Derived &AddNoneTensor() { + AclTensorDescMaker desc; + auto aclDesc = desc.Create(ACL_DT_UNDEFINED, ACL_FORMAT_UNDEFINED).Get(); + AclTensorBufferMaker buffer(nullptr, 0); + aclCmd->AddInput(aclDesc, buffer.Get(), 0, ACL_FORMAT_UNDEFINED); + return static_cast(*this); + } + Derived &AddScalarInput(const c10::Scalar &input, + at::ScalarType type) { + at::ScalarType type_bk = type; + if (commonType.has_value()) { + type_bk = commonType.value(); + } + at::Tensor aclInput = CopyHostToDevice(input, type_bk); + auto res = OpCmdHelper::CovertScalarToAclInput(aclInput, type_bk); + aclCmd->AddInput( + std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res)); + return static_cast(*this); + } + Derived &AddOutput(at::Tensor &output, const string &realType = "") { + if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) { + output = NPUNativeFunctions::npu_dtype_cast(output, commonType.value()); + } + const at::Tensor *tensor = &output; + auto res = OpCmdHelper::CovertToAclOutput(tensor, realType); + aclCmd->AddOutput( + std::get<0>(res), std::get<1>(res), std::get<2>(res), std::get<3>(res)); + return static_cast(*this); + } - at::Tensor &CreateHostTensor(void *data, size_t size, - const c10::TensorOptions &options, at::ScalarType toType) - { +protected: + // 由于format_contiguous会生成新Tensor,为了保证其在生命周期内有效,故而放到对象中存储 + // 同下,CopyScalarToDevice也有同样问题 + at::Tensor &Contiguous(const at::Tensor &input) { + storage.emplace_back(NpuUtils::format_contiguous_add_copy_optimize(input)); + return storage.back(); + } + at::Tensor CopyHostToDevice(const c10::Scalar &scalar, at::ScalarType type) { + auto tensor = scalar_to_tensor(scalar).to(type); + return CopyHostToDevice(tensor); + } + at::Tensor CopyHostToDevice(const at::Tensor &cpuTensor) { + at::Tensor cpuPinMemTensor = cpuTensor.pin_memory(); + int deviceIndex = 0; + AT_NPU_CHECK(aclrtGetDevice(&deviceIndex)); + auto tensor = cpuPinMemTensor.to( + c10::Device(c10::DeviceType::NPU, deviceIndex), + cpuPinMemTensor.scalar_type(), + true, + true); + storage.emplace_back(tensor); + return storage.back(); + } - AT_ASSERT(options.dtype() == at::kLong); - auto cpuTensor = at::empty(size, options); - AT_ASSERT(cpuTensor.is_contiguous()); - std::memcpy(cpuTensor.data_ptr(), data, sizeof(int64_t) * cpuTensor.numel()); - if (toType != at::kLong) - { - cpuTensor = cpuTensor.to(toType); - } + at::Tensor &CreateHostTensor(void *data, size_t size, + const c10::TensorOptions &options, at::ScalarType toType) { - storage.emplace_back(std::move(cpuTensor)); - return storage.back(); - } - at::Tensor CreateScalarTensor(const c10::Scalar &scalar, at::ScalarType type) - { - if (commonType.has_value()) { - type = commonType.value(); - } - storage.emplace_back(scalar_to_tensor(scalar).to(type)); - return storage.back(); - } - c10::SmallVector storage; // tensor's life cycle should maintain when Run() is called + AT_ASSERT(options.dtype() == at::kLong); + auto cpuTensor = at::empty(size, options); + AT_ASSERT(cpuTensor.is_contiguous()); + std::memcpy(cpuTensor.data_ptr(), data, sizeof(int64_t) * cpuTensor.numel()); + if (toType != at::kLong) { + cpuTensor = cpuTensor.to(toType); + } - protected: - OpCommandImpls *aclCmds = nullptr; // owned - OpCommandImpl *aclCmd = nullptr; + storage.emplace_back(std::move(cpuTensor)); + return storage.back(); + } + at::Tensor CreateScalarTensor(const c10::Scalar &scalar, at::ScalarType type) { + if (commonType.has_value()) { + type = commonType.value(); + } + storage.emplace_back(scalar_to_tensor(scalar).to(type)); + return storage.back(); + } + c10::SmallVector storage; // tensor's life cycle should maintain when Run() is called - private: - c10::optional commonType = c10::nullopt; - c10::optional commonShape = c10::nullopt; - bool resultTypeDefined = false; +protected: + OpCommandImpls *aclCmds = nullptr; // owned + OpCommandImpl *aclCmd = nullptr; + GraphCommandImpl graphCmd; - }; // class OpCommandBase +private: + c10::optional commonType = c10::nullopt; + c10::optional commonShape = c10::nullopt; + bool resultTypeDefined = false; - } // namespace native +}; // class OpCommandBase +} // namespace native } // namespace at_npu #endif \ No newline at end of file diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h index 92633ab259150c3264b0988b5819cf0d720de926..4711a9d5f6a1739ef44fb828430b19920e3fd386 100644 --- a/torch_npu/csrc/framework/OpParamMaker.h +++ b/torch_npu/csrc/framework/OpParamMaker.h @@ -216,7 +216,7 @@ namespace at_npu // queue-enable } - void SetName(string &name) + void SetName(const string &name) { opName = name; } diff --git a/torch_npu/csrc/framework/graph/cache/GraphCacher.cpp b/torch_npu/csrc/framework/graph/cache/GraphCacher.cpp new file mode 100644 index 0000000000000000000000000000000000000000..16cadd4b59817f10355a22ac503dcb2cc54b0f18 --- /dev/null +++ b/torch_npu/csrc/framework/graph/cache/GraphCacher.cpp @@ -0,0 +1,78 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "GraphCacher.h" + +namespace at_npu { +namespace native { +hash_t GraphCache::GetGraphTopoHash( + const std::vector& inputs_topo_hash, + const std::vector& outputs_topo_hash) { + hash_t graph_topo_hash = multi_hash(inputs_topo_hash); + graph_topo_hash = multi_hash(graph_topo_hash, outputs_topo_hash); + return graph_topo_hash; +} + +hash_t GraphCache::GetGraphShapeHash( + const std::vector& inputs_shape_hash, + const std::vector& outputs_shape_hash) { + hash_t graph_shape_hash = multi_hash(inputs_shape_hash); + graph_shape_hash = multi_hash(graph_shape_hash, outputs_shape_hash); + return graph_shape_hash; +} + +hash_t GraphCache::GetTensorShapeHash( + const hash_t& topo_hash, + const ge::TensorDesc& tensor_desc) { + return multi_hash( + topo_hash, + tensor_desc.GetOriginShape().GetDimNum(), + tensor_desc.GetOriginShape().GetDims()); +} + +hash_t GraphCache::GetTensorTopoHash( + const Value& graph_value, + const ge::TensorDesc& tensor_desc) { + return multi_hash( + graph_value.GetValueHash(), + tensor_desc.GetDataType(), + tensor_desc.GetOriginFormat(), + tensor_desc.GetFormat()); +} + +c10::optional GraphCache::GetCacheGraphId( + const std::vector& inputs_topo_hash, + const std::vector& inputs_shape_hash, + const std::vector& outputs_topo_hash, + const std::vector& outputs_shape_hash, + uint32_t cur_graph_id) { + hash_t topo_hash = GetGraphTopoHash(inputs_topo_hash, outputs_topo_hash); + hash_t shape_hash = GetGraphShapeHash(inputs_shape_hash, outputs_shape_hash); + auto iter = graph_cache_.find(topo_hash); + if (iter != graph_cache_.end()) { + auto& shape_map = iter->second; + auto shape_iter = shape_map.find(shape_hash); + if (shape_iter != shape_map.end()) { + return shape_iter->second; + } else { + shape_map[shape_hash] = cur_graph_id; + } + } else { + graph_cache_[topo_hash] = {{shape_hash, cur_graph_id}}; + } + return c10::nullopt; +} +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/graph/cache/GraphCacher.h b/torch_npu/csrc/framework/graph/cache/GraphCacher.h new file mode 100644 index 0000000000000000000000000000000000000000..2e030beac772699fd65d6f719d4b0621d5af4d15 --- /dev/null +++ b/torch_npu/csrc/framework/graph/cache/GraphCacher.h @@ -0,0 +1,61 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include + +namespace at_npu { +namespace native { + +using c10::npu::graph::Value; +using c10::npu::hash_utils::hash_t; +using c10::npu::hash_utils::multi_hash; + +class GraphCache { +public: + c10::optional GetCacheGraphId( + const std::vector& inputs_topo_hash, + const std::vector& inputs_shape_hash, + const std::vector& outputs_topo_hash, + const std::vector& outputs_shape_hash, + uint32_t cur_graph_id); + + static hash_t GetTensorTopoHash( + const Value& graph_value, + const ge::TensorDesc& tensor_desc); + + static hash_t GetTensorShapeHash( + const hash_t& topo_hash, + const ge::TensorDesc& tensor_desc); + +private: + static hash_t GetGraphTopoHash( + const std::vector& inputs_topo_hash, + const std::vector& outputs_topo_hash); + + static hash_t GetGraphShapeHash( + const std::vector& inputs_shape_hash, + const std::vector& outputs_shape_hash); + + std::unordered_map> graph_cache_; +}; +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/framework/graph/construct/GraphConstructor.cpp b/torch_npu/csrc/framework/graph/construct/GraphConstructor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..24da178417b40ec9325eab59a271f07683d45860 --- /dev/null +++ b/torch_npu/csrc/framework/graph/construct/GraphConstructor.cpp @@ -0,0 +1,190 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "GraphConstructor.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" +#include "torch_npu/csrc/framework/graph/util/GraphUtils.h" +#include "torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h" + +namespace at_npu { +namespace native { +using c10::npu::graph::NodeExtInfoType; +void GraphCommandImpl::SetName(const std::string& name) { + ir_node_ = std::make_shared(name); +} + +void GraphCommandImpl::AddInput() { + ++input_index_; +} + +void GraphCommandImpl::AddInput( + const at::Tensor& input, + const string& desc_name, + const string& real_dtype, + const c10::optional& sensitive_format) { + if (input.dim() == 0 && !input.is_npu()) { + return AddZeroDimInput(input, desc_name); + } + if (GraphUtils::IsTensorWithoutNode(input)) { + if (!input.storage().data()) { + auto storage_impl = input.storage().unsafeGetStorageImpl(); + size_t n_bytes = storage_impl->nbytes(); + auto data_ptr = c10_npu::NPUCachingAllocator::get()->allocate(n_bytes); + storage_impl->set_data_ptr(std::move(data_ptr)); + } + GraphUtils::SetDataOp(input.storage().unsafeGetStorageImpl()); + } + if (GraphUtils::IsDataTensor(input)) { + GraphUtils::RetainGraphDataTensor(input); + } + if (sensitive_format.has_value()) { + ir_node_->AddExtInfo( + NodeExtInfoType::SENSITIVE_FORMAT_OF_INPUT, + std::make_pair(desc_name, sensitive_format.value())); + } + + auto& cur_ir_value = GraphUtils::GetTensorIrValue(input); + if (!real_dtype.empty()) { + cur_ir_value.SetRealType(real_dtype); + } + ir_node_->AddInput( + input_index_++, cur_ir_value.GetCurNode(), cur_ir_value.GetValueIndex()); + ir_node_->UpdateNodeHash(GraphUtils::GetTensorIrValueHash(input), real_dtype); +} + +void GraphCommandImpl::AddInput( + const at::Scalar& input, + const at::ScalarType type, + CompileType compile_type) { + if (compile_type == CompileType::MEMORY_HOST_COMPILE_INDEPENDENT) { + uint32_t offset; + ReduceScalarValue(input, type, offset); + int deviceIndex = 0; + AT_NPU_CHECK(aclrtGetDevice(&deviceIndex)); + auto npu_scalar_tensor = at::empty({}, at::TensorOptions(at::kNPU, deviceIndex).dtype(type)); + GraphUtils::SetDataOp(npu_scalar_tensor.storage().unsafeGetStorageImpl()); + GraphUtils::RetainGraphDataTensor(npu_scalar_tensor); + auto& cur_ir_value = GraphUtils::GetTensorIrValue(npu_scalar_tensor); + cur_ir_value.SetScalarMemOffset(offset); + ir_node_->AddInput( + input_index_++, cur_ir_value.GetCurNode(), cur_ir_value.GetValueIndex()); + ir_node_->UpdateNodeHash(GraphUtils::GetTensorIrValueHash(npu_scalar_tensor)); + } else { + ir_node_->AddExtInfo( + NodeExtInfoType::INPUT_TYPE_SCALAR, + std::make_tuple(input_index_++, input, type)); + ir_node_->UpdateNodeHash(CalcuOpUtil::get_scalar_float_value(input), type); + } +} + +void GraphCommandImpl::AddInput( + const c10::IntArrayRef& dim_list, + const at::ScalarType to_type) { + vector val(dim_list.begin(), dim_list.end()); + ir_node_->AddExtInfo( + NodeExtInfoType::INPUT_TYPE_LIST_LONG, + std::make_tuple(input_index_++, std::move(val), to_type)); + ir_node_->UpdateNodeHash(dim_list, to_type); +} + +void GraphCommandImpl::AddOutput( + const at::Tensor& output, + const string& desc_name, + const string& real_type, + const c10::optional& sensitive_format) { + if (sensitive_format.has_value()) { + ir_node_->AddExtInfo( + NodeExtInfoType::SENSITIVE_FORMAT_OF_OUTPUT, + std::make_pair(desc_name, sensitive_format.value())); + } + if (!ir_node_->GetInputs().empty() || output_index_ != 0) { + Value value{ir_node_, output_index_++}; + if (!real_type.empty()) { + value.SetRealType(real_type); + } + GraphUtils::SetTensorIrValue(output, value); + } else { + // op without input and has outputs should be treated as graph input + GraphUtils::SetTensorIrValue( + output, Value(ir_node_, ir_node_, output_index_++)); + GraphUtils::RetainGraphDataTensor(output); + } +} + +void GraphCommandImpl::AddDynamicInputRegFunc( + DynamicInputRegFunc func, + DyNumAndIndex num_and_index) { + ir_node_->AddExtInfo( + NodeExtInfoType::DYNAMIC_INPUT_FUNC, std::make_pair(func, num_and_index)); +} + +void GraphCommandImpl::ReduceScalarValue( + const at::Scalar& input, + const at::ScalarType type, + uint32_t& host_ptr_offset) { + if (at::ScalarType::Float == type) { + float value = input.toFloat(); + ScalarMemContext::GetContext().AppendToHostMem( + reinterpret_cast(&value), + sizeof(float), + host_ptr_offset); + } else if (at::ScalarType::Int == type) { + int value = input.toInt(); + ScalarMemContext::GetContext().AppendToHostMem( + reinterpret_cast(&value), + sizeof(int), + host_ptr_offset); + } else if (at::ScalarType::Long == type) { + int64_t value = input.toLong(); + ScalarMemContext::GetContext().AppendToHostMem( + reinterpret_cast(&value), + sizeof(int64_t), + host_ptr_offset); + } else if (at::ScalarType::Double == type) { + double value = input.toDouble(); + ScalarMemContext::GetContext().AppendToHostMem( + reinterpret_cast(&value), + sizeof(double), + host_ptr_offset); + } else if (at::ScalarType::Half == type) { + auto value = input.toHalf(); + ScalarMemContext::GetContext().AppendToHostMem( + reinterpret_cast(&value), + sizeof(at::ScalarType::Half), + host_ptr_offset); + } else { + AT_ERROR("scalar not support '", at::toString(type), "' type currently."); + } +} + +void GraphCommandImpl::AddZeroDimInput( + const at::Tensor& input, + const string& desc_name) { + at::ScalarType dtype = at::ScalarType::Undefined; + if (!input.unsafeGetTensorImpl()->is_wrapped_number()) { + dtype = input.scalar_type(); + } + TORCH_CHECK( + dtype != at::ScalarType::Undefined, "Cpu tensor scalar type is undefined"); + at::Scalar expect_scalar = CalcuOpUtil::ConvertTensorToScalar(input); + ir_node_->AddExtInfo( + NodeExtInfoType::INPUT_TYPE_SCALAR, + std::make_tuple(input_index_++, expect_scalar, dtype)); + ir_node_->UpdateNodeHash( + CalcuOpUtil::get_scalar_float_value(expect_scalar), dtype); +} + +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/framework/graph/construct/GraphConstructor.h b/torch_npu/csrc/framework/graph/construct/GraphConstructor.h new file mode 100644 index 0000000000000000000000000000000000000000..369aebe045e68215b366d73538f21f9cc632991c --- /dev/null +++ b/torch_npu/csrc/framework/graph/construct/GraphConstructor.h @@ -0,0 +1,142 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/framework/utils/NpuUtils.h" + +#include +#include +namespace at_npu { +namespace native { +using c10::npu::graph::DynamicInputRegFunc; +using c10::npu::graph::DyNumAndIndex; +using c10::npu::graph::NodeExtInfoType; +using c10::npu::graph::NodePtr; + +class OperatorAttrMaker { +public: + static void SetAttr(const string& attr_name, bool value, NodePtr node) { + node->AddExtInfo( + NodeExtInfoType::ATTR_TYPE_BOOL, std::make_pair(attr_name, value)); + node->UpdateNodeHash(value); + } + + static void SetAttr(const string& attr_name, float value, NodePtr node) { + node->AddExtInfo( + NodeExtInfoType::ATTR_TYPE_FLOAT, std::make_pair(attr_name, value)); + node->UpdateNodeHash(value); + } + + static void SetAttr(const string& attr_name, int64_t value, NodePtr node) { + node->AddExtInfo( + NodeExtInfoType::ATTR_TYPE_LONG, std::make_pair(attr_name, value)); + node->UpdateNodeHash(value); + } + + static void SetAttr( + const string& attr_name, + const string& value, + NodePtr node) { + node->AddExtInfo( + NodeExtInfoType::ATTR_TYPE_STRING, std::make_pair(attr_name, value)); + node->UpdateNodeHash(value); + } + + static void SetAttr( + const string& attr_name, + const c10::ArrayRef& value, + NodePtr node) { + vector val(value.begin(), value.end()); + node->AddExtInfo( + NodeExtInfoType::ATTR_TYPE_LIST_LONG, + std::make_pair(attr_name, std::move(val))); + node->UpdateNodeHash(val); + } + + static void SetAttr( + const string& attr_name, + const c10::ArrayRef& value, + NodePtr node) { + vector val(value.begin(), value.end()); + node->AddExtInfo( + NodeExtInfoType::ATTR_TYPE_LIST_FLOAT, + std::make_pair(attr_name, std::move(val))); + node->UpdateNodeHash(val); + } + + static void SetAttr( + const string& attr_name, + const c10::Scalar& value, + NodePtr node) { + float val = CalcuOpUtil::get_scalar_float_value(value); + node->AddExtInfo( + NodeExtInfoType::ATTR_TYPE_FLOAT, std::make_pair(attr_name, val)); + node->UpdateNodeHash(val); + } +}; + +class GraphCommandImpl { +public: + GraphCommandImpl() = default; + ~GraphCommandImpl() = default; + + void SetName(const std::string& name); + + void AddInput(); + + void AddInput( + const at::Tensor& input, + const string& desc_name, + const string& real_dtype, + const c10::optional& sensitive_format = c10::nullopt); + + void AddInput( + const c10::Scalar& input, + const at::ScalarType type, + CompileType compile_type); + + void AddInput(const c10::IntArrayRef& dim_list, const at::ScalarType to_type); + + void AddOutput( + const at::Tensor& output, + const string& desc_name = "", + const string& real_type = "", + const c10::optional& sensitive_format = c10::nullopt); + + void AddDynamicInputRegFunc( + DynamicInputRegFunc func, + DyNumAndIndex num_and_index); + + void ReduceScalarValue( + const at::Scalar& input, + const at::ScalarType type, + uint32_t& host_ptr_offset); + + template + void AddAttr(const string& attr_name, T value) { + OperatorAttrMaker::SetAttr(attr_name, value, ir_node_); + } + +private: + void AddZeroDimInput(const at::Tensor& input, const string& desc_name); + + uint32_t output_index_ = 0; + uint32_t input_index_ = 0; + NodePtr ir_node_ = nullptr; +}; +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/graph/execute/GraphExecutor.cpp b/torch_npu/csrc/framework/graph/execute/GraphExecutor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..496457527ae7f43e56174505a0f965b2a5944cbc --- /dev/null +++ b/torch_npu/csrc/framework/graph/execute/GraphExecutor.cpp @@ -0,0 +1,411 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "GraphExecutor.h" + +#include +#include +#include "torch_npu/csrc/framework/graph/util/ATenGeBridge.h" +#include "torch_npu/csrc/framework/graph/util/GraphUtils.h" +#include "torch_npu/csrc/framework/interface/AclInterface.h" +#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" +#include +#include +#include "torch_npu/csrc/core/npu/register/OptionRegister.h" +#include "torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h" +#include + +#include + +// wait RECORD_HOST_FUNCTION to be added into plugin +#define RECORD_HOST_FUNCTION(a, b) ; +namespace at_npu { +namespace native { +namespace { +const char* kPytorchGraphName = "PytorchGraph"; +const std::string kDataNodeType = "Data"; +const char* kDataAttrIndex = "index"; + +static ge::Tensor MakeGeTensor( + const ge::TensorDesc& tensor_desc, + void* device_ptr, + const size_t nbytes) { + ge::Tensor ge_tensor{tensor_desc}; + ge_tensor.SetData( + reinterpret_cast(device_ptr), nbytes, [](uint8_t* device_ptr) { + return; + }); + return ge_tensor; +} +} // namespace + +uint32_t GraphExecutor::graph_id = 0; + +void GraphExecutor::RunGraph( + uint32_t graph_id, + CombinedInfo& inputs, + CombinedInfo& outputs) { + RECORD_HOST_FUNCTION("RunGraph", std::vector({})); + aclrtStream cal_stream = + const_cast(c10::npu::getCurrentNPUStream().stream()); + + auto ret = session_->RunGraphWithStreamAsync(graph_id, + cal_stream, + inputs.tensors, + outputs.tensors); + TORCH_CHECK(ret == 0, "Run Graph Failed!"); +} + +void GraphExecutor::ConstructAndExecuteGraph() { + RECORD_HOST_FUNCTION("ConstructAndExecuteGraph", std::vector({})); + auto ret = CheckDeviceIdAndInit(); + if (!ret) { + return; + } + TORCH_CHECK(session_ != nullptr, "Undefined session before run graph."); + // before construct graph and tensor, do H2D copy for scalar. + ScalarMemContext::GetContext().ExecuteH2D(c10::npu::getCurrentNPUStream()); + CombinedInfo inputs = GetInputCombinedInfo(); + CombinedInfo outputs = GetOutputCombinedInfo(); + if (outputs.nodes.empty()) { + return; + } + + uint32_t cur_graph_id = graph_id + 1; + auto cached_graph_id = cacher_.GetCacheGraphId( + inputs.hash_of_topo_and_attr, + inputs.hash_of_shape, + outputs.hash_of_topo_and_attr, + outputs.hash_of_shape, + cur_graph_id); + + if (!cached_graph_id.has_value()) { + RECORD_HOST_FUNCTION("ConstructGraph", std::vector({})); + ConstructOps(outputs); + ge::Graph graph(kPytorchGraphName); + graph.SetInputs(GetInputOps()).SetOutputs(GetOutputOps()); + + TORCH_CHECK( + session_->AddGraph(cur_graph_id, graph) == 0, "AddGraph failed!"); + graph_id = cur_graph_id; + } else { + cur_graph_id = cached_graph_id.value(); + } + + RunGraph(cur_graph_id, inputs, outputs); + ScalarMemContext::GetContext().Reset(); + ResetGraphOutputs(); + if (!cached_graph_id.has_value()) { + // Data of new graph maybe inputs of old graphs, + // GE will change its attr + // so we need to refresh it + RefreshGraphInputs(); + } + ClearDataStore(); + return; +} + +void GraphExecutor::Init() { + auto device_id = std::to_string(init_device_id_); + std::map config = { + {ge::AscendString(ge::OPTION_EXEC_DEVICE_ID), + ge::AscendString(device_id.data())}, + {ge::AscendString(ge::OPTION_GRAPH_RUN_MODE), "0"}, + {ge::AscendString(ge::PRECISION_MODE.data()), "allow_fp32_to_fp16"}, + {ge::AscendString(ge::VARIABLE_MEMORY_MAX_SIZE), "1048576"} + }; + + static std::map + STRING_TO_COMPILE_OPT_MAP = { + {"ACL_OP_DEBUG_LEVEL", ge::OP_DEBUG_LEVEL}, + {"ACL_DEBUG_DIR", ge::DEBUG_DIR}, + {"ACL_OP_COMPILER_CACHE_MODE", ge::OP_COMPILER_CACHE_MODE}, + {"ACL_OP_COMPILER_CACHE_DIR", ge::OP_COMPILER_CACHE_DIR}, + {"ACL_OP_SELECT_IMPL_MODE", ge::OP_SELECT_IMPL_MODE}, + {"ACL_OPTYPELIST_FOR_IMPLMODE", ge::OPTYPELIST_FOR_IMPLMODE} + }; + + for (const auto& iter : STRING_TO_COMPILE_OPT_MAP) { + auto val = torch_npu::option::GetOption(iter.first); + if (val.has_value() && (!val.value().empty())) { + config.emplace(iter.second.data(), val.value().data()); + } + } + + // to be uncommented +// auto soc_name = c10::npu::acl::AclGetSocName(); +// if (soc_name != nullptr) { +// config.emplace(ge::AscendString(ge::SOC_VERSION.data()), soc_name); +// } + +// if (c10::npu::acl::IsExistQueryEventRecordedStatus()) { +// static const std::string HCOM_OPTIONS = "ge.exec.isUseHcom"; +// config.emplace(HCOM_OPTIONS.data(), "1"); +// } + + config["ge.session_device_id"] = ge::AscendString(device_id.data()); + config["ge.exec.reuseZeroCopyMemory"] = ge::AscendString("1"); + session_ = std::make_unique(config); + C10_NPU_CHECK(aclrtSetDevice(init_device_id_)); + if (session_ == nullptr) { + AT_ERROR("Create session failed!"); + } +} + +void GraphExecutor::Finalize() { + if (GraphExecutor::GetInstance().session_ != nullptr) { + session_.reset(); + session_ = nullptr; + } +} + +void GraphExecutor::ConstructOps(CombinedInfo& output) { + RECORD_HOST_FUNCTION("ConstructOps", std::vector({})); + std::set searched_nodes; + for (const auto& output_node : output.nodes) { + if (searched_nodes.find(output_node) != searched_nodes.end()) { + continue; + } + searched_nodes.insert(output_node); + std::stack stack_node; + stack_node.push(output_node); + while (!stack_node.empty()) { + auto top_node = stack_node.top(); + ATenGeBridge::CheckAndBuildGeOpForNode(top_node); + stack_node.pop(); + const auto& inputs = top_node->GetInputs(); + for (const auto& input : inputs) { + ATenGeBridge::CheckAndBuildGeOpForNode(input.peer_output_node); + top_node->GetGeOp()->SetInput( + input.input_index, + *(input.peer_output_node->GetGeOp()), + input.peer_output_index); + if (searched_nodes.find(input.peer_output_node) != + searched_nodes.end()) { + continue; + } + stack_node.push(input.peer_output_node); + searched_nodes.insert(input.peer_output_node); + } + } + } +} + +std::vector GraphExecutor::GetInputOps() { + std::vector ops; + auto input_storages = c10::npu::graph::NpuGraphContextManager::GetInstance() + .GetAllInputStorages(init_device_id_); + for (size_t index = 0; index < input_storages.size(); ++index) { + auto &graph_desc = input_storages[index]->get_mutable_npu_graph_desc(); + auto data_node = graph_desc.graph_value.GetDataNode(); + auto op_ptr = data_node.value()->GetGeOp(); + if (data_node.value()->GetOpType() == kDataNodeType) { + if (op_ptr == nullptr) { + data_node.value()->SetGeOp(std::make_shared()); + op_ptr = data_node.value()->GetGeOp(); + } + // storageImpl has no dtype since 1.8, need a solution +// auto op_desc = ATenGeBridge::InferGeTenosrDesc( +// input_storages[index]->get_npu_desc(), +// input_storages[index]->dtype(), +// graph_desc.graph_value.GetRealDtype(), +// true); +// // x and y are the input and output names of Data IR +// op_ptr->UpdateInputDesc("x", op_desc); +// op_ptr->UpdateOutputDesc("y", op_desc); +// op_ptr->SetAttr(kDataAttrIndex, static_cast(index)); + } + ops.push_back(*op_ptr); + } + return ops; +} + +GeOutPutOpType GraphExecutor::GetOutputOps() { + GeOutPutOpType ops_and_idx; + auto output_storages = c10::npu::graph::NpuGraphContextManager::GetInstance() + .GetAllStorageOfLiveTensors(init_device_id_); + for (auto& output_storage : output_storages) { + if (GraphUtils::IsTensorWithoutNode(output_storage) || + GraphUtils::IsDataTensor(output_storage)) { + continue; + } + const auto& graph_value = + output_storage->get_mutable_npu_graph_desc().graph_value; + auto op_ptr = graph_value.GetCurNode()->GetGeOp(); + ops_and_idx.emplace_back( + *op_ptr, std::vector{graph_value.GetValueIndex()}); + } + return ops_and_idx; +} + +CombinedInfo GraphExecutor::GetInputCombinedInfo() { + RECORD_HOST_FUNCTION("GetInputCombinedInfo", std::vector({})); + CombinedInfo input_infos; + auto input_storages = c10::npu::graph::NpuGraphContextManager::GetInstance() + .GetAllInputStorages(init_device_id_); + for (size_t index = 0; index < input_storages.size(); ++index) { + c10::NpuGraphDesc& graph_desc = + input_storages[index]->get_mutable_npu_graph_desc(); + auto data_node = graph_desc.graph_value.GetDataNode(); + TORCH_CHECK(data_node.has_value(), "Inputs Tensor must have data node"); + // storageImpl has no dtype since 1.8, need a solution +// ge::TensorDesc tensor_desc = ATenGeBridge::InferGeTenosrDesc( +// input_storages[index]->get_npu_desc(), +// input_storages[index]->dtype(), +// graph_desc.graph_value.GetRealDtype()); +// +// if (data_node.value()->GetOpType() == kDataNodeType) { +// ge::Tensor ge_tensor = +// PrepareInputTensor(input_storages[index], tensor_desc); +// input_infos.tensors.push_back(std::move(ge_tensor)); +// } +// hash_t topo_hash = +// GraphCache::GetTensorTopoHash(graph_desc.graph_value, tensor_desc); +// input_infos.hash_of_topo_and_attr.push_back(topo_hash); +// hash_t shape_hash = GraphCache::GetTensorShapeHash(topo_hash, tensor_desc); +// input_infos.hash_of_shape.push_back(shape_hash); + } + return input_infos; +} + +CombinedInfo GraphExecutor::GetOutputCombinedInfo() { + RECORD_HOST_FUNCTION("GetOutputCombinedInfo", std::vector({})); + CombinedInfo output_infos; + auto output_storages = c10::npu::graph::NpuGraphContextManager::GetInstance() + .GetAllStorageOfLiveTensors(init_device_id_); + for (auto& output_storage : output_storages) { + if (GraphUtils::IsTensorWithoutNode(output_storage) || + GraphUtils::IsDataTensor(output_storage)) { + c10::NpuGraphDesc graph_desc = output_storage->get_npu_graph_desc(); + // the tensor of scalar_merge_copy will enter here because is has't node, + // only the length of the out queue is increased, nothing else. + if ((output_storage->data() == nullptr) && + (!graph_desc.graph_value.GetScalarMemOffset().has_value())) { + size_t nbytes = output_storage->nbytes(); + auto data_ptr = c10_npu::NPUCachingAllocator::get()->allocate(nbytes); + output_storage->set_data_ptr(std::move(data_ptr)); + } + continue; + } + auto& graph_value = + output_storage->get_mutable_npu_graph_desc().graph_value; + TORCH_CHECK(graph_value.HashNode(), "output must have node!"); + output_infos.nodes.push_back(graph_value.GetCurNode()); + // storageImpl has no dtype since 1.8, need a solution +// ge::TensorDesc tensor_desc = ATenGeBridge::InferGeTenosrDesc( +// output_storage->get_npu_desc(), +// output_storage->dtype(), +// graph_value.GetRealDtype()); +// auto ge_tensor = PrepareOutputTenosr(output_storage, tensor_desc); +// output_infos.tensors.push_back(std::move(ge_tensor)); +// hash_t topo_hash = GraphCache::GetTensorTopoHash(graph_value, tensor_desc); +// output_infos.hash_of_topo_and_attr.emplace_back(topo_hash); +// +// hash_t shape_hash = GraphCache::GetTensorShapeHash(topo_hash, tensor_desc); +// output_infos.hash_of_shape.push_back(shape_hash); + } + return output_infos; +} + +ge::Tensor GraphExecutor::PrepareInputTensor( + const c10::StorageImpl* const storage, + const ge::TensorDesc& desc) { + c10::NpuGraphDesc& graph_desc = storage->get_mutable_npu_graph_desc(); + auto device_ptr = storage->data(); + size_t nbytes = storage->nbytes(); + auto addr_offset = graph_desc.graph_value.GetScalarMemOffset(); + if (addr_offset.has_value()) { + device_ptr = ScalarMemContext::GetContext().GetDeviceMemBuffer() + addr_offset.value(); + } + return MakeGeTensor(desc, device_ptr, nbytes); +} + +ge::Tensor GraphExecutor::PrepareOutputTenosr( + c10::StorageImpl* storage, + const ge::TensorDesc& desc) { + c10::NpuGraphDesc& graph_desc = storage->get_mutable_npu_graph_desc(); + TORCH_CHECK( + graph_desc.graph_value.HashNode(), + "graph desc in storage must have node"); + size_t nbytes = storage->nbytes(); + c10::DataPtr data_ptr; + + // In the case of in-place operator + // we can not call set_data_ptr + // for this will cause the old data ptr to be released + // and if one value have data node which has no device memory + // we should malloc for it + if (!(graph_desc.graph_value.GetDataNode().has_value() && + storage->data() != nullptr)) { + data_ptr = c10_npu::NPUCachingAllocator::get()->allocate(nbytes); + storage->set_data_ptr(std::move(data_ptr)); + } + return MakeGeTensor(desc, storage->data(), nbytes); +} + +void GraphExecutor::ResetGraphOutputs() { + RECORD_HOST_FUNCTION("ResetGraphOutputs", std::vector({})); + auto output_storages = c10::npu::graph::NpuGraphContextManager::GetInstance() + .GetAllStorageOfLiveTensors(init_device_id_); + std::for_each( + output_storages.begin(), output_storages.end(), [](c10::StorageImpl* x) { + if (!GraphUtils::IsTensorWithoutNode(x) && + !GraphUtils::IsDataTensor(x)) { + GraphUtils::ResetOp(x); + } + }); +} + +void GraphExecutor::RefreshGraphInputs() { + RECORD_HOST_FUNCTION("RefreshGraphInputs", std::vector({})); + auto input_storages = c10::npu::graph::NpuGraphContextManager::GetInstance() + .GetAllInputStorages(init_device_id_); + std::for_each( + input_storages.begin(), input_storages.end(), [&](c10::StorageImpl* x) { + GraphUtils::SetDataOp(x); + }); +} + +void GraphExecutor::ClearDataStore() { + RECORD_HOST_FUNCTION("ClearDataStore", std::vector({})); + c10::npu::graph::NpuGraphContextManager::GetInstance().EraseInputStorage( + init_device_id_); +} + +bool GraphExecutor::CheckDeviceIdAndInit() { + RECORD_HOST_FUNCTION("CheckDeviceIdAndInit", std::vector({})); + auto devices_has_input = + c10::npu::graph::NpuGraphContextManager::GetInstance() + .GetDevicesHasLiveTensor(); + if (devices_has_input.empty()) { + return false; + } else if (devices_has_input.size() > 1) { + AT_ERROR("In graph mode, you can not construct graph in different device"); + } + + init_device_id_ = devices_has_input.front(); + if (session_ == nullptr) { + Init(); + } + + if (init_device_id_ != devices_has_input.front()) { + AT_ERROR( + "In graph mode, you can not change " + "device id after first graph launch"); + } + return true; +} +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/framework/graph/execute/GraphExecutor.h b/torch_npu/csrc/framework/graph/execute/GraphExecutor.h new file mode 100644 index 0000000000000000000000000000000000000000..41e01e5e5be6125c92b5d0b274d68fe8ea6a336a --- /dev/null +++ b/torch_npu/csrc/framework/graph/execute/GraphExecutor.h @@ -0,0 +1,119 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "torch_npu/csrc/framework/graph/cache/GraphCacher.h" +#include +#include +#include + +#ifdef SUCCESS +#undef SUCCESS +#endif +#ifdef FAILED +#undef FAILED +#endif +#include + +#include + +namespace at_npu { +namespace native { +using c10::npu::graph::NodePtr; +using c10::npu::hash_utils::hash_t; + +using GeOutPutOpType = + std::vector>>; + +struct CombinedInfo { + std::vector nodes; + std::vector tensors; + std::vector hash_of_topo_and_attr; + std::vector hash_of_shape; +}; + +class TORCH_API GraphExecutor { +public: + GraphExecutor(const GraphExecutor&) = delete; + GraphExecutor(GraphExecutor&&) = delete; + GraphExecutor& operator=(const GraphExecutor&) = delete; + GraphExecutor& operator=(GraphExecutor&&) = delete; + ~GraphExecutor() = default; + + void ConstructAndExecuteGraph(); + + static GraphExecutor& GetInstance() { + static GraphExecutor instance; + return instance; + } + + void Finalize(); + + private: + GraphExecutor() = default; + + void Init(); + + /** + * NB + * Currently, in graph mode, there are two limitations + * 1, after your first graph launching, you can not change device, + * the init_device_id_ will be the id + * of first device which has input tensor. + * + * 2, you can not construct graph in two different device. + */ + bool CheckDeviceIdAndInit(); + + void RunGraph( + uint32_t graph_id, + CombinedInfo& inputs, + CombinedInfo& outputs); + + static void ConstructOps(CombinedInfo& output); + + std::vector GetInputOps(); + + GeOutPutOpType GetOutputOps(); + + CombinedInfo GetInputCombinedInfo(); + + CombinedInfo GetOutputCombinedInfo(); + + static ge::Tensor PrepareInputTensor( + const c10::StorageImpl* const storage, + const ge::TensorDesc& desc); + + static ge::Tensor PrepareOutputTenosr( + c10::StorageImpl* storage, + const ge::TensorDesc& desc); + + void ResetGraphOutputs(); + + void RefreshGraphInputs(); + + void ClearDataStore(); + + static uint32_t graph_id; + + c10::DeviceIndex init_device_id_ = -1; + + std::unique_ptr session_ = nullptr; + + GraphCache cacher_; +}; +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.cpp b/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d32e54954c51ed605cb5df06bffe55ef12a87af8 --- /dev/null +++ b/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.cpp @@ -0,0 +1,86 @@ +#include "ScalarMemoryOps.h" + +namespace at_npu { +namespace native { + +void ScalarMemContext::Init() { + cpu_tensor_ = at::empty( + {HOST_MEM_INIT_SIZE}, + at::TensorOptions().pinned_memory(true).device(at::kCPU).dtype(at::kByte)); + host_mem_valid_len_ = 0; + inited_ = true; +} + +void ScalarMemContext::ExecuteH2D(c10::npu::NPUStream stream) { + if (!inited_) { + return; + } + + if (CHECK_MEM_MAX_SIZE <= host_mem_valid_len_) { + AT_ERROR("Checked the device memory size >= 64M."); + return; + } + int deviceIndex = 0; + AT_NPU_CHECK(aclrtGetDevice(&deviceIndex)); + npu_tensor_ = at::empty( + {host_mem_valid_len_}, + at::TensorOptions().device(at::kNPU, deviceIndex).dtype(at::kByte)); + + AT_NPU_CHECK( + aclrtMemcpyAsync( + npu_tensor_.data_ptr(), + host_mem_valid_len_, + cpu_tensor_.data_ptr(), + host_mem_valid_len_, + ACL_MEMCPY_HOST_TO_DEVICE, + stream)); + AT_NPU_CHECK(THNPUCachingHostAllocator_recordEvent(cpu_tensor_.data_ptr(), stream)); + + // reset pin memory + cpu_tensor_.reset(); +} + +void ScalarMemContext::CheckForExpand(uint32_t input_valid_len) { + if (input_valid_len <= (cpu_tensor_.nbytes() - host_mem_valid_len_)) { + return; + } + + auto tmp_tensor = cpu_tensor_; + uint32_t expand_tensor_size = tmp_tensor.nbytes() + HOST_MEM_INIT_SIZE; + cpu_tensor_ = at::empty( + {expand_tensor_size}, + at::TensorOptions().pinned_memory(true).device(at::kCPU).dtype(at::kByte)); + + AT_NPU_CHECK( + aclrtMemcpy( + cpu_tensor_.data_ptr(), + host_mem_valid_len_, + tmp_tensor.data_ptr(), + host_mem_valid_len_, + ACL_MEMCPY_HOST_TO_HOST)); +} + +void ScalarMemContext::AppendToHostMem( + uint8_t* host_ptr, + uint32_t data_len, + uint32_t& data_offset) { + if (!inited_) { + Init(); + } + + uint32_t valid_len = DEVICE_VALID_LEN(data_len); + CheckForExpand(valid_len); + data_offset = host_mem_valid_len_; + std::memcpy( + reinterpret_cast(cpu_tensor_.data_ptr()) + data_offset, + host_ptr, data_len); + host_mem_valid_len_ += valid_len; +} + +void ScalarMemContext::Reset() { + npu_tensor_.reset(); + inited_ = false; +} + +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h b/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..700d74e37741c93efdb44ca74d7bd111c666b377 --- /dev/null +++ b/torch_npu/csrc/framework/graph/scalar/ScalarMemoryOps.h @@ -0,0 +1,68 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "torch_npu/csrc/framework/allocator/THNPUCachingHostAllocator.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" + +namespace at_npu { +namespace native { + +#define HOST_MEM_INIT_SIZE (512 * 10240) // 5M +#define CHECK_MEM_MAX_SIZE (65536 * 1024) // 64M +#define DEVICE_VALID_LEN(a) ((((a) + 32 + 511) / 512) * 512) + +class C10_API ScalarMemContext { +public: + static ScalarMemContext &GetContext() { + static ScalarMemContext ctx; + return ctx; + } + + ScalarMemContext(const ScalarMemContext&) = delete; + ScalarMemContext(ScalarMemContext&&) = delete; + ScalarMemContext& operator=(const ScalarMemContext&) = delete; + ScalarMemContext& operator=(ScalarMemContext&&) = delete; + + uint8_t* GetDeviceMemBuffer() { + return reinterpret_cast(npu_tensor_.data_ptr()); + } + + void AppendToHostMem( + uint8_t* host_ptr, + uint32_t data_len, + uint32_t& data_offset); + + void ExecuteH2D(c10::npu::NPUStream stream); + + void Reset(); + +private: + void Init(); + + void CheckForExpand(uint32_t input_valid_len); + + ScalarMemContext() = default; + + bool inited_ = false; + at::Tensor cpu_tensor_; + at::Tensor npu_tensor_; + uint32_t host_mem_valid_len_ = 0; +}; + +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/graph/util/ATenGeBridge.cpp b/torch_npu/csrc/framework/graph/util/ATenGeBridge.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c4dc1e20bdc76daa0e1d96b49334941832165a93 --- /dev/null +++ b/torch_npu/csrc/framework/graph/util/ATenGeBridge.cpp @@ -0,0 +1,280 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ATenGeBridge.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include + +#include + +namespace at_npu { +namespace native { +namespace { +std::map kScalarTypeToGeDType{ + {at::ScalarType::Byte, ge::DataType::DT_UINT8}, + {at::ScalarType::Char, ge::DataType::DT_INT8}, + {at::ScalarType::Bool, ge::DataType::DT_BOOL}, + {at::ScalarType::Double, ge::DataType::DT_DOUBLE}, + {at::ScalarType::Float, ge::DataType::DT_FLOAT}, + {at::ScalarType::Half, ge::DataType::DT_FLOAT16}, + {at::ScalarType::Short, ge::DataType::DT_INT16}, + {at::ScalarType::Int, ge::DataType::DT_INT32}, + {at::ScalarType::Long, ge::DataType::DT_INT64}, +}; + +std::map kRealDtypeToGeType { + {"uint16", ge::DataType::DT_UINT16}, +}; + +at::Tensor ConstructCpuTenosr(const c10::Scalar& scalar_input, c10::ScalarType type) { + return scalar_to_tensor(scalar_input).to(type); +} + +at::Tensor ConstructCpuTenosr( + const std::vector& list_input, + c10::ScalarType dtype) { + auto cpu_tensor = at::from_blob( + const_cast(reinterpret_cast(list_input.data())), + {list_input.size()}, + c10::TensorOptions(at::kCPU).dtype(at::kLong)); + if (dtype != at::kLong) { + return cpu_tensor.to(dtype); + } + return cpu_tensor; +} +} // namespace + +template <> +void ATenGeBridge::SetGeOpAttr> + (const c10::any& attr_val, ge::OperatorPtr ge_op) { + auto attr = TryToGetAnyValue>(attr_val); + ge_op->SetAttr(attr.first.c_str(), ge::AscendString(attr.second.c_str())); +} + +ge::DataType ATenGeBridge::GetGeDType(c10::ScalarType type) { + auto iter = kScalarTypeToGeDType.find(type); + if (iter == kScalarTypeToGeDType.end()) { + AT_ERROR("Unsupported convert this ATen DType: %s to Ge DType", type); + } + return iter->second; +} + +ge::DataType ATenGeBridge::GetGeDType(caffe2::TypeMeta type_meta) { + auto aten_dtype = c10::typeMetaToScalarType(type_meta); + return GetGeDType(aten_dtype); +} + +ge::DataType ATenGeBridge::GetGeDType(const std::string& real_dtype) { + auto iter = kRealDtypeToGeType.find(real_dtype); + if (iter == kRealDtypeToGeType.end()) { + AT_ERROR("Unsupported convert this ATen DType: %s to Ge DType", real_dtype); + } + return iter->second; +} + +ge::Shape ATenGeBridge::GetGeShape(c10::ArrayRef vec) { + return ge::Shape(std::vector(vec.begin(), vec.end())); +} + +ge::TensorDesc ATenGeBridge::InferGeTenosrDesc( + const c10::NPUStorageDesc& storage_desc, + const caffe2::TypeMeta& type_meta, + const c10::optional& real_dtype, + bool is_op_desc) { + ge::TensorDesc desc; + + if (real_dtype.has_value()) { + desc.SetDataType(ATenGeBridge::GetGeDType(real_dtype.value())); + } else { + desc.SetDataType(ATenGeBridge::GetGeDType(type_meta)); + } + + desc.SetPlacement(ge::kPlacementDevice); + desc.SetOriginShape( + ATenGeBridge::GetGeShape(storage_desc.base_sizes_)); + desc.SetOriginFormat(ge::Format(storage_desc.origin_format_)); + + /* + * NB + * AOE does not support inner format + * So we set Operator description as origin format and shape + * Then we can dump ge graph to begin offline auto tune + * + * data1 data2 data1 data2 + * (nchw/nchw) (nchw/nchw) (nchw/5hd) (nchw/fz) + * \ / \ / + * \ / Param:input_tensors{tensor1(nchw/5hd), tensor2(nchw/fz)} \ / + * \ / -----------------RunGraphWithStreamAsync-----------------> \ / + * conv2d conv2d + * | | + * | | + * netoutput netoutput + * + * In graph, we set data node as data1:nchw(origin format) / nchw (format) + * and data2: nchw(origin format) / nchw (format) + * when we run graph, we give input tensors as tensor1:nchw(origin format) / 5hd(format) + * and tensor2:nchw(origin format) / fz(format) + * In interface RunGraphWithStreamAsync, ge will refresh data description with input tensor description + * to support inner format + * In aoe scene, we dump raw graph without inner format + */ + if (is_op_desc) { + desc.SetShape(ATenGeBridge::GetGeShape(storage_desc.base_sizes_)); + desc.SetFormat(ge::Format(storage_desc.origin_format_)); + } else { + desc.SetShape(ATenGeBridge::GetGeShape(storage_desc.storage_sizes_)); + desc.SetFormat(ge::Format(storage_desc.npu_format_)); + } + + return desc; +} + +template +void ATenGeBridge::SetGeOpConstInput( + const c10::any& const_input, + ge::OperatorPtr ge_op) { + auto const_input_tuple = + ATenGeBridge::TryToGetAnyValue(const_input); + at::Tensor cpu_tensor = ConstructCpuTenosr( + std::get<1>(const_input_tuple), std::get<2>(const_input_tuple)); + auto ge_data_type = GetGeDType(std::get<2>(const_input_tuple)); + ge::TensorDesc ge_tensor_desc{ + ge::Shape(cpu_tensor.sizes().vec()), ge::Format::FORMAT_ND, ge_data_type}; + ge::Tensor ge_tenosr{ + ge_tensor_desc, + reinterpret_cast(cpu_tensor.data_ptr()), + cpu_tensor.nbytes()}; + + auto const_op = std::make_shared(); + const_op->set_attr_value(ge_tenosr); + ge_op->SetInput(std::get<0>(const_input_tuple), *const_op, 0); +} + +void ATenGeBridge::SetSensitiveFormat( + const c10::any& sensitive_format, + ge::OperatorPtr ge_op, + NodeExtInfoType ext_type) { + auto sensitive_format_pair = + TryToGetAnyValue>(sensitive_format); + if (ext_type == NodeExtInfoType::SENSITIVE_FORMAT_OF_INPUT) { + auto tmp_desc = + ge_op->GetInputDescByName(sensitive_format_pair.first.c_str()); + tmp_desc.SetFormat(ge::Format(sensitive_format_pair.second)); + tmp_desc.SetOriginFormat(ge::Format(sensitive_format_pair.second)); + ge_op->UpdateInputDesc(sensitive_format_pair.first.c_str(), tmp_desc); + } else { + auto tmp_desc = + ge_op->GetOutputDescByName(sensitive_format_pair.first.c_str()); + tmp_desc.SetFormat(ge::Format(sensitive_format_pair.second)); + tmp_desc.SetOriginFormat(ge::Format(sensitive_format_pair.second)); + ge_op->UpdateOutputDesc(sensitive_format_pair.first.c_str(), tmp_desc); + } +} + +void ATenGeBridge::AddNodeExtInfoIntoGeOp( + c10::ArrayRef> ext_info, + ge::OperatorPtr ge_op) { + for (const auto& info : ext_info) { + switch (info.first) { + case NodeExtInfoType::ATTR_TYPE_BOOL: + SetGeOpAttr>(info.second, ge_op); + break; + case NodeExtInfoType::ATTR_TYPE_LONG: + SetGeOpAttr>(info.second, ge_op); + break; + case NodeExtInfoType::ATTR_TYPE_FLOAT: + SetGeOpAttr>(info.second, ge_op); + break; + case NodeExtInfoType::ATTR_TYPE_STRING: + SetGeOpAttr>(info.second, ge_op); + break; + case NodeExtInfoType::ATTR_TYPE_LIST_LONG: + SetGeOpAttr>>(info.second, ge_op); + break; + case NodeExtInfoType::ATTR_TYPE_LIST_FLOAT: + SetGeOpAttr>>(info.second, ge_op); + break; + case NodeExtInfoType::INPUT_TYPE_SCALAR: + SetGeOpConstInput>( + info.second, ge_op); + break; + case NodeExtInfoType::INPUT_TYPE_LIST_LONG: + SetGeOpConstInput, c10::ScalarType>>( + info.second, ge_op); + break; + case NodeExtInfoType::SENSITIVE_FORMAT_OF_INPUT: + SetSensitiveFormat( + info.second, ge_op, NodeExtInfoType::SENSITIVE_FORMAT_OF_INPUT); + break; + case NodeExtInfoType::SENSITIVE_FORMAT_OF_OUTPUT: + SetSensitiveFormat( + info.second, ge_op, NodeExtInfoType::SENSITIVE_FORMAT_OF_OUTPUT); + break; + default: + AT_ERROR( + "Has no method to process node ext info type: %d", + static_cast::type>( + info.first)); + } + } +} + +void ATenGeBridge::PorcessDynamicInputReg( + NodePtr node, + ge::OperatorPtr& ge_op, + std::string op_name) { + auto& ext_info = node->GetExtInfo(); + auto it = std::find_if( + ext_info.begin(), + ext_info.end(), + [](const std::pair& item) { + return item.first == NodeExtInfoType::DYNAMIC_INPUT_FUNC; + }); + if (it != ext_info.end()) { + auto func_and_para = + TryToGetAnyValue>( + it->second); + ge_op = func_and_para.first(func_and_para.second, op_name); + + // no need to process it anymore + ext_info.erase(it); + } + return; +} + +void ATenGeBridge::CheckAndBuildGeOpForNode(NodePtr node) { + if (node->GetGeOp() != nullptr) { + return; + } + static uint64_t op_index = 0; + const std::string op_type = node->GetOpType(); + TORCH_CHECK( + ge::OperatorFactory::IsExistOp(op_type.c_str()), + "Cur op type: %s is not exit", + op_type); + std::string op_name = op_type + std::to_string(op_index++); + ge::OperatorPtr ge_op = nullptr; + PorcessDynamicInputReg(node, ge_op, op_name); + if (ge_op == nullptr) { + ge_op = std::make_shared( + ge::OperatorFactory::CreateOperator(op_name.c_str(), op_type.c_str())); + } + AddNodeExtInfoIntoGeOp(node->GetExtInfo(), ge_op); + node->SetGeOp(ge_op); + return; +} + +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/graph/util/ATenGeBridge.h b/torch_npu/csrc/framework/graph/util/ATenGeBridge.h new file mode 100644 index 0000000000000000000000000000000000000000..f2faf60507c75a1406cc212fd3f0d3a1d36b70f1 --- /dev/null +++ b/torch_npu/csrc/framework/graph/util/ATenGeBridge.h @@ -0,0 +1,89 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +namespace at_npu { +namespace native { + +using c10::npu::graph::NodeExtInfoType; +using c10::npu::graph::DyNumAndIndex; +using c10::npu::graph::DynamicInputRegFunc; +using c10::npu::graph::NodePtr; + +class ATenGeBridge { +public: + static ge::DataType GetGeDType(c10::ScalarType type); + + static ge::DataType GetGeDType(caffe2::TypeMeta type_meta); + + static ge::DataType GetGeDType(const std::string& real_dtype); + + static ge::Shape GetGeShape(c10::ArrayRef vec); + + static ge::TensorDesc InferGeTenosrDesc( + const c10::NPUStorageDesc& storage_desc, + const caffe2::TypeMeta& type_meta, + const c10::optional& real_dtype, + bool is_op_desc = false); + + static void CheckAndBuildGeOpForNode(NodePtr node); + +private: + template + static T TryToGetAnyValue(const c10::any& any_val) { + T val; + try { + val = c10::any_cast(any_val); + } catch (c10::bad_any_cast &bd) { + AT_ERROR(bd.what(), typeid(T).name()); + } + return val; + } + + template + static void SetGeOpConstInput( + const c10::any& const_input, + ge::OperatorPtr ge_op); + + static void SetSensitiveFormat( + const c10::any& sensitive_format, + ge::OperatorPtr ge_op, + NodeExtInfoType ext_type); + + static void PorcessDynamicInputReg( + NodePtr node, + ge::OperatorPtr& ge_op, + std::string op_name); + + template + static void SetGeOpAttr(const c10::any& attr_val, ge::OperatorPtr ge_op) { + AttrType attr = TryToGetAnyValue(attr_val); + ge_op->SetAttr(attr.first.c_str(), attr.second); + } + + static void AddNodeExtInfoIntoGeOp( + c10::ArrayRef> ext_info, + ge::OperatorPtr ge_op); +}; +} // namespace native +} // namespace at_npu diff --git a/torch_npu/csrc/framework/graph/util/GraphModeGuard.h b/torch_npu/csrc/framework/graph/util/GraphModeGuard.h new file mode 100644 index 0000000000000000000000000000000000000000..c8eadd73b34c2059bf4e0a55b7953544a5df13c8 --- /dev/null +++ b/torch_npu/csrc/framework/graph/util/GraphModeGuard.h @@ -0,0 +1,52 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "torch_npu/csrc/framework/graph/execute/GraphExecutor.h" +#include "torch_npu/csrc/core/npu/NPURunMode.h" + +namespace at_npu { +namespace native { +class GraphModeGuard { +public: + GraphModeGuard() = delete; + GraphModeGuard(const GraphModeGuard& other) = delete; + GraphModeGuard(GraphModeGuard&& other) = delete; + GraphModeGuard& operator=(const GraphModeGuard& other) = delete; + GraphModeGuard& operator=(GraphModeGuard&& other) = delete; + + explicit GraphModeGuard(c10_npu::ModeKind mode) : mode_(mode) { + ori_mode_ = c10_npu::NpuRunMode::IsGraphMode() + ? c10_npu::ModeKind::GRAPH_MODE + : c10_npu::ModeKind::SINGLE_OP_MODE; + if ((ori_mode_ == c10_npu::ModeKind::GRAPH_MODE) && + (mode_ == c10_npu::ModeKind::SINGLE_OP_MODE)) { + GraphExecutor::GetInstance().ConstructAndExecuteGraph(); + } + c10_npu::NpuRunMode::SetNpuRunMode(mode_); + } + + ~GraphModeGuard() { + c10_npu::NpuRunMode::SetNpuRunMode(ori_mode_); + } + +private: + c10_npu::ModeKind ori_mode_; + c10_npu::ModeKind mode_; +}; +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/graph/util/GraphUtils.cpp b/torch_npu/csrc/framework/graph/util/GraphUtils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..81a3430d5d2640f433842cb88326f3163364eb71 --- /dev/null +++ b/torch_npu/csrc/framework/graph/util/GraphUtils.cpp @@ -0,0 +1,94 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "GraphUtils.h" + +#include + +namespace at_npu { +namespace native { +Value& GraphUtils::GetTensorIrValue(const at::Tensor& tensor) { + auto storage = tensor.storage().unsafeGetStorageImpl(); + TORCH_CHECK(storage != nullptr, "Storage is null"); + return storage->get_mutable_npu_graph_desc().graph_value; +} + +hash_t GraphUtils::GetTensorIrValueHash(const at::Tensor& tensor) { + return GetTensorIrValue(tensor).GetValueHash(); +} + +void GraphUtils::SetTensorIrValue(c10::StorageImpl* storage, const Value& value) { + TORCH_CHECK(storage != nullptr, "Storage is null"); + auto& npu_graph_desc = storage->get_mutable_npu_graph_desc(); + npu_graph_desc.graph_value.UpdateFromOther(value); + return; +} + +void GraphUtils::SetTensorIrValue( + const at::Tensor& tensor, + const Value& value) { + SetTensorIrValue(tensor.storage().unsafeGetStorageImpl(), value); + return; +} + +void GraphUtils::SetDataOp(c10::StorageImpl* storage) { + TORCH_CHECK(storage != nullptr, "Storage is null"); + auto data_node = std::make_shared("Data"); + auto data_value = Value(data_node, data_node, 0); + SetTensorIrValue(storage, data_value); +} + +void GraphUtils::SetDataOp(const at::Tensor& tensor) { + SetDataOp(tensor.storage().unsafeGetStorageImpl()); +} + +void GraphUtils::ResetOp(c10::StorageImpl* storage) { + TORCH_CHECK(storage != nullptr, "Storage is null"); + storage->get_mutable_npu_graph_desc().graph_value.ResetValue(); +} +void GraphUtils::ResetOp(at::Tensor& tensor) { + ResetOp(tensor.storage().unsafeGetStorageImpl()); +} + +bool GraphUtils::IsDataTensor(const c10::StorageImpl* storage) { + TORCH_CHECK(storage != nullptr, "Storage is null"); + auto& value = storage->get_mutable_npu_graph_desc().graph_value; + auto cur_node = value.GetCurNode(); + TORCH_CHECK(cur_node != nullptr, "Cur storage does not have node"); + return (cur_node->GetOpType() == "Data"); +} + +bool GraphUtils::IsDataTensor(const at::Tensor& tensor) { + return IsDataTensor(tensor.storage().unsafeGetStorageImpl()); +} + +bool GraphUtils::IsTensorWithoutNode(const c10::StorageImpl* storage) { + TORCH_CHECK(storage != nullptr, "Storage is null"); + return !storage->get_npu_graph_desc().graph_value.HashNode(); +} + +bool GraphUtils::IsTensorWithoutNode(const at::Tensor& tensor) { + return IsTensorWithoutNode(tensor.storage().unsafeGetStorageImpl()); +} + +void GraphUtils::RetainGraphDataTensor(const at::Tensor& data_tensor) { + auto storage = data_tensor.storage().unsafeGetStorageImpl(); + auto storage_ptr = c10::intrusive_ptr::reclaim(storage); + c10::npu::graph::NpuGraphContextManager::GetInstance().AddInputStorage( + storage_ptr); + storage_ptr.release(); +} +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/graph/util/GraphUtils.h b/torch_npu/csrc/framework/graph/util/GraphUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..682d5ac018ef9e6576dd91f975263c7f147a18ba --- /dev/null +++ b/torch_npu/csrc/framework/graph/util/GraphUtils.h @@ -0,0 +1,53 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +namespace at_npu { +namespace native { + +using c10::npu::graph::Value; +using c10::npu::hash_utils::hash_t; +class GraphUtils { +public: + static Value& GetTensorIrValue(const at::Tensor& tensor); + + static hash_t GetTensorIrValueHash(const at::Tensor& tensor); + + static void SetTensorIrValue(c10::StorageImpl* storage, const Value& value); + static void SetTensorIrValue(const at::Tensor& tensor, const Value& value); + + static void SetDataOp(c10::StorageImpl* storage); + + static void SetDataOp(const at::Tensor& tensor); + + static void ResetOp(c10::StorageImpl* storage); + static void ResetOp(at::Tensor& tensor); + + static bool IsDataTensor(const c10::StorageImpl* storage); + static bool IsDataTensor(const at::Tensor& tensor); + + static bool IsTensorWithoutNode(const c10::StorageImpl* storage); + static bool IsTensorWithoutNode(const at::Tensor& tensor); + + static void RetainGraphDataTensor(const at::Tensor& data_tensor); +}; +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index aeac53150dbc31ac8dca0edff0c826a9528d9ee9..cf6b6da127821cc997f01517e3ed85c5cef6b37a 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -40,8 +40,10 @@ #include "third_party/acl/inc/acl/acl.h" #include "torch_npu/csrc/core/npu/register/OptionRegister.h" +#include "torch_npu/csrc/framework/graph/execute/GraphExecutor.h" #include "torch_npu/csrc/profiler/cann_profiling.h" #include "torch_npu/csrc/profiler/e2e_profiler.h" +#include "torch_npu/csrc/core/npu/NPURunMode.h" static PyObject* THNPModule_initExtension(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS @@ -175,6 +177,44 @@ PyObject * THNPModule_setStream_wrap(PyObject *self, PyObject *obj) END_HANDLE_TH_ERRORS } +PyObject* THNPModule_enable_graph_mode_wrap(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + pybind11::gil_scoped_release no_gil; + c10_npu::NpuRunMode::SetNpuRunMode(c10_npu::ModeKind::GRAPH_MODE); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THNPModule_disable_graph_mode_wrap(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + pybind11::gil_scoped_release no_gil; + at_npu::native::GraphExecutor::GetInstance().ConstructAndExecuteGraph(); + c10_npu::NpuRunMode::SetNpuRunMode(c10_npu::ModeKind::SINGLE_OP_MODE); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THNPModule_launch_graph_wrap(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + pybind11::gil_scoped_release no_gil; + at_npu::native::GraphExecutor::GetInstance().ConstructAndExecuteGraph(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* THNPModule_is_graph_mode_wrap(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + pybind11::gil_scoped_release no_gil; + auto is_graph_mode = c10_npu::NpuRunMode::IsGraphMode(); + if (is_graph_mode) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } + END_HANDLE_TH_ERRORS +} + + PyObject * THNPModule_emptyCache(PyObject *_unused, PyObject *noargs) { HANDLE_TH_ERRORS @@ -476,6 +516,11 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_getCurrentStream", (PyCFunction)THNPModule_getCurrentStream_wrap, METH_O, nullptr}, {"_npu_getDefaultStream", (PyCFunction)THNPModule_getDefaultStream_wrap, METH_O, nullptr}, {"_npu_setStream", (PyCFunction)THNPModule_setStream_wrap, METH_O, nullptr}, + {"_npu_setStream", (PyCFunction)THNPModule_setStream_wrap, METH_O, nullptr}, + {"_npu_enable_graph_mode", (PyCFunction)THNPModule_enable_graph_mode_wrap, METH_NOARGS, nullptr}, + {"_npu_disable_graph_mode", (PyCFunction)THNPModule_disable_graph_mode_wrap, METH_NOARGS, nullptr}, + {"_npu_launch_graph", (PyCFunction)THNPModule_launch_graph_wrap, METH_NOARGS, nullptr}, + {"_npu_is_graph_mode", (PyCFunction)THNPModule_is_graph_mode_wrap, METH_NOARGS, nullptr}, {"_npu_emptyCache", (PyCFunction) THNPModule_emptyCache, METH_NOARGS, nullptr}, {"_npu_memoryStats", (PyCFunction) THNPModule_memoryStats, METH_O, nullptr}, {"_npu_resetAccumulatedMemoryStats", (PyCFunction) THNPModule_resetAccumulatedMemoryStats, METH_O, nullptr}, diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index c370fd20ea8bd8df2d507c6400d4fb9d99d0cc74..5b2686b359ab187fa4d522dc5d5c2d5728282122 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -44,6 +44,7 @@ from .memory import (_free_mutex, caching_allocator_alloc, caching_allocator_del max_memory_allocated, memory_reserved, max_memory_reserved, memory_cached, max_memory_cached, memory_snapshot, memory_summary) from .streams import Stream, Event +from .graph import is_graph_mode, disable_graph_mode, enable_graph_mode, launch_graph from . import profiler from .npu_frontend_enhance import (set_option, set_aoe, profile, prof_init, prof_start, prof_stop, prof_finalize, profileConfig) \ No newline at end of file diff --git a/torch_npu/npu/graph.py b/torch_npu/npu/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..439ec26fc255d213260a6bfc66ec6eadf382e57b --- /dev/null +++ b/torch_npu/npu/graph.py @@ -0,0 +1,22 @@ +import torch_npu +from .utils import _lazy_init + + +def enable_graph_mode(): + torch_npu._C._npu_enable_graph_mode() + + +def disable_graph_mode(): + _lazy_init() + torch_npu._C._npu_disable_graph_mode() + + +def is_graph_mode() -> bool: + return torch_npu._C._npu_is_graph_mode() + + +def launch_graph(): + _lazy_init() + if not is_graph_mode(): + raise RuntimeError("Npu run mode must be graph mode when launch graph") + torch_npu._C._npu_launch_graph() \ No newline at end of file diff --git a/torch_npu/utils/module.py b/torch_npu/utils/module.py index 332995804fd56d4517d4ebb128740d828caaf6cf..c9a295ef213d1122625b6f2ff14d694e75031061 100644 --- a/torch_npu/utils/module.py +++ b/torch_npu/utils/module.py @@ -35,8 +35,14 @@ def npu(self, device=None): if device is None: device = torch.device("npu") if torch_npu.npu.is_available(): + # Ref [cast weight in single op mode] + is_graph_mode = torch_npu.npu.is_graph_mode() + if is_graph_mode: + torch_npu.npu.disable_graph_mode() with torch.no_grad(): self.cast_weight(device) + if is_graph_mode: + torch_npu.npu.enable_graph_mode() return self._apply(lambda t: t.npu(device)) @@ -55,7 +61,14 @@ def to(self, *args, **kwargs): "if a complex module does not work as expected.") if torch_npu.npu.is_available(): with torch.no_grad(): - self.cast_weight(device) + # Ref [cast weight in single op mode] + is_graph_mode = torch_npu.npu.is_graph_mode() + if is_graph_mode: + torch_npu.npu.disable_graph_mode() + with torch.no_grad(): + self.cast_weight(device) + if is_graph_mode: + torch_npu.npu.enable_graph_mode(); def convert(t): if convert_to_format is not None and t.dim() == 4: