diff --git a/torch_npu/contrib/function/roll.py b/torch_npu/contrib/function/roll.py index 97037c8d0754427942742c28de5ba0f6568ee3ac..550064e693c5ce5fc1110ef7f2b666ceb739a531 100644 --- a/torch_npu/contrib/function/roll.py +++ b/torch_npu/contrib/function/roll.py @@ -30,7 +30,7 @@ _roll_with_index_select = _RollWithIndexSelect.apply def _get_roll_index(H, W, shifts, device='cpu'): index = torch.arange(0, H * W).reshape(H, W) index_fp = torch.roll(index, shifts=shifts, dims=(0, 1)).reshape(-1).long() - index_bp_dict = {i:idx for idx, i in enumerate(index_fp.numpy().tolist())} + index_bp_dict = {i: idx for idx, i in enumerate(index_fp.numpy().tolist())} index_bp_list = [index_bp_dict[i] for i in range(H * W)] index_bp = torch.LongTensor(index_bp_list) return [index_fp.to(device), index_bp.to(device)] diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp index bcb89e6c88354a1a290b320c0d9003417b0339f7..c5f6f913d44632c09ebea3978b0f1bdbccbaf82a 100644 --- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp +++ b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp @@ -93,8 +93,10 @@ void GetExclusiveAffinityCPU() offset = find_offset->second; } c10_npu::CoreIdRange cpu_range = parseAffinityCPU(affinity_cpu); - int length = (cpu_range.end - cpu_range.start + 1) / same_num; - c10_npu::CoreIdRange exclusiveAffinityCpu = {cpu_range.start + offset * length, (cpu_range.start + length - 1) + offset * length}; + unsigned int length = (cpu_range.end - cpu_range.start + 1) / static_cast(same_num); + c10_npu::CoreIdRange exclusiveAffinityCpu = { + cpu_range.start + static_cast(offset) * length, + (cpu_range.start + length - 1) + static_cast(offset) * length}; offsetMap[affinity_cpu] = offset + 1; CardIdAffinityCPU[card_id] = exclusiveAffinityCpu; } diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp index 8916a70fc9998b2562a6471fd2cae992086fbe7a..cc817f5506ae2e654caec02cd1a7bc1923382152 100644 --- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp +++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp @@ -123,7 +123,7 @@ int64_t DriverVersionToNum(std::string versionStr) ((release + 1) * 10000) + ((RCVersion + 1) * 100 + 5000) + ((TVersion + 1) * 100) - - (alphaVersion ? 1 : 0) * (100 - alphaVersion) + + (alphaVersion != 0 ? 1 : 0) * (100 - alphaVersion) + (bVersion - 1) + patch; return num; } diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index d3425f6f44b63a8467a0e56174058836796bce9c..5517812a0108c265a5171213f09631b335732351 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -2502,7 +2502,7 @@ private: // Repeat GC until we reach reclaim > target size. bool block_freed = true; - while (gc_reclaimed < target_size && block_freed == true && freeable_block_count > 0) { + while (gc_reclaimed < target_size && block_freed && freeable_block_count > 0) { // Free blocks exceeding this age threshold first. double age_threshold = total_age / freeable_block_count; // Stop iteration if we can no longer free a block. diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index c33f51fbc895f989609cd6ef0953678d7b0e1cdf..a4e14d2232ab30f7a3cd4e991c904f404b18f6a5 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -23,8 +23,8 @@ C10_NPU_API std::mutex* getFreeMutex(); // block inside of already allocated area. class FreeMemoryCallback { public: - virtual ~FreeMemoryCallback(){}; - virtual bool Execute() = 0; + virtual ~FreeMemoryCallback(){}; + virtual bool Execute() = 0; }; C10_DECLARE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback); diff --git a/torch_npu/csrc/core/npu/NPUEventManager.h b/torch_npu/csrc/core/npu/NPUEventManager.h index c01491aa033752413dd880329445a9eb2d8556e2..ac7f0176e0f52daf9f88fdd39bdb2f5b0d546f5b 100644 --- a/torch_npu/csrc/core/npu/NPUEventManager.h +++ b/torch_npu/csrc/core/npu/NPUEventManager.h @@ -22,7 +22,7 @@ public: void DecreaseUnrecordedCount(aclrtEvent event); bool IsEventRecorded(aclrtEvent event); void ClearUnrecordedCount(); - ~NPUEventManager() {} + ~NPUEventManager() {} private: void run(aclrtEvent event); diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp index 9c667f1fdb120c30696c49c1a911702332ffb54b..5d130ec760dbee43d428b3e4b5a94c5c54d534a5 100644 --- a/torch_npu/csrc/core/npu/NPUException.cpp +++ b/torch_npu/csrc/core/npu/NPUException.cpp @@ -132,10 +132,12 @@ const std::string c10_npu_check_error_message(std::string& errmsg) std::regex ws_regex("[\\s\\t\\n\\r]+"); content = std::regex_replace(content, ws_regex, " "); - if (!content.empty() && content.front() == ' ') + if (!content.empty() && content.front() == ' ') { content.erase(0, 1); - if (!content.empty() && content.back() == ' ') + } + if (!content.empty() && content.back() == ' ') { content.pop_back(); + } return content; } diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 73e2bb7ca1d19b06af7bd6b1c34861ce7a5467c7..bd29315e057b8e14ee9189bde7c802f3e73558b9 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -249,7 +249,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error) // occur. #ifndef BUILD_LIBTORCH PyThreadState *gilState = nullptr; - if (PyGILState_Check()) { + if (PyGILState_Check() != 0) { gilState = PyEval_SaveThread(); } #endif @@ -523,7 +523,7 @@ void Repository::Enqueue(void *cur_paras) uint64_t u = 1; SetWriteWorking(true); - while (ret == false && (GetStatus() == RUN || GetStatus() == INIT)) { + while (!ret && (GetStatus() == RUN || GetStatus() == INIT)) { ret = WriteQueue(cur_paras); if (ret == false) { SetWriteWorking(false); @@ -531,7 +531,7 @@ void Repository::Enqueue(void *cur_paras) if (IsFullQueue()) { #ifndef BUILD_LIBTORCH // double check the current thread hold a Gil lock - if (PyGILState_Check()) { + if (PyGILState_Check() != 0) { Py_BEGIN_ALLOW_THREADS s = eventfd_read(efd_write, &u); Py_END_ALLOW_THREADS } else { diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 35e6e526b17c08fa40bbf9259ae2d7e666e0d5cc..4411760ab4497d45c9237fc67c34498196b2b603 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -259,7 +259,7 @@ static uint32_t get_idx(std::atomic& counter) { auto raw_idx = counter++; static int StreamsPerPool = GetStreamsPerPool(); - return raw_idx % StreamsPerPool; + return raw_idx % static_cast(StreamsPerPool); } static uint32_t get_sync_launch_stream_idx(std::atomic& counter) diff --git a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp index 084f1df577a58ba352c76361211f5741df8ab4ef..39d19b0b628e1a191aab9ff4182a7e5bd1f6c657 100644 --- a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp @@ -47,7 +47,7 @@ void* registerSvmMem(void* ptr, size_t size) void* mallocHostSwapMemory(size_t size) { if (!initialized) { - kAlignSize = sysconf(_SC_PAGESIZE); + kAlignSize = static_cast(sysconf(_SC_PAGESIZE)); initialized = true; } size = (size + kAlignSize - 1) & ~(kAlignSize - 1); diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp index 4e0fce02fb311d239c95a913939e41b554333b02..3fedb9d387ef61702a7414912b5572a8e187e7cd 100644 --- a/torch_npu/csrc/core/npu/NpuVariables.cpp +++ b/torch_npu/csrc/core/npu/NpuVariables.cpp @@ -39,7 +39,8 @@ static std::map socVersionMap = { {"Ascend910_9372", SocVersion::Ascend910_9372}, {"Ascend910_9362", SocVersion::Ascend910_9362}}; -void SetSocVersion(const char* const socVersion) { +void SetSocVersion(const char* const socVersion) +{ if (socVersion == nullptr || g_curSocVersion != SocVersion::UnsupportedSocVersion) { return; diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index d85dd2211dae3d89134fff556cd55b95e36b963b..0ac77517aa9d965731f73ae41e934ab9b66db525 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -410,7 +410,7 @@ aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) { } TORCH_CHECK(func_backup, "Failed to find function", "aclrtSynchronizeStreamWithTimeout and aclrtSynchronizeStream", PROF_ERROR(ErrCode::NOT_FOUND)); return func_backup(stream); - } + } } aclError AclrtDestroyStreamForce(aclrtStream stream) { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 3564e5196cc3ce6bce26f9c67850d0e2cc425ef5..4b6707b8495b9a89c500e06f160b42950e9ae6fb 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -277,8 +277,8 @@ void NpuSysCtrl::RegisterLazyFn(const option::OptionCallBack& call_, const std:: lazy_fn_.emplace_back(std::make_pair(call_, in)); } -void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn, - ReleasePriority priority) { +void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn, ReleasePriority priority) +{ const auto& iter = this->release_fn_.find(priority); if (iter != release_fn_.end()) { release_fn_[priority].emplace_back(release_fn); diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 99c6dc6f22aee259c44014bdddc383c092a0fa87..5c3a7729907ab060521f24fb9d941e5d1fe966c8 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -576,7 +576,7 @@ Example:: Default settings return everything - i.e. contains HCCL comm dumps and collective traces. )"); - Py_RETURN_TRUE; + Py_RETURN_TRUE; } // c10d methods on torch._C diff --git a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp index c6b0e64641a65ee303b95e8dd3a8c87883bbcd4a..1626499a80b2870fb4cad1c8c726595e003212ff 100644 --- a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp +++ b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp @@ -18,35 +18,36 @@ namespace at_npu { namespace native { +void ForceAclnn::RegisterOp(const std::string &list) +{ + if (list.empty()) { + return; + } -void ForceAclnn::RegisterOp(const std::string &list) { - if (list.empty()) { - return; - } - - auto value = list; - std::string delimiter = ","; - auto start = 0U; - auto end = value.find(delimiter); - std::string token; - while (end != std::string::npos) { + auto value = list; + std::string delimiter = ","; + auto start = 0U; + auto end = value.find(delimiter); + std::string token; + while (end != std::string::npos) { + token = value.substr(start, end - start); + if (!token.empty()) { + force_aclnn_op_list_.insert(token); + } + start = end + delimiter.size(); + end = value.find(delimiter, start); + } token = value.substr(start, end - start); if (!token.empty()) { - force_aclnn_op_list_.insert(token); + force_aclnn_op_list_.insert(token); } - start = end + delimiter.size(); - end = value.find(delimiter, start); - } - token = value.substr(start, end - start); - if (!token.empty()) { - force_aclnn_op_list_.insert(token); - } - return; + return; } -bool ForceAclnn::IsForceAclnnOp(const std::string &op_name) const { - bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end()); - return ret; +bool ForceAclnn::IsForceAclnnOp(const std::string &op_name) const +{ + bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end()); + return ret; } } // namespace native } // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.h b/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.h index 3c839aeb68d73cd347db637892518c32534842ac..57c8e71e7fbb7c9e06c57ece3f54f2c254476dee 100644 --- a/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.h +++ b/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.h @@ -8,32 +8,25 @@ namespace torch::inductor { // NOTICE: Following APIs are subject to change due to active development // We provide NO BC guarantee for these APIs class TORCH_API AOTIModelContainerRunnerNpu : public AOTIModelContainerRunner { - public: - // @param device_str: cuda device string, e.g. "cuda", "cuda:0" - AOTIModelContainerRunnerNpu( - const std::string& model_so_path, - size_t num_models = 1, - const std::string& device_str = "npu", - const std::string& cubin_dir = ""); +public: + // @param device_str: cuda device string, e.g. "cuda", "cuda:0" + AOTIModelContainerRunnerNpu(const std::string &model_so_path, size_t num_models = 1, + const std::string &device_str = "npu", const std::string &cubin_dir = ""); - ~AOTIModelContainerRunnerNpu(); + ~AOTIModelContainerRunnerNpu(); - std::vector run( - const std::vector& inputs, - void* stream_handle = nullptr) override; + std::vector run(const std::vector &inputs, void *stream_handle = nullptr) override; - std::vector run_with_npu_stream( - std::vector& inputs, - c10_npu::NPUStream npu_stream); + std::vector run_with_npu_stream(std::vector &inputs, c10_npu::NPUStream npu_stream); - void init_proxy_executor(); + void init_proxy_executor(); - void set_proxy_executor(AOTIProxyExecutorHandle handle); + void set_proxy_executor(AOTIProxyExecutorHandle handle); - private: - std::string model_so_path_; - bool init_flag_; - std::unique_ptr proxy_executor_npu_; +private: + std::string model_so_path_; + bool init_flag_; + std::unique_ptr proxy_executor_npu_; }; } // namespace torch::inductor \ No newline at end of file diff --git a/torch_npu/csrc/inductor/aoti_runtime/arrayref_tensor.h b/torch_npu/csrc/inductor/aoti_runtime/arrayref_tensor.h index c5a989b5ca9397eb6226e85aa6b9d9b6517fe3cb..7659b3d2217a0a76980383f9244afff3b315af82 100644 --- a/torch_npu/csrc/inductor/aoti_runtime/arrayref_tensor.h +++ b/torch_npu/csrc/inductor/aoti_runtime/arrayref_tensor.h @@ -14,371 +14,341 @@ namespace torch::aot_inductor { // adapted. template class MiniArrayRef final { - public: - using iterator = T*; - using const_iterator = const T*; - using size_type = size_t; - using value_type = T; - - using reverse_iterator = std::reverse_iterator; - - private: - /// The start of the array, in an external buffer. - T* Data; - - /// The number of elements. - size_type Length; - - public: - /// @name Constructors - /// @{ - - /// Construct an empty MiniArrayRef. - /* implicit */ constexpr MiniArrayRef() : Data(nullptr), Length(0) {} - - /// Construct an MiniArrayRef from a single element. - // TODO Make this explicit - constexpr MiniArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {} - - /// Construct an MiniArrayRef from a pointer and length. - constexpr MiniArrayRef(T* data, size_t length) : Data(data), Length(length) {} - - /// Construct an MiniArrayRef from a range. - constexpr MiniArrayRef(T* begin, T* end) : Data(begin), Length(end - begin) {} - - template < - typename Container, - typename = std::enable_if_t().data())>, - T*>>> - /* implicit */ MiniArrayRef(Container& container) - : Data(container.data()), Length(container.size()) {} - - /// Construct an MiniArrayRef from a std::vector. - // The enable_if stuff here makes sure that this isn't used for - // std::vector, because MiniArrayRef can't work on a std::vector - // bitfield. - template - /* implicit */ MiniArrayRef(const std::vector& Vec) - : Data(Vec.data()), Length(Vec.size()) { - static_assert( - !std::is_same_v, - "MiniArrayRef cannot be constructed from a std::vector bitfield."); - } - - /// Construct an MiniArrayRef from a std::array - template - /* implicit */ constexpr MiniArrayRef(std::array& Arr) - : Data(Arr.data()), Length(N) {} - - /// Construct an MiniArrayRef from a C array. - template - // NOLINTNEXTLINE(*c-array*) - /* implicit */ constexpr MiniArrayRef(T (&Arr)[N]) : Data(Arr), Length(N) {} - - // /// Construct an MiniArrayRef from an empty C array. - /* implicit */ constexpr MiniArrayRef(const volatile void* Arr) - : Data(nullptr), Length(0) {} - - /// Construct an MiniArrayRef from a std::initializer_list. - /* implicit */ constexpr MiniArrayRef(const std::initializer_list& Vec) - : Data( - std::begin(Vec) == std::end(Vec) ? static_cast(nullptr) - : std::begin(Vec)), - Length(Vec.size()) {} - - /// @} - /// @name Simple Operations - /// @{ - - constexpr iterator begin() const { - return Data; - } - constexpr iterator end() const { - return Data + Length; - } - - // These are actually the same as iterator, since MiniArrayRef only - // gives you const iterators. - constexpr const_iterator cbegin() const { - return Data; - } - constexpr const_iterator cend() const { - return Data + Length; - } - - constexpr reverse_iterator rbegin() const { - return reverse_iterator(end()); - } - constexpr reverse_iterator rend() const { - return reverse_iterator(begin()); - } - - /// empty - Check if the array is empty. - constexpr bool empty() const { - return Length == 0; - } - - constexpr T* data() const { - return Data; - } - - /// size - Get the array size. - constexpr size_t size() const { - return Length; - } - - /// equals - Check for element-wise equality. - constexpr bool equals(MiniArrayRef RHS) const { - return Length == RHS.Length && std::equal(begin(), end(), RHS.begin()); - } - - /// @} - /// @name Operator Overloads - /// @{ - constexpr const T& operator[](size_t Index) const { - return Data[Index]; - } - - /// Disallow accidental assignment from a temporary. - /// - /// The declaration here is extra complicated so that "arrayRef = {}" - /// continues to select the move assignment operator. - template - std::enable_if_t, MiniArrayRef>& operator=( - // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) - U&& Temporary) = delete; - - /// Disallow accidental assignment from a temporary. - /// - /// The declaration here is extra complicated so that "arrayRef = {}" - /// continues to select the move assignment operator. - template - std::enable_if_t, MiniArrayRef>& operator=( - std::initializer_list) = delete; +public: + using iterator = T *; + using const_iterator = const T *; + using size_type = size_t; + using value_type = T; + + using reverse_iterator = std::reverse_iterator; + +private: + // / The start of the array, in an external buffer. + T *Data; + + // / The number of elements. + size_type Length; + +public: + // / @name Constructors + // / @{ + + // / Construct an empty MiniArrayRef. + /* implicit */ constexpr MiniArrayRef() : Data(nullptr), Length(0) {} + + // / Construct an MiniArrayRef from a single element. + // TODO Make this explicit + constexpr MiniArrayRef(const T &OneElt) : Data(&OneElt), Length(1) {} + + // / Construct an MiniArrayRef from a pointer and length. + constexpr MiniArrayRef(T *data, size_t length) : Data(data), Length(length) {} + + // / Construct an MiniArrayRef from a range. + constexpr MiniArrayRef(T *begin, T *end) : Data(begin), Length(end - begin) {} + + template ().data())>, T *>>> + /* implicit */ MiniArrayRef(Container &container) : Data(container.data()), Length(container.size()) + {} + + // / Construct an MiniArrayRef from a std::vector. + // The enable_if stuff here makes sure that this isn't used for + // std::vector, because MiniArrayRef can't work on a std::vector + // bitfield. + template + /* implicit */ MiniArrayRef(const std::vector &Vec) : Data(Vec.data()), Length(Vec.size()) + { + static_assert(!std::is_same_v, + "MiniArrayRef cannot be constructed from a std::vector bitfield."); + } + + // / Construct an MiniArrayRef from a std::array + template + /* implicit */ constexpr MiniArrayRef(std::array &Arr) : Data(Arr.data()), Length(N) + {} + + // / Construct an MiniArrayRef from a C array. + template + // NOLINTNEXTLINE(*c-array*) + /* implicit */ constexpr MiniArrayRef(T (&Arr)[N]) : Data(Arr), Length(N) + {} + + // /// Construct an MiniArrayRef from an empty C array. + /* implicit */ constexpr MiniArrayRef(const volatile void *Arr) : Data(nullptr), Length(0) {} + + // / Construct an MiniArrayRef from a std::initializer_list. + /* implicit */ constexpr MiniArrayRef(const std::initializer_list &Vec) + : Data(std::begin(Vec) == std::end(Vec) ? static_cast(nullptr) : std::begin(Vec)), Length(Vec.size()) + {} + + // / @} + // / @name Simple Operations + // / @{ + + constexpr iterator begin() const + { + return Data; + } + constexpr iterator end() const + { + return Data + Length; + } + + // These are actually the same as iterator, since MiniArrayRef only + // gives you const iterators. + constexpr const_iterator cbegin() const + { + return Data; + } + constexpr const_iterator cend() const + { + return Data + Length; + } + + constexpr reverse_iterator rbegin() const + { + return reverse_iterator(end()); + } + constexpr reverse_iterator rend() const + { + return reverse_iterator(begin()); + } + + // / empty - Check if the array is empty. + constexpr bool empty() const + { + return Length == 0; + } + + constexpr T *data() const + { + return Data; + } + + // / size - Get the array size. + constexpr size_t size() const + { + return Length; + } + + // / equals - Check for element-wise equality. + constexpr bool equals(MiniArrayRef RHS) const + { + return Length == RHS.Length && std::equal(begin(), end(), RHS.begin()); + } + + // / @} + // / @name Operator Overloads + // / @{ + constexpr const T &operator[](size_t Index) const + { + return Data[Index]; + } + + // / Disallow accidental assignment from a temporary. + // / + // / The declaration here is extra complicated so that "arrayRef = {}" + // / continues to select the move assignment operator. + template + std::enable_if_t, MiniArrayRef> &operator = ( + // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) + U &&Temporary) = delete; + + // / Disallow accidental assignment from a temporary. + // / + // / The declaration here is extra complicated so that "arrayRef = {}" + // / continues to select the move assignment operator. + template + std::enable_if_t, MiniArrayRef> &operator = (std::initializer_list) = delete; }; using MiniIntArrayRef = MiniArrayRef; -static_assert( - sizeof(MiniIntArrayRef) == sizeof(void*) + sizeof(size_t), +static_assert(sizeof(MiniIntArrayRef) == sizeof(void *) + sizeof(size_t), "changing the size of MiniArrayRef breaks ABI compatibility!"); -inline bool is_contiguous_strides_for_shape( - int64_t ndim, - const int64_t* strides_ptr, - const int64_t* sizes_ptr) { - int64_t z = 1; - for (int64_t d = ndim - 1; d >= 0; d--) { - const auto& size_d = sizes_ptr[d]; - if (size_d != 1) { - if (strides_ptr[d] == z) { - z *= size_d; - } else { - return false; - } +inline bool is_contiguous_strides_for_shape(int64_t ndim, const int64_t *strides_ptr, const int64_t *sizes_ptr) +{ + int64_t z = 1; + for (int64_t d = ndim - 1; d >= 0; d--) { + const auto &size_d = sizes_ptr[d]; + if (size_d != 1) { + if (strides_ptr[d] == z) { + z *= size_d; + } else { + return false; + } + } } - } - return true; + return true; } // Shim for AOTI generated code to pretend a raw array works like an // AtenTensorHandle. -template -class ArrayRefTensor { - public: - ArrayRefTensor() = default; - - explicit ArrayRefTensor( - MiniArrayRef arr, - MiniArrayRef sizes, - MiniArrayRef strides, - int32_t device_type, - int32_t device_idx) - : arrayRef_(arr), - sizes_(sizes), - strides_(strides), - device_type_(device_type), - device_idx_(device_idx) { - assert(sizes.size() == strides.size()); - assert(is_contiguous_strides_for_shape( - sizes.size(), strides.data(), sizes.data())); - } - - AtenTensorHandle expensiveCopyToTensor() const { - AtenTensorHandle result = nullptr; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided( - sizes_.size(), - sizes_.data(), - strides_.data(), - aoti_torch_dtype>(), - device_type_, - device_idx_, - &result)); - void* dataPtr = nullptr; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(result, &dataPtr)); - std::memcpy(dataPtr, data(), numel() * sizeof(T)); - return result; - } - - // We need to look the same as RAIIAtenTensorHandle, which returns - // an owning AtenTensorHandle from release(). So, we allocate one! - AtenTensorHandle release() { - return expensiveCopyToTensor(); - } - - AtenTensorHandle borrowAsTensor() const { - AtenTensorHandle result = nullptr; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2( - data(), - sizes_.size(), - sizes_.data(), - strides_.data(), - 0, - aoti_torch_dtype>(), - device_type_, - device_idx_, - &result, - aoti_torch_layout_strided(), - nullptr, - 0)); - return result; - } - - // We don't need to free any memory. - void reset() {} - - auto sizes() const { - return sizes_; - } - - auto strides() const { - return strides_; - } - - auto device_type() const { - return device_type_; - } - - auto device_idx() const { - return device_idx_; - } - - T* data() const { - return arrayRef_.data(); - } - - auto numel() const { - return arrayRef_.size(); - } - - void set_arrayref(MiniArrayRef new_arrayref) { - arrayRef_ = new_arrayref; - } - - private: - MiniArrayRef arrayRef_; - // We expect generated code to have statically available sizes & - // strides for us. - MiniArrayRef sizes_; - MiniArrayRef strides_; - int32_t device_type_ = 0; - int32_t device_idx_ = 0; - // We continue to zero-initialize this field in case we repurpose - // the space later; having predictable contents can only help. - int32_t unusedDoNotRemoveForABICompatibility_ = 0; +template class ArrayRefTensor { +public: + ArrayRefTensor() = default; + + explicit ArrayRefTensor(MiniArrayRef arr, MiniArrayRef sizes, MiniArrayRef strides, + int32_t device_type, int32_t device_idx) + : arrayRef_(arr), sizes_(sizes), strides_(strides), device_type_(device_type), device_idx_(device_idx) + { + assert(sizes.size() == strides.size()); + assert(is_contiguous_strides_for_shape(sizes.size(), strides.data(), sizes.data())); + } + + AtenTensorHandle expensiveCopyToTensor() const + { + AtenTensorHandle result = nullptr; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(sizes_.size(), sizes_.data(), strides_.data(), + aoti_torch_dtype>(), device_type_, device_idx_, &result)); + void *dataPtr = nullptr; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(result, &dataPtr)); + std::memcpy(dataPtr, data(), numel() * sizeof(T)); + return result; + } + + // We need to look the same as RAIIAtenTensorHandle, which returns + // an owning AtenTensorHandle from release(). So, we allocate one! + AtenTensorHandle release() + { + return expensiveCopyToTensor(); + } + + AtenTensorHandle borrowAsTensor() const + { + AtenTensorHandle result = nullptr; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(data(), sizes_.size(), sizes_.data(), + strides_.data(), 0, aoti_torch_dtype>(), device_type_, device_idx_, &result, + aoti_torch_layout_strided(), nullptr, 0)); + return result; + } + + // We don't need to free any memory. + void reset() {} + + auto sizes() const + { + return sizes_; + } + + auto strides() const + { + return strides_; + } + + auto device_type() const + { + return device_type_; + } + + auto device_idx() const + { + return device_idx_; + } + + T *data() const + { + return arrayRef_.data(); + } + + auto numel() const + { + return arrayRef_.size(); + } + + void set_arrayref(MiniArrayRef new_arrayref) + { + arrayRef_ = new_arrayref; + } + +private: + MiniArrayRef arrayRef_; + // We expect generated code to have statically available sizes & + // strides for us. + MiniArrayRef sizes_; + MiniArrayRef strides_; + int32_t device_type_ = 0; + int32_t device_idx_ = 0; + // We continue to zero-initialize this field in case we repurpose + // the space later; having predictable contents can only help. + int32_t unusedDoNotRemoveForABICompatibility_ = 0; }; -static_assert( - sizeof(ArrayRefTensor) == - 3 * sizeof(MiniIntArrayRef) + 3 * sizeof(int32_t) + - (alignof(ArrayRefTensor) > 4 ? sizeof(int32_t) : 0), +static_assert(sizeof(ArrayRefTensor) == + 3 * sizeof(MiniIntArrayRef) + 3 * sizeof(int32_t) + (alignof(ArrayRefTensor) > 4 ? sizeof(int32_t) : 0), "changing the size of ArrayRefTensor breaks ABI compatibility!"); template -inline ArrayRefTensor reinterpret_tensor_wrapper( - const ArrayRefTensor& self, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset) { - // REVIEW: we should add a way to build the DSO in debug mode during - // tests so we can have checks like this! - assert(is_contiguous_strides_for_shape(ndim, strides_ptr, sizes_ptr)); - return ArrayRefTensor( - MiniArrayRef( - self.data() + storage_offset, self.numel() - storage_offset), - MiniArrayRef(sizes_ptr, ndim), - MiniArrayRef(strides_ptr, ndim), - self.device_type(), - self.device_idx()); +inline ArrayRefTensor reinterpret_tensor_wrapper(const ArrayRefTensor &self, int64_t ndim, + const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset) +{ + // REVIEW: we should add a way to build the DSO in debug mode during + // tests so we can have checks like this! + assert(is_contiguous_strides_for_shape(ndim, strides_ptr, sizes_ptr)); + return ArrayRefTensor(MiniArrayRef(self.data() + storage_offset, self.numel() - storage_offset), + MiniArrayRef(sizes_ptr, ndim), MiniArrayRef(strides_ptr, ndim), + self.device_type(), self.device_idx()); } -template -inline T* get_data_ptr_wrapper(ArrayRefTensor& tensor) { - return tensor.data(); +template inline T *get_data_ptr_wrapper(ArrayRefTensor &tensor) +{ + return tensor.data(); } -template -inline T* get_data_ptr_wrapper(const MiniArrayRef& arr) { - return arr.data(); +template inline T *get_data_ptr_wrapper(const MiniArrayRef &arr) +{ + return arr.data(); } -template -inline const ArrayRefTensor& unwrap_raii_handle_if_needed( - const ArrayRefTensor& tensor) { - return tensor; +template inline const ArrayRefTensor &unwrap_raii_handle_if_needed(const ArrayRefTensor &tensor) +{ + return tensor; } -template -inline ArrayRefTensor& unwrap_raii_handle_if_needed( - ArrayRefTensor& tensor) { - return tensor; +template inline ArrayRefTensor &unwrap_raii_handle_if_needed(ArrayRefTensor &tensor) +{ + return tensor; } -template -inline const ArrayRefTensor& wrap_with_raii_handle_if_needed( - const ArrayRefTensor& tensor) { - return tensor; +template inline const ArrayRefTensor &wrap_with_raii_handle_if_needed(const ArrayRefTensor &tensor) +{ + return tensor; } -template -inline ArrayRefTensor& wrap_with_raii_handle_if_needed( - ArrayRefTensor& tensor) { - return tensor; +template inline ArrayRefTensor &wrap_with_raii_handle_if_needed(ArrayRefTensor &tensor) +{ + return tensor; } -template -inline RAIIAtenTensorHandle expensive_copy_to_tensor_if_needed( - const ArrayRefTensor& tensor) { - return tensor.expensiveCopyToTensor(); +template inline RAIIAtenTensorHandle expensive_copy_to_tensor_if_needed(const ArrayRefTensor &tensor) +{ + return tensor.expensiveCopyToTensor(); } -inline AtenTensorHandle expensive_copy_to_tensor_if_needed( - AtenTensorHandle handle) { - return handle; +inline AtenTensorHandle expensive_copy_to_tensor_if_needed(AtenTensorHandle handle) +{ + return handle; } -template -const T& copy_arrayref_tensor_to_tensor(const T& t) { - return t; +template const T ©_arrayref_tensor_to_tensor(const T &t) +{ + return t; } -template -RAIIAtenTensorHandle copy_arrayref_tensor_to_tensor( - const ArrayRefTensor& art) { - return art.expensiveCopyToTensor(); +template RAIIAtenTensorHandle copy_arrayref_tensor_to_tensor(const ArrayRefTensor &art) +{ + return art.expensiveCopyToTensor(); } -template -const T& borrow_arrayref_tensor_as_tensor(const T& t) { - return t; +template const T &borrow_arrayref_tensor_as_tensor(const T &t) +{ + return t; } -template -RAIIAtenTensorHandle borrow_arrayref_tensor_as_tensor( - const ArrayRefTensor& art) { - return art.borrowAsTensor(); +template RAIIAtenTensorHandle borrow_arrayref_tensor_as_tensor(const ArrayRefTensor &art) +{ + return art.borrowAsTensor(); } } // namespace torch::aot_inductor diff --git a/torch_npu/csrc/inductor/aoti_runtime/device_utils.h b/torch_npu/csrc/inductor/aoti_runtime/device_utils.h index fd640b78ecae6a88bf78130319e8b0fcf645448c..7d32261911f5343d6e5fe63c44847bda234df38c 100644 --- a/torch_npu/csrc/inductor/aoti_runtime/device_utils.h +++ b/torch_npu/csrc/inductor/aoti_runtime/device_utils.h @@ -14,15 +14,14 @@ #include #include -#define AOTI_RUNTIME_DEVICE_CHECK(EXPR) \ - do { \ - const cudaError_t code = EXPR; \ - const char* msg = cudaGetErrorString(code); \ - if (code != cudaSuccess) { \ - throw std::runtime_error( \ - std::string("CUDA error: ") + std::string(msg)); \ - } \ - } while (0) +#define AOTI_RUNTIME_DEVICE_CHECK(EXPR) \ + do { \ + const cudaError_t code = EXPR; \ + const char *msg = cudaGetErrorString(code); \ + if (code != cudaSuccess) { \ + throw std::runtime_error(std::string("CUDA error: ") + std::string(msg)); \ + } \ + } while (0) namespace torch::aot_inductor { @@ -34,19 +33,19 @@ using DeviceStreamType = cudaStream_t; #include #include #include -#define AOTI_RUNTIME_DEVICE_CHECK(EXPR) \ - do { \ - const ze_result_t status = EXPR; \ - if (status != ZE_RESULT_SUCCESS) { \ - std::stringstream ss; \ - ss << "L0 runtime error: " << std::hex << std::uppercase << status; \ - throw std::runtime_error(ss.str()); \ - } \ - } while (0) +#define AOTI_RUNTIME_DEVICE_CHECK(EXPR) \ + do { \ + const ze_result_t status = EXPR; \ + if (status != ZE_RESULT_SUCCESS) { \ + std::stringstream ss; \ + ss << "L0 runtime error: " << std::hex << std::uppercase << status; \ + throw std::runtime_error(ss.str()); \ + } \ + } while (0) namespace torch::aot_inductor { -using DeviceStreamType = sycl::queue*; +using DeviceStreamType = sycl::queue *; } // namespace torch::aot_inductor @@ -55,19 +54,18 @@ using DeviceStreamType = sycl::queue*; #include "third_party/acl/inc/acl/acl_base.h" #include "third_party/acl/inc/acl/acl_rt.h" -typedef void* NPUdeviceptr; +typedef void *NPUdeviceptr; -typedef void* NPUfunction; +typedef void *NPUfunction; -#define AOTI_RUNTIME_DEVICE_CHECK(EXPR) \ - do { \ - const aclError code = EXPR; \ - if (code != ACL_SUCCESS) { \ - throw std::runtime_error( \ - std::string("NPU error core: ") + std::to_string(code) \ - + std::string(" ") + std::string(__FILE__) + std::string(":") + std::to_string(__LINE__)); \ - } \ - } while (0) +#define AOTI_RUNTIME_DEVICE_CHECK(EXPR) \ + do { \ + const aclError code = EXPR; \ + if (code != ACL_SUCCESS) { \ + throw std::runtime_error(std::string("NPU error core: ") + std::to_string(code) + std::string(" ") + \ + std::string(__FILE__) + std::string(":") + std::to_string(__LINE__)); \ + } \ + } while (0) namespace torch::aot_inductor { @@ -77,16 +75,17 @@ using DeviceStreamType = aclrtStream; #else -#define AOTI_RUNTIME_DEVICE_CHECK(EXPR) \ - bool ok = EXPR; \ - if (!ok) { \ - throw std::runtime_error("CPU runtime error"); \ - } +#define AOTI_RUNTIME_DEVICE_CHECK(EXPR) \ + bool ok = EXPR; \ + if (!ok) { \ + throw std::runtime_error("CPU runtime error"); \ + } namespace torch::aot_inductor { -using DeviceStreamType = void*; +using DeviceStreamType = void *; } // namespace torch::aot_inductor -#endif // USE_CUDA +#endif \ + // USE_CUDA diff --git a/torch_npu/csrc/inductor/aoti_torch/c/shim.h b/torch_npu/csrc/inductor/aoti_torch/c/shim.h index efc536fc3205031e2719ea49a34ba88e82542d24..6cafd55f8743f7c80b3163e0e1618737d6b60e27 100644 --- a/torch_npu/csrc/inductor/aoti_torch/c/shim.h +++ b/torch_npu/csrc/inductor/aoti_torch/c/shim.h @@ -75,13 +75,13 @@ extern "C" { // (note that RAIIAtenTensorHandle is private to model.so, and never crosses // the ABI boundary.) struct AtenTensorOpaque; -using AtenTensorHandle = AtenTensorOpaque*; +using AtenTensorHandle = AtenTensorOpaque *; struct AtenGeneratorOpaque; -using AtenGeneratorHandle = AtenGeneratorOpaque*; +using AtenGeneratorHandle = AtenGeneratorOpaque *; struct AOTIProxyExecutorOpaque; -using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque*; +using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque *; using AOTITorchError = int32_t; #define AOTI_TORCH_SUCCESS 0 @@ -135,564 +135,325 @@ AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_channels_last_3d(); AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_preserve_format(); // Functions for converting a single-element tensor to a scalar value -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_float16(AtenTensorHandle tensor, c10::Half* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_float32(AtenTensorHandle tensor, float* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_float64(AtenTensorHandle tensor, double* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_uint8(AtenTensorHandle tensor, uint8_t* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_uint16(AtenTensorHandle tensor, uint16_t* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_uint32(AtenTensorHandle tensor, uint32_t* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_uint64(AtenTensorHandle tensor, uint64_t* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_int8(AtenTensorHandle tensor, int8_t* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_int16(AtenTensorHandle tensor, int16_t* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_int32(AtenTensorHandle tensor, int32_t* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_int64(AtenTensorHandle tensor, int64_t* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_bool(AtenTensorHandle tensor, bool* ret_value); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_item_bfloat16(AtenTensorHandle tensor, c10::BFloat16* ret_value); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_complex64( - AtenTensorHandle tensor, - c10::complex* ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_float16(AtenTensorHandle tensor, c10::Half *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_float32(AtenTensorHandle tensor, float *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_float64(AtenTensorHandle tensor, double *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_uint8(AtenTensorHandle tensor, uint8_t *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_uint16(AtenTensorHandle tensor, uint16_t *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_uint32(AtenTensorHandle tensor, uint32_t *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_uint64(AtenTensorHandle tensor, uint64_t *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_int8(AtenTensorHandle tensor, int8_t *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_int16(AtenTensorHandle tensor, int16_t *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_int32(AtenTensorHandle tensor, int32_t *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_int64(AtenTensorHandle tensor, int64_t *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_bool(AtenTensorHandle tensor, bool *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_bfloat16(AtenTensorHandle tensor, c10::BFloat16 *ret_value); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_complex64(AtenTensorHandle tensor, c10::complex *ret_value); // Functions for wrapping a scalar value to a single-element tensor -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float32( - float value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float64( - double value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint8( - uint8_t value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint16( - uint16_t value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint32( - uint32_t value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint64( - uint64_t value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int8( - int8_t value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int16( - int16_t value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int32( - int32_t value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int64( - int64_t value, - AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_scalar_to_tensor_bool(bool value, AtenTensorHandle* ret_new_tensor); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_complex64( - c10::complex value, - AtenTensorHandle* ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float32(float value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float64(double value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint8(uint8_t value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint16(uint16_t value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint32(uint32_t value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint64(uint64_t value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int8(int8_t value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int16(int16_t value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int32(int32_t value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int64(int64_t value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_bool(bool value, AtenTensorHandle *ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_complex64(c10::complex value, + AtenTensorHandle *ret_new_tensor); AOTI_TORCH_EXPORT bool aoti_torch_grad_mode_is_enabled(); AOTI_TORCH_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled); // Free the tensor object -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_delete_tensor_object(AtenTensorHandle tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_delete_tensor_object(AtenTensorHandle tensor); // Get a pointer to the underlying storage data -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr( - AtenTensorHandle tensor, - void** ret_data_ptr // returns borrowed reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(AtenTensorHandle tensor, + void **ret_data_ptr // returns borrowed reference ); // Get the nbytes of the underlying storage -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_storage_size(AtenTensorHandle tensor, int64_t* ret_size); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_size(AtenTensorHandle tensor, int64_t *ret_size); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_dim(AtenTensorHandle tensor, int64_t* ret_dim); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_dim(AtenTensorHandle tensor, int64_t *ret_dim); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_numel(AtenTensorHandle tensor, int64_t* ret_numel); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_numel(AtenTensorHandle tensor, int64_t *ret_numel); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_storage_numel(AtenTensorHandle tensor, int64_t* ret_numel); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_numel(AtenTensorHandle tensor, int64_t *ret_numel); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_sizes( - AtenTensorHandle tensor, - int64_t** ret_sizes // returns borrowed reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_sizes(AtenTensorHandle tensor, + int64_t **ret_sizes // returns borrowed reference ); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_size(AtenTensorHandle tensor, int64_t d, int64_t* ret_size); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_size(AtenTensorHandle tensor, int64_t d, int64_t *ret_size); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_strides( - AtenTensorHandle tensor, - int64_t** ret_strides // returns borrowed reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_strides(AtenTensorHandle tensor, + int64_t **ret_strides // returns borrowed reference ); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_stride(AtenTensorHandle tensor, int64_t d, int64_t* ret_stride); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_stride(AtenTensorHandle tensor, int64_t d, int64_t *ret_stride); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_dtype(AtenTensorHandle tensor, int32_t* ret_dtype); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_dtype(AtenTensorHandle tensor, int32_t *ret_dtype); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_device_type(AtenTensorHandle tensor, int32_t* ret_device_type); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_device_type(AtenTensorHandle tensor, int32_t *ret_device_type); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_device_index(AtenTensorHandle tensor, int32_t* ret_device_index); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_device_index(AtenTensorHandle tensor, int32_t *ret_device_index); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset( - AtenTensorHandle tensor, - int64_t* ret_storage_offset); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(AtenTensorHandle tensor, int64_t *ret_storage_offset); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch__alloc_from_pool( - AtenTensorHandle self, - int64_t offset_bytes, - int32_t dtype, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - AtenTensorHandle* ret_new_tensor); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__alloc_from_pool(AtenTensorHandle self, int64_t offset_bytes, int32_t dtype, + int64_t ndim, const int64_t *sizes_ptr, const int64_t *strides_ptr, AtenTensorHandle *ret_new_tensor); // This function will create a new tensor object and its pointer is returned // through *out. The caller is responsible for wrapping the tensor pointer // with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object // when going out of scope. -AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor( - AtenTensorHandle self, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - AtenTensorHandle* ret_new_tensor // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(AtenTensorHandle self, int64_t ndim, + const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset, + AtenTensorHandle *ret_new_tensor // returns new reference ); // This function will create a new tensor object and its pointer is returned // through *out. The caller is responsible for wrapping the tensor pointer // with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object // when going out of scope. -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided( - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AtenTensorHandle* ret_new_tensor // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(int64_t ndim, const int64_t *sizes_ptr, + const int64_t *strides_ptr, int32_t dtype, int32_t device_type, int32_t device_index, + AtenTensorHandle *ret_new_tensor // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AtenTensorHandle* ret // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(void *data, int64_t ndim, const int64_t *sizes_ptr, + const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype, int32_t device_type, int32_t device_index, + AtenTensorHandle *ret // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(void *data, int64_t ndim, + const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype, int32_t device_type, int32_t device_index, - AtenTensorHandle* ret, // returns new reference - int32_t layout, - const uint8_t* opaque_metadata, - int64_t opaque_metadata_size); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, + AtenTensorHandle *ret, // returns new reference + int32_t layout, const uint8_t *opaque_metadata, int64_t opaque_metadata_size); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu(void *data, int64_t ndim, + const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype, int32_t device_type, int32_t device_index, - AtenTensorHandle* ret // returns new reference + AtenTensorHandle *ret // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2(void *data, int64_t ndim, + const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype, int32_t device_type, int32_t device_index, - AtenTensorHandle* ret, // returns new reference - int32_t layout, - const uint8_t* opaque_metadata, - int64_t opaque_metadata_size); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag( - AtenTensorHandle weight, - AtenTensorHandle indices, - AtenTensorHandle offsets, - int32_t scale_grad_by_freq, - int32_t mode, - int32_t sparse, + AtenTensorHandle *ret, // returns new reference + int32_t layout, const uint8_t *opaque_metadata, int64_t opaque_metadata_size); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, + AtenTensorHandle offsets, int32_t scale_grad_by_freq, int32_t mode, int32_t sparse, AtenTensorHandle per_sample_weights, // optional argument - int32_t include_last_offset, - int32_t padding_idx, - AtenTensorHandle* ret0, // returns new reference - AtenTensorHandle* ret1, // returns new reference - AtenTensorHandle* ret2, // returns new reference - AtenTensorHandle* ret3 // returns new reference + int32_t include_last_offset, int32_t padding_idx, + AtenTensorHandle *ret0, // returns new reference + AtenTensorHandle *ret1, // returns new reference + AtenTensorHandle *ret2, // returns new reference + AtenTensorHandle *ret3 // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c( - AtenTensorHandle self, - const int64_t* dim_ptr, - int64_t dim_size, - int64_t normalization, - int32_t forward, - AtenTensorHandle* ret // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c(AtenTensorHandle self, const int64_t *dim_ptr, int64_t dim_size, + int64_t normalization, int32_t forward, + AtenTensorHandle *ret // returns new reference ); // This version is deprecated. We will remove it later -AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention( - AtenTensorHandle query, - AtenTensorHandle key, - AtenTensorHandle value, - double dropout_p, - bool is_causal, - bool return_debug_mask, +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention(AtenTensorHandle query, + AtenTensorHandle key, AtenTensorHandle value, double dropout_p, bool is_causal, bool return_debug_mask, double scale, - AtenTensorHandle* ret0, // returns new reference - AtenTensorHandle* ret1, // returns new reference - AtenTensorHandle* ret2, // returns new reference - AtenTensorHandle* ret3, // returns new reference - int64_t* ret4, - int64_t* ret5, - AtenTensorHandle* ret6, // returns new reference - AtenTensorHandle* ret7, // returns new reference - AtenTensorHandle* ret8 // returns new reference + AtenTensorHandle *ret0, // returns new reference + AtenTensorHandle *ret1, // returns new reference + AtenTensorHandle *ret2, // returns new reference + AtenTensorHandle *ret3, // returns new reference + int64_t *ret4, int64_t *ret5, + AtenTensorHandle *ret6, // returns new reference + AtenTensorHandle *ret7, // returns new reference + AtenTensorHandle *ret8 // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch__scaled_dot_product_flash_attention_v2( - AtenTensorHandle query, - AtenTensorHandle key, - AtenTensorHandle value, - double dropout_p, - int is_causal, - int return_debug_mask, - double* scale, // optional argument - AtenTensorHandle* ret0, // returns new reference - AtenTensorHandle* ret1, // returns new reference - AtenTensorHandle* ret2, // returns new reference - AtenTensorHandle* ret3, // returns new reference - int64_t* ret4, - int64_t* ret5, - AtenTensorHandle* ret6, // returns new reference - AtenTensorHandle* ret7, // returns new reference - AtenTensorHandle* ret8 // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention_v2(AtenTensorHandle query, + AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int is_causal, int return_debug_mask, + double *scale, // optional argument + AtenTensorHandle *ret0, // returns new reference + AtenTensorHandle *ret1, // returns new reference + AtenTensorHandle *ret2, // returns new reference + AtenTensorHandle *ret3, // returns new reference + int64_t *ret4, int64_t *ret5, + AtenTensorHandle *ret6, // returns new reference + AtenTensorHandle *ret7, // returns new reference + AtenTensorHandle *ret8 // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch__scaled_dot_product_efficient_attention( - AtenTensorHandle query, - AtenTensorHandle key, - AtenTensorHandle value, +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_efficient_attention(AtenTensorHandle query, + AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, // optional argument - int compute_log_sumexp, - double dropout_p, - int is_causal, - double* scale, // optional argument - AtenTensorHandle* ret0, // returns new reference - AtenTensorHandle* ret1, // returns new reference - AtenTensorHandle* ret2, // returns new reference - AtenTensorHandle* ret3 // returns new reference + int compute_log_sumexp, double dropout_p, int is_causal, + double *scale, // optional argument + AtenTensorHandle *ret0, // returns new reference + AtenTensorHandle *ret1, // returns new reference + AtenTensorHandle *ret2, // returns new reference + AtenTensorHandle *ret3 // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm( - AtenTensorHandle self, - AtenTensorHandle mat2, - AtenTensorHandle bias, - int32_t* out_dtype, - AtenTensorHandle scale_a, - AtenTensorHandle scale_b, - AtenTensorHandle scale_result, - int8_t use_fast_accum, - AtenTensorHandle* ret0, - AtenTensorHandle* ret1); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm_v2( - AtenTensorHandle self, - AtenTensorHandle mat2, - AtenTensorHandle scale_a, - AtenTensorHandle scale_b, - AtenTensorHandle bias, - AtenTensorHandle scale_result, - int32_t* out_dtype, - int8_t use_fast_accum, - AtenTensorHandle* ret0); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution( - AtenTensorHandle input, - AtenTensorHandle weight, +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2, + AtenTensorHandle bias, int32_t *out_dtype, AtenTensorHandle scale_a, AtenTensorHandle scale_b, + AtenTensorHandle scale_result, int8_t use_fast_accum, AtenTensorHandle *ret0, AtenTensorHandle *ret1); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm_v2(AtenTensorHandle self, AtenTensorHandle mat2, + AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle bias, AtenTensorHandle scale_result, + int32_t *out_dtype, int8_t use_fast_accum, AtenTensorHandle *ret0); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle bias, // optional argument - const int64_t* stride_ptr, - int64_t stride_size, - const int64_t* padding_ptr, - int64_t padding_size, - const int64_t* dilation_ptr, - int64_t dilation_size, - int transposed, - const int64_t* output_padding_ptr, - int64_t output_padding_size, - int64_t groups, - AtenTensorHandle* ret // returns new reference + const int64_t *stride_ptr, int64_t stride_size, const int64_t *padding_ptr, int64_t padding_size, + const int64_t *dilation_ptr, int64_t dilation_size, int transposed, const int64_t *output_padding_ptr, + int64_t output_padding_size, int64_t groups, + AtenTensorHandle *ret // returns new reference ); // This function will create a new uninitialized tensor object // and its pointer is returned through *ret. -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_new_uninitialized_tensor(AtenTensorHandle* ret); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_new_uninitialized_tensor(AtenTensorHandle *ret); // WARNING: This will be deprecated. Use aoti_torch_copy_ instead. -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_tensor_copy_(AtenTensorHandle src, AtenTensorHandle dst); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_tensor_copy_(AtenTensorHandle src, AtenTensorHandle dst); // Make the tensor referred to by dst an alias for the tensor referred // to by src. The two tensors must still be deleted with // aoti_torch_delete_tensor separately (or not) as before the call. -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_assign_tensors(AtenTensorHandle src, AtenTensorHandle dst); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_assign_tensors(AtenTensorHandle src, AtenTensorHandle dst); // Make a shallow copy of the tensor referred to by src and assign // it to the handle in the ret_dst. This is similar to the above // aoti_torch_assign_tensors function, but creates and sets the // ret_dst from within. -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_assign_tensors_out(AtenTensorHandle src, AtenTensorHandle* ret_dst); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_assign_tensors_out(AtenTensorHandle src, AtenTensorHandle *ret_dst); // This function will create a new tensor object and its pointer is returned // through *ret. The caller is responsible for wrapping the tensor pointer // with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object // when going out of scope. -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_clone(AtenTensorHandle self, AtenTensorHandle* ret); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_addmm_out( - AtenTensorHandle out, - AtenTensorHandle self, - AtenTensorHandle mat1, - AtenTensorHandle mat2, - float beta, - float alpha); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_bmm_out( - AtenTensorHandle out, - AtenTensorHandle self, - AtenTensorHandle mat2); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_( - AtenTensorHandle self, - AtenTensorHandle src, - int32_t non_blocking); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mm_out( - AtenTensorHandle out, - AtenTensorHandle self, - AtenTensorHandle mat2); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch__mm_plus_mm_out( - AtenTensorHandle out, - AtenTensorHandle a, - AtenTensorHandle b, - AtenTensorHandle c, - AtenTensorHandle d); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_clone(AtenTensorHandle self, AtenTensorHandle *ret); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_addmm_out(AtenTensorHandle out, AtenTensorHandle self, + AtenTensorHandle mat1, AtenTensorHandle mat2, float beta, float alpha); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch__mm_plus_mm_out(AtenTensorHandle out, AtenTensorHandle a, + AtenTensorHandle b, AtenTensorHandle c, AtenTensorHandle d); // This will soon be deprecated after ao_quantization is complete. // Please refrain from using this or increasing callsites. -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_cpu_wrapped_fbgemm_pack_gemm_matrix_fp16( - AtenTensorHandle weight, - AtenTensorHandle* out); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_wrapped_fbgemm_pack_gemm_matrix_fp16(AtenTensorHandle weight, + AtenTensorHandle *out); // This will soon be deprecated after ao_quantization is complete. // Please refrain from using this or increasing callsites. -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__wrapped_linear_prepack( - AtenTensorHandle weight, - AtenTensorHandle weight_scale, - AtenTensorHandle weight_zero_point, - AtenTensorHandle bias, - AtenTensorHandle* out); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__wrapped_linear_prepack(AtenTensorHandle weight, + AtenTensorHandle weight_scale, AtenTensorHandle weight_zero_point, AtenTensorHandle bias, AtenTensorHandle *out); // This will soon be deprecated after ao_quantization is complete. // Please refrain from using this or increasing callsites. -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight( - AtenTensorHandle input, - AtenTensorHandle weight, - AtenTensorHandle bias, - int64_t out_channel, - AtenTensorHandle* out); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight(AtenTensorHandle input, + AtenTensorHandle weight, AtenTensorHandle bias, int64_t out_channel, AtenTensorHandle *out); // This will soon be deprecated after ao_quantization is complete. // Please refrain from using this or increasing callsites. -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_cpu__wrapped_quantized_linear_prepacked( - AtenTensorHandle input, - AtenTensorHandle input_scale, - AtenTensorHandle input_zero_point, - AtenTensorHandle weight, - AtenTensorHandle out_scale, - AtenTensorHandle out_zeropoint, - int64_t out_channel, - AtenTensorHandle* out); - -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_nonzero(AtenTensorHandle self, AtenTensorHandle* out); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__wrapped_quantized_linear_prepacked(AtenTensorHandle input, + AtenTensorHandle input_scale, AtenTensorHandle input_zero_point, AtenTensorHandle weight, + AtenTensorHandle out_scale, AtenTensorHandle out_zeropoint, int64_t out_channel, AtenTensorHandle *out); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_nonzero(AtenTensorHandle self, AtenTensorHandle *out); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_zero_(AtenTensorHandle self); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_repeat_interleave_Tensor( - AtenTensorHandle repeats, - int64_t* output_size, - AtenTensorHandle* out); - -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_check_inf_and_nan(const char* tensor_name, AtenTensorHandle tensor); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_out( - AtenTensorHandle out, - AtenTensorHandle self, - int64_t dim, - AtenTensorHandle index, - AtenTensorHandle src); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_reduce_out( - AtenTensorHandle out, - AtenTensorHandle self, - int64_t dim, - AtenTensorHandle index, - AtenTensorHandle src, - const char* reduce, - int32_t include_self); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_index_put_out( - AtenTensorHandle out, - AtenTensorHandle self, - const AtenTensorHandle* indices, - const uint32_t num_indices, - const AtenTensorHandle values, - bool accumulate); - -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real( - AtenTensorHandle self, - AtenTensorHandle* ret // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t *output_size, + AtenTensorHandle *out); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_check_inf_and_nan(const char *tensor_name, AtenTensorHandle tensor); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, + AtenTensorHandle index, AtenTensorHandle src); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, + AtenTensorHandle index, AtenTensorHandle src, const char *reduce, int32_t include_self); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_index_put_out(AtenTensorHandle out, AtenTensorHandle self, + const AtenTensorHandle *indices, const uint32_t num_indices, const AtenTensorHandle values, bool accumulate); + +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real(AtenTensorHandle self, + AtenTensorHandle *ret // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype( - AtenTensorHandle self, - int32_t dtype, - AtenTensorHandle* ret // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype(AtenTensorHandle self, int32_t dtype, + AtenTensorHandle *ret // returns new reference ); -AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle( - AtenTensorHandle self, - const char* msg); +AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char *msg); // When AOTI debug printer option is enabled, this function will be invoked to // torch pickle save the intermediate tensor for debugging purpose. -AOTI_TORCH_EXPORT void aoti_torch_save_tensor_handle( - AtenTensorHandle self, - const char* tensor_name, - const char* launch_prefix, - const char* kernel_name); +AOTI_TORCH_EXPORT void aoti_torch_save_tensor_handle(AtenTensorHandle self, const char *tensor_name, + const char *launch_prefix, const char *kernel_name); #ifdef USE_CUDA struct CUDAGuardOpaque; -using CUDAGuardHandle = CUDAGuardOpaque*; +using CUDAGuardHandle = CUDAGuardOpaque *; -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_guard( - int32_t device_index, - CUDAGuardHandle* ret_guard // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_guard(int32_t device_index, + CUDAGuardHandle *ret_guard // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index); struct CUDAStreamGuardOpaque; -using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*; +using CUDAStreamGuardHandle = CUDAStreamGuardOpaque *; -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard( - void* stream, - int32_t device_index, - CUDAStreamGuardHandle* ret_guard // returns new reference +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(void *stream, int32_t device_index, + CUDAStreamGuardHandle *ret_guard // returns new reference ); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); -AOTI_TORCH_EXPORT AOTITorchError -aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream(int32_t device_index, void **ret_stream); -#endif // USE_CUDA +#endif \ + // USE_CUDA // See `ProxyExecutor Design Note` in ir.py for more details -AOTI_TORCH_EXPORT AOTITorchError aoti_torch_proxy_executor_call_function( - AOTIProxyExecutorHandle proxy_executor, - int extern_node_index, - int num_ints, - int64_t* flatten_int_args, - int num_tensors, - AtenTensorHandle* flatten_tensor_args); - -AOTI_TORCH_EXPORT void aoti_torch_check( - bool cond, - const char* func, - const char* file, - uint32_t line, - const char* msg); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_proxy_executor_call_function(AOTIProxyExecutorHandle proxy_executor, + int extern_node_index, int num_ints, int64_t *flatten_int_args, int num_tensors, + AtenTensorHandle *flatten_tensor_args); + +AOTI_TORCH_EXPORT void aoti_torch_check(bool cond, const char *func, const char *file, uint32_t line, const char *msg); #ifdef STRIP_ERROR_MESSAGES -#define AOTI_TORCH_CHECK(cond, ...) \ - if (!(cond)) { \ - aoti_torch_check( \ - false, \ - __func__, \ - __FILE__, \ - static_cast(__LINE__), \ - TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \ - } +#define AOTI_TORCH_CHECK(cond, ...) \ + if (!(cond)) { \ + aoti_torch_check(false, __func__, __FILE__, static_cast(__LINE__), \ + TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \ + } #else -#define AOTI_TORCH_CHECK(cond, ...) \ - if (!(cond)) { \ - aoti_torch_check( \ - false, \ - __func__, \ - __FILE__, \ - static_cast(__LINE__), \ - TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__)); \ - } +#define AOTI_TORCH_CHECK(cond, ...) \ + if (!(cond)) { \ + aoti_torch_check(false, __func__, __FILE__, static_cast(__LINE__), \ + TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__)); \ + } #endif #ifdef __cplusplus @@ -701,11 +462,11 @@ AOTI_TORCH_EXPORT void aoti_torch_check( template int32_t aoti_torch_dtype() = delete; -#define DEFINE_DTYPE_SPECIALIZATION(ctype, typename) \ - template <> \ - inline int32_t aoti_torch_dtype() { \ - return aoti_torch_dtype_##typename(); \ - } +#define DEFINE_DTYPE_SPECIALIZATION(ctype, typename) \ + template <> inline int32_t aoti_torch_dtype() \ + { \ + return aoti_torch_dtype_##typename(); \ + } namespace c10 { struct BFloat16; @@ -726,4 +487,5 @@ DEFINE_DTYPE_SPECIALIZATION(bool, bool) #endif -#endif // AOTI_TORCH_SHIM +#endif \ + // AOTI_TORCH_SHIM diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 60ef55e1d409cfc1ef38bafa57e88d14bef8c900..50b306959da090ec5aa84c73e1701b30ad061929 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -1239,7 +1239,8 @@ PyObject* THNPModule_npuCachingAllocator_raw_alloc(PyObject *_unused, PyObject * END_HANDLE_TH_ERRORS } -PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj) { +PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj) +{ HANDLE_TH_ERRORS void* mem_ptr = PyLong_AsVoidPtr(obj); c10_npu::NPUCachingAllocator::raw_delete(mem_ptr); @@ -1291,7 +1292,8 @@ PyObject* THNPModule_npuUnlockMutex(PyObject *module, PyObject *noargs) Py_RETURN_NONE; } -PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs) { +PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs) +{ HANDLE_TH_ERRORS pybind11::gil_scoped_release no_gil; NPU_CHECK_ERROR_WITHOUT_UCE(aclmdlInitDump()); diff --git a/torch_npu/csrc/utils/TensorType.cpp b/torch_npu/csrc/utils/TensorType.cpp index aeb6fd8b832e96d39fc8b8cd0724ddbdcf9125b9..e6998f57eaad6ac9368366847593d66c8d56658e 100644 --- a/torch_npu/csrc/utils/TensorType.cpp +++ b/torch_npu/csrc/utils/TensorType.cpp @@ -6,7 +6,6 @@ namespace torch_npu { namespace utils { - using namespace at; using namespace torch::autograd; @@ -15,14 +14,13 @@ std::vector> all_declared_types_npu() std::vector> ret; // can't easily iterate over enum classes, does not support BFloat16 now std::vector backends = { c10::Backend::PrivateUse1 }; - std::vector scalar_types = { - ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float, - ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half, - ScalarType::Bool, ScalarType::BFloat16 - }; - - for (auto& backend : backends) { - for (auto& scalar_type : scalar_types) { + std::vector scalar_types = { ScalarType::Byte, ScalarType::Char, ScalarType::Double, + ScalarType::Float, ScalarType::Int, ScalarType::Long, + ScalarType::Short, ScalarType::Half, ScalarType::Bool, + ScalarType::BFloat16 }; + + for (auto &backend : backends) { + for (auto &scalar_type : scalar_types) { ret.emplace_back(std::make_pair(backend, scalar_type)); } } @@ -32,8 +30,8 @@ std::vector> all_declared_types_npu() struct PyTensorType { PyTypeObject py_type; - THPDtype* dtype; - THPLayout* layout; + THPDtype *dtype; + THPLayout *layout; bool is_npu; char name[64]; int backend; @@ -57,73 +55,67 @@ struct PyTensorType { static_assert(std::is_standard_layout::value, "PyTensorType must be standard layout"); -static void py_bind_tensor_types(const std::vector& tensor_types); +static void py_bind_tensor_types(const std::vector &tensor_types); -static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +static PyObject *Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { HANDLE_TH_ERRORS - auto& tensor_type = *((PyTensorType*)type); + auto &tensor_type = *((PyTensorType *)type); if (tensor_type.is_npu) { - TORCH_NPU_WARN_ONCE( - "Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. " + TORCH_NPU_WARN_ONCE("Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. " "It's best to use methods such as torch.tensor(data, dtype=*, device='npu') " "to create tensors."); } - TORCH_CHECK_TYPE( - !tensor_type.is_npu || c10_npu::device_count() != 0, - "type ", - tensor_type.name, + TORCH_CHECK_TYPE(!tensor_type.is_npu || c10_npu::device_count() != 0, "type ", tensor_type.name, " not available. Torch not compiled with npu enabled.", PTA_ERROR(ErrCode::TYPE)) torch_npu::utils::npu_lazy_init(); - return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), - tensor_type.get_scalar_type(), - args, - kwargs)); + return THPVariable_Wrap( + torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), tensor_type.get_scalar_type(), args, kwargs)); END_HANDLE_TH_ERRORS } -static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg) +static PyObject *Tensor_instancecheck(PyObject *_self, PyObject *arg) { - HANDLE_TH_ERRORS - auto self = (PyTensorType*)_self; - if (THPVariable_Check(arg)) { - const auto& var = THPVariable_Unpack(arg); - - if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() && - var.scalar_type() == static_cast(self->scalar_type)) { - Py_RETURN_TRUE; + HANDLE_TH_ERRORS + auto self = (PyTensorType *)_self; + if (THPVariable_Check(arg)) { + const auto &var = THPVariable_Unpack(arg); + + if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() && + var.scalar_type() == static_cast(self->scalar_type)) { + Py_RETURN_TRUE; + } } - } - Py_RETURN_FALSE; - END_HANDLE_TH_ERRORS + Py_RETURN_FALSE; + END_HANDLE_TH_ERRORS } -PyObject* Tensor_dtype(PyTensorType* self, void *unused) +PyObject *Tensor_dtype(PyTensorType *self, void *unused) { - return torch::autograd::utils::wrap(self->dtype); + return torch::autograd::utils::wrap(self->dtype); } -PyObject* Tensor_layout(PyTensorType* self, void *unused) +PyObject *Tensor_layout(PyTensorType *self, void *unused) { - return torch::autograd::utils::wrap(self->layout); + return torch::autograd::utils::wrap(self->layout); } -PyObject* Tensor_is_npu(PyTensorType* self, void *unused) +PyObject *Tensor_is_npu(PyTensorType *self, void *unused) { - if (self->is_npu) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } + if (self->is_npu) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } } -PyObject* Tensor_is_sparse(PyTensorType *self, void *unused) +PyObject *Tensor_is_sparse(PyTensorType *self, void *unused) { - if (self->layout->layout == at::Layout::Strided) { - Py_RETURN_FALSE; - } else { - Py_RETURN_TRUE; - } + if (self->layout->layout == at::Layout::Strided) { + Py_RETURN_FALSE; + } else { + Py_RETURN_TRUE; + } } static struct PyMethodDef metaclass_methods[] = { @@ -131,7 +123,7 @@ static struct PyMethodDef metaclass_methods[] = { {nullptr} }; -using getter = PyObject* (*)(PyObject *, void *); +using getter = PyObject *(*)(PyObject *, void *); static struct PyGetSetDef metaclass_properties[] = { {"dtype", (getter)Tensor_dtype, nullptr, nullptr, nullptr}, @@ -142,46 +134,44 @@ static struct PyGetSetDef metaclass_properties[] = { }; static PyTypeObject metaclass = { - PyVarObject_HEAD_INIT(nullptr, 0) - "torch.tensortype", /* tp_name */ - sizeof(PyTypeObject) /* tp_basicsize */ + PyVarObject_HEAD_INIT(nullptr, 0) "torch.tensortype", /* tp_name */ + sizeof(PyTypeObject) /* tp_basicsize */ }; -static void py_initialize_metaclass(PyTypeObject& metaclass) +static void py_initialize_metaclass(PyTypeObject &metaclass) { - metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; - metaclass.tp_methods = metaclass_methods; - metaclass.tp_getset = metaclass_properties; - metaclass.tp_base = &PyType_Type; - if (PyType_Ready(&metaclass) < 0) { - throw python_error(); - } + metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; + metaclass.tp_methods = metaclass_methods; + metaclass.tp_getset = metaclass_properties; + metaclass.tp_base = &PyType_Type; + if (PyType_Ready(&metaclass) < 0) { + throw python_error(); + } } static PyTypeObject tensor_type_prototype = { - PyVarObject_HEAD_INIT(&metaclass, 0) - nullptr, /* tp_name */ - sizeof(PyTensorType) /* tp_basicsize */ + PyVarObject_HEAD_INIT(&metaclass, 0) nullptr, /* tp_name */ + sizeof(PyTensorType) /* tp_basicsize */ }; -static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict) +static void py_initialize_tensor_type(PyTypeObject &type, const char *name, PyObject *tp_dict) { - // NOTE: we don't use the typical static declaration of PyTypeObject because - // we need to initialize as many types as there are VariableType instances. - // We copy the basic object fields from a prototype definition and initialize - // the remaining fields below. - memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject)); - // Subclassing from torch.Tensor isn't supported. - // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed. - type.tp_flags = Py_TPFLAGS_DEFAULT; - type.tp_name = name; - type.tp_new = Tensor_new; - if (PyType_Ready(&type) < 0) { - throw python_error(); - } - if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) { - throw python_error(); - } + // NOTE: we don't use the typical static declaration of PyTypeObject because + // we need to initialize as many types as there are VariableType instances. + // We copy the basic object fields from a prototype definition and initialize + // the remaining fields below. + memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject)); + // Subclassing from torch.Tensor isn't supported. + // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed. + type.tp_flags = Py_TPFLAGS_DEFAULT; + type.tp_name = name; + type.tp_new = Tensor_new; + if (PyType_Ready(&type) < 0) { + throw python_error(); + } + if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) { + throw python_error(); + } } static std::string get_module(Backend backend) @@ -204,103 +194,103 @@ static std::string get_module(Backend backend) static std::string get_name(Backend backend, ScalarType scalarType) { - std::ostringstream ss; - ss << get_module(backend) << "." << toString(scalarType) << "Tensor"; - return ss.str(); + std::ostringstream ss; + ss << get_module(backend) << "." << toString(scalarType) << "Tensor"; + return ss.str(); } -static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType) +static void set_type(PyTensorType &type_obj, Backend backend, ScalarType scalarType) { - // This field is lazily initialized from backend and scalar_type - type_obj.backend = static_cast(backend); - type_obj.scalar_type = static_cast(scalarType); - type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend)); - type_obj.dtype = torch::getTHPDtype(scalarType); - type_obj.is_npu = (backend == c10::Backend::PrivateUse1); + // This field is lazily initialized from backend and scalar_type + type_obj.backend = static_cast(backend); + type_obj.scalar_type = static_cast(scalarType); + type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend)); + type_obj.dtype = torch::getTHPDtype(scalarType); + type_obj.is_npu = (backend == c10::Backend::PrivateUse1); } -static void set_name(PyTensorType& type_obj, const std::string& name) +static void set_name(PyTensorType &type_obj, const std::string &name) { - size_t n = sizeof(type_obj.name); - strncpy(type_obj.name, name.c_str(), n); - type_obj.name[n - 1] = '\0'; + size_t n = sizeof(type_obj.name); + strncpy(type_obj.name, name.c_str(), n); + type_obj.name[n - 1] = '\0'; } static THPObjectPtr get_tensor_dict() { - auto torch = THPObjectPtr(PyImport_ImportModule("torch")); - if (!torch) { - throw python_error(); - } - - auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor")); - if (!tensor_class) { - throw python_error(); - } - - auto tensor_type = (PyTypeObject*)tensor_class.get(); - TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE)); - - auto res = THPObjectPtr(PyDict_New()); - if (!res) { - throw python_error(); - } - - if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) { - throw python_error(); - } - if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) { - throw python_error(); - } - - return res; + auto torch = THPObjectPtr(PyImport_ImportModule("torch")); + if (!torch) { + throw python_error(); + } + + auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor")); + if (!tensor_class) { + throw python_error(); + } + + auto tensor_type = (PyTypeObject *)tensor_class.get(); + TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE)); + + auto res = THPObjectPtr(PyDict_New()); + if (!res) { + throw python_error(); + } + + if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) { + throw python_error(); + } + if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) { + throw python_error(); + } + + return res; } static std::vector tensor_types; -static void initialize_npu_aten_types(std::vector& tensor_types) +static void initialize_npu_aten_types(std::vector &tensor_types) { - // only initialize npu types - auto declared_types = all_declared_types_npu(); - tensor_types.resize(declared_types.size()); - - for (size_t i = 0, end = declared_types.size(); i != end; i++) { - auto& tensor_type = tensor_types[i]; - Backend backend = declared_types[i].first; - ScalarType scalar_type = declared_types[i].second; - set_type(tensor_type, backend, scalar_type); - set_name(tensor_type, get_name(backend, scalar_type)); - } + // only initialize npu types + auto declared_types = all_declared_types_npu(); + tensor_types.resize(declared_types.size()); + + for (size_t i = 0, end = declared_types.size(); i != end; i++) { + auto &tensor_type = tensor_types[i]; + Backend backend = declared_types[i].first; + ScalarType scalar_type = declared_types[i].second; + set_type(tensor_type, backend, scalar_type); + set_name(tensor_type, get_name(backend, scalar_type)); + } } void _initialize_python_bindings() { - // Initialize the at::Type* pointers, name, and properties of the PyTensorType - // vector. After this call, the vector must not be resized. - initialize_npu_aten_types(tensor_types); - - // Initialize the Python metaclass for the torch.FloatTensor, etc. types. - // The metaclass handles __instancecheck__ checks and binds the dtype property - // on the type objects. - py_initialize_metaclass(metaclass); - - // Get the tp_dict of the Variable class. We copy function definitions - // onto each Tensor type object so that they can be accessed via e.g. - // `torch.npu.FloatTensor.add`. - auto tensor_dict = get_tensor_dict(); - - // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc. - for (auto& tensor_type : tensor_types) { - py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get()); - } - - // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor - // is added to the `torch_npu` module as `FloatTensor`. Also add all the type - // objects to the set torch_npu._tensor_classes. - py_bind_tensor_types(tensor_types); + // Initialize the at::Type* pointers, name, and properties of the PyTensorType + // vector. After this call, the vector must not be resized. + initialize_npu_aten_types(tensor_types); + + // Initialize the Python metaclass for the torch.FloatTensor, etc. types. + // The metaclass handles __instancecheck__ checks and binds the dtype property + // on the type objects. + py_initialize_metaclass(metaclass); + + // Get the tp_dict of the Variable class. We copy function definitions + // onto each Tensor type object so that they can be accessed via e.g. + // `torch.npu.FloatTensor.add`. + auto tensor_dict = get_tensor_dict(); + + // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc. + for (auto &tensor_type : tensor_types) { + py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get()); + } + + // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor + // is added to the `torch_npu` module as `FloatTensor`. Also add all the type + // objects to the set torch_npu._tensor_classes. + py_bind_tensor_types(tensor_types); } -static void py_bind_tensor_types(const std::vector& tensor_types) +static void py_bind_tensor_types(const std::vector &tensor_types) { auto torch_module = THPObjectPtr(PyImport_ImportModule("torch")); if (!torch_module) { @@ -312,7 +302,7 @@ static void py_bind_tensor_types(const std::vector& tensor_types) throw python_error(); } - for (auto& tensor_type : tensor_types) { + for (auto &tensor_type : tensor_types) { auto name = std::string(tensor_type.name); auto idx = name.rfind('.'); auto type_name = name.substr(idx + 1); @@ -323,7 +313,7 @@ static void py_bind_tensor_types(const std::vector& tensor_types) throw python_error(); } - PyObject* type_obj = (PyObject*)&tensor_type; + PyObject *type_obj = (PyObject *)&tensor_type; Py_INCREF(type_obj); if (PyModule_AddObject(module_obj.get(), type_name.c_str(), type_obj) < 0) { throw python_error(); @@ -335,12 +325,12 @@ static void py_bind_tensor_types(const std::vector& tensor_types) } // Callback for python part. Used for additional initialization of python classes -static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs) +static PyObject *THPModule_initExtension(PyObject *_unused, PyObject *noargs) { - HANDLE_TH_ERRORS - _initialize_python_bindings(); - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS + HANDLE_TH_ERRORS + _initialize_python_bindings(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS } // autograd methods on torch._C @@ -349,9 +339,9 @@ static PyMethodDef TorchNpuExtensionMethods[] = { {nullptr, nullptr, 0, nullptr} }; -PyMethodDef* npu_extension_functions() +PyMethodDef *npu_extension_functions() { - return TorchNpuExtensionMethods; + return TorchNpuExtensionMethods; } } } diff --git a/torch_npu/distributed/fsdp/_add_fsdp_patch.py b/torch_npu/distributed/fsdp/_add_fsdp_patch.py index 7405620686897a2199333f9eb8580c113443ee0b..62ed66cac6fa1e1e83a20eb35c67ba68aec7bb5f 100644 --- a/torch_npu/distributed/fsdp/_add_fsdp_patch.py +++ b/torch_npu/distributed/fsdp/_add_fsdp_patch.py @@ -122,7 +122,8 @@ def foreach_all_gather_copy_out_npu( # Copy to a temporary and then chunk-cat into the final all-gather # output tensors param_all_gather_outputs = [ - torch.empty_like(t) for t in param_all_gather_outputs + torch.empty_like(t) + for t in param_all_gather_outputs ] shard_i_copy_infos.append((fsdp_param, param_all_gather_outputs)) split_with_sizes_out.extend(param_all_gather_outputs)