From ebde3a4e94ee0f4d15fe4862923d9506134783f8 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 02:46:38 +0000 Subject: [PATCH 01/11] model.h Signed-off-by: zjx.com --- torch_npu/csrc/inductor/aoti_runtime/model.h | 660 ++++++++++--------- 1 file changed, 332 insertions(+), 328 deletions(-) diff --git a/torch_npu/csrc/inductor/aoti_runtime/model.h b/torch_npu/csrc/inductor/aoti_runtime/model.h index bf04ddc2613..fe017bc59c6 100644 --- a/torch_npu/csrc/inductor/aoti_runtime/model.h +++ b/torch_npu/csrc/inductor/aoti_runtime/model.h @@ -22,12 +22,12 @@ #endif #define AOTI_RUNTIME_CHECK(EXPR, MSG) \ - do { \ +do { \ bool ok = EXPR; \ if (!ok) { \ - throw std::runtime_error(MSG); \ + throw std::runtime_error(MSG); \ } \ - } while (0) +} while (0) // At codegen time, we write out a binary file called constants.bin. // We then turn the raw binary to an object file that exposes this @@ -49,10 +49,10 @@ namespace { using GPUPtr = std::unique_ptr>; GPUPtr RAII_gpuMalloc(size_t num_bytes) { - void* data_ptr; - AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes)); - auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); }; - return GPUPtr(data_ptr, deleter); + void* data_ptr; + AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes)); + auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); }; + return GPUPtr(data_ptr, deleter); } #endif // USE_CUDA @@ -62,14 +62,14 @@ GPUPtr RAII_gpuMalloc(size_t num_bytes) { using NPUPtr = std::unique_ptr>; NPUPtr RAII_npuMalloc(size_t num_bytes) { - void* data_ptr; - // aclrtMalloc doesn't support allocate 0-bytes. In this case, - // e.g, model has no weight, we should do padding. - size_t padding_bytes = 32; - if (num_bytes == 0) num_bytes = padding_bytes; - AOTI_RUNTIME_DEVICE_CHECK(aclrtMalloc((void**)&data_ptr, num_bytes, ACL_MEM_MALLOC_HUGE_FIRST)); - auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(aclrtFree(ptr)); }; - return NPUPtr(data_ptr, deleter); + void* data_ptr; + // aclrtMalloc doesn't support allocate 0-bytes. In this case, + // e.g, model has no weight, we should do padding. + size_t padding_bytes = 32; + if (num_bytes == 0) num_bytes = padding_bytes; + AOTI_RUNTIME_DEVICE_CHECK(aclrtMalloc((void**)&data_ptr, num_bytes, ACL_MEM_MALLOC_HUGE_FIRST)); + auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(aclrtFree(ptr)); }; + return NPUPtr(data_ptr, deleter); } #endif // USE_NPU @@ -79,11 +79,11 @@ NPUPtr RAII_npuMalloc(size_t num_bytes) { using GPUPtr = std::unique_ptr>; GPUPtr RAII_gpuMalloc(size_t num_bytes) { - sycl::queue* queue_ptr = nullptr; - aoti_torch_get_current_sycl_queue((void**)&queue_ptr); - void* data_ptr = sycl::malloc_device(num_bytes, *queue_ptr); - auto deleter = [queue_ptr](void* ptr) { sycl::free(ptr, *queue_ptr); }; - return GPUPtr(data_ptr, deleter); + sycl::queue* queue_ptr = nullptr; + aoti_torch_get_current_sycl_queue((void**)&queue_ptr); + void* data_ptr = sycl::malloc_device(num_bytes, *queue_ptr); + auto deleter = [queue_ptr](void* ptr) { sycl::free(ptr, *queue_ptr); }; + return GPUPtr(data_ptr, deleter); } #endif // USE_CUDA @@ -92,11 +92,11 @@ GPUPtr RAII_gpuMalloc(size_t num_bytes) { namespace torch::aot_inductor { enum ConstantType : uint8_t { - Unknown = 0, - Parameter = 1, - Buffer = 2, - TensorConstant = 3, - FoldedConstant = 4, + Unknown = 0, + Parameter = 1, + Buffer = 2, + ensorConstant = 3, + FoldedConstant = 4, }; using ConstantMap = std::unordered_map; @@ -107,32 +107,36 @@ inline void parse_device_str( const std::string& device_str, int32_t& device_type, int32_t& device_idx) { - std::regex re("(cpu|cuda|xpu|npu)(:([0-9]+))?"); - std::smatch sm; - bool matched = std::regex_match(device_str, sm, re); - AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str); - - if (sm[1].str() == "cpu") { - device_type = aoti_torch_device_type_cpu(); - } else if (sm[1].str() == "cuda") { - device_type = aoti_torch_device_type_cuda(); + std::regex re("(cpu|cuda|xpu|npu)(:([0-9]+))?"); + std::smatch sm; + bool matched = std::regex_match(device_str, sm, re); + AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str); + + if (sm[1].str() == "cpu") { + device_type = aoti_torch_device_type_cpu(); + } + else if (sm[1].str() == "cuda") { + device_type = aoti_torch_device_type_cuda(); #ifdef USE_XPU - } else if (sm[1].str() == "xpu") { - device_type = aoti_torch_device_type_xpu(); + } + else if (sm[1].str() == "xpu") { + device_type = aoti_torch_device_type_xpu(); #endif #ifdef USE_NPU - } else if (sm[1].str() == "npu") { - device_type = aoti_torch_device_type_npu(); + } + else if (sm[1].str() == "npu") { + device_type = aoti_torch_device_type_npu(); #endif - } else { - AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str); - } - int64_t device_sub = 3; - if (sm[device_sub].matched) { - device_idx = stoi(sm[device_sub].str()); - } else { - device_idx = -1; - } + } + else { + AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str); + } + int64_t device_sub = 3; + if (sm[device_sub].matched) { + device_idx = stoi(sm[device_sub].str()); + } else { + device_idx = -1; + } } // Defines the base class for AOTInductorModel, which is generated by the @@ -159,83 +163,83 @@ class AOTInductorModelBase { #ifdef USE_CUDA if (device_idx_ == -1) { - AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_)); + AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_)); } else { // If device_idx_ is passed in, we need to set the current device to it - AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_)); + AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_)); } #endif // USE_CUDA #ifdef USE_XPU if (device_idx_ == -1) { - aoti_torch_get_current_xpu_device(&device_idx_); + aoti_torch_get_current_xpu_device(&device_idx_); } else { - aoti_torch_set_current_xpu_device(device_idx_); + aoti_torch_set_current_xpu_device(device_idx_); } #endif // USE_XPU #ifdef USE_NPU if (device_idx_ == -1) { - AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0)); - AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_)); + AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0)); + AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_)); } else { - AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(device_idx_)); + AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(device_idx_)); } #endif // USE_NPU } // NOLINTNEXTLINE(modernize-use-equals-default) - ~AOTInductorModelBase() { +~AOTInductorModelBase() { #ifdef USE_CUDA if (run_finished_) { - auto code = cudaEventDestroy(*run_finished_); - if (code != cudaSuccess) { - std::cerr << "Failed to destroy CUDA event in AOTInductor model: " - << cudaGetErrorString(code) << std::endl; - } + auto code = cudaEventDestroy(*run_finished_); + if (code != cudaSuccess) { + std::cerr << "Failed to destroy CUDA event in AOTInductor model: " + << cudaGetErrorString(code) << std::endl; + } } #endif // USE_CUDA #ifdef USE_XPU if (run_finished_) { - (*run_finished_)->wait_and_throw(); - delete *run_finished_; + (*run_finished_)->wait_and_throw(); + delete *run_finished_; } #endif // USE_XPU #ifdef USE_NPU if (run_finished_) { - auto code = aclrtDestroyEvent(*run_finished_); - if (code != ACL_SUCCESS) { - std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: " - << code << std::endl; - } + auto code = aclrtDestroyEvent(*run_finished_); + if (code != ACL_SUCCESS) { + std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: " + << code << std::endl; + } } #endif // USE_NPU - } +} - AOTInductorModelBase(AOTInductorModelBase&&) = delete; - AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete; - AOTInductorModelBase(const AOTInductorModelBase&) = delete; - AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete; +AOTInductorModelBase(AOTInductorModelBase&&) = delete; +AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete; +AOTInductorModelBase(const AOTInductorModelBase&) = delete; +AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete; - void run( - AtenTensorHandle* - input_handles, // array of input AtenTensorHandle; handles +void run( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; handles // are stolen; the array itself is borrowed - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles // will be stolen by the caller; the array itself is // borrowed - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor) { + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor) { #ifdef USE_CUDA if (!run_finished_) { - cudaEvent_t run_finished; - AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished)); - run_finished_.emplace(run_finished); + cudaEvent_t run_finished; + AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished)); + run_finished_.emplace(run_finished); } #elif defined(USE_XPU) if (run_finished_) { - (*run_finished_)->wait_and_throw(); - delete *run_finished_; - run_finished_.reset(); + (*run_finished_)->wait_and_throw(); + delete *run_finished_; + run_finished_.reset(); } #elif defined(USE_NPU) if (!run_finished_) { @@ -262,27 +266,27 @@ class AOTInductorModelBase { #endif // USE_CUDA } - std::unordered_map run_const_fold( - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor, - bool initialization = false) { +std::unordered_map run_const_fold( + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor, + bool initialization = false) { #ifdef USE_CUDA if (!run_finished_) { - cudaEvent_t run_finished; - AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished)); - run_finished_.emplace(run_finished); + cudaEvent_t run_finished; + AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished)); + run_finished_.emplace(run_finished); } #elif defined(USE_XPU) if (run_finished_) { - (*run_finished_)->wait_and_throw(); - delete *run_finished_; - run_finished_.reset(); + (*run_finished_)->wait_and_throw(); + delete *run_finished_; + run_finished_.reset(); } #elif defined(USE_NPU) if (!run_finished_) { - aclrtEvent run_finished; - AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished)); - run_finished_.emplace(run_finished); + aclrtEvent run_finished; + AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished)); + run_finished_.emplace(run_finished); } #else // !USE_CUDA && !USE_XPU run_finished_ = false; @@ -305,119 +309,119 @@ class AOTInductorModelBase { #endif // USE_CUDA return folded_constants; - } +} - void load_constants() { +void load_constants() { size_t num_constants = this->num_constants(); constants_map_->reserve(num_constants); std::vector constants_internal_offset(num_constants); if (device_type_ != aoti_torch_device_type_cpu()) { - size_t blob_size = 0; - compute_gpu_constant_blob(blob_size, constants_internal_offset); + size_t blob_size = 0; + compute_gpu_constant_blob(blob_size, constants_internal_offset); #if defined(USE_CUDA) || defined(USE_XPU) - constant_blob_ = RAII_gpuMalloc(blob_size); + constant_blob_ = RAII_gpuMalloc(blob_size); #elif defined(USE_NPU) constant_blob_ = RAII_npuMalloc(blob_size); #endif } if (!include_weights) { - return; + return; } size_t bytes_read = 0; for (size_t i = 0; i < num_constants; i++) { - bool from_folded = this->constant_from_folded(i); + bool from_folded = this->constant_from_folded(i); #if not defined(USE_XPU) && not defined(USE_CUDA) && not defined(USE_NPU) - if (from_folded) { + if (from_folded) { // We do not reallocate and copy for CPU. continue; - } + } #endif // USE_CUDA - std::string name = this->constant_name(i); - size_t data_size = this->constant_data_size(i); - uint8_t* internal_ptr = (data_size != 0) - ? constant_ptr( - constants_internal_offset[i], - bytes_read, - data_size, - from_folded) + std::string name = this->constant_name(i); + size_t data_size = this->constant_data_size(i); + uint8_t* internal_ptr = (data_size != 0) + ? constant_ptr( + constants_internal_offset[i], + bytes_read, + data_size, + from_folded) : nullptr; - bytes_read += data_size; - - // Create at::Tensor from copied memory. - auto dtype = this->constant_dtype(i); - auto ndim = this->constant_ndim(i); - auto size = this->constant_shape(i); - auto stride = this->constant_stride(i); - auto offset = this->constant_offset(i); - auto layout = this->constant_layout(i); - auto opaque_metadata_ptr = this->opaque_metadata(i); - auto opaque_metadata_size = this->opaque_metadata_size(i); - - AtenTensorHandle tensor_handle = nullptr; + bytes_read += data_size; + + // Create at::Tensor from copied memory. + auto dtype = this->constant_dtype(i); + auto ndim = this->constant_ndim(i); + auto size = this->constant_shape(i); + auto stride = this->constant_stride(i); + auto offset = this->constant_offset(i); + auto layout = this->constant_layout(i); + auto opaque_metadata_ptr = this->opaque_metadata(i); + auto opaque_metadata_size = this->opaque_metadata_size(i); + + AtenTensorHandle tensor_handle = nullptr; #ifdef AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 // When opaque_metadata_size is not 0, we need to have the // aoti_torch_create_tensor_from_blob_npu_v2 available - AOTI_RUNTIME_CHECK( - opaque_metadata_size == 0, - "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined"); - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu( - internal_ptr, - ndim, - size, - stride, - offset, - dtype, - device_type_, - device_idx_, - &tensor_handle)); + AOTI_RUNTIME_CHECK( + opaque_metadata_size == 0, + "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined"); + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu( + internal_ptr, + ndim, + size, + stride, + offset, + dtype, + device_type_, + device_idx_, + &tensor_handle)); #else - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2( - internal_ptr, - ndim, - size, - stride, - offset, - dtype, - device_type_, - device_idx_, - &tensor_handle, - layout, - opaque_metadata_ptr, - opaque_metadata_size)); + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2( + internal_ptr, + ndim, + size, + stride, + offset, + dtype, + device_type_, + device_idx_, + &tensor_handle, + layout, + opaque_metadata_ptr, + opaque_metadata_size)); #endif // AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 - constants_map_->emplace(std::move(name), tensor_handle); + constants_map_->emplace(std::move(name), tensor_handle); } if (constants_map_) { - this->update_constants_array_from_map(); + this->update_constants_array_from_map(); } - } +} #if defined(USE_CUDA) || defined(USE_XPU) - GPUPtr&& release_constant_blob() { - return std::move(constant_blob_); - } + GPUPtr&& release_constant_blob() { + return std::move(constant_blob_); + } #endif #ifdef USE_NPU - NPUPtr&& release_constant_blob() { - return std::move(constant_blob_); - } + NPUPtr&& release_constant_blob() { + return std::move(constant_blob_); + } #endif - std::shared_ptr> get_constants_array() { +std::shared_ptr> get_constants_array() { return constants_; - } +} - int32_t get_device_idx() const { +int32_t get_device_idx() const { return device_idx_; - } +} - uint8_t* constant_ptr( - size_t constant_offset, - size_t bytes_read, - size_t data_size, - bool skip_copy) { +uint8_t* constant_ptr( + size_t constant_offset, + size_t bytes_read, + size_t data_size, + bool skip_copy) { #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU) auto* constants_ptr = static_cast(constant_blob_.get()); uint8_t* internal_ptr = constants_ptr + constant_offset; @@ -454,154 +458,154 @@ class AOTInductorModelBase { #endif // USE_CUDA } - void compute_gpu_constant_blob( - size_t& blob_size, - std::vector& constants_internal_offset) { +void compute_gpu_constant_blob( + size_t& blob_size, + std::vector& constants_internal_offset) { #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU) size_t num_constants = this->num_constants(); // Compute required blob size with 64-alignment if on GPU. blob_size = 0; for (size_t i = 0; i < num_constants; i++) { - size_t data_size = this->constant_data_size(i); - if (data_size % AOTI_CONST_GPU_ALIGNMENT) { - data_size = AOTI_CONST_GPU_ALIGNMENT + + size_t data_size = this->constant_data_size(i); + if (data_size % AOTI_CONST_GPU_ALIGNMENT) { + data_size = AOTI_CONST_GPU_ALIGNMENT + (data_size / AOTI_CONST_GPU_ALIGNMENT) * AOTI_CONST_GPU_ALIGNMENT; - } - constants_internal_offset[i] = blob_size; - blob_size += data_size; + } + constants_internal_offset[i] = blob_size; + blob_size += data_size; } #endif // USE_CUDA - } +} - size_t num_inputs() const { +size_t num_inputs() const { return inputs_info_.size(); - } +} - size_t num_outputs() const { +size_t num_outputs() const { return outputs_info_.size(); - } +} - size_t num_constants() const { +size_t num_constants() const { return constants_info_.size(); - } +} - const char* input_name(int64_t idx) const { +const char* input_name(int64_t idx) const { return inputs_info_.at(idx).name; - } +} - const char* output_name(int64_t idx) const { +const char* output_name(int64_t idx) const { return outputs_info_.at(idx).name; - } +} - const char* constant_name(int64_t idx) const { +const char* constant_name(int64_t idx) const { return constants_info_.at(idx).name; - } +} - size_t constant_ndim(int64_t idx) { +size_t constant_ndim(int64_t idx) { return constants_info_.at(idx).shape.size(); - } +} - const int64_t* constant_shape(int64_t idx) const { +const int64_t* constant_shape(int64_t idx) const { return constants_info_.at(idx).shape.data(); - } +} - const int64_t* constant_stride(int64_t idx) const { +const int64_t* constant_stride(int64_t idx) const { return constants_info_.at(idx).stride.data(); - } +} - int32_t constant_dtype(int64_t idx) const { +int32_t constant_dtype(int64_t idx) const { return constants_info_.at(idx).dtype; - } +} - int32_t constant_layout(int64_t idx) const { +int32_t constant_layout(int64_t idx) const { return constants_info_.at(idx).layout; - } +} - size_t constant_offset(int64_t idx) const { +size_t constant_offset(int64_t idx) const { return constants_info_.at(idx).offset; - } +} - size_t constant_data_size(int64_t idx) const { +size_t constant_data_size(int64_t idx) const { return constants_info_.at(idx).data_size; - } +} - const char* constant_original_fqn(int64_t idx) const { +const char* constant_original_fqn(int64_t idx) const { return constants_info_.at(idx).original_fqn; - } +} - const uint8_t* opaque_metadata(int64_t idx) const { +const uint8_t* opaque_metadata(int64_t idx) const { return constants_info_.at(idx).opaque_metadata.data(); - } +} - size_t opaque_metadata_size(int64_t idx) { +size_t opaque_metadata_size(int64_t idx) { return constants_info_.at(idx).opaque_metadata.size(); - } +} - bool constant_from_folded(int64_t idx) const { +bool constant_from_folded(int64_t idx) const { return constants_info_.at(idx).from_folded; - } +} - int32_t constant_type(int64_t idx) const { +int32_t constant_type(int64_t idx) const { return constants_info_.at(idx).type; - } +} - const char* get_in_spec() const { +const char* get_in_spec() const { return in_spec_.c_str(); - } +} - const char* get_out_spec() const { +const char* get_out_spec() const { return out_spec_.c_str(); - } +} - void update_constants_array_from_map() { +void update_constants_array_from_map() { if (!constants_map_) { - throw std::runtime_error{ - "constants_map_ was not ready when constants_ is trying to be constructed from it!"}; + throw std::runtime_error{ + "constants_map_ was not ready when constants_ is trying to be constructed from it!"}; } if (!constants_) { - constants_ = - std::make_shared>(constants_info_.size()); + constants_ = + std::make_shared>(constants_info_.size()); } else { - constants_->resize(constants_info_.size()); + constants_->resize(constants_info_.size()); } int idx = 0; for (const auto& info : constants_info_) { - const auto it = constants_map_->find(info.name); - if (it != constants_map_->end()) { - constants_->at(idx) = ConstantHandle(it->second); - } - idx++; + const auto it = constants_map_->find(info.name); + if (it != constants_map_->end()) { + constants_->at(idx) = ConstantHandle(it->second); + } + idx++; } - } +} - void update_constants_map( - std::shared_ptr constants_map, - bool remap_constants_array = true) { - constants_map_ = std::move(constants_map); - if (remap_constants_array) { - update_constants_array_from_map(); - } - } +void update_constants_map( + std::shared_ptr constants_map, + bool remap_constants_array = true) { + constants_map_ = std::move(constants_map); + if (remap_constants_array) { + update_constants_array_from_map(); + } +} // This function allows us to update the constants_ that is used to look up // the corresponding constant tensor during runtime. - void update_constants_array( - std::shared_ptr> constants_array) { - constants_ = std::move(constants_array); - } +void update_constants_array( + std::shared_ptr> constants_array) { + constants_ = std::move(constants_array); +} /// Returns true if the model is complete. - bool is_finished() { +bool is_finished() { #ifdef USE_CUDA if (!run_finished_) { - throw std::runtime_error{"Model CUDA event was not initialized"}; + throw std::runtime_error{"Model CUDA event was not initialized"}; } auto event_status = cudaEventQuery(*run_finished_); if (event_status == cudaSuccess) { - return true; + return true; } else if (event_status == cudaErrorNotReady) { - return false; + return false; } throw std::runtime_error( @@ -610,19 +614,19 @@ class AOTInductorModelBase { #elif defined(USE_NPU) if (!run_finished_) { - throw std::runtime_error{"Model NPU event was not initialized"}; + throw std::runtime_error{"Model NPU event was not initialized"}; } aclrtEventRecordedStatus recordStatus = ACL_EVENT_RECORDED_STATUS_NOT_READY; AOTI_RUNTIME_DEVICE_CHECK(aclrtQueryEventStatus(*run_finished_, &recordStatus)); if (recordStatus == ACL_EVENT_RECORDED_STATUS_COMPLETE) { - return true; + return true; } else { - return false; + return false; } #elif defined(USE_XPU) if (!run_finished_) { - throw std::runtime_error{"Model XPU event was not initialized"}; + throw std::runtime_error{"Model XPU event was not initialized"}; } using namespace sycl::info; return (*run_finished_)->get_info() == @@ -631,24 +635,24 @@ class AOTInductorModelBase { #else // !USE_CUDA && !USE_XPU return run_finished_; #endif // USE_CUDA - } +} /// Synchronizes completion event. - void wait_for_completion() { +void wait_for_completion() { #ifdef USE_CUDA if (!run_finished_) { - throw std::runtime_error{"Model event was not initialized"}; + throw std::runtime_error{"Model event was not initialized"}; } AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_)); #endif // USE_CUDA #ifdef USE_XPU if (!run_finished_) { - throw std::runtime_error{"Model event was not initialized"}; + throw std::runtime_error{"Model event was not initialized"}; } (*run_finished_)->wait_and_throw(); #endif - } +} protected: uint8_t* _get_constants_start() { @@ -657,7 +661,7 @@ class AOTInductorModelBase { return const_cast(_binary_constants_bin_start); #else if (self_mmap) { - return self_mmap; + return self_mmap; } Dl_info dl_info; // get pointer to constant which are appended to the binary @@ -691,11 +695,11 @@ class AOTInductorModelBase { return self_mmap; #endif } - struct ParamInfo { +struct ParamInfo { const char* name = nullptr; - }; +}; - struct ConstInfo { +struct ConstInfo { const char* name = nullptr; std::vector shape; std::vector stride; @@ -708,29 +712,29 @@ class AOTInductorModelBase { const char* original_fqn = nullptr; bool from_folded{}; int32_t type{}; - }; +}; - std::vector inputs_info_; - std::vector outputs_info_; - std::vector constants_info_; - std::string in_spec_; - std::string out_spec_; +std::vector inputs_info_; +std::vector outputs_info_; +std::vector constants_info_; +std::string in_spec_; +std::string out_spec_; - std::shared_ptr constants_map_; - std::shared_ptr> constants_; +std::shared_ptr constants_map_; +std::shared_ptr> constants_; #if defined(USE_CUDA) || defined(USE_XPU) // Holds the blob storage for constants' at::Tensor for CUDA. - GPUPtr constant_blob_; + GPUPtr constant_blob_; #endif // USE_CUDA #ifdef USE_NPU // Holds the blob storage for constants' at::Tensor for CUDA. - NPUPtr constant_blob_; + NPUPtr constant_blob_; #endif // USE_NPU #ifdef USE_MMAP_SELF - uint8_t* self_mmap = NULL; + uint8_t* self_mmap = NULL; #endif // A directory with CUDA binary files, e.g. compiled kernels, etc. @@ -744,18 +748,18 @@ class AOTInductorModelBase { // Record if the model finishes an inference run so that its owning // AOTModelContainer can re-use this instance. #ifdef USE_CUDA - std::optional run_finished_; + std::optional run_finished_; #elif defined(USE_XPU) - std::optional run_finished_; + std::optional run_finished_; #elif defined(USE_NPU) - std::optional run_finished_; + std::optional run_finished_; #else // !USE_CUDA - bool run_finished_{}; + bool run_finished_{}; #endif // Generated model uses this device index to create CUDA guards. - int32_t device_type_{}; - int32_t device_idx_{}; + int32_t device_type_{}; + int32_t device_idx_{}; }; // Codegen-ed classes can derive from this to keep pointers to loaded kernels. @@ -765,55 +769,55 @@ class AOTInductorModelKernelsBase { }; class AOTInductorModel : public AOTInductorModelBase { - public: - AOTInductorModel( - std::shared_ptr constants_map, - std::shared_ptr> constants_array, - const std::string& device_str, - std::optional cubin_dir, - bool include_weights = true); - - std::unordered_map const_run_impl( - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor, - bool initialization = false); - - void _const_run_impl( - std::vector& output_handles, - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor); - - void run_impl( - AtenTensorHandle* - input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor); - - template - Outputs run_impl_minimal_arrayref_interface( - const Inputs& inputs, - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor); - - static std::unique_ptr Create( - std::shared_ptr constants_map, - std::shared_ptr> constants_array, - const std::string& device_str, - std::optional cubin_dir) { - return std::make_unique( - std::move(constants_map), - std::move(constants_array), - device_str, - std::move(cubin_dir)); - } + public: + AOTInductorModel( + std::shared_ptr constants_map, + std::shared_ptr> constants_array, + const std::string& device_str, + std::optional cubin_dir, + bool include_weights = true); + + std::unordered_map const_run_impl( + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor, + bool initialization = false); + + void _const_run_impl( + std::vector& output_handles, + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor); + + void run_impl( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; + //handles are stolen; the array itself is borrowed + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; + //handles will be stolen by the caller; + //the array itself is borrowed + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor); + + template + Outputs run_impl_minimal_arrayref_interface( + const Inputs& inputs, + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor); + + static std::unique_ptr Create( + std::shared_ptr constants_map, + std::shared_ptr> constants_array, + const std::string& device_str, + std::optional cubin_dir) { + return std::make_unique( + std::move(constants_map), + std::move(constants_array), + device_str, + std::move(cubin_dir)); + } - private: - std::unique_ptr kernels_; + private: + std::unique_ptr kernels_; }; } // namespace torch::aot_inductor -- Gitee From 83c445cc52ba1904ee0eaef891a6d315c7d29d82 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 05:07:50 +0000 Subject: [PATCH 02/11] model Signed-off-by: zjx.com --- torch_npu/csrc/inductor/aoti_runtime/model.h | 1013 +++++++++--------- 1 file changed, 497 insertions(+), 516 deletions(-) diff --git a/torch_npu/csrc/inductor/aoti_runtime/model.h b/torch_npu/csrc/inductor/aoti_runtime/model.h index fe017bc59c6..644b61abcc8 100644 --- a/torch_npu/csrc/inductor/aoti_runtime/model.h +++ b/torch_npu/csrc/inductor/aoti_runtime/model.h @@ -22,12 +22,12 @@ #endif #define AOTI_RUNTIME_CHECK(EXPR, MSG) \ -do { \ - bool ok = EXPR; \ - if (!ok) { \ - throw std::runtime_error(MSG); \ - } \ -} while (0) + do { \ + bool ok = EXPR; \ + if (!ok) { \ + throw std::runtime_error(MSG); \ + } \ + } while (0) // At codegen time, we write out a binary file called constants.bin. // We then turn the raw binary to an object file that exposes this @@ -48,7 +48,8 @@ namespace { using GPUPtr = std::unique_ptr>; -GPUPtr RAII_gpuMalloc(size_t num_bytes) { +GPUPtr RAII_gpuMalloc(size_t num_bytes) +{ void* data_ptr; AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes)); auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); }; @@ -61,12 +62,15 @@ GPUPtr RAII_gpuMalloc(size_t num_bytes) { using NPUPtr = std::unique_ptr>; -NPUPtr RAII_npuMalloc(size_t num_bytes) { +NPUPtr RAII_npuMalloc(size_t num_bytes) +{ void* data_ptr; // aclrtMalloc doesn't support allocate 0-bytes. In this case, // e.g, model has no weight, we should do padding. size_t padding_bytes = 32; - if (num_bytes == 0) num_bytes = padding_bytes; + if (num_bytes == 0) { + num_bytes = padding_bytes; + } AOTI_RUNTIME_DEVICE_CHECK(aclrtMalloc((void**)&data_ptr, num_bytes, ACL_MEM_MALLOC_HUGE_FIRST)); auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(aclrtFree(ptr)); }; return NPUPtr(data_ptr, deleter); @@ -78,7 +82,8 @@ NPUPtr RAII_npuMalloc(size_t num_bytes) { using GPUPtr = std::unique_ptr>; -GPUPtr RAII_gpuMalloc(size_t num_bytes) { +GPUPtr RAII_gpuMalloc(size_t num_bytes) +{ sycl::queue* queue_ptr = nullptr; aoti_torch_get_current_sycl_queue((void**)&queue_ptr); void* data_ptr = sycl::malloc_device(num_bytes, *queue_ptr); @@ -95,7 +100,7 @@ enum ConstantType : uint8_t { Unknown = 0, Parameter = 1, Buffer = 2, - ensorConstant = 3, + TensorConstant = 3, FoldedConstant = 4, }; @@ -106,7 +111,8 @@ using ConstantMap = std::unordered_map; inline void parse_device_str( const std::string& device_str, int32_t& device_type, - int32_t& device_idx) { + int32_t& device_idx) +{ std::regex re("(cpu|cuda|xpu|npu)(:([0-9]+))?"); std::smatch sm; bool matched = std::regex_match(device_str, sm, re); @@ -114,21 +120,17 @@ inline void parse_device_str( if (sm[1].str() == "cpu") { device_type = aoti_torch_device_type_cpu(); - } - else if (sm[1].str() == "cuda") { + } else if (sm[1].str() == "cuda") { device_type = aoti_torch_device_type_cuda(); #ifdef USE_XPU - } - else if (sm[1].str() == "xpu") { + } else if (sm[1].str() == "xpu") { device_type = aoti_torch_device_type_xpu(); #endif #ifdef USE_NPU - } - else if (sm[1].str() == "npu") { + } else if (sm[1].str() == "npu") { device_type = aoti_torch_device_type_npu(); #endif - } - else { + } else { AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str); } int64_t device_sub = 3; @@ -146,471 +148,446 @@ inline void parse_device_str( // methods such as run_impl. template class AOTInductorModelBase { - public: - AOTInductorModelBase( - size_t num_inputs, - size_t num_outputs, - size_t num_constants, - const std::string& device_str, - std::optional cubin_dir, - bool include_weights = true) - : inputs_info_(num_inputs), - outputs_info_(num_outputs), - constants_info_(num_constants), - cubin_dir_(std::move(cubin_dir)), - include_weights(include_weights) { - parse_device_str(device_str, device_type_, device_idx_); +public: + AOTInductorModelBase( + size_t num_inputs, + size_t num_outputs, + size_t num_constants, + const std::string& device_str, + std::optional cubin_dir, + bool include_weights = true) + : inputs_info_(num_inputs), + outputs_info_(num_outputs), + constants_info_(num_constants), + cubin_dir_(std::move(cubin_dir)), + include_weights(include_weights) + { + parse_device_str(device_str, device_type_, device_idx_); #ifdef USE_CUDA - if (device_idx_ == -1) { - AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_)); - } else { - // If device_idx_ is passed in, we need to set the current device to it - AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_)); - } + if (device_idx_ == -1) { + AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_)); + } else { + // If device_idx_ is passed in, we need to set the current device to it + AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_)); + } #endif // USE_CUDA #ifdef USE_XPU - if (device_idx_ == -1) { - aoti_torch_get_current_xpu_device(&device_idx_); - } else { - aoti_torch_set_current_xpu_device(device_idx_); - } + if (device_idx_ == -1) { + aoti_torch_get_current_xpu_device(&device_idx_); + } else { + aoti_torch_set_current_xpu_device(device_idx_); + } #endif // USE_XPU #ifdef USE_NPU - if (device_idx_ == -1) { - AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0)); - AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_)); - } else { - AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(device_idx_)); - } + if (device_idx_ == -1) { + AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0)); + AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_)); + } else { + AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(device_idx_)); + } #endif // USE_NPU - } + } - // NOLINTNEXTLINE(modernize-use-equals-default) -~AOTInductorModelBase() { + // NOLINTNEXTLINE(modernize-use-equals-default) + ~AOTInductorModelBase() + { #ifdef USE_CUDA - if (run_finished_) { - auto code = cudaEventDestroy(*run_finished_); - if (code != cudaSuccess) { - std::cerr << "Failed to destroy CUDA event in AOTInductor model: " - << cudaGetErrorString(code) << std::endl; + if (run_finished_) { + auto code = cudaEventDestroy(*run_finished_); + if (code != cudaSuccess) { + std::cerr << "Failed to destroy CUDA event in AOTInductor model: " + << cudaGetErrorString(code) << std::endl; + } } - } #endif // USE_CUDA #ifdef USE_XPU - if (run_finished_) { - (*run_finished_)->wait_and_throw(); - delete *run_finished_; - } + if (run_finished_) { + (*run_finished_)->wait_and_throw(); + delete *run_finished_; + } #endif // USE_XPU #ifdef USE_NPU - if (run_finished_) { - auto code = aclrtDestroyEvent(*run_finished_); - if (code != ACL_SUCCESS) { - std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: " - << code << std::endl; + if (run_finished_) { + auto code = aclrtDestroyEvent(*run_finished_); + if (code != ACL_SUCCESS) { + std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: " + << code << std::endl; + } } - } #endif // USE_NPU -} + } -AOTInductorModelBase(AOTInductorModelBase&&) = delete; -AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete; -AOTInductorModelBase(const AOTInductorModelBase&) = delete; -AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete; - -void run( - AtenTensorHandle* - input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor) { + AOTInductorModelBase(AOTInductorModelBase&&) = delete; + AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete; + AOTInductorModelBase(const AOTInductorModelBase&) = delete; + AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete; + + void run( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor) + { #ifdef USE_CUDA - if (!run_finished_) { - cudaEvent_t run_finished; - AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished)); - run_finished_.emplace(run_finished); - } + if (!run_finished_) { + cudaEvent_t run_finished; + AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished)); + run_finished_.emplace(run_finished); + } #elif defined(USE_XPU) - if (run_finished_) { - (*run_finished_)->wait_and_throw(); - delete *run_finished_; - run_finished_.reset(); - } + if (run_finished_) { + (*run_finished_)->wait_and_throw(); + delete *run_finished_; + run_finished_.reset(); + } #elif defined(USE_NPU) - if (!run_finished_) { - aclrtEvent run_finished; - AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished)); - run_finished_.emplace(run_finished); - } + if (!run_finished_) { + aclrtEvent run_finished; + AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished)); + run_finished_.emplace(run_finished); + } #else // !USE_CUDA && !USE_XPU - run_finished_ = false; + run_finished_ = false; #endif - auto* model = static_cast(this); - model->run_impl(input_handles, output_handles, stream, proxy_executor); + auto* model = static_cast(this); + model->run_impl(input_handles, output_handles, stream, proxy_executor); #ifdef USE_CUDA - AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream)); + AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream)); #elif defined(USE_XPU) - run_finished_ = std::make_optional(new sycl::event( - static_cast(stream)->ext_oneapi_submit_barrier())); + run_finished_ = std::make_optional(new sycl::event( + static_cast(stream)->ext_oneapi_submit_barrier())); #elif defined(USE_NPU) - AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream)); + AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream)); #else // !USE_CUDA && !USE_XPU - run_finished_ = true; + run_finished_ = true; #endif // USE_CUDA - } + } -std::unordered_map run_const_fold( - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor, - bool initialization = false) { + std::unordered_map run_const_fold( + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor, + bool initialization = false) + { #ifdef USE_CUDA - if (!run_finished_) { - cudaEvent_t run_finished; - AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished)); - run_finished_.emplace(run_finished); - } + if (!run_finished_) { + cudaEvent_t run_finished; + AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished)); + run_finished_.emplace(run_finished); + } #elif defined(USE_XPU) - if (run_finished_) { - (*run_finished_)->wait_and_throw(); - delete *run_finished_; - run_finished_.reset(); - } + if (run_finished_) { + (*run_finished_)->wait_and_throw(); + delete *run_finished_; + run_finished_.reset(); + } #elif defined(USE_NPU) - if (!run_finished_) { - aclrtEvent run_finished; - AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished)); - run_finished_.emplace(run_finished); - } + if (!run_finished_) { + aclrtEvent run_finished; + AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished)); + run_finished_.emplace(run_finished); + } #else // !USE_CUDA && !USE_XPU - run_finished_ = false; + run_finished_ = false; #endif - auto* model = static_cast(this); - auto folded_constants = - model->const_run_impl(stream, proxy_executor, initialization); + auto* model = static_cast(this); + auto folded_constants = + model->const_run_impl(stream, proxy_executor, initialization); #ifdef USE_CUDA - AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream)); + AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream)); #elif defined(USE_XPU) - run_finished_ = std::make_optional(new sycl::event( - static_cast(stream)->ext_oneapi_submit_barrier())); + run_finished_ = std::make_optional(new sycl::event( + static_cast(stream)->ext_oneapi_submit_barrier())); #elif defined(USE_NPU) - AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream)); + AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream)); #else // !USE_CUDA && !USE_XPU - run_finished_ = true; + run_finished_ = true; #endif // USE_CUDA - return folded_constants; -} + return folded_constants; + } -void load_constants() { - size_t num_constants = this->num_constants(); - constants_map_->reserve(num_constants); + void load_constants() + { + size_t num_constants = this->num_constants(); + constants_map_->reserve(num_constants); - std::vector constants_internal_offset(num_constants); - if (device_type_ != aoti_torch_device_type_cpu()) { - size_t blob_size = 0; - compute_gpu_constant_blob(blob_size, constants_internal_offset); + std::vector constants_internal_offset(num_constants); + if (device_type_ != aoti_torch_device_type_cpu()) { + size_t blob_size = 0; + compute_gpu_constant_blob(blob_size, constants_internal_offset); #if defined(USE_CUDA) || defined(USE_XPU) - constant_blob_ = RAII_gpuMalloc(blob_size); + constant_blob_ = RAII_gpuMalloc(blob_size); #elif defined(USE_NPU) - constant_blob_ = RAII_npuMalloc(blob_size); + constant_blob_ = RAII_npuMalloc(blob_size); #endif - } - if (!include_weights) { - return; - } + } + if (!include_weights) { + return; + } - size_t bytes_read = 0; - for (size_t i = 0; i < num_constants; i++) { - bool from_folded = this->constant_from_folded(i); + size_t bytes_read = 0; + for (size_t i = 0; i < num_constants; i++) { + bool from_folded = this->constant_from_folded(i); #if not defined(USE_XPU) && not defined(USE_CUDA) && not defined(USE_NPU) - if (from_folded) { - // We do not reallocate and copy for CPU. - continue; - } + if (from_folded) { + // We do not reallocate and copy for CPU. + continue; + } #endif // USE_CUDA - std::string name = this->constant_name(i); - size_t data_size = this->constant_data_size(i); - uint8_t* internal_ptr = (data_size != 0) - ? constant_ptr( - constants_internal_offset[i], - bytes_read, - data_size, - from_folded) - : nullptr; - bytes_read += data_size; - - // Create at::Tensor from copied memory. - auto dtype = this->constant_dtype(i); - auto ndim = this->constant_ndim(i); - auto size = this->constant_shape(i); - auto stride = this->constant_stride(i); - auto offset = this->constant_offset(i); - auto layout = this->constant_layout(i); - auto opaque_metadata_ptr = this->opaque_metadata(i); - auto opaque_metadata_size = this->opaque_metadata_size(i); - - AtenTensorHandle tensor_handle = nullptr; + std::string name = this->constant_name(i); + size_t data_size = this->constant_data_size(i); + uint8_t* internal_ptr = (data_size != 0) + ? constant_ptr( + constants_internal_offset[i], + bytes_read, + data_size, + from_folded) + : nullptr; + bytes_read += data_size; + + // Create at::Tensor from copied memory. + auto dtype = this->constant_dtype(i); + auto ndim = this->constant_ndim(i); + auto size = this->constant_shape(i); + auto stride = this->constant_stride(i); + auto offset = this->constant_offset(i); + auto layout = this->constant_layout(i); + auto opaque_metadata_ptr = this->opaque_metadata(i); + auto opaque_metadata_size = this->opaque_metadata_size(i); + + AtenTensorHandle tensor_handle = nullptr; #ifdef AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 - // When opaque_metadata_size is not 0, we need to have the - // aoti_torch_create_tensor_from_blob_npu_v2 available - AOTI_RUNTIME_CHECK( - opaque_metadata_size == 0, - "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined"); - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu( - internal_ptr, - ndim, - size, - stride, - offset, - dtype, - device_type_, - device_idx_, - &tensor_handle)); + // When opaque_metadata_size is not 0, we need to have the + // aoti_torch_create_tensor_from_blob_npu_v2 available + AOTI_RUNTIME_CHECK( + opaque_metadata_size == 0, + "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined"); + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu( + internal_ptr, + ndim, + size, + stride, + offset, + dtype, + device_type_, + device_idx_, + &tensor_handle)); #else - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2( - internal_ptr, - ndim, - size, - stride, - offset, - dtype, - device_type_, - device_idx_, - &tensor_handle, - layout, - opaque_metadata_ptr, - opaque_metadata_size)); + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2( + internal_ptr, + ndim, + size, + stride, + offset, + dtype, + device_type_, + device_idx_, + &tensor_handle, + layout, + opaque_metadata_ptr, + opaque_metadata_size)); #endif // AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 - constants_map_->emplace(std::move(name), tensor_handle); - } - if (constants_map_) { - this->update_constants_array_from_map(); + constants_map_->emplace(std::move(name), tensor_handle); + } + if (constants_map_) { + this->update_constants_array_from_map(); + } } -} #if defined(USE_CUDA) || defined(USE_XPU) - GPUPtr&& release_constant_blob() { + GPUPtr&& release_constant_blob() + { return std::move(constant_blob_); } #endif #ifdef USE_NPU - NPUPtr&& release_constant_blob() { + NPUPtr&& release_constant_blob() + { return std::move(constant_blob_); } #endif -std::shared_ptr> get_constants_array() { - return constants_; -} + std::shared_ptr> get_constants_array() + { + return constants_; + } -int32_t get_device_idx() const { - return device_idx_; -} + int32_t get_device_idx() const + { + return device_idx_; + } -uint8_t* constant_ptr( - size_t constant_offset, - size_t bytes_read, - size_t data_size, - bool skip_copy) { + uint8_t* constant_ptr( + size_t constant_offset, + size_t bytes_read, + size_t data_size, + bool skip_copy) + { #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU) - auto* constants_ptr = static_cast(constant_blob_.get()); - uint8_t* internal_ptr = constants_ptr + constant_offset; - // Copy data to GPU memory - if (!skip_copy) { + auto* constants_ptr = static_cast(constant_blob_.get()); + uint8_t* internal_ptr = constants_ptr + constant_offset; + // Copy data to GPU memory + if (!skip_copy) { #ifdef USE_XPU - sycl::queue* queue_ptr = nullptr; - aoti_torch_get_current_sycl_queue((void**)&queue_ptr); - queue_ptr - ->memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size) - .wait(); + sycl::queue* queue_ptr = nullptr; + aoti_torch_get_current_sycl_queue((void**)&queue_ptr); + queue_ptr + ->memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size) + .wait(); #elif defined(USE_NPU) - AOTI_RUNTIME_DEVICE_CHECK(aclrtMemcpy( - internal_ptr, - data_size, - _get_constants_start() + bytes_read, - data_size, - ACL_MEMCPY_HOST_TO_DEVICE)); + AOTI_RUNTIME_DEVICE_CHECK(aclrtMemcpy( + internal_ptr, + data_size, + _get_constants_start() + bytes_read, + data_size, + ACL_MEMCPY_HOST_TO_DEVICE)); #else - AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy( - internal_ptr, - _get_constants_start() + bytes_read, - data_size, - cudaMemcpyHostToDevice)); + AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy( + internal_ptr, + _get_constants_start() + bytes_read, + data_size, + cudaMemcpyHostToDevice)); #endif - } - return internal_ptr; + } + return internal_ptr; #else - // get pointer to constant which is packed in model during compile time. - AOTI_RUNTIME_CHECK(!skip_copy, "pure cpu mode doesn't support skip copy"); - return _get_constants_start() + bytes_read; + // get pointer to constant which is packed in model during compile time. + AOTI_RUNTIME_CHECK(!skip_copy, "pure cpu mode doesn't support skip copy"); + return _get_constants_start() + bytes_read; #endif // USE_CUDA - } + } -void compute_gpu_constant_blob( - size_t& blob_size, - std::vector& constants_internal_offset) { + void compute_gpu_constant_blob( + size_t& blob_size, + std::vector& constants_internal_offset) + { #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU) - size_t num_constants = this->num_constants(); - // Compute required blob size with 64-alignment if on GPU. - blob_size = 0; - for (size_t i = 0; i < num_constants; i++) { - size_t data_size = this->constant_data_size(i); - if (data_size % AOTI_CONST_GPU_ALIGNMENT) { - data_size = AOTI_CONST_GPU_ALIGNMENT + - (data_size / AOTI_CONST_GPU_ALIGNMENT) * AOTI_CONST_GPU_ALIGNMENT; + size_t num_constants = this->num_constants(); + // Compute required blob size with 64-alignment if on GPU. + blob_size = 0; + for (size_t i = 0; i < num_constants; i++) { + size_t data_size = this->constant_data_size(i); + if (data_size % AOTI_CONST_GPU_ALIGNMENT) { + data_size = AOTI_CONST_GPU_ALIGNMENT + + (data_size / AOTI_CONST_GPU_ALIGNMENT) * AOTI_CONST_GPU_ALIGNMENT; + } + constants_internal_offset[i] = blob_size; + blob_size += data_size; } - constants_internal_offset[i] = blob_size; - blob_size += data_size; - } #endif // USE_CUDA -} + } -size_t num_inputs() const { - return inputs_info_.size(); -} + size_t num_inputs() const { return inputs_info_.size(); } -size_t num_outputs() const { - return outputs_info_.size(); -} + size_t num_outputs() const { return outputs_info_.size(); } -size_t num_constants() const { - return constants_info_.size(); -} + size_t num_constants() const { return constants_info_.size(); } -const char* input_name(int64_t idx) const { - return inputs_info_.at(idx).name; -} + const char* input_name(int64_t idx) const { return inputs_info_.at(idx).name; } -const char* output_name(int64_t idx) const { - return outputs_info_.at(idx).name; -} + const char* output_name(int64_t idx) const { return outputs_info_.at(idx).name; } -const char* constant_name(int64_t idx) const { - return constants_info_.at(idx).name; -} + const char* constant_name(int64_t idx) const { return constants_info_.at(idx).name; } -size_t constant_ndim(int64_t idx) { - return constants_info_.at(idx).shape.size(); -} + size_t constant_ndim(int64_t idx) { return constants_info_.at(idx).shape.size(); } -const int64_t* constant_shape(int64_t idx) const { - return constants_info_.at(idx).shape.data(); -} + const int64_t* constant_shape(int64_t idx) const { return constants_info_.at(idx).shape.data(); } -const int64_t* constant_stride(int64_t idx) const { - return constants_info_.at(idx).stride.data(); -} + const int64_t* constant_stride(int64_t idx) const { return constants_info_.at(idx).stride.data(); } -int32_t constant_dtype(int64_t idx) const { - return constants_info_.at(idx).dtype; -} + int32_t constant_dtype(int64_t idx) const { return constants_info_.at(idx).dtype; } -int32_t constant_layout(int64_t idx) const { - return constants_info_.at(idx).layout; -} + int32_t constant_layout(int64_t idx) const { return constants_info_.at(idx).layout; } -size_t constant_offset(int64_t idx) const { - return constants_info_.at(idx).offset; -} + size_t constant_offset(int64_t idx) const { return constants_info_.at(idx).offset; } -size_t constant_data_size(int64_t idx) const { - return constants_info_.at(idx).data_size; -} + size_t constant_data_size(int64_t idx) const { return constants_info_.at(idx).data_size; } -const char* constant_original_fqn(int64_t idx) const { - return constants_info_.at(idx).original_fqn; -} + const char* constant_original_fqn(int64_t idx) const { return constants_info_.at(idx).original_fqn; } -const uint8_t* opaque_metadata(int64_t idx) const { - return constants_info_.at(idx).opaque_metadata.data(); -} + const uint8_t* opaque_metadata(int64_t idx) const { return constants_info_.at(idx).opaque_metadata.data(); } -size_t opaque_metadata_size(int64_t idx) { - return constants_info_.at(idx).opaque_metadata.size(); -} + size_t opaque_metadata_size(int64_t idx) { return constants_info_.at(idx).opaque_metadata.size(); } -bool constant_from_folded(int64_t idx) const { - return constants_info_.at(idx).from_folded; -} + bool constant_from_folded(int64_t idx) const { return constants_info_.at(idx).from_folded; } -int32_t constant_type(int64_t idx) const { - return constants_info_.at(idx).type; -} + int32_t constant_type(int64_t idx) const { return constants_info_.at(idx).type; } -const char* get_in_spec() const { - return in_spec_.c_str(); -} + const char* get_in_spec() const { return in_spec_.c_str(); } -const char* get_out_spec() const { - return out_spec_.c_str(); -} + const char* get_out_spec() const { return out_spec_.c_str(); } -void update_constants_array_from_map() { - if (!constants_map_) { - throw std::runtime_error{ - "constants_map_ was not ready when constants_ is trying to be constructed from it!"}; - } - if (!constants_) { - constants_ = - std::make_shared>(constants_info_.size()); - } else { - constants_->resize(constants_info_.size()); - } - int idx = 0; - for (const auto& info : constants_info_) { - const auto it = constants_map_->find(info.name); - if (it != constants_map_->end()) { - constants_->at(idx) = ConstantHandle(it->second); + void update_constants_array_from_map() + { + if (!constants_map_) { + throw std::runtime_error{ + "constants_map_ was not ready when constants_ is trying to be constructed from it!"}; + } + if (!constants_) { + constants_ = + std::make_shared>(constants_info_.size()); + } else { + constants_->resize(constants_info_.size()); + } + int idx = 0; + for (const auto& info : constants_info_) { + const auto it = constants_map_->find(info.name); + if (it != constants_map_->end()) { + constants_->at(idx) = ConstantHandle(it->second); + } + idx++; } - idx++; } -} -void update_constants_map( - std::shared_ptr constants_map, - bool remap_constants_array = true) { + void update_constants_map( + std::shared_ptr constants_map, + bool remap_constants_array = true) + { constants_map_ = std::move(constants_map); if (remap_constants_array) { update_constants_array_from_map(); } -} + } - // This function allows us to update the constants_ that is used to look up - // the corresponding constant tensor during runtime. -void update_constants_array( - std::shared_ptr> constants_array) { + // This function allows us to update the constants_ that is used to look up + // the corresponding constant tensor during runtime. + void update_constants_array( + std::shared_ptr> constants_array) + { constants_ = std::move(constants_array); -} + } - /// Returns true if the model is complete. -bool is_finished() { + /// Returns true if the model is complete. + bool is_finished() + { #ifdef USE_CUDA - if (!run_finished_) { - throw std::runtime_error{"Model CUDA event was not initialized"}; - } + if (!run_finished_) { + throw std::runtime_error{"Model CUDA event was not initialized"}; + } - auto event_status = cudaEventQuery(*run_finished_); - if (event_status == cudaSuccess) { - return true; - } else if (event_status == cudaErrorNotReady) { - return false; - } + auto event_status = cudaEventQuery(*run_finished_); + if (event_status == cudaSuccess) { + return true; + } else if (event_status == cudaErrorNotReady) { + return false; + } - throw std::runtime_error( - std::string("The model did not finish successfully. Error: ") + - cudaGetErrorString(cudaGetLastError())); + throw std::runtime_error( + std::string("The model did not finish successfully. Error: ") + + cudaGetErrorString(cudaGetLastError())); #elif defined(USE_NPU) if (!run_finished_) { @@ -618,118 +595,120 @@ bool is_finished() { } aclrtEventRecordedStatus recordStatus = ACL_EVENT_RECORDED_STATUS_NOT_READY; AOTI_RUNTIME_DEVICE_CHECK(aclrtQueryEventStatus(*run_finished_, &recordStatus)); - + if (recordStatus == ACL_EVENT_RECORDED_STATUS_COMPLETE) { return true; } else { return false; } #elif defined(USE_XPU) - if (!run_finished_) { - throw std::runtime_error{"Model XPU event was not initialized"}; - } - using namespace sycl::info; - return (*run_finished_)->get_info() == - event_command_status::complete; + if (!run_finished_) { + throw std::runtime_error{"Model XPU event was not initialized"}; + } + using namespace sycl::info; + return (*run_finished_)->get_info() == + event_command_status::complete; #else // !USE_CUDA && !USE_XPU - return run_finished_; + return run_finished_; #endif // USE_CUDA -} + } - /// Synchronizes completion event. -void wait_for_completion() { + /// Synchronizes completion event. + void wait_for_completion() + { #ifdef USE_CUDA - if (!run_finished_) { - throw std::runtime_error{"Model event was not initialized"}; - } + if (!run_finished_) { + throw std::runtime_error{"Model event was not initialized"}; + } - AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_)); + AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_)); #endif // USE_CUDA #ifdef USE_XPU - if (!run_finished_) { - throw std::runtime_error{"Model event was not initialized"}; - } - (*run_finished_)->wait_and_throw(); + if (!run_finished_) { + throw std::runtime_error{"Model event was not initialized"}; + } + (*run_finished_)->wait_and_throw(); #endif -} + } - protected: - uint8_t* _get_constants_start() { +protected: + uint8_t* _get_constants_start() + { #ifndef USE_MMAP_SELF - // NOLINTNEXTLINE(*const-cast*) - return const_cast(_binary_constants_bin_start); + // NOLINTNEXTLINE(*const-cast*) + return const_cast(_binary_constants_bin_start); #else - if (self_mmap) { + if (self_mmap) { + return self_mmap; + } + Dl_info dl_info; + // get pointer to constant which are appended to the binary + AOTI_RUNTIME_CHECK( + dladdr(__func__, &dl_info), "Can't find shared library name"); + int fd = open(dl_info.dli_fname, O_RDONLY); + AOTI_RUNTIME_CHECK(fd >= 0, "Shared library file cannot be opened"); + auto fsize = lseek(fd, 0, SEEK_END); + auto weights_size = + reinterpret_cast(_binary_constants_bin_start)[0]; + auto magic_number = + reinterpret_cast(_binary_constants_bin_start)[1]; + auto weights_offset = fsize - weights_size; + AOTI_RUNTIME_CHECK( + (weights_offset & 0x3fff) == 0, + "weights_offset must be aligned to 16K boundary"); + auto ptr = mmap( + NULL, + weights_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + fd, + weights_offset); + close(fd); + AOTI_RUNTIME_CHECK(ptr != MAP_FAILED, "mmap() failed"); + self_mmap = static_cast(ptr); + AOTI_RUNTIME_CHECK( + reinterpret_cast( + self_mmap + weights_size - sizeof(uint64_t))[0] == magic_number, + "Weigths data seems corrupt"); return self_mmap; - } - Dl_info dl_info; - // get pointer to constant which are appended to the binary - AOTI_RUNTIME_CHECK( - dladdr(__func__, &dl_info), "Can't find shared library name"); - int fd = open(dl_info.dli_fname, O_RDONLY); - AOTI_RUNTIME_CHECK(fd >= 0, "Shared library file cannot be opened"); - auto fsize = lseek(fd, 0, SEEK_END); - auto weights_size = - reinterpret_cast(_binary_constants_bin_start)[0]; - auto magic_number = - reinterpret_cast(_binary_constants_bin_start)[1]; - auto weights_offset = fsize - weights_size; - AOTI_RUNTIME_CHECK( - (weights_offset & 0x3fff) == 0, - "weights_offset must be aligned to 16K boundary"); - auto ptr = mmap( - NULL, - weights_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, - fd, - weights_offset); - close(fd); - AOTI_RUNTIME_CHECK(ptr != MAP_FAILED, "mmap() failed"); - self_mmap = static_cast(ptr); - AOTI_RUNTIME_CHECK( - reinterpret_cast( - self_mmap + weights_size - sizeof(uint64_t))[0] == magic_number, - "Weigths data seems corrupt"); - return self_mmap; #endif - } -struct ParamInfo { - const char* name = nullptr; -}; - -struct ConstInfo { - const char* name = nullptr; - std::vector shape; - std::vector stride; - int32_t dtype{}; - int64_t offset{}; - size_t data_size{}; - int32_t layout{}; - std::vector opaque_metadata; - int64_t opaque_metadata_size{}; - const char* original_fqn = nullptr; - bool from_folded{}; - int32_t type{}; -}; - -std::vector inputs_info_; -std::vector outputs_info_; -std::vector constants_info_; -std::string in_spec_; -std::string out_spec_; - -std::shared_ptr constants_map_; -std::shared_ptr> constants_; + } + struct ParamInfo { + const char* name = nullptr; + }; + + struct ConstInfo { + const char* name = nullptr; + std::vector shape; + std::vector stride; + int32_t dtype{}; + int64_t offset{}; + size_t data_size{}; + int32_t layout{}; + std::vector opaque_metadata; + int64_t opaque_metadata_size{}; + const char* original_fqn = nullptr; + bool from_folded{}; + int32_t type{}; + }; + + std::vector inputs_info_; + std::vector outputs_info_; + std::vector constants_info_; + std::string in_spec_; + std::string out_spec_; + + std::shared_ptr constants_map_; + std::shared_ptr> constants_; #if defined(USE_CUDA) || defined(USE_XPU) - // Holds the blob storage for constants' at::Tensor for CUDA. + // Holds the blob storage for constants' at::Tensor for CUDA. GPUPtr constant_blob_; #endif // USE_CUDA #ifdef USE_NPU - // Holds the blob storage for constants' at::Tensor for CUDA. + // Holds the blob storage for constants' at::Tensor for CUDA. NPUPtr constant_blob_; #endif // USE_NPU @@ -737,16 +716,16 @@ std::shared_ptr> constants_; uint8_t* self_mmap = NULL; #endif - // A directory with CUDA binary files, e.g. compiled kernels, etc. - const std::optional cubin_dir_; + // A directory with CUDA binary files, e.g. compiled kernels, etc. + const std::optional cubin_dir_; - // This is the flag that implies whether the weight is included in the model. - // If True, we would prepare the weight when loading the model, otherwise the - // model will be loaded without weights, and need to be provided by the user. - bool include_weights; + // This is the flag that implies whether the weight is included in the model. + // If True, we would prepare the weight when loading the model, otherwise the + // model will be loaded without weights, and need to be provided by the user. + bool include_weights; - // Record if the model finishes an inference run so that its owning - // AOTModelContainer can re-use this instance. + // Record if the model finishes an inference run so that its owning + // AOTModelContainer can re-use this instance. #ifdef USE_CUDA std::optional run_finished_; #elif defined(USE_XPU) @@ -757,67 +736,69 @@ std::shared_ptr> constants_; bool run_finished_{}; #endif - // Generated model uses this device index to create CUDA guards. + // Generated model uses this device index to create CUDA guards. int32_t device_type_{}; int32_t device_idx_{}; }; // Codegen-ed classes can derive from this to keep pointers to loaded kernels. class AOTInductorModelKernelsBase { - public: - virtual ~AOTInductorModelKernelsBase() = default; +public: + virtual ~AOTInductorModelKernelsBase() = default; }; class AOTInductorModel : public AOTInductorModelBase { - public: - AOTInductorModel( - std::shared_ptr constants_map, - std::shared_ptr> constants_array, - const std::string& device_str, - std::optional cubin_dir, - bool include_weights = true); - - std::unordered_map const_run_impl( - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor, - bool initialization = false); - - void _const_run_impl( - std::vector& output_handles, - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor); - - void run_impl( - AtenTensorHandle* - input_handles, // array of input AtenTensorHandle; - //handles are stolen; the array itself is borrowed - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; - //handles will be stolen by the caller; - //the array itself is borrowed - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor); - - template - Outputs run_impl_minimal_arrayref_interface( - const Inputs& inputs, - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor); - - static std::unique_ptr Create( - std::shared_ptr constants_map, - std::shared_ptr> constants_array, - const std::string& device_str, - std::optional cubin_dir) { - return std::make_unique( - std::move(constants_map), - std::move(constants_array), - device_str, - std::move(cubin_dir)); - } - - private: - std::unique_ptr kernels_; +public: + AOTInductorModel( + std::shared_ptr constants_map, + std::shared_ptr> constants_array, + const std::string& device_str, + std::optional cubin_dir, + bool include_weights = true); + + std::unordered_map const_run_impl( + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor, + bool initialization = false); + + void _const_run_impl( + std::vector& output_handles, + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor); + + void run_impl( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor); + + template + Outputs run_impl_minimal_arrayref_interface( + const Inputs& inputs, + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor); + + static std::unique_ptr Create( + std::shared_ptr constants_map, + std::shared_ptr> constants_array, + const std::string& device_str, + std::optional cubin_dir) + { + return std::make_unique( + std::move(constants_map), + std::move(constants_array), + device_str, + std::move(cubin_dir)); + } + +private: + std::unique_ptr kernels_; }; } // namespace torch::aot_inductor +``` \ No newline at end of file -- Gitee From 3bc27d7394c33785648a48fc2a08d458949cd447 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 07:21:29 +0000 Subject: [PATCH 03/11] oss_proxy Signed-off-by: zjx.com --- .../aoti_torch/oss_proxy_executor_npu.cpp | 1134 +++++++++-------- 1 file changed, 570 insertions(+), 564 deletions(-) diff --git a/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp b/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp index db916d1d251..c11ef93bfda 100644 --- a/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp +++ b/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp @@ -5,8 +5,9 @@ #include namespace { -at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) { - return reinterpret_cast(handle); +at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) +{ + return reinterpret_cast(handle); } } // namespace @@ -16,477 +17,481 @@ void OSSProxyExecutorNpu::prefill_stack_with_static_arguments( size_t index, const at::TypePtr& schema_arg_type, const nlohmann::json& serialized_arg, - OSSOpKernel& op_kernel) { - auto& stack = op_kernel.stack_; - auto& dynamic_args = op_kernel.dynamic_args_; - - TORCH_CHECK(serialized_arg.size() == 1); - std::string serialized_arg_type = serialized_arg.begin().key(); - auto& serialized_arg_val = serialized_arg.begin().value(); - - switch (schema_arg_type->kind()) { - case c10::TypeKind::TensorType: { - TORCH_CHECK( - serialized_arg_type == "as_tensor", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_tensor for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back(index, DynamicArgType::TensorType, 1); - break; - } - case c10::TypeKind::IntType: { - TORCH_CHECK( - serialized_arg_type == "as_int", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_int for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); - break; - } - case c10::TypeKind::SymIntType: { - TORCH_CHECK( - serialized_arg_type == "as_int" || - serialized_arg_type == "as_sym_int", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_int or as_sym_int for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); - break; - } - case c10::TypeKind::FloatType: { - TORCH_CHECK( - serialized_arg_type == "as_float", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_float for argument ", - index, - " but got ", - serialized_arg_type); - stack.at(index) = serialized_arg_val.get(); - break; - } - case c10::TypeKind::BoolType: { - TORCH_CHECK( - serialized_arg_type == "as_bool", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_bool for argument ", - index, - " but got ", - serialized_arg_type); - stack.at(index) = serialized_arg_val.get(); - break; - } - case c10::TypeKind::NumberType: { - if (serialized_arg_type == "as_int") { - // Only int Scalar is treated as dynamic arg for now - dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); - } else if (serialized_arg_type == "as_float") { - stack.at(index) = serialized_arg_val.get(); - } else if (serialized_arg_type == "as_bool") { - stack.at(index) = serialized_arg_val.get(); - } else { - TORCH_CHECK( - false, - "Expected extern kernel ", - op_kernel.target_, - " to have a scalar input for argument ", - index, - " but got ", - serialized_arg_type); - } - break; - } - case c10::TypeKind::StringType: { - TORCH_CHECK( - serialized_arg_type == "as_string", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_string for argument ", - index, - " but got ", - serialized_arg_type); - stack.at(index) = serialized_arg_val.get(); - break; - } - case c10::TypeKind::DeviceObjType: { - TORCH_CHECK( - serialized_arg_type == "as_device", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_device for argument ", - index, - " but got ", - serialized_arg_type); - - std::string device_string = serialized_arg_val["type"].get(); - if (serialized_arg_val.contains("index") && - serialized_arg_val["index"].is_number()) { - device_string += ":" + serialized_arg_val["index"].get(); - } - - c10::Device device(device_string); - - if (device != *device_) { - VLOG(1) << "ProxyExecutor is using " << *device_ << " for " - << op_kernel.target_ << " argument #" << index - << ", which is different from the one serialized in thrift: " - << device << ". Please ensure this is intentional."; - } - - stack.at(index) = *device_; - break; - } - case c10::TypeKind::ListType: { - if (schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) { - TORCH_CHECK( - serialized_arg_type == "as_tensors", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_tensors for argument ", - index, - " but got ", - serialized_arg_type); - TORCH_CHECK(serialized_arg_type == "as_tensors"); - dynamic_args.emplace_back( - index, DynamicArgType::ListTensorType, serialized_arg_val.size()); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofInts())) { - TORCH_CHECK( - serialized_arg_type == "as_ints", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_ints for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back( - index, DynamicArgType::ListIntType, serialized_arg_val.size()); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) { - TORCH_CHECK( - serialized_arg_type == "as_ints" || - serialized_arg_type == "as_sym_ints", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_ints or as_sym_ints for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back( - index, DynamicArgType::ListIntType, serialized_arg_val.size()); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofFloats())) { - TORCH_CHECK( - serialized_arg_type == "as_floats", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_floats for argument ", - index, - " but got ", - serialized_arg_type); - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg.get()); + OSSOpKernel& op_kernel) +{ + auto& stack = op_kernel.stack_; + auto& dynamic_args = op_kernel.dynamic_args_; + + TORCH_CHECK(serialized_arg.size() == 1); + std::string serialized_arg_type = serialized_arg.begin().key(); + auto& serialized_arg_val = serialized_arg.begin().value(); + + switch (schema_arg_type->kind()) { + case c10::TypeKind::TensorType: { + TORCH_CHECK( + serialized_arg_type == "as_tensor", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_tensor for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back(index, DynamicArgType::TensorType, 1); + break; } - stack.at(index) = std::move(ret); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofBools())) { - TORCH_CHECK( - serialized_arg_type == "as_bools", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_bools for argument ", - index, - " but got ", - serialized_arg_type); - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg.get()); + case c10::TypeKind::IntType: { + TORCH_CHECK( + serialized_arg_type == "as_int", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_int for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); + break; } - stack.at(index) = std::move(ret); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofNumbers())) { - if (serialized_arg_type == "as_ints") { - dynamic_args.emplace_back( - index, DynamicArgType::ListIntType, serialized_arg_val.size()); - } else if (serialized_arg_type == "as_floats") { - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg); - } - stack.at(index) = std::move(ret); - } else if (serialized_arg_type == "as_bools") { - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg); - } - stack.at(index) = std::move(ret); - } else { - TORCH_CHECK( - false, - "Expected extern kernel ", - op_kernel.target_, - " to have a List[Scalar] input for argument ", - index, - " but got ", - serialized_arg_type); + case c10::TypeKind::SymIntType: { + TORCH_CHECK( + serialized_arg_type == "as_int" || + serialized_arg_type == "as_sym_int", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_int or as_sym_int for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); + break; } - } else if (schema_arg_type->isSubtypeOf( - at::ListType::ofOptionalTensors())) { - if (serialized_arg_type == "as_optional_tensors") { - std::vector list_item_types; - for (const auto& arg : serialized_arg_val) { - list_item_types.push_back(arg.begin().key()); - } - dynamic_args.emplace_back( - index, - DynamicArgType::ListOptionalTensorType, - serialized_arg_val.size(), - list_item_types); - } else if (serialized_arg_type == "as_tensors") { - dynamic_args.emplace_back( - index, DynamicArgType::ListTensorType, serialized_arg_val.size()); - } else { - TORCH_CHECK( - false, - "Expected extern kernel ", - op_kernel.target_, - " to have a Tensor?[] input for argument ", - index, - " but got ", - serialized_arg_type); + case c10::TypeKind::FloatType: { + TORCH_CHECK( + serialized_arg_type == "as_float", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_float for argument ", + index, + " but got ", + serialized_arg_type); + stack.at(index) = serialized_arg_val.get(); + break; } - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofStrings())) { - TORCH_CHECK( - serialized_arg_type == "as_strings", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_strings for argument ", - index, - " but got ", - serialized_arg_type); - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg.get()); + case c10::TypeKind::BoolType: { + TORCH_CHECK( + serialized_arg_type == "as_bool", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_bool for argument ", + index, + " but got ", + serialized_arg_type); + stack.at(index) = serialized_arg_val.get(); + break; } - stack.at(index) = std::move(ret); - } else { - TORCH_CHECK( - false, - "NYI: Unsupported list type ", - serialized_arg_type, - " for extern kernel ", - op_kernel.target_, - " argument ", - index); - } - break; - } - case c10::TypeKind::OptionalType: { - auto inner_type = - schema_arg_type->castRaw()->getElementType(); - - if (serialized_arg_type == "as_none") { - stack.at(index) = c10::IValue{}; - if (inner_type->kind() == c10::TypeKind::TensorType) { - // Tensor is None - dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0); - } else if ( - inner_type->kind() == c10::TypeKind::IntType || - inner_type->kind() == c10::TypeKind::SymIntType) { - // Int or SymInt is None - dynamic_args.emplace_back(index, DynamicArgType::IntType, 0); - } else if ( - inner_type->kind() == c10::TypeKind::ListType && - schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) { - // List[Tensor] is None - dynamic_args.emplace_back(index, DynamicArgType::ListTensorType, 0); - } else if ( - inner_type->kind() == c10::TypeKind::ListType && - schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) { - // List[SymInt] is None - dynamic_args.emplace_back(index, DynamicArgType::ListIntType, 0); + case c10::TypeKind::NumberType: { + if (serialized_arg_type == "as_int") { + // Only int Scalar is treated as dynamic arg for now + dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); + } else if (serialized_arg_type == "as_float") { + stack.at(index) = serialized_arg_val.get(); + } else if (serialized_arg_type == "as_bool") { + stack.at(index) = serialized_arg_val.get(); + } else { + TORCH_CHECK( + false, + "Expected extern kernel ", + op_kernel.target_, + " to have a scalar input for argument ", + index, + " but got ", + serialized_arg_type); + } + break; + } + case c10::TypeKind::StringType: { + TORCH_CHECK( + serialized_arg_type == "as_string", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_string for argument ", + index, + " but got ", + serialized_arg_type); + stack.at(index) = serialized_arg_val.get(); + break; + } + case c10::TypeKind::DeviceObjType: { + TORCH_CHECK( + serialized_arg_type == "as_device", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_device for argument ", + index, + " but got ", + serialized_arg_type); + + std::string device_string = serialized_arg_val["type"].get(); + if (serialized_arg_val.contains("index") && + serialized_arg_val["index"].is_number()) { + device_string += ":" + serialized_arg_val["index"].get(); + } + + c10::Device device(device_string); + + if (device != *device_) { + VLOG(1) << "ProxyExecutor is using " << *device_ << " for " + << op_kernel.target_ << " argument #" << index + << ", which is different from the one serialized in thrift: " + << device << ". Please ensure this is intentional."; + } + + stack.at(index) = *device_; + break; + } + case c10::TypeKind::ListType: { + if (schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) { + TORCH_CHECK( + serialized_arg_type == "as_tensors", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_tensors for argument ", + index, + " but got ", + serialized_arg_type); + TORCH_CHECK(serialized_arg_type == "as_tensors"); + dynamic_args.emplace_back( + index, DynamicArgType::ListTensorType, serialized_arg_val.size()); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofInts())) { + TORCH_CHECK( + serialized_arg_type == "as_ints", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_ints for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back( + index, DynamicArgType::ListIntType, serialized_arg_val.size()); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) { + TORCH_CHECK( + serialized_arg_type == "as_ints" || + serialized_arg_type == "as_sym_ints", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_ints or as_sym_ints for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back( + index, DynamicArgType::ListIntType, serialized_arg_val.size()); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofFloats())) { + TORCH_CHECK( + serialized_arg_type == "as_floats", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_floats for argument ", + index, + " but got ", + serialized_arg_type); + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg.get()); + } + stack.at(index) = std::move(ret); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofBools())) { + TORCH_CHECK( + serialized_arg_type == "as_bools", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_bools for argument ", + index, + " but got ", + serialized_arg_type); + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg.get()); + } + stack.at(index) = std::move(ret); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofNumbers())) { + if (serialized_arg_type == "as_ints") { + dynamic_args.emplace_back( + index, DynamicArgType::ListIntType, serialized_arg_val.size()); + } else if (serialized_arg_type == "as_floats") { + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg); + } + stack.at(index) = std::move(ret); + } else if (serialized_arg_type == "as_bools") { + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg); + } + stack.at(index) = std::move(ret); + } else { + TORCH_CHECK( + false, + "Expected extern kernel ", + op_kernel.target_, + " to have a List[Scalar] input for argument ", + index, + " but got ", + serialized_arg_type); + } + } else if (schema_arg_type->isSubtypeOf( + at::ListType::ofOptionalTensors())) { + if (serialized_arg_type == "as_optional_tensors") { + std::vector list_item_types; + for (const auto& arg : serialized_arg_val) { + list_item_types.push_back(arg.begin().key()); + } + dynamic_args.emplace_back( + index, + DynamicArgType::ListOptionalTensorType, + serialized_arg_val.size(), + list_item_types); + } else if (serialized_arg_type == "as_tensors") { + dynamic_args.emplace_back( + index, DynamicArgType::ListTensorType, serialized_arg_val.size()); + } else { + TORCH_CHECK( + false, + "Expected extern kernel ", + op_kernel.target_, + " to have a Tensor?[] input for argument ", + index, + " but got ", + serialized_arg_type); + } + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofStrings())) { + TORCH_CHECK( + serialized_arg_type == "as_strings", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_strings for argument ", + index, + " but got ", + serialized_arg_type); + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg.get()); + } + stack.at(index) = std::move(ret); + } else { + TORCH_CHECK( + false, + "NYI: Unsupported list type ", + serialized_arg_type, + " for extern kernel ", + op_kernel.target_, + " argument ", + index); + } + break; + } + case c10::TypeKind::OptionalType: { + auto inner_type = + schema_arg_type->castRaw()->getElementType(); + + if (serialized_arg_type == "as_none") { + stack.at(index) = c10::IValue{}; + if (inner_type->kind() == c10::TypeKind::TensorType) { + // Tensor is None + dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0); + } else if ( + inner_type->kind() == c10::TypeKind::IntType || + inner_type->kind() == c10::TypeKind::SymIntType) { + // Int or SymInt is None + dynamic_args.emplace_back(index, DynamicArgType::IntType, 0); + } else if ( + inner_type->kind() == c10::TypeKind::ListType && + schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) { + // List[Tensor] is None + dynamic_args.emplace_back(index, DynamicArgType::ListTensorType, 0); + } else if ( + inner_type->kind() == c10::TypeKind::ListType && + schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) { + // List[SymInt] is None + dynamic_args.emplace_back(index, DynamicArgType::ListIntType, 0); + } + } else { + prefill_stack_with_static_arguments( + index, inner_type, serialized_arg, op_kernel); + } + break; } - } else { - prefill_stack_with_static_arguments( - index, inner_type, serialized_arg, op_kernel); - } - break; + default: + TORCH_CHECK( + false, + "Unsupported input type ", + serialized_arg_type, + " for extern kernel ", + op_kernel.target_, + " argument ", + index); } - default: - TORCH_CHECK( - false, - "Unsupported input type ", - serialized_arg_type, - " for extern kernel ", - op_kernel.target_, - " argument ", - index); - } } // Populates op_kernel.stack_, op_kernel.dynamic_args_ void OSSProxyExecutorNpu::get_input_info_from_serialized( const std::vector& schema_args, const nlohmann::json& serialized_node, - OSSOpKernel& op_kernel) { - std::vector filled(schema_args.size(), false); - TORCH_CHECK(op_kernel.stack_.size() == 0); - op_kernel.stack_.resize(schema_args.size()); - for (const auto& named_argument : serialized_node["inputs"]) { - const auto& arg = named_argument["arg"]; - const auto& name = named_argument["name"].get(); - - // Doing a linear lookup in the schema to find the index - // of a static argument. Should be fine performance wise - // because we usually only have small amount of arguments. - for (size_t index = 0; index < schema_args.size(); index++) { - auto& schema_arg = schema_args[index]; - if (schema_arg.name() == name) { - prefill_stack_with_static_arguments( - index, schema_arg.real_type(), arg, op_kernel); - filled[index] = true; - break; - } - } - } - - // If an argument is not filled and has a default value, we should - // also prefill the default value. - for (size_t index = 0; index < schema_args.size(); index++) { - if (!filled[index] && schema_args[index].default_value()) { - auto default_value = *schema_args[index].default_value(); - op_kernel.stack_.at(index) = default_value; - } - } + OSSOpKernel& op_kernel) +{ + std::vector filled(schema_args.size(), false); + TORCH_CHECK(op_kernel.stack_.size() == 0); + op_kernel.stack_.resize(schema_args.size()); + for (const auto& named_argument : serialized_node["inputs"]) { + const auto& arg = named_argument["arg"]; + const auto& name = named_argument["name"].get(); + + // Doing a linear lookup in the schema to find the index + // of a static argument. Should be fine performance wise + // because we usually only have small amount of arguments. + for (size_t index = 0; index < schema_args.size(); index++) { + auto& schema_arg = schema_args[index]; + if (schema_arg.name() == name) { + prefill_stack_with_static_arguments( + index, schema_arg.real_type(), arg, op_kernel); + filled[index] = true; + break; + } + } + } + + // If an argument is not filled and has a default value, we should + // also prefill the default value. + for (size_t index = 0; index < schema_args.size(); index++) { + if (!filled[index] && schema_args[index].default_value()) { + auto default_value = *schema_args[index].default_value(); + op_kernel.stack_.at(index) = default_value; + } + } } // Populates op_kernel.outputs_ void OSSProxyExecutorNpu::get_output_info_from_serialized( const std::vector& schema_returns, const nlohmann::json& serialized_node, - OSSOpKernel& op_kernel) { - std::vector& outputs = op_kernel.outputs_; - - TORCH_CHECK( - schema_returns.size() == serialized_node["outputs"].size(), - "Serialized node doesn't match operator ", - serialized_node["target"], - "'s schema outputs."); - - size_t output_index = 0; - for (const auto& serialized_output : serialized_node["outputs"]) { - TORCH_CHECK(serialized_output.size() == 1); - std::string serialized_output_type = serialized_output.begin().key(); - auto& serialized_output_val = serialized_output.begin().value(); - - auto& schema_return = schema_returns[output_index]; - const at::TypePtr& schema_return_type = schema_return.real_type(); - - switch (schema_return_type->kind()) { - case c10::TypeKind::TensorType: { - TORCH_CHECK( - serialized_output_type == "as_tensor", - "Expected extern kernel ", - serialized_node["target"], - " to have serialized output type as_tensor, ", - " but got ", - serialized_output_type); - outputs.emplace_back(output_index, DynamicArgType::TensorType, 1); - break; - } - case c10::TypeKind::ListType: { - if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) { - TORCH_CHECK( - serialized_output_type == "as_tensors", - "Expected extern kernel ", - serialized_node["target"], - " to have serialized output type as_tensors, ", - " but got ", - serialized_output_type); - outputs.emplace_back( - output_index, - DynamicArgType::ListTensorType, - serialized_output_val.size()); - } else { - TORCH_CHECK( - false, - "Unsupported return list type ", - schema_return_type->repr_str()); + OSSOpKernel& op_kernel) +{ + std::vector& outputs = op_kernel.outputs_; + + TORCH_CHECK( + schema_returns.size() == serialized_node["outputs"].size(), + "Serialized node doesn't match operator ", + serialized_node["target"], + "'s schema outputs."); + + size_t output_index = 0; + for (const auto& serialized_output : serialized_node["outputs"]) { + TORCH_CHECK(serialized_output.size() == 1); + std::string serialized_output_type = serialized_output.begin().key(); + auto& serialized_output_val = serialized_output.begin().value(); + + auto& schema_return = schema_returns[output_index]; + const at::TypePtr& schema_return_type = schema_return.real_type(); + + switch (schema_return_type->kind()) { + case c10::TypeKind::TensorType: { + TORCH_CHECK( + serialized_output_type == "as_tensor", + "Expected extern kernel ", + serialized_node["target"], + " to have serialized output type as_tensor, ", + " but got ", + serialized_output_type); + outputs.emplace_back(output_index, DynamicArgType::TensorType, 1); + break; + } + case c10::TypeKind::ListType: { + if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) { + TORCH_CHECK( + serialized_output_type == "as_tensors", + "Expected extern kernel ", + serialized_node["target"], + " to have serialized output type as_tensors, ", + " but got ", + serialized_output_type); + outputs.emplace_back( + output_index, + DynamicArgType::ListTensorType, + serialized_output_val.size()); + } else { + TORCH_CHECK( + false, + "Unsupported return list type ", + schema_return_type->repr_str()); + } + break; + } + default: { + TORCH_CHECK( + false, + "Unsupported return type ", + schema_return_type->repr_str(), + " for extern kernel ", + op_kernel.target_); + } } - break; - } - default: { - TORCH_CHECK( - false, - "Unsupported return type ", - schema_return_type->repr_str(), - " for extern kernel ", - op_kernel.target_); - } - } - output_index++; - } + output_index++; + } } -OSSProxyExecutorNpu::OSSProxyExecutorNpu(const std::string& json_path, bool is_cpu) { - if (is_cpu) { - device_ = std::make_unique(c10::DeviceType::CPU); - } else { - int device_idx = -1; - device_ = std::make_unique(c10::DeviceType::CUDA, device_idx); - } +OSSProxyExecutorNpu::OSSProxyExecutorNpu(const std::string& json_path, bool is_cpu) +{ + if (is_cpu) { + device_ = std::make_unique(c10::DeviceType::CPU); + } else { + int device_idx = -1; + device_ = std::make_unique(c10::DeviceType::CUDA, device_idx); + } - std::string extern_kernel_nodes_serialized; + std::string extern_kernel_nodes_serialized; - std::ifstream json_file(json_path); - TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path); + std::ifstream json_file(json_path); + TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path); - // Parse file into a json object - nlohmann::json json_obj; - json_file >> json_obj; + // Parse file into a json object + nlohmann::json json_obj; + json_file >> json_obj; - // Access data - for (auto const& serialized_extern_node : json_obj["nodes"]) { - auto const& serialized_node = serialized_extern_node["node"]; + // Access data + for (auto const& serialized_extern_node : json_obj["nodes"]) { + auto const& serialized_node = serialized_extern_node["node"]; - const std::string& target = serialized_node["target"]; + const std::string& target = serialized_node["target"]; - std::string opName; - std::string overloadName; - size_t pos = target.find('.'); - if (pos == std::string::npos) { - opName = target; - overloadName = ""; - } else { - // There should be no more periods - size_t pos2 = target.find('.', pos + 1); - TORCH_CHECK(pos2 == std::string::npos); + std::string opName; + std::string overloadName; + size_t pos = target.find('.'); + if (pos == std::string::npos) { + opName = target; + overloadName = ""; + } else { + // There should be no more periods + size_t pos2 = target.find('.', pos + 1); + TORCH_CHECK(pos2 == std::string::npos); - opName = target.substr(0, pos); - overloadName = target.substr(pos + 1, target.length() - pos); - } + opName = target.substr(0, pos); + overloadName = target.substr(pos + 1, target.length() - pos); + } - c10::OperatorHandle op_handle = - c10::Dispatcher::singleton().findSchemaOrThrow( - opName.c_str(), overloadName.c_str()); - const c10::FunctionSchema& schema = op_handle.schema(); + c10::OperatorHandle op_handle = + c10::Dispatcher::singleton().findSchemaOrThrow( + opName.c_str(), overloadName.c_str()); + const c10::FunctionSchema& schema = op_handle.schema(); - const auto& schema_args = schema.arguments(); - const auto& schema_returns = schema.returns(); + const auto& schema_args = schema.arguments(); + const auto& schema_returns = schema.returns(); - OSSOpKernel op_kernel(target, op_handle); - get_input_info_from_serialized(schema_args, serialized_node, op_kernel); - get_output_info_from_serialized(schema_returns, serialized_node, op_kernel); + OSSOpKernel op_kernel(target, op_handle); + get_input_info_from_serialized(schema_args, serialized_node, op_kernel); + get_output_info_from_serialized(schema_returns, serialized_node, op_kernel); - op_kernels_.emplace_back(std::move(op_kernel)); - } + op_kernels_.emplace_back(std::move(op_kernel)); + } } void OSSProxyExecutorNpu::call_function( @@ -494,134 +499,135 @@ void OSSProxyExecutorNpu::call_function( int num_ints, int64_t* flatten_int_args, int num_tensors, - AtenTensorHandle* flatten_tensor_args) { - TORCH_CHECK( - extern_node_index < static_cast(op_kernels_.size()), - "Invalid extern node index"); - OSSOpKernel& op_kernel = op_kernels_[extern_node_index]; - - std::vector stack = op_kernel.stack_; - auto& dynamic_args = op_kernel.dynamic_args_; - - int tensor_id = 0; - int int_id = 0; - for (auto& dynamic_arg : dynamic_args) { - int arg_index = dynamic_arg.arg_index; - DynamicArgType dynamic_arg_type = dynamic_arg.arg_type; - int length = dynamic_arg.length; - - if (length == 0) { - continue; - } - - switch (dynamic_arg_type) { - case DynamicArgType::TensorType: { - at::Tensor* tensor = - tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); - stack[arg_index] = *tensor; - break; - } - case DynamicArgType::IntType: { - int64_t val = flatten_int_args[int_id++]; - stack[arg_index] = val; - break; - } - case DynamicArgType::ListTensorType: { - std::vector tensor_list; - for (int j = 0; j < length; j++) { - at::Tensor* tensor = - tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); - tensor_list.push_back(*tensor); + AtenTensorHandle* flatten_tensor_args) +{ + TORCH_CHECK( + extern_node_index < static_cast(op_kernels_.size()), + "Invalid extern node index"); + OSSOpKernel& op_kernel = op_kernels_[extern_node_index]; + + std::vector stack = op_kernel.stack_; + auto& dynamic_args = op_kernel.dynamic_args_; + + int tensor_id = 0; + int int_id = 0; + for (auto& dynamic_arg : dynamic_args) { + int arg_index = dynamic_arg.arg_index; + DynamicArgType dynamic_arg_type = dynamic_arg.arg_type; + int length = dynamic_arg.length; + + if (length == 0) { + continue; } - stack[arg_index] = tensor_list; - break; - } - case DynamicArgType::ListOptionalTensorType: { - std::vector> optional_tensor_list; - auto& list_item_types = dynamic_arg.list_item_types; - TORCH_CHECK( - list_item_types.has_value(), - "Could not find list of item types for optional tensor list input"); - - for (const std::string& item_type : list_item_types.value()) { - if (item_type == "as_tensor") { - at::Tensor* tensor = tensor_handle_to_tensor_pointer( - flatten_tensor_args[tensor_id++]); - optional_tensor_list.emplace_back(*tensor); - } else if (item_type == "as_none") { - optional_tensor_list.emplace_back(std::nullopt); - } - } - stack[arg_index] = optional_tensor_list; - break; - } - case DynamicArgType::ListIntType: { - std::vector vals; - vals.reserve(length); - for (int j = 0; j < length; j++) { - vals.push_back(flatten_int_args[int_id++]); + + switch (dynamic_arg_type) { + case DynamicArgType::TensorType: { + at::Tensor* tensor = + tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); + stack[arg_index] = *tensor; + break; + } + case DynamicArgType::IntType: { + int64_t val = flatten_int_args[int_id++]; + stack[arg_index] = val; + break; + } + case DynamicArgType::ListTensorType: { + std::vector tensor_list; + for (int j = 0; j < length; j++) { + at::Tensor* tensor = + tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); + tensor_list.push_back(*tensor); + } + stack[arg_index] = tensor_list; + break; + } + case DynamicArgType::ListOptionalTensorType: { + std::vector> optional_tensor_list; + auto& list_item_types = dynamic_arg.list_item_types; + TORCH_CHECK( + list_item_types.has_value(), + "Could not find list of item types for optional tensor list input"); + + for (const std::string& item_type : list_item_types.value()) { + if (item_type == "as_tensor") { + at::Tensor* tensor = tensor_handle_to_tensor_pointer( + flatten_tensor_args[tensor_id++]); + optional_tensor_list.emplace_back(*tensor); + } else if (item_type == "as_none") { + optional_tensor_list.emplace_back(std::nullopt); + } + } + stack[arg_index] = optional_tensor_list; + break; + } + case DynamicArgType::ListIntType: { + std::vector vals; + vals.reserve(length); + for (int j = 0; j < length; j++) { + vals.push_back(flatten_int_args[int_id++]); + } + stack[arg_index] = vals; + break; + } + default: + TORCH_CHECK(false, "Unsupported dynamic arg type: ", dynamic_arg_type); } - stack[arg_index] = vals; - break; - } - default: - TORCH_CHECK(false, "Unsupported dynamic arg type: ", dynamic_arg_type); } - } - - int num_output_tensors = op_kernel.num_output_tensors(); - TORCH_CHECK( - tensor_id == num_tensors - num_output_tensors, - "Mismatch between tensors consumed and num of input tensor, got tensor_id = .", - tensor_id, - ", expected num = ", - num_tensors - num_output_tensors); - TORCH_CHECK( - int_id == num_ints, - "Mismatch between ints consumed and num_ints, got int_id = ", - int_id, - ", num_ints = ", - num_ints); - - // Call the op with the prepared stack. - const c10::OperatorHandle& op = op_kernel.op_handle_; - op.callBoxed(stack); - - const c10::FunctionSchema& schema = op.schema(); - const auto& schema_returns = schema.returns(); - - TORCH_CHECK(op_kernel.outputs_.size() == stack.size()); - TORCH_CHECK(stack.size() == schema_returns.size()); - - int index = 0; - for (const auto& schema_return : schema_returns) { - if (schema_return.type()->kind() == c10::TypeKind::TensorType) { - at::Tensor* tensor = - tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); - *tensor = stack[index++].toTensor(); - } else if ( - schema_return.type()->kind() == c10::TypeKind::ListType && - schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) { - auto tensors = stack[index++].toTensorList(); - for (auto&& t : tensors) { - at::Tensor* tensor = - tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); - *tensor = t; - } - } else { - TORCH_CHECK( - false, - "NYI: Unsupported return type for schema: ", - schema_return.type()->repr_str()); + + int num_output_tensors = op_kernel.num_output_tensors(); + TORCH_CHECK( + tensor_id == num_tensors - num_output_tensors, + "Mismatch between tensors consumed and num of input tensor, got tensor_id = .", + tensor_id, + ", expected num = ", + num_tensors - num_output_tensors); + TORCH_CHECK( + int_id == num_ints, + "Mismatch between ints consumed and num_ints, got int_id = ", + int_id, + ", num_ints = ", + num_ints); + + // Call the op with the prepared stack. + const c10::OperatorHandle& op = op_kernel.op_handle_; + op.callBoxed(stack); + + const c10::FunctionSchema& schema = op.schema(); + const auto& schema_returns = schema.returns(); + + TORCH_CHECK(op_kernel.outputs_.size() == stack.size()); + TORCH_CHECK(stack.size() == schema_returns.size()); + + int index = 0; + for (const auto& schema_return : schema_returns) { + if (schema_return.type()->kind() == c10::TypeKind::TensorType) { + at::Tensor* tensor = + tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); + *tensor = stack[index++].toTensor(); + } else if ( + schema_return.type()->kind() == c10::TypeKind::ListType && + schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) { + auto tensors = stack[index++].toTensorList(); + for (auto&& t : tensors) { + at::Tensor* tensor = + tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); + *tensor = t; + } + } else { + TORCH_CHECK( + false, + "NYI: Unsupported return type for schema: ", + schema_return.type()->repr_str()); + } } - } - - TORCH_CHECK( - tensor_id == num_tensors, - "Mismatch between tensors consumed and num_tensors, got tensor_id = ", - tensor_id, - ", expected num = ", - num_tensors); + + TORCH_CHECK( + tensor_id == num_tensors, + "Mismatch between tensors consumed and num_tensors, got tensor_id = ", + tensor_id, + ", expected num = ", + num_tensors); } -} // namespace torch::aot_inductor +} // namespace torch::aot_inductor \ No newline at end of file -- Gitee From 98cf06bf670668bba5a724c630b64952c4aa7418 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 08:40:41 +0000 Subject: [PATCH 04/11] TensorType Signed-off-by: zjx.com --- torch_npu/csrc/utils/TensorType.cpp | 307 ++++++++++++++-------------- 1 file changed, 149 insertions(+), 158 deletions(-) diff --git a/torch_npu/csrc/utils/TensorType.cpp b/torch_npu/csrc/utils/TensorType.cpp index aeb6fd8b832..8a05d271aab 100644 --- a/torch_npu/csrc/utils/TensorType.cpp +++ b/torch_npu/csrc/utils/TensorType.cpp @@ -14,12 +14,11 @@ std::vector> all_declared_types_npu() { std::vector> ret; // can't easily iterate over enum classes, does not support BFloat16 now - std::vector backends = { c10::Backend::PrivateUse1 }; + std::vector backends = {c10::Backend::PrivateUse1}; std::vector scalar_types = { ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float, ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half, - ScalarType::Bool, ScalarType::BFloat16 - }; + ScalarType::Bool, ScalarType::BFloat16}; for (auto& backend : backends) { for (auto& scalar_type : scalar_types) { @@ -39,10 +38,7 @@ struct PyTensorType { int backend; int scalar_type; - Backend get_backend() const - { - return static_cast(backend); - } + Backend get_backend() const { return static_cast(backend); } DispatchKey get_dispatch_key() const { @@ -59,7 +55,7 @@ static_assert(std::is_standard_layout::value, "PyTensorType must b static void py_bind_tensor_types(const std::vector& tensor_types); -static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +static PyObject* Tensor_new(PyTypeObject* type, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS auto& tensor_type = *((PyTensorType*)type); @@ -76,112 +72,108 @@ static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs " not available. Torch not compiled with npu enabled.", PTA_ERROR(ErrCode::TYPE)) torch_npu::utils::npu_lazy_init(); return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), - tensor_type.get_scalar_type(), - args, - kwargs)); + tensor_type.get_scalar_type(), + args, + kwargs)); END_HANDLE_TH_ERRORS } static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg) { - HANDLE_TH_ERRORS - auto self = (PyTensorType*)_self; - if (THPVariable_Check(arg)) { - const auto& var = THPVariable_Unpack(arg); - - if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() && - var.scalar_type() == static_cast(self->scalar_type)) { - Py_RETURN_TRUE; + HANDLE_TH_ERRORS + auto self = (PyTensorType*)_self; + if (THPVariable_Check(arg)) { + const auto& var = THPVariable_Unpack(arg); + + if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() && + var.scalar_type() == static_cast(self->scalar_type)) { + Py_RETURN_TRUE; + } } - } - Py_RETURN_FALSE; - END_HANDLE_TH_ERRORS + Py_RETURN_FALSE; + END_HANDLE_TH_ERRORS } -PyObject* Tensor_dtype(PyTensorType* self, void *unused) +PyObject* Tensor_dtype(PyTensorType* self, void* unused) { - return torch::autograd::utils::wrap(self->dtype); + return torch::autograd::utils::wrap(self->dtype); } -PyObject* Tensor_layout(PyTensorType* self, void *unused) +PyObject* Tensor_layout(PyTensorType* self, void* unused) { - return torch::autograd::utils::wrap(self->layout); + return torch::autograd::utils::wrap(self->layout); } -PyObject* Tensor_is_npu(PyTensorType* self, void *unused) +PyObject* Tensor_is_npu(PyTensorType* self, void* unused) { - if (self->is_npu) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } + if (self->is_npu) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } } -PyObject* Tensor_is_sparse(PyTensorType *self, void *unused) +PyObject* Tensor_is_sparse(PyTensorType* self, void* unused) { - if (self->layout->layout == at::Layout::Strided) { - Py_RETURN_FALSE; - } else { - Py_RETURN_TRUE; - } + if (self->layout->layout == at::Layout::Strided) { + Py_RETURN_FALSE; + } else { + Py_RETURN_TRUE; + } } static struct PyMethodDef metaclass_methods[] = { {"__instancecheck__", Tensor_instancecheck, METH_O, nullptr}, - {nullptr} -}; + {nullptr}}; -using getter = PyObject* (*)(PyObject *, void *); +using getter = PyObject* (*)(PyObject*, void*); static struct PyGetSetDef metaclass_properties[] = { - {"dtype", (getter)Tensor_dtype, nullptr, nullptr, nullptr}, - {"layout", (getter)Tensor_layout, nullptr, nullptr, nullptr}, - {"is_npu", (getter)Tensor_is_npu, nullptr, nullptr, nullptr}, - {"is_sparse", (getter)Tensor_is_sparse, nullptr, nullptr, nullptr}, - {nullptr} -}; + {"dtype", (getter)Tensor_dtype, nullptr, nullptr, nullptr}, + {"layout", (getter)Tensor_layout, nullptr, nullptr, nullptr}, + {"is_npu", (getter)Tensor_is_npu, nullptr, nullptr, nullptr}, + {"is_sparse", (getter)Tensor_is_sparse, nullptr, nullptr, nullptr}, + {nullptr}}; static PyTypeObject metaclass = { - PyVarObject_HEAD_INIT(nullptr, 0) - "torch.tensortype", /* tp_name */ - sizeof(PyTypeObject) /* tp_basicsize */ + PyVarObject_HEAD_INIT(nullptr, 0) "torch.tensortype", /* tp_name */ + sizeof(PyTypeObject) /* tp_basicsize */ }; static void py_initialize_metaclass(PyTypeObject& metaclass) { - metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; - metaclass.tp_methods = metaclass_methods; - metaclass.tp_getset = metaclass_properties; - metaclass.tp_base = &PyType_Type; - if (PyType_Ready(&metaclass) < 0) { - throw python_error(); - } + metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; + metaclass.tp_methods = metaclass_methods; + metaclass.tp_getset = metaclass_properties; + metaclass.tp_base = &PyType_Type; + if (PyType_Ready(&metaclass) < 0) { + throw python_error(); + } } static PyTypeObject tensor_type_prototype = { - PyVarObject_HEAD_INIT(&metaclass, 0) - nullptr, /* tp_name */ - sizeof(PyTensorType) /* tp_basicsize */ + PyVarObject_HEAD_INIT(&metaclass, 0) nullptr, /* tp_name */ + sizeof(PyTensorType) /* tp_basicsize */ }; static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict) { - // NOTE: we don't use the typical static declaration of PyTypeObject because - // we need to initialize as many types as there are VariableType instances. - // We copy the basic object fields from a prototype definition and initialize - // the remaining fields below. - memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject)); - // Subclassing from torch.Tensor isn't supported. - // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed. - type.tp_flags = Py_TPFLAGS_DEFAULT; - type.tp_name = name; - type.tp_new = Tensor_new; - if (PyType_Ready(&type) < 0) { - throw python_error(); - } - if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) { - throw python_error(); - } + // NOTE: we don't use the typical static declaration of PyTypeObject because + // we need to initialize as many types as there are VariableType instances. + // We copy the basic object fields from a prototype definition and initialize + // the remaining fields below. + memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject)); + // Subclassing from torch.Tensor isn't supported. + // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed. + type.tp_flags = Py_TPFLAGS_DEFAULT; + type.tp_name = name; + type.tp_new = Tensor_new; + if (PyType_Ready(&type) < 0) { + throw python_error(); + } + if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) { + throw python_error(); + } } static std::string get_module(Backend backend) @@ -204,100 +196,100 @@ static std::string get_module(Backend backend) static std::string get_name(Backend backend, ScalarType scalarType) { - std::ostringstream ss; - ss << get_module(backend) << "." << toString(scalarType) << "Tensor"; - return ss.str(); + std::ostringstream ss; + ss << get_module(backend) << "." << toString(scalarType) << "Tensor"; + return ss.str(); } static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType) { - // This field is lazily initialized from backend and scalar_type - type_obj.backend = static_cast(backend); - type_obj.scalar_type = static_cast(scalarType); - type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend)); - type_obj.dtype = torch::getTHPDtype(scalarType); - type_obj.is_npu = (backend == c10::Backend::PrivateUse1); + // This field is lazily initialized from backend and scalar_type + type_obj.backend = static_cast(backend); + type_obj.scalar_type = static_cast(scalarType); + type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend)); + type_obj.dtype = torch::getTHPDtype(scalarType); + type_obj.is_npu = (backend == c10::Backend::PrivateUse1); } static void set_name(PyTensorType& type_obj, const std::string& name) { - size_t n = sizeof(type_obj.name); - strncpy(type_obj.name, name.c_str(), n); - type_obj.name[n - 1] = '\0'; + size_t n = sizeof(type_obj.name); + strncpy(type_obj.name, name.c_str(), n); + type_obj.name[n - 1] = '\0'; } static THPObjectPtr get_tensor_dict() { - auto torch = THPObjectPtr(PyImport_ImportModule("torch")); - if (!torch) { - throw python_error(); - } - - auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor")); - if (!tensor_class) { - throw python_error(); - } - - auto tensor_type = (PyTypeObject*)tensor_class.get(); - TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE)); - - auto res = THPObjectPtr(PyDict_New()); - if (!res) { - throw python_error(); - } - - if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) { - throw python_error(); - } - if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) { - throw python_error(); - } - - return res; + auto torch = THPObjectPtr(PyImport_ImportModule("torch")); + if (!torch) { + throw python_error(); + } + + auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor")); + if (!tensor_class) { + throw python_error(); + } + + auto tensor_type = (PyTypeObject*)tensor_class.get(); + TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE)); + + auto res = THPObjectPtr(PyDict_New()); + if (!res) { + throw python_error(); + } + + if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) { + throw python_error(); + } + if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) { + throw python_error(); + } + + return res; } static std::vector tensor_types; static void initialize_npu_aten_types(std::vector& tensor_types) { - // only initialize npu types - auto declared_types = all_declared_types_npu(); - tensor_types.resize(declared_types.size()); - - for (size_t i = 0, end = declared_types.size(); i != end; i++) { - auto& tensor_type = tensor_types[i]; - Backend backend = declared_types[i].first; - ScalarType scalar_type = declared_types[i].second; - set_type(tensor_type, backend, scalar_type); - set_name(tensor_type, get_name(backend, scalar_type)); - } + // only initialize npu types + auto declared_types = all_declared_types_npu(); + tensor_types.resize(declared_types.size()); + + for (size_t i = 0, end = declared_types.size(); i != end; i++) { + auto& tensor_type = tensor_types[i]; + Backend backend = declared_types[i].first; + ScalarType scalar_type = declared_types[i].second; + set_type(tensor_type, backend, scalar_type); + set_name(tensor_type, get_name(backend, scalar_type)); + } } void _initialize_python_bindings() { - // Initialize the at::Type* pointers, name, and properties of the PyTensorType - // vector. After this call, the vector must not be resized. - initialize_npu_aten_types(tensor_types); - - // Initialize the Python metaclass for the torch.FloatTensor, etc. types. - // The metaclass handles __instancecheck__ checks and binds the dtype property - // on the type objects. - py_initialize_metaclass(metaclass); - - // Get the tp_dict of the Variable class. We copy function definitions - // onto each Tensor type object so that they can be accessed via e.g. - // `torch.npu.FloatTensor.add`. - auto tensor_dict = get_tensor_dict(); - - // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc. - for (auto& tensor_type : tensor_types) { - py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get()); - } - - // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor - // is added to the `torch_npu` module as `FloatTensor`. Also add all the type - // objects to the set torch_npu._tensor_classes. - py_bind_tensor_types(tensor_types); + // Initialize the at::Type* pointers, name, and properties of the PyTensorType + // vector. After this call, the vector must not be resized. + initialize_npu_aten_types(tensor_types); + + // Initialize the Python metaclass for the torch.FloatTensor, etc. types. + // The metaclass handles __instancecheck__ checks and binds the dtype property + // on the type objects. + py_initialize_metaclass(metaclass); + + // Get the tp_dict of the Variable class. We copy function definitions + // onto each Tensor type object so that they can be accessed via e.g. + // `torch.npu.FloatTensor.add`. + auto tensor_dict = get_tensor_dict(); + + // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc. + for (auto& tensor_type : tensor_types) { + py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get()); + } + + // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor + // is added to the `torch_npu` module as `FloatTensor`. Also add all the type + // objects to the set torch_npu._tensor_classes. + py_bind_tensor_types(tensor_types); } static void py_bind_tensor_types(const std::vector& tensor_types) @@ -335,23 +327,22 @@ static void py_bind_tensor_types(const std::vector& tensor_types) } // Callback for python part. Used for additional initialization of python classes -static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs) +static PyObject* THPModule_initExtension(PyObject* _unused, PyObject* noargs) { - HANDLE_TH_ERRORS - _initialize_python_bindings(); - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS + HANDLE_TH_ERRORS + _initialize_python_bindings(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS } // autograd methods on torch._C static PyMethodDef TorchNpuExtensionMethods[] = { {"_initExtension", (PyCFunction)THPModule_initExtension, METH_NOARGS, nullptr}, - {nullptr, nullptr, 0, nullptr} -}; + {nullptr, nullptr, 0, nullptr}}; PyMethodDef* npu_extension_functions() { - return TorchNpuExtensionMethods; -} -} + return TorchNpuExtensionMethods; } +} // namespace utils +} // namespace torch_npu \ No newline at end of file -- Gitee From 41380bcef4f8254d5b7f56847156b070cc671a16 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 08:44:43 +0000 Subject: [PATCH 05/11] runner_npu Signed-off-by: zjx.com --- .../model_container_runner_npu.cpp | 89 ++++++++++--------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.cpp b/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.cpp index b995cd0ffe3..d333dacadf5 100644 --- a/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.cpp +++ b/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.cpp @@ -9,12 +9,13 @@ namespace fs = std::filesystem; #endif namespace { -bool file_exists(std::string& path) { +bool file_exists(std::string& path) +{ #ifdef _WIN32 - return fs::exists(path); + return fs::exists(path); #else - struct stat rc {}; - return lstat(path.c_str(), &rc) == 0; + struct stat rc {}; + return lstat(path.c_str(), &rc) == 0; #endif } } // namespace @@ -27,51 +28,59 @@ AOTIModelContainerRunnerNpu::AOTIModelContainerRunnerNpu( const std::string& device_str, const std::string& cubin_dir) : AOTIModelContainerRunner( - model_so_path, - num_models, - device_str, - cubin_dir) { - model_so_path_ = model_so_path; - init_flag_ = false; + model_so_path, + num_models, + device_str, + cubin_dir) +{ + model_so_path_ = model_so_path; + init_flag_ = false; } AOTIModelContainerRunnerNpu::~AOTIModelContainerRunnerNpu() = default; -void AOTIModelContainerRunnerNpu::init_proxy_executor() { - if (init_flag_) return; +void AOTIModelContainerRunnerNpu::init_proxy_executor() +{ + if (init_flag_) { + return; + } - init_flag_ = true; - size_t lastindex = model_so_path_.find_last_of('.'); - std::string json_filename = model_so_path_.substr(0, lastindex) + "_npu.json"; + init_flag_ = true; + size_t lastindex = model_so_path_.find_last_of('.'); + std::string json_filename = model_so_path_.substr(0, lastindex) + "_npu.json"; - if (file_exists(json_filename)) { - proxy_executor_npu_ = std::make_unique( - json_filename, false); - proxy_executor_handle_ = - reinterpret_cast(proxy_executor_npu_.get()); - } else { - proxy_executor_handle_ = nullptr; - } + if (file_exists(json_filename)) { + proxy_executor_npu_ = std::make_unique( + json_filename, false); + proxy_executor_handle_ = + reinterpret_cast(proxy_executor_npu_.get()); + } else { + proxy_executor_handle_ = nullptr; + } } std::vector AOTIModelContainerRunnerNpu::run( - const std::vector& inputs, void* stream_handle) { - init_proxy_executor(); - c10_npu::NPUStream npu_stream = c10_npu::getCurrentNPUStream(); - return AOTIModelContainerRunner::run( - inputs, reinterpret_cast(npu_stream.stream())); + const std::vector& inputs, + void* stream_handle) +{ + init_proxy_executor(); + c10_npu::NPUStream npu_stream = c10_npu::getCurrentNPUStream(); + return AOTIModelContainerRunner::run( + inputs, reinterpret_cast(npu_stream.stream())); } std::vector AOTIModelContainerRunnerNpu::run_with_npu_stream( std::vector& inputs, - c10_npu::NPUStream npu_stream) { - return AOTIModelContainerRunner::run( - inputs, reinterpret_cast(npu_stream.stream())); + c10_npu::NPUStream npu_stream) +{ + return AOTIModelContainerRunner::run( + inputs, reinterpret_cast(npu_stream.stream())); } -void AOTIModelContainerRunnerNpu::set_proxy_executor(AOTIProxyExecutorHandle handle) { - proxy_executor_handle_ = handle; - init_flag_ = true; +void AOTIModelContainerRunnerNpu::set_proxy_executor(AOTIProxyExecutorHandle handle) +{ + proxy_executor_handle_ = handle; + init_flag_ = true; } namespace { @@ -79,13 +88,13 @@ std::unique_ptr create_aoti_runner_npu( const std::string& model_so_path, size_t num_models, const std::string& device_str, - const std::string& cubin_dir) { - return std::make_unique( - model_so_path, num_models, device_str, cubin_dir); + const std::string& cubin_dir) +{ + return std::make_unique( + model_so_path, num_models, device_str, cubin_dir); } RegisterAOTIModelRunner register_npu_runner("npu", &create_aoti_runner_npu); -} - +} // namespace -} // namespace torch::inductor +} // namespace torch::inductor \ No newline at end of file -- Gitee From d3b8b7a6723a9744cecaf6ab978020afcc29939f Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 08:46:35 +0000 Subject: [PATCH 06/11] ForceAclnnList Signed-off-by: zjx.com --- .../csrc/framework/utils/ForceAclnnList.cpp | 46 ++++++++++--------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp index c6b0e64641a..0bdf4393e0c 100644 --- a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp +++ b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp @@ -19,34 +19,36 @@ namespace at_npu { namespace native { -void ForceAclnn::RegisterOp(const std::string &list) { - if (list.empty()) { - return; - } +void ForceAclnn::RegisterOp(const std::string& list) +{ + if (list.empty()) { + return; + } - auto value = list; - std::string delimiter = ","; - auto start = 0U; - auto end = value.find(delimiter); - std::string token; - while (end != std::string::npos) { + auto value = list; + std::string delimiter = ","; + auto start = 0U; + auto end = value.find(delimiter); + std::string token; + while (end != std::string::npos) { + token = value.substr(start, end - start); + if (!token.empty()) { + force_aclnn_op_list_.insert(token); + } + start = end + delimiter.size(); + end = value.find(delimiter, start); + } token = value.substr(start, end - start); if (!token.empty()) { - force_aclnn_op_list_.insert(token); + force_aclnn_op_list_.insert(token); } - start = end + delimiter.size(); - end = value.find(delimiter, start); - } - token = value.substr(start, end - start); - if (!token.empty()) { - force_aclnn_op_list_.insert(token); - } - return; + return; } -bool ForceAclnn::IsForceAclnnOp(const std::string &op_name) const { - bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end()); - return ret; +bool ForceAclnn::IsForceAclnnOp(const std::string& op_name) const +{ + bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end()); + return ret; } } // namespace native } // namespace at_npu \ No newline at end of file -- Gitee From 8710a02447c41f99a6a75e9e7aeea5b15aa14972 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 08:51:07 +0000 Subject: [PATCH 07/11] shim_npu Signed-off-by: zjx.com --- .../csrc/inductor/aoti_torch/shim_npu.cpp | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/torch_npu/csrc/inductor/aoti_torch/shim_npu.cpp b/torch_npu/csrc/inductor/aoti_torch/shim_npu.cpp index 95f5a807ae4..3139135c028 100644 --- a/torch_npu/csrc/inductor/aoti_torch/shim_npu.cpp +++ b/torch_npu/csrc/inductor/aoti_torch/shim_npu.cpp @@ -12,7 +12,8 @@ #ifdef __cplusplus extern "C" { #endif -int32_t aoti_torch_device_type_npu() { +int32_t aoti_torch_device_type_npu() +{ return (int32_t)c10::DeviceType::PrivateUse1; } @@ -21,15 +22,16 @@ int32_t aoti_torch_device_type_npu() { #endif namespace { - static c10::Device c10_device(int32_t device_type, int32_t device_index) { - if (device_type == aoti_torch_device_type_cpu()) { - return c10::Device(static_cast(device_type)); - } else { - return c10::Device( - static_cast(device_type), - static_cast(device_index)); - } +static c10::Device c10_device(int32_t device_type, int32_t device_index) +{ + if (device_type == aoti_torch_device_type_cpu()) { + return c10::Device(static_cast(device_type)); + } else { + return c10::Device( + static_cast(device_type), + static_cast(device_index)); } +} } // namespace AOTITorchError aoti_torch_create_tensor_from_blob_npu( @@ -41,18 +43,19 @@ AOTITorchError aoti_torch_create_tensor_from_blob_npu( int32_t dtype, int32_t device_type, int32_t device_index, - AtenTensorHandle* ret_new_tensor) { - AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({ - c10::IntArrayRef sizes(sizes_ptr, ndim); - c10::IntArrayRef strides(strides_ptr, ndim); - c10::Device device = c10_device(device_type, device_index); - c10::TensorOptions options = c10::TensorOptions().device(device).dtype( - static_cast(dtype)); - *ret_new_tensor = torch::aot_inductor::new_tensor_handle( - // data == nullptr can happen for a 0-size tensor - (data != nullptr) ? at_npu::native::from_blob(data, sizes, strides, storage_offset, options, device) - : at::empty_strided(sizes, strides, options)); - }); + AtenTensorHandle* ret_new_tensor) +{ + AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({ + c10::IntArrayRef sizes(sizes_ptr, ndim); + c10::IntArrayRef strides(strides_ptr, ndim); + c10::Device device = c10_device(device_type, device_index); + c10::TensorOptions options = c10::TensorOptions().device(device).dtype( + static_cast(dtype)); + *ret_new_tensor = torch::aot_inductor::new_tensor_handle( + // data == nullptr can happen for a 0-size tensor + (data != nullptr) ? at_npu::native::from_blob(data, sizes, strides, storage_offset, options, device) + : at::empty_strided(sizes, strides, options)); + }); } AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2( @@ -67,7 +70,8 @@ AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2( AtenTensorHandle* ret_new_tensor, int32_t layout, const uint8_t* opaque_metadata, - int64_t opaque_metadata_size) { + int64_t opaque_metadata_size) +{ AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({ if (layout == static_cast(at::kMkldnn)) { throw std::runtime_error("do not support mkldnn on npu."); -- Gitee From 8120e6ad6b15dcc63c4d19571180dbc5d61cf6a6 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 08:52:26 +0000 Subject: [PATCH 08/11] Init Signed-off-by: zjx.com --- torch_npu/csrc/distributed/Init.cpp | 282 ++++++++++++++-------------- 1 file changed, 144 insertions(+), 138 deletions(-) diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 99c6dc6f22a..19ed1289a40 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -47,7 +47,8 @@ public: : impl_(std::move(impl)) {} explicit IntrusivePtrNoGilDestructor(T* impl) : impl_(c10::intrusive_ptr::unsafe_steal_from_new(impl)) {} - ~IntrusivePtrNoGilDestructor() { + ~IntrusivePtrNoGilDestructor() + { if (impl_) { if (PyGILState_Check() != 0) { pybind11::gil_scoped_release release; @@ -57,19 +58,24 @@ public: } } } - T& operator*() const noexcept { + T& operator*() const noexcept + { return *impl_; } - T* operator->() const noexcept { + T* operator->() const noexcept + { return impl_.get(); } - C10_NODISCARD T* get() const noexcept { + C10_NODISCARD T* get() const noexcept + { return impl_.get(); } - void reset() noexcept { + void reset() noexcept + { impl_.reset(); } - operator bool() const noexcept { + operator bool() const noexcept + { return impl_; } }; @@ -97,7 +103,7 @@ class BroadcastWork { public: inline std::vector cast_tensors(at::TensorList tensors) const { - static auto cast_back_to_ori_format = [](const at::Tensor &t) { + static auto cast_back_to_ori_format = [](const at::Tensor& t) { return at_npu::native::custom_ops::npu_format_cast(t, torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_); }; return c10::fmap(tensors, cast_back_to_ori_format); @@ -109,7 +115,8 @@ public: int root_rank = 0) : bucket_tensors_(std::move(bucket_tensors)), cast_tensors_(cast_tensors(bucket_tensors_)), - flat_tensor_({torch::utils::flatten_dense_tensors(cast_tensors_)}) { + flat_tensor_({torch::utils::flatten_dense_tensors(cast_tensors_)}) + { c10d::BroadcastOptions broadcastOptions; broadcastOptions.rootRank = root_rank; work_ = process_group->broadcast(flat_tensor_, broadcastOptions); @@ -141,7 +148,6 @@ protected: std::vector flat_tensor_; private: - // The broadcast work that is kicked off upon construction. c10::intrusive_ptr work_; }; @@ -210,17 +216,17 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) throw python_error(); } auto torch_npu_C_m = py::handle(torch_npu_C_module).cast(); - + auto m = torch_npu_C_m.def_submodule("_distributed_c10d", "distributed c10d bindings"); auto module = py::handle(m).cast(); module.def("_compute_bucket_assignment_by_size", [](const std::vector& tensors, - const std::vector& bucket_size_limits, - const std::vector& expect_sparse_gradient, - const std::vector& tensor_indices, - const c10::optional>& logger) { + const std::vector& bucket_size_limits, + const std::vector& expect_sparse_gradient, + const std::vector& tensor_indices, + const c10::optional>& logger) { if (logger.has_value()) { std::weak_ptr<::c10d::Logger> logger_weakref = logger.value(); return ::c10d_npu::compute_bucket_assignment_by_size(tensors, bucket_size_limits, expect_sparse_gradient, tensor_indices, {logger_weakref}); @@ -237,8 +243,8 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) module.def("_verify_params_across_processes", [](const c10::intrusive_ptr<::c10d::ProcessGroup>& process_group, - const std::vector& params, - const c10::optional>& logger) { + const std::vector& params, + const c10::optional>& logger) { if (logger.has_value()) { std::weak_ptr<::c10d::Logger> logger_weakref = logger.value(); c10d_npu::verify_params_across_processes(process_group, params, {logger_weakref}); @@ -254,23 +260,23 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) module .def("_register_comm_hook", &_register_comm_hook, - py::arg("reducer"), - py::arg("state"), - py::arg("comm_hook"), - py::call_guard()) + py::arg("reducer"), + py::arg("state"), + py::arg("comm_hook"), + py::call_guard()) .def("_register_builtin_comm_hook", &_register_builtin_comm_hook, - py::arg("reducer"), - py::arg("comm_hook_type")); + py::arg("reducer"), + py::arg("comm_hook_type")); module.def("_broadcast_coalesced", // Define a lambda such that the pybind11 prototype can take a std::vector // for the tensor list argument, but still pass it to the underlying // function as a c10::ArrayRef. [](c10::intrusive_ptr<::c10d::ProcessGroup> process_group, - std::vector tensors, // NOLINT - size_t buffer_size, - int rank) { + std::vector tensors, // NOLINT + size_t buffer_size, + int rank) { torch_npu::distributed::broadcast_coalesced( std::move(process_group), tensors, buffer_size, rank); }, @@ -285,76 +291,76 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) shared_ptr_class_(module, "Reducer") .def(py::init< - std::vector, - std::vector>, - std::vector, - c10::intrusive_ptr<::c10d::ProcessGroup>, - std::vector, - int64_t, - bool, - bool, - std::unordered_map, - int64_t>(), - py::arg("params"), - py::arg("bucket_indices"), - py::arg("per_bucket_size_limits"), - py::arg("process_group"), - py::arg("expect_sparse_gradients") = std::vector(), - py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap, - py::arg("find_unused_parameters") = false, - py::arg("gradient_as_bucket_view") = false, - py::arg("param_to_name_mapping") = + std::vector, + std::vector>, + std::vector, + c10::intrusive_ptr<::c10d::ProcessGroup>, + std::vector, + int64_t, + bool, + bool, + std::unordered_map, + int64_t>(), + py::arg("params"), + py::arg("bucket_indices"), + py::arg("per_bucket_size_limits"), + py::arg("process_group"), + py::arg("expect_sparse_gradients") = std::vector(), + py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap, + py::arg("find_unused_parameters") = false, + py::arg("gradient_as_bucket_view") = false, + py::arg("param_to_name_mapping") = std::unordered_map(), - py::arg("first_bucket_bytes_cap") = ::c10d::kDefaultFirstBucketBytes, - py::call_guard()) + py::arg("first_bucket_bytes_cap") = ::c10d::kDefaultFirstBucketBytes, + py::call_guard()) .def("prepare_for_forward", &c10d_npu::Reducer::prepare_for_forward, - py::call_guard()) + py::call_guard()) .def("prepare_for_backward", &c10d_npu::Reducer::prepare_for_backward, - py::call_guard()) + py::call_guard()) .def("prepare_for_backward", [](c10d_npu::Reducer& reducer, const at::Tensor& output) -> void { reducer.prepare_for_backward({output}); }, - py::call_guard()) + py::call_guard()) .def("get_backward_stats", &c10d_npu::Reducer::get_backward_stats) .def("_install_post_backward_futures", [](::c10d_npu::Reducer& reducer, const std::vector>& futs) { - c10::List> futures(c10::FutureType::create(c10::TensorType::get())); - for (const auto &fut : futs) { - futures.push_back(fut->fut); - } - reducer.install_futures(std::move(futures)); - }, - py::call_guard()) + c10::List> futures(c10::FutureType::create(c10::TensorType::get())); + for (const auto& fut : futs) { + futures.push_back(fut->fut); + } + reducer.install_futures(std::move(futures)); + }, + py::call_guard()) .def("_rebuild_buckets", &::c10d_npu::Reducer::rebuild_buckets, - py::call_guard()) + py::call_guard()) .def("_get_zeros_like_grad_buckets", [](::c10d_npu::Reducer& reducer) { return reducer.get_grad_buckets(true); }, - py::call_guard()) + py::call_guard()) .def("_push_all_rebuilt_params", &::c10d_npu::Reducer::push_rebuilt_params_for_all_indices, - py::call_guard()) + py::call_guard()) .def("_set_forward_pass_work_handle", &::c10d_npu::Reducer::set_forward_pass_work_handle, - py::call_guard()) + py::call_guard()) .def("_get_local_used_map", &::c10d_npu::Reducer::get_local_used_map_on_device) .def("_set_ddp_runtime_logging_sample_rate", &::c10d_npu::Reducer::set_ddp_runtime_logging_sample_rate, - py::arg("sample_rate"), - py::call_guard()) + py::arg("sample_rate"), + py::call_guard()) .def("_set_static_graph", &::c10d_npu::Reducer::set_static_graph, - py::call_guard()) + py::call_guard()) .def("_ddp_graph_static", &::c10d_npu::Reducer::ddp_graph_static, - py::call_guard()) + py::call_guard()) .def("_delay_all_reduce", &::c10d_npu::Reducer::delay_all_reduce, - py::call_guard()) + py::call_guard()) .def("_run_comm_hook", [](::c10d_npu::Reducer& reducer, ::c10d::GradBucket& bucket) -> std::shared_ptr { @@ -362,10 +368,10 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) reducer.run_comm_hook(bucket); return std::make_shared(fut); }, - py::call_guard()) + py::call_guard()) .def("set_logger", [](::c10d_npu::Reducer& reducer, - const std::shared_ptr<::c10d::Logger> logger) { + const std::shared_ptr<::c10d::Logger> logger) { std::weak_ptr<::c10d::Logger> logger_weakref = logger; reducer.set_logger(logger_weakref); }); @@ -374,40 +380,40 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) auto processGroupHCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupHCCL>( module, "ProcessGroupHCCL", dist.attr("Backend")) .def(py::init&, - int, - int, - c10::intrusive_ptr<::c10d_npu::ProcessGroupHCCL::Options>>(), - py::call_guard()) + int, + int, + c10::intrusive_ptr<::c10d_npu::ProcessGroupHCCL::Options>>(), + py::call_guard()) .def(py::init([](const c10::intrusive_ptr<::c10d::Store>& store, - int rank, - int size, - const std::chrono::milliseconds& timeout) { - auto options = ::c10d_npu::ProcessGroupHCCL::Options::create(); - options->is_high_priority_stream = false; - options->timeout = timeout; - return c10::make_intrusive<::c10d_npu::ProcessGroupHCCL>( - store, rank, size, options); - }), - py::arg("store"), - py::arg("rank"), - py::arg("size"), - py::arg("timeout") = kProcessGroupDefaultTimeout, - py::call_guard()) + int rank, + int size, + const std::chrono::milliseconds& timeout) { + auto options = ::c10d_npu::ProcessGroupHCCL::Options::create(); + options->is_high_priority_stream = false; + options->timeout = timeout; + return c10::make_intrusive<::c10d_npu::ProcessGroupHCCL>( + store, rank, size, options); + }), + py::arg("store"), + py::arg("rank"), + py::arg("size"), + py::arg("timeout") = kProcessGroupDefaultTimeout, + py::call_guard()) .def("get_hccl_comm", &::c10d_npu::ProcessGroupHCCL::getHcclComm) .def("_set_hccl_comm_name", &::c10d_npu::ProcessGroupHCCL::setHcclCommName) .def("resume_hccl_comm", &::c10d_npu::ProcessGroupHCCL::resumeHcclComm) .def("_set_switch_nic_comm", &::c10d_npu::ProcessGroupHCCL::setSwitchNicComm, - py::arg("rankid"), - py::arg("nRanks"), - py::arg("ranks") = std::vector{}, - py::arg("useBackup") = std::vector{}) + py::arg("rankid"), + py::arg("nRanks"), + py::arg("ranks") = std::vector{}, + py::arg("useBackup") = std::vector{}) .def("abort_hccl_comm", &::c10d_npu::ProcessGroupHCCL::abortAndClearHcclComm) .def("_delete_tcpstore_key", &::c10d_npu::ProcessGroupHCCL::deleteTCPStoreKey) .def("set_watchdog_status", &::c10d_npu::ProcessGroupHCCL::setWatchdogStatus) .def("clear_workmeta_list", &::c10d_npu::ProcessGroupHCCL::clearWorkMetaList) .def("get_hccl_comm_name", - [](::c10d_npu::ProcessGroupHCCL &pg, int rankid, py::args args, py::kwargs kwargs) + [](::c10d_npu::ProcessGroupHCCL& pg, int rankid, py::args args, py::kwargs kwargs) -> std::string { bool init_comm = true; if (kwargs.contains("init_comm")) { @@ -416,31 +422,31 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) return pg.getHcclCommName(rankid, init_comm); }) .def("_get_stream_id", &::c10d_npu::ProcessGroupHCCL::getStreamId, - py::arg("p2p") = false, - py::arg("peer") = -1) + py::arg("p2p") = false, + py::arg("peer") = -1) .def_property_readonly("options", &::c10d_npu::ProcessGroupHCCL::getOptions) .def("batch_isend_irecv", - [](::c10d_npu::ProcessGroupHCCL &pg, std::vector &op_type, - std::vector &tensors, - std::vector remote_rank_list) + [](::c10d_npu::ProcessGroupHCCL& pg, std::vector& op_type, + std::vector& tensors, + std::vector remote_rank_list) -> c10::intrusive_ptr { return pg.batch_isend_irecv(op_type, tensors, remote_rank_list); }, - py::call_guard()) + py::call_guard()) .def("reduce_scatter_tensor_uneven", &::c10d_npu::ProcessGroupHCCL::_reduce_scatter_base_uneven, - py::arg("output"), - py::arg("input"), - py::arg("input_split_sizes") = std::vector{}, - py::arg("opts") = ::c10d::ReduceScatterOptions(), - py::call_guard()) + py::arg("output"), + py::arg("input"), + py::arg("input_split_sizes") = std::vector{}, + py::arg("opts") = ::c10d::ReduceScatterOptions(), + py::call_guard()) .def("all_gather_into_tensor_uneven", &::c10d_npu::ProcessGroupHCCL::_allgather_base_uneven, - py::arg("output"), - py::arg("input"), - py::arg("output_split_sizes") = std::vector{}, - py::arg("opts") = ::c10d::AllgatherOptions(), - py::call_guard()); + py::arg("output"), + py::arg("input"), + py::arg("output_split_sizes") = std::vector{}, + py::arg("opts") = ::c10d::AllgatherOptions(), + py::call_guard()); intrusive_ptr_class_<::c10d_npu::ProcessGroupHCCL::Options>( processGroupHCCL, @@ -449,13 +455,13 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) .def(py::init<>()) .def_readwrite("op_timeout", &::c10d_npu::ProcessGroupHCCL::Options::opTimeout) .def_readwrite("is_high_priority_stream", - &::c10d_npu::ProcessGroupHCCL::Options::is_high_priority_stream) + &::c10d_npu::ProcessGroupHCCL::Options::is_high_priority_stream) .def_readwrite("global_ranks_in_group", - &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group) + &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group) .def_readwrite("hccl_config", &::c10d_npu::ProcessGroupHCCL::Options::hccl_config) .def_readwrite("group_id", - &::c10d_npu::ProcessGroupHCCL::Options::group_id); - + &::c10d_npu::ProcessGroupHCCL::Options::group_id); + // bind for ProcessGroupLCCL auto processGroupLCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupLCCL>( module, "ProcessGroupLCCL", dist.attr("Backend")) @@ -510,37 +516,37 @@ Example:: >>> client_store.get("first_key") )") - .def(py::init([](const std::string &host, - uint16_t port, - int worldSize, - bool agentRun, - uint32_t agentPid, - bool isServer, - bool enableTiered, - std::chrono::milliseconds timeout, - bool waitWorkers, - bool multiTenant) { + .def(py::init([](const std::string& host, + uint16_t port, + int worldSize, + bool agentRun, + uint32_t agentPid, + bool isServer, + bool enableTiered, + std::chrono::milliseconds timeout, + bool waitWorkers, + bool multiTenant) { c10::optional numWorkers = c10::nullopt; if (worldSize > -1) { numWorkers = static_cast(worldSize); } - ::c10d::TCPStoreOptions opts{ port, isServer, numWorkers, waitWorkers, timeout, multiTenant }; - return c10::make_intrusive <::c10d::ParallelTcpStore>(host, agentRun, agentPid, enableTiered, opts); - }), - py::arg("host") = "127.0.0.1", - py::arg("port") = 29500, - py::arg("world_size") = -1, - py::arg("agent_run") = false, - py::arg("agent_pid") = -1, - py::arg("is_server") = false, - py::arg("enable_tiered") = false, - py::arg("timeout") = std::chrono::milliseconds(300000), - py::arg("wait_workers") = true, - py::arg("multi_tenant") = false); + ::c10d::TCPStoreOptions opts{port, isServer, numWorkers, waitWorkers, timeout, multiTenant}; + return c10::make_intrusive<::c10d::ParallelTcpStore>(host, agentRun, agentPid, enableTiered, opts); + }), + py::arg("host") = "127.0.0.1", + py::arg("port") = 29500, + py::arg("world_size") = -1, + py::arg("agent_run") = false, + py::arg("agent_pid") = -1, + py::arg("is_server") = false, + py::arg("enable_tiered") = false, + py::arg("timeout") = std::chrono::milliseconds(300000), + py::arg("wait_workers") = true, + py::arg("multi_tenant") = false); module.def("_dump_hccl_trace_json", [](std::optional includeCollectives, - std::optional onlyActive) { + std::optional onlyActive) { return py::bytes(::c10d_npu::dump_hccl_trace_json( includeCollectives.value_or(true), onlyActive.value_or(false))); }, @@ -556,8 +562,8 @@ Example:: )"); module.def("_dump_hccl_trace", [](std::optional includeCollectives, - std::optional includeStackTraces, - std::optional onlyActive) { + std::optional includeStackTraces, + std::optional onlyActive) { return py::bytes(::c10d_npu::dump_hccl_trace( includeCollectives.value_or(true), includeStackTraces.value_or(true), @@ -576,7 +582,7 @@ Example:: Default settings return everything - i.e. contains HCCL comm dumps and collective traces. )"); - Py_RETURN_TRUE; + Py_RETURN_TRUE; } // c10d methods on torch._C @@ -593,4 +599,4 @@ PyMethodDef* python_functions() } } // namespace distributed -} // namespace torch_npu +} // namespace torch_npu \ No newline at end of file -- Gitee From 0bd3d4c3170244842efa91492473bf4739009ef9 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 08:55:16 +0000 Subject: [PATCH 09/11] AclInterface Signed-off-by: zjx.com --- .../csrc/core/npu/interface/AclInterface.cpp | 275 ++++++++++-------- 1 file changed, 152 insertions(+), 123 deletions(-) diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index d85dd2211da..a2ea830bd73 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -11,14 +11,13 @@ #endif namespace c10_npu { - namespace acl { #undef LOAD_FUNCTION #define LOAD_FUNCTION(funcName) \ - REGISTER_FUNCTION(libascendcl, funcName) + REGISTER_FUNCTION(libascendcl, funcName) #undef GET_FUNC -#define GET_FUNC(funcName) \ - GET_FUNCTION(libascendcl, funcName) +#define GET_FUNC(funcName) \ + GET_FUNCTION(libascendcl, funcName) REGISTER_LIBRARY(libascendcl) LOAD_FUNCTION(aclGetRecentErrMsg) @@ -83,8 +82,9 @@ LOAD_FUNCTION(aclmdlRICaptureTaskUpdateEnd) LOAD_FUNCTION(aclrtHostRegister) LOAD_FUNCTION(aclrtHostUnregister) -aclprofStepInfoPtr init_stepinfo() { - typedef aclprofStepInfoPtr(*npdInitFunc)(); +aclprofStepInfoPtr init_stepinfo() +{ + typedef aclprofStepInfoPtr (*npdInitFunc)(); static npdInitFunc func = nullptr; if (func == nullptr) { func = (npdInitFunc)GET_FUNC(aclprofCreateStepInfo); @@ -94,8 +94,9 @@ aclprofStepInfoPtr init_stepinfo() { return ret; } -NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo) { - typedef NpdStatus(*npdDestroyFunc)(aclprofStepInfoPtr); +NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo) +{ + typedef NpdStatus (*npdDestroyFunc)(aclprofStepInfoPtr); static npdDestroyFunc func = nullptr; if (func == nullptr) { func = (npdDestroyFunc)GET_FUNC(aclprofDestroyStepInfo); @@ -105,8 +106,9 @@ NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo) { return ret; } -NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) { - typedef NpdStatus(*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream); +NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) +{ + typedef NpdStatus (*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream); static npdStartProfiling func = nullptr; if (func == nullptr) { func = (npdStartProfiling)GET_FUNC(aclprofGetStepTimestamp); @@ -116,8 +118,9 @@ NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, return ret; } -NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) { - typedef NpdStatus(*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream); +NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) +{ + typedef NpdStatus (*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream); static npdStopProfiling func = nullptr; if (func == nullptr) { func = (npdStopProfiling)GET_FUNC(aclprofGetStepTimestamp); @@ -127,9 +130,9 @@ NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, a return ret; } -const char *AclGetErrMsg() +const char* AclGetErrMsg() { - typedef const char *(*aclGetErrMsg)(); + typedef const char* (*aclGetErrMsg)(); static aclGetErrMsg func = nullptr; if (func == nullptr) { func = (aclGetErrMsg)GET_FUNC(aclGetRecentErrMsg); @@ -141,8 +144,9 @@ const char *AclGetErrMsg() return ""; } -aclError AclrtCreateStreamWithConfig(aclrtStream *stream, uint32_t priority, uint32_t flag) { - typedef aclError(*aclrtCreateStreamWithConfigFunc)(aclrtStream*, uint32_t, uint32_t); +aclError AclrtCreateStreamWithConfig(aclrtStream* stream, uint32_t priority, uint32_t flag) +{ + typedef aclError (*aclrtCreateStreamWithConfigFunc)(aclrtStream*, uint32_t, uint32_t); static aclrtCreateStreamWithConfigFunc func = nullptr; if (func == nullptr) { func = (aclrtCreateStreamWithConfigFunc)GET_FUNC(aclrtCreateStreamWithConfig); @@ -170,11 +174,13 @@ aclError AclrtCreateStreamWithConfig(aclrtStream *stream, uint32_t priority, uin } } -aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) { +aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) +{ if (stream == nullptr) { // default stream return ACL_ERROR_INVALID_PARAM; } - typedef aclError(*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t); + + typedef aclError (*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t); static aclrtSetStreamFailureModeFunc func = (aclrtSetStreamFailureModeFunc)GET_FUNC(aclrtSetStreamFailureMode); if (func == nullptr) { return ACL_SUCCESS; @@ -182,8 +188,9 @@ aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) { return func(stream, mode); } -aclError AclrtSetOpWaitTimeout(uint32_t timeout) { - typedef aclError(*aclrtSetOpWaitTimeoutFunc)(uint32_t); +aclError AclrtSetOpWaitTimeout(uint32_t timeout) +{ + typedef aclError (*aclrtSetOpWaitTimeoutFunc)(uint32_t); static aclrtSetOpWaitTimeoutFunc func = nullptr; if (func == nullptr) { func = (aclrtSetOpWaitTimeoutFunc)GET_FUNC(aclrtSetOpWaitTimeout); @@ -194,14 +201,14 @@ aclError AclrtSetOpWaitTimeout(uint32_t timeout) { bool IsExistCreateEventExWithFlag() { - typedef aclError(*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t); + typedef aclError (*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t); static AclrtCreateEventWithFlagFunc func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventExWithFlag); return func != nullptr; } -aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag) +aclError AclrtCreateEventWithFlag(aclrtEvent* event, uint32_t flag) { - typedef aclError(*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t); + typedef aclError (*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t); // Recommend aclrtCreateEventExWithFlag. // Differences from aclrtCreateEventWithFlag: // 1. Event can be reused naturally, aclrtResetEvent is not supported. @@ -220,9 +227,9 @@ aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag) return func_ex(event, flag); } -aclError AclQueryEventWaitStatus(aclrtEvent event, aclrtEventWaitStatus *waitStatus) +aclError AclQueryEventWaitStatus(aclrtEvent event, aclrtEventWaitStatus* waitStatus) { - typedef aclError (*aclQueryEventWaitStatus)(aclrtEvent event, aclrtEventWaitStatus *waitStatus); + typedef aclError (*aclQueryEventWaitStatus)(aclrtEvent event, aclrtEventWaitStatus* waitStatus); static aclQueryEventWaitStatus func = nullptr; if (func == nullptr) { func = (aclQueryEventWaitStatus)GET_FUNC(aclrtQueryEventWaitStatus); @@ -231,8 +238,9 @@ aclError AclQueryEventWaitStatus(aclrtEvent event, aclrtEventWaitStatus *waitSta return func(event, waitStatus); } -aclError AclQueryEventRecordedStatus(aclrtEvent event, aclrtEventRecordedStatus *status) { - typedef aclError (*aclQueryEventStatus)(aclrtEvent event, aclrtEventRecordedStatus *status); +aclError AclQueryEventRecordedStatus(aclrtEvent event, aclrtEventRecordedStatus* status) +{ + typedef aclError (*aclQueryEventStatus)(aclrtEvent event, aclrtEventRecordedStatus* status); static aclQueryEventStatus func = nullptr; if (func == nullptr) { func = (aclQueryEventStatus)GET_FUNC(aclrtQueryEventStatus); @@ -243,7 +251,7 @@ aclError AclQueryEventRecordedStatus(aclrtEvent event, aclrtEventRecordedStatus bool IsExistQueryEventRecordedStatus() { - typedef aclError (*aclQueryEventStatus)(aclrtEvent event, aclrtEventRecordedStatus *status); + typedef aclError (*aclQueryEventStatus)(aclrtEvent event, aclrtEventRecordedStatus* status); static aclQueryEventStatus func = nullptr; if (func == nullptr) { func = (aclQueryEventStatus)GET_FUNC(aclrtQueryEventStatus); @@ -255,8 +263,9 @@ bool IsExistQueryEventRecordedStatus() } } -aclError AclProfilingInit(const char *profilerResultPath, size_t length) { - typedef aclError (*AclProfInitFunc) (const char *, size_t); +aclError AclProfilingInit(const char* profilerResultPath, size_t length) +{ + typedef aclError (*AclProfInitFunc)(const char*, size_t); static AclProfInitFunc func = nullptr; if (func == nullptr) { func = (AclProfInitFunc)GET_FUNC(aclprofInit); @@ -265,8 +274,9 @@ aclError AclProfilingInit(const char *profilerResultPath, size_t length) { return func(profilerResultPath, length); } -aclError AclProfilingStart(const aclprofConfig *profilerConfig) { - typedef aclError (*AclProfStartFunc) (const aclprofConfig *); +aclError AclProfilingStart(const aclprofConfig* profilerConfig) +{ + typedef aclError (*AclProfStartFunc)(const aclprofConfig*); static AclProfStartFunc func = nullptr; if (func == nullptr) { func = (AclProfStartFunc)GET_FUNC(aclprofStart); @@ -275,8 +285,9 @@ aclError AclProfilingStart(const aclprofConfig *profilerConfig) { return func(profilerConfig); } -aclError AclProfilingStop(const aclprofConfig *profilerConfig) { - typedef aclError (*AclProfStopFunc) (const aclprofConfig*); +aclError AclProfilingStop(const aclprofConfig* profilerConfig) +{ + typedef aclError (*AclProfStopFunc)(const aclprofConfig*); static AclProfStopFunc func = nullptr; if (func == nullptr) { func = (AclProfStopFunc)GET_FUNC(aclprofStop); @@ -285,8 +296,9 @@ aclError AclProfilingStop(const aclprofConfig *profilerConfig) { return func(profilerConfig); } -aclError AclProfilingFinalize() { - typedef aclError (*AclProfFinalizeFunc) (); +aclError AclProfilingFinalize() +{ + typedef aclError (*AclProfFinalizeFunc)(); static AclProfFinalizeFunc func = nullptr; if (func == nullptr) { func = (AclProfFinalizeFunc)GET_FUNC(aclprofFinalize); @@ -295,14 +307,15 @@ aclError AclProfilingFinalize() { return func(); } -aclprofConfig *AclProfilingCreateConfig( - uint32_t *deviceIdList, +aclprofConfig* AclProfilingCreateConfig( + uint32_t* deviceIdList, uint32_t deviceNums, aclprofAicoreMetrics aicoreMetrics, - aclprofAicoreEvents *aicoreEvents, - uint64_t dataTypeConfig) { - typedef aclprofConfig *(*AclProfCreateConfigFunc) \ - (uint32_t *, uint32_t, aclprofAicoreMetrics, const aclprofAicoreEvents *, uint64_t); + aclprofAicoreEvents* aicoreEvents, + uint64_t dataTypeConfig) +{ + typedef aclprofConfig* (*AclProfCreateConfigFunc) \ + (uint32_t*, uint32_t, aclprofAicoreMetrics, const aclprofAicoreEvents*, uint64_t); static AclProfCreateConfigFunc func = nullptr; if (func == nullptr) { func = (AclProfCreateConfigFunc)GET_FUNC(aclprofCreateConfig); @@ -311,8 +324,9 @@ aclprofConfig *AclProfilingCreateConfig( return func(deviceIdList, deviceNums, aicoreMetrics, aicoreEvents, dataTypeConfig); } -aclError AclProfilingDestroyConfig(const aclprofConfig *profilerConfig) { - typedef aclError (*AclProfDestroyConfigFunc) (const aclprofConfig *); +aclError AclProfilingDestroyConfig(const aclprofConfig* profilerConfig) +{ + typedef aclError (*AclProfDestroyConfigFunc)(const aclprofConfig*); static AclProfDestroyConfigFunc func = nullptr; if (func == nullptr) { func = (AclProfDestroyConfigFunc)GET_FUNC(aclprofDestroyConfig); @@ -321,8 +335,9 @@ aclError AclProfilingDestroyConfig(const aclprofConfig *profilerConfig) { return func(profilerConfig); } -const char *AclrtGetSocName() { - typedef const char *(*aclrtGetSocNameFunc)(); +const char* AclrtGetSocName() +{ + typedef const char* (*aclrtGetSocNameFunc)(); static aclrtGetSocNameFunc func = nullptr; if (func == nullptr) { func = (aclrtGetSocNameFunc)GET_FUNC(aclrtGetSocName); @@ -331,8 +346,9 @@ const char *AclrtGetSocName() { return func(); } -const char *AclGetSocName() { - typedef const char *(*AclGetSoc) (); +const char* AclGetSocName() +{ + typedef const char* (*AclGetSoc)(); static AclGetSoc func = nullptr; if (func == nullptr) { func = (AclGetSoc)GET_FUNC(aclrtGetSocName); @@ -343,7 +359,8 @@ const char *AclGetSocName() { return func(); } -aclError AclrtSetDeviceSatMode(aclrtFloatOverflowMode mode) { +aclError AclrtSetDeviceSatMode(aclrtFloatOverflowMode mode) +{ typedef aclError (*AclrtSetDeviceSatMode)(aclrtFloatOverflowMode mode); static AclrtSetDeviceSatMode func = nullptr; if (func == nullptr) { @@ -353,7 +370,8 @@ aclError AclrtSetDeviceSatMode(aclrtFloatOverflowMode mode) { return func(mode); } -aclError AclrtSetStreamOverflowSwitch(aclrtStream stream, uint32_t flag) { +aclError AclrtSetStreamOverflowSwitch(aclrtStream stream, uint32_t flag) +{ typedef aclError (*AclrtSetStreamOverflowSwitch)(aclrtStream, uint32_t); static AclrtSetStreamOverflowSwitch func = nullptr; if (func == nullptr) { @@ -363,7 +381,8 @@ aclError AclrtSetStreamOverflowSwitch(aclrtStream stream, uint32_t flag) { return func(stream, flag); } -aclError AclrtSetOpExecuteTimeOut(uint32_t timeout) { +aclError AclrtSetOpExecuteTimeOut(uint32_t timeout) +{ typedef aclError (*AclrtSetOpExecuteTimeOut)(uint32_t); static AclrtSetOpExecuteTimeOut func = nullptr; if (func == nullptr) { @@ -375,7 +394,8 @@ aclError AclrtSetOpExecuteTimeOut(uint32_t timeout) { return func(timeout); } -aclError AclrtGetStreamOverflowSwitch(aclrtStream stream, uint32_t *flag) { +aclError AclrtGetStreamOverflowSwitch(aclrtStream stream, uint32_t* flag) +{ typedef aclError (*AclrtGetStreamOverflowSwitch)(aclrtStream, uint32_t*); static AclrtGetStreamOverflowSwitch func = nullptr; if (func == nullptr) { @@ -385,7 +405,8 @@ aclError AclrtGetStreamOverflowSwitch(aclrtStream stream, uint32_t *flag) { return func(stream, flag); } -aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) { +aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) +{ if (C10_UNLIKELY( c10_npu::warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) { c10_npu::warn_or_error_on_sync(); @@ -410,10 +431,11 @@ aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) { } TORCH_CHECK(func_backup, "Failed to find function", "aclrtSynchronizeStreamWithTimeout and aclrtSynchronizeStream", PROF_ERROR(ErrCode::NOT_FOUND)); return func_backup(stream); - } + } } -aclError AclrtDestroyStreamForce(aclrtStream stream) { +aclError AclrtDestroyStreamForce(aclrtStream stream) +{ typedef aclError (*AclrtDestroyStreamForce)(aclrtStream); static AclrtDestroyStreamForce func = (AclrtDestroyStreamForce)GET_FUNC(aclrtDestroyStreamForce); if (func != nullptr) { @@ -423,7 +445,8 @@ aclError AclrtDestroyStreamForce(aclrtStream stream) { return aclrtDestroyStream(stream); } -aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *utilizationInfo) { +aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo* utilizationInfo) +{ typedef aclError (*AclrtGetDeviceUtilizationRate)(int32_t, aclrtUtilizationInfo*); static AclrtGetDeviceUtilizationRate func = nullptr; if (func == nullptr) { @@ -433,7 +456,8 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u return func(deviceId, utilizationInfo); } -aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy) { +aclError AclrtMallocAlign32(void** devPtr, size_t size, aclrtMemMallocPolicy policy) +{ typedef aclError (*AclrtMallocAlign32)(void**, size_t, aclrtMemMallocPolicy); static AclrtMallocAlign32 func = (AclrtMallocAlign32)GET_FUNC(aclrtMallocAlign32); aclError ret; @@ -461,7 +485,8 @@ aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy pol return ret; } -aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status) { +aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus* status) +{ typedef aclError (*AclrtStreamQuery)(aclrtStream, aclrtStreamStatus*); static AclrtStreamQuery func = nullptr; if (func == nullptr) { @@ -471,7 +496,8 @@ aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status) { return func(stream, status); } -bool can_device_access_peer(c10::DeviceIndex device_id, c10::DeviceIndex peer_device_id) { +bool can_device_access_peer(c10::DeviceIndex device_id, c10::DeviceIndex peer_device_id) +{ int32_t can_access_peer = 0; c10::DeviceIndex num_npus = c10_npu::device_count(); TORCH_CHECK(device_id >= 0 && device_id < num_npus, PROF_ERROR(ErrCode::VALUE)); @@ -490,8 +516,8 @@ bool can_device_access_peer(c10::DeviceIndex device_id, c10::DeviceIndex peer_de return can_access_peer != 0; } -aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, void *expectPtr, uint64_t flags, - HcclComm hcclComm) +aclError AclrtReserveMemAddress(void** virPtr, size_t size, size_t alignment, void* expectPtr, uint64_t flags, + HcclComm hcclComm) { typedef aclError (*AclrtReserveMemAddress)(void**, size_t, size_t, void*, uint64_t); static AclrtReserveMemAddress func = nullptr; @@ -506,7 +532,7 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo return ret; } -aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm) +aclError AclrtReleaseMemAddress(void* virPtr, HcclComm hcclComm) { typedef aclError (*AclrtReleaseMemAddress)(void*); static AclrtReleaseMemAddress func = nullptr; @@ -521,8 +547,9 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm) return ret; } -aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop, - uint64_t flags) { +aclError AclrtMallocPhysical(aclrtDrvMemHandle* handle, size_t size, const aclrtPhysicalMemProp* prop, + uint64_t flags) +{ typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle*, size_t, const aclrtPhysicalMemProp*, uint64_t); static AclrtMallocPhysical func = nullptr; if (func == nullptr) { @@ -537,17 +564,18 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrt "is enabled, but the pre-allocated number of 1G large pages is insufficient " "or 1G large-page memory pre-allocation is not enabled."); aclrtPhysicalMemProp prop_update = {prop->handleType, - prop->allocationType, - ACL_HBM_MEM_HUGE, - {prop->location.id, - prop->location.type}, - prop->reserve}; + prop->allocationType, + ACL_HBM_MEM_HUGE, + {prop->location.id, + prop->location.type}, + prop->reserve}; ret = func(handle, size, &prop_update, flags); } return ret; } -aclError AclrtFreePhysical(aclrtDrvMemHandle handle) { +aclError AclrtFreePhysical(aclrtDrvMemHandle handle) +{ typedef aclError (*AclrtFreePhysical)(aclrtDrvMemHandle); static AclrtFreePhysical func = nullptr; if (func == nullptr) { @@ -557,8 +585,8 @@ aclError AclrtFreePhysical(aclrtDrvMemHandle handle) { return func(handle); } -aclError AclrtMapMem(void *virPtr, size_t size, size_t offset, aclrtDrvMemHandle handle, uint64_t flags, - HcclComm hcclComm) +aclError AclrtMapMem(void* virPtr, size_t size, size_t offset, aclrtDrvMemHandle handle, uint64_t flags, + HcclComm hcclComm) { typedef aclError (*AclrtMapMem)(void*, size_t, size_t, aclrtDrvMemHandle, uint64_t); static AclrtMapMem func = nullptr; @@ -573,7 +601,7 @@ aclError AclrtMapMem(void *virPtr, size_t size, size_t offset, aclrtDrvMemHandle return ret; } -aclError AclrtUnmapMem(void *virPtr, HcclComm hcclComm) +aclError AclrtUnmapMem(void* virPtr, HcclComm hcclComm) { typedef aclError (*AclrtUnmapMem)(void*); static AclrtUnmapMem func = nullptr; @@ -590,50 +618,50 @@ aclError AclrtUnmapMem(void *virPtr, HcclComm hcclComm) bool IsExistGetCannAttribute() { - typedef aclError (*AclGetCannAttribute)(aclCannAttr, int32_t *); - static AclGetCannAttribute func = (AclGetCannAttribute) GET_FUNC(aclGetCannAttribute); + typedef aclError (*AclGetCannAttribute)(aclCannAttr, int32_t*); + static AclGetCannAttribute func = (AclGetCannAttribute)GET_FUNC(aclGetCannAttribute); return func != nullptr; } -aclError AclGetCannAttributeList(const aclCannAttr **cannAttrList, size_t *num) +aclError AclGetCannAttributeList(const aclCannAttr** cannAttrList, size_t* num) { - typedef aclError (*AclGetCannAttributeList)(const aclCannAttr **, size_t *); + typedef aclError (*AclGetCannAttributeList)(const aclCannAttr**, size_t*); static AclGetCannAttributeList func = nullptr; if (func == nullptr) { - func = (AclGetCannAttributeList) GET_FUNC(aclGetCannAttributeList); + func = (AclGetCannAttributeList)GET_FUNC(aclGetCannAttributeList); } TORCH_CHECK(func, "Failed to find function ", "aclGetCannAttributeList", PTA_ERROR(ErrCode::NOT_FOUND)); return func(cannAttrList, num); } -aclError AclGetCannAttribute(aclCannAttr cannAttr, int32_t *value) +aclError AclGetCannAttribute(aclCannAttr cannAttr, int32_t* value) { - typedef aclError (*AclGetCannAttribute)(aclCannAttr, int32_t *); + typedef aclError (*AclGetCannAttribute)(aclCannAttr, int32_t*); static AclGetCannAttribute func = nullptr; if (func == nullptr) { - func = (AclGetCannAttribute) GET_FUNC(aclGetCannAttribute); + func = (AclGetCannAttribute)GET_FUNC(aclGetCannAttribute); } TORCH_CHECK(func, "Failed to find function ", "aclGetCannAttribute", PTA_ERROR(ErrCode::NOT_FOUND)); return func(cannAttr, value); } -aclError AclGetDeviceCapability(uint32_t deviceId, aclDeviceInfo deviceInfo, int64_t *value) +aclError AclGetDeviceCapability(uint32_t deviceId, aclDeviceInfo deviceInfo, int64_t* value) { - typedef aclError (*AclGetDeviceCapability)(uint32_t, aclDeviceInfo, int64_t *); + typedef aclError (*AclGetDeviceCapability)(uint32_t, aclDeviceInfo, int64_t*); static AclGetDeviceCapability func = nullptr; if (func == nullptr) { - func = (AclGetDeviceCapability) GET_FUNC(aclGetDeviceCapability); + func = (AclGetDeviceCapability)GET_FUNC(aclGetDeviceCapability); } TORCH_CHECK(func, "Failed to find function ", "aclGetDeviceCapability", PTA_ERROR(ErrCode::NOT_FOUND)); return func(deviceId, deviceInfo, value); } -aclError AclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize, size_t *retSize) +aclError AclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize, size_t* retSize) { - typedef aclError (*AclrtGetMemUceInfo)(int32_t, aclrtMemUceInfo*, size_t, size_t *); + typedef aclError (*AclrtGetMemUceInfo)(int32_t, aclrtMemUceInfo*, size_t, size_t*); static AclrtGetMemUceInfo func = nullptr; if (func == nullptr) { - func = (AclrtGetMemUceInfo) GET_FUNC(aclrtGetMemUceInfo); + func = (AclrtGetMemUceInfo)GET_FUNC(aclrtGetMemUceInfo); } if (func == nullptr) { TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtGetMemUceInfo"); @@ -647,7 +675,7 @@ aclError AclrtDeviceTaskAbort(int32_t deviceId) typedef aclError (*AclrtDeviceTaskAbort)(int32_t, uint32_t); static AclrtDeviceTaskAbort func = nullptr; if (func == nullptr) { - func = (AclrtDeviceTaskAbort) GET_FUNC(aclrtDeviceTaskAbort); + func = (AclrtDeviceTaskAbort)GET_FUNC(aclrtDeviceTaskAbort); } if (func == nullptr) { TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtDeviceTaskAbort"); @@ -662,7 +690,7 @@ aclError AclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, s typedef aclError (*AclrtMemUceRepair)(int32_t, aclrtMemUceInfo*, size_t); static AclrtMemUceRepair func = nullptr; if (func == nullptr) { - func = (AclrtMemUceRepair) GET_FUNC(aclrtMemUceRepair); + func = (AclrtMemUceRepair)GET_FUNC(aclrtMemUceRepair); } if (func == nullptr) { TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMemUceRepair"); @@ -676,7 +704,7 @@ aclError AclrtCmoAsync(void* src, size_t size, aclrtCmoType cmoType, aclrtStream typedef aclError (*AclrtCmoAsync)(void*, size_t, aclrtCmoType, aclrtStream); static AclrtCmoAsync func = nullptr; if (func == nullptr) { - func = (AclrtCmoAsync) GET_FUNC(aclrtCmoAsync); + func = (AclrtCmoAsync)GET_FUNC(aclrtCmoAsync); } TORCH_CHECK(func, "Failed to find function ", "aclrtCmoAsync", PTA_ERROR(ErrCode::NOT_FOUND)); return func(src, size, cmoType, stream); @@ -687,7 +715,7 @@ aclError AclrtGetLastError(aclrtLastErrLevel flag) typedef aclError (*AclrtGetLastError)(aclrtLastErrLevel flag); static AclrtGetLastError func = nullptr; if (func == nullptr) { - func = (AclrtGetLastError) GET_FUNC(aclrtGetLastError); + func = (AclrtGetLastError)GET_FUNC(aclrtGetLastError); } if (func == nullptr) { return ACL_ERROR_NONE; @@ -700,7 +728,7 @@ aclError AclrtPeekAtLastError(aclrtLastErrLevel flag) typedef aclError (*AclrtPeekAtLastError)(aclrtLastErrLevel flag); static AclrtPeekAtLastError func = nullptr; if (func == nullptr) { - func = (AclrtPeekAtLastError) GET_FUNC(aclrtPeekAtLastError); + func = (AclrtPeekAtLastError)GET_FUNC(aclrtPeekAtLastError); } if (func == nullptr) { return ACL_ERROR_NONE; @@ -708,12 +736,12 @@ aclError AclrtPeekAtLastError(aclrtLastErrLevel flag) return func(flag); } -aclError AclStressDetect(int32_t deviceId, void *workspace, size_t workspaceSize) +aclError AclStressDetect(int32_t deviceId, void* workspace, size_t workspaceSize) { typedef aclError (*AclStressDetect)(int32_t, void*, size_t); static AclStressDetect func = nullptr; if (func == nullptr) { - func = (AclStressDetect) GetOpApiFuncAddr("StressDetect"); + func = (AclStressDetect)GetOpApiFuncAddr("StressDetect"); } TORCH_CHECK(func, "Failed to find function ", "StressDetect", PTA_ERROR(ErrCode::NOT_FOUND)); return func(deviceId, workspace, workspaceSize); @@ -740,7 +768,7 @@ aclError AclrtSynchronizeDeviceWithTimeout(void) } } -aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp) +aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t* timestamp) { typedef aclError (*AclrtEventGetTimestamp)(aclrtEvent, uint64_t*); static AclrtEventGetTimestamp func = nullptr; @@ -756,31 +784,31 @@ aclError AclmdlRICaptureBegin(aclrtStream stream, aclmdlRICaptureMode mode) typedef aclError (*AclmdlRICaptureBegin)(aclrtStream, aclmdlRICaptureMode); static AclmdlRICaptureBegin func = nullptr; if (func == nullptr) { - func = (AclmdlRICaptureBegin) GET_FUNC(aclmdlRICaptureBegin); + func = (AclmdlRICaptureBegin)GET_FUNC(aclmdlRICaptureBegin); } TORCH_CHECK(func, "Failed to find function aclmdlRICaptureBegin", PTA_ERROR(ErrCode::NOT_FOUND)); return func(stream, mode); } -aclError AclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus *status, aclmdlRI *modelRI) +aclError AclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus* status, aclmdlRI* modelRI) { - typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *); + typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus*, aclmdlRI*); static AclmdlRICaptureGetInfo func = nullptr; if (func == nullptr) { - func = (AclmdlRICaptureGetInfo) GET_FUNC(aclmdlRICaptureGetInfo); + func = (AclmdlRICaptureGetInfo)GET_FUNC(aclmdlRICaptureGetInfo); } TORCH_CHECK(func, "Failed to find function aclmdlRICaptureGetInfo", PTA_ERROR(ErrCode::NOT_FOUND)); return func(stream, status, modelRI); } -aclError AclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *modelRI) +aclError AclmdlRICaptureEnd(aclrtStream stream, aclmdlRI* modelRI) { - typedef aclError (*AclmdlRICaptureEnd)(aclrtStream, aclmdlRI *); + typedef aclError (*AclmdlRICaptureEnd)(aclrtStream, aclmdlRI*); static AclmdlRICaptureEnd func = nullptr; if (func == nullptr) { - func = (AclmdlRICaptureEnd) GET_FUNC(aclmdlRICaptureEnd); + func = (AclmdlRICaptureEnd)GET_FUNC(aclmdlRICaptureEnd); } TORCH_CHECK(func, "Failed to find function aclmdlRICaptureEnd", PTA_ERROR(ErrCode::NOT_FOUND)); @@ -792,7 +820,7 @@ aclError AclmdlRIDebugPrint(aclmdlRI modelRI) typedef aclError (*AclmdlRIDebugPrint)(aclmdlRI); static AclmdlRIDebugPrint func = nullptr; if (func == nullptr) { - func = (AclmdlRIDebugPrint) GET_FUNC(aclmdlRIDebugPrint); + func = (AclmdlRIDebugPrint)GET_FUNC(aclmdlRIDebugPrint); } TORCH_CHECK(func, "Failed to find function aclmdlRIDebugPrint", PTA_ERROR(ErrCode::NOT_FOUND)); @@ -804,7 +832,7 @@ aclError AclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream) typedef aclError (*AclmdlRIExecuteAsync)(aclmdlRI, aclrtStream); static AclmdlRIExecuteAsync func = nullptr; if (func == nullptr) { - func = (AclmdlRIExecuteAsync) GET_FUNC(aclmdlRIExecuteAsync); + func = (AclmdlRIExecuteAsync)GET_FUNC(aclmdlRIExecuteAsync); } TORCH_CHECK(func, "Failed to find function aclmdlRIExecuteAsync", PTA_ERROR(ErrCode::NOT_FOUND)); @@ -812,14 +840,14 @@ aclError AclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream) return func(modelRI, stream); } -aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version) +aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion* version) { - using aclsysGetCANNVersionFunc = aclError(*)(aclCANNPackageName, aclCANNPackageVersion *); + using aclsysGetCANNVersionFunc = aclError (*)(aclCANNPackageName, aclCANNPackageVersion*); static aclsysGetCANNVersionFunc func = nullptr; if (func == nullptr) { func = (aclsysGetCANNVersionFunc)GET_FUNC(aclsysGetCANNVersion); if (func == nullptr) { - return ACL_ERROR_RT_FEATURE_NOT_SUPPORT; + return ACL_ERROR_RT_FEATURE_NOT_SUPPORT; } } @@ -831,7 +859,7 @@ aclError AclmdlRIDestroy(aclmdlRI modelRI) typedef aclError (*AclmdlRIDestroy)(aclmdlRI); static AclmdlRIDestroy func = nullptr; if (func == nullptr) { - func = (AclmdlRIDestroy) GET_FUNC(aclmdlRIDestroy); + func = (AclmdlRIDestroy)GET_FUNC(aclmdlRIDestroy); } TORCH_CHECK(func, "Failed to find function aclmdlRIDestroy", PTA_ERROR(ErrCode::NOT_FOUND)); @@ -844,11 +872,12 @@ bool IsCaptureSupported() static bool have_load_func = false; static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) && (GetSocVersion() < SocVersion::Ascend310B1)) || - (GetSocVersion() >= SocVersion::Ascend910_9391); + ((GetSocVersion() >= SocVersion::Ascend910_9391) && + (GetSocVersion() < SocVersion::Ascend910_95)); if (default_support_capture && !have_load_func) { have_load_func = true; - typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *); - static AclmdlRICaptureGetInfo func = (AclmdlRICaptureGetInfo) GET_FUNC(aclmdlRICaptureGetInfo); + typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus*, aclmdlRI*); + static AclmdlRICaptureGetInfo func = (AclmdlRICaptureGetInfo)GET_FUNC(aclmdlRICaptureGetInfo); is_support = (func != nullptr); } @@ -860,19 +889,19 @@ aclError AclmdlRICaptureTaskGrpBegin(aclrtStream stream) typedef aclError (*AclmdlRICaptureTaskGrpBegin)(aclrtStream); static AclmdlRICaptureTaskGrpBegin func = nullptr; if (func == nullptr) { - func = (AclmdlRICaptureTaskGrpBegin) GET_FUNC(aclmdlRICaptureTaskGrpBegin); + func = (AclmdlRICaptureTaskGrpBegin)GET_FUNC(aclmdlRICaptureTaskGrpBegin); } TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskGrpBegin", PTA_ERROR(ErrCode::NOT_FOUND)); return func(stream); } -aclError AclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp *handle) +aclError AclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp* handle) { typedef aclError (*AclmdlRICaptureTaskGrpEnd)(aclrtStream, aclrtTaskGrp*); static AclmdlRICaptureTaskGrpEnd func = nullptr; if (func == nullptr) { - func = (AclmdlRICaptureTaskGrpEnd) GET_FUNC(aclmdlRICaptureTaskGrpEnd); + func = (AclmdlRICaptureTaskGrpEnd)GET_FUNC(aclmdlRICaptureTaskGrpEnd); } TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskGrpEnd", PTA_ERROR(ErrCode::NOT_FOUND)); @@ -884,7 +913,7 @@ aclError AclmdlRICaptureTaskUpdateBegin(aclrtStream stream, aclrtTaskGrp handle) typedef aclError (*AclmdlRICaptureTaskUpdateBegin)(aclrtStream, aclrtTaskGrp); static AclmdlRICaptureTaskUpdateBegin func = nullptr; if (func == nullptr) { - func = (AclmdlRICaptureTaskUpdateBegin) GET_FUNC(aclmdlRICaptureTaskUpdateBegin); + func = (AclmdlRICaptureTaskUpdateBegin)GET_FUNC(aclmdlRICaptureTaskUpdateBegin); } TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskUpdateBegin", PTA_ERROR(ErrCode::NOT_FOUND)); @@ -896,31 +925,31 @@ aclError AclmdlRICaptureTaskUpdateEnd(aclrtStream stream) typedef aclError (*AclmdlRICaptureTaskUpdateEnd)(aclmdlRI); static AclmdlRICaptureTaskUpdateEnd func = nullptr; if (func == nullptr) { - func = (AclmdlRICaptureTaskUpdateEnd) GET_FUNC(aclmdlRICaptureTaskUpdateEnd); + func = (AclmdlRICaptureTaskUpdateEnd)GET_FUNC(aclmdlRICaptureTaskUpdateEnd); } TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskUpdateEnd", PTA_ERROR(ErrCode::NOT_FOUND)); return func(stream); } -aclError AclrtHostRegister(void *ptr, uint64_t size, aclrtHostRegisterType type, void **devPtr) +aclError AclrtHostRegister(void* ptr, uint64_t size, aclrtHostRegisterType type, void** devPtr) { - typedef aclError (*AclrtHostRegister)(void *, uint64_t, aclrtHostRegisterType, void **); + typedef aclError (*AclrtHostRegister)(void*, uint64_t, aclrtHostRegisterType, void**); static AclrtHostRegister func = nullptr; if (func == nullptr) { - func = (AclrtHostRegister) GET_FUNC(aclrtHostRegister); + func = (AclrtHostRegister)GET_FUNC(aclrtHostRegister); } TORCH_CHECK(func, "Failed to find function aclrtHostRegister", PTA_ERROR(ErrCode::NOT_FOUND)); return func(ptr, size, type, devPtr); } -aclError AclrtHostUnregister(void *ptr) +aclError AclrtHostUnregister(void* ptr) { - typedef aclError (*AclrtHostUnregister)(void *); + typedef aclError (*AclrtHostUnregister)(void*); static AclrtHostUnregister func = nullptr; if (func == nullptr) { - func = (AclrtHostUnregister) GET_FUNC(aclrtHostUnregister); + func = (AclrtHostUnregister)GET_FUNC(aclrtHostUnregister); } TORCH_CHECK(func, "Failed to find function aclrtHostUnregister", PTA_ERROR(ErrCode::NOT_FOUND)); @@ -928,4 +957,4 @@ aclError AclrtHostUnregister(void *ptr) } } // namespace acl -} // namespace c10 +} // namespace c10_npu \ No newline at end of file -- Gitee From 86a4bb7a97074833111f006c16e1f8c17c7e4750 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 11:36:09 +0000 Subject: [PATCH 10/11] oss_proxy Signed-off-by: zjx.com --- .../aoti_torch/oss_proxy_executor_npu.cpp | 1132 ++++++++--------- 1 file changed, 563 insertions(+), 569 deletions(-) diff --git a/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp b/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp index c11ef93bfda..d9f1c04fc2c 100644 --- a/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp +++ b/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp @@ -5,9 +5,8 @@ #include namespace { -at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) -{ - return reinterpret_cast(handle); +at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) { + return reinterpret_cast(handle); } } // namespace @@ -17,481 +16,477 @@ void OSSProxyExecutorNpu::prefill_stack_with_static_arguments( size_t index, const at::TypePtr& schema_arg_type, const nlohmann::json& serialized_arg, - OSSOpKernel& op_kernel) -{ - auto& stack = op_kernel.stack_; - auto& dynamic_args = op_kernel.dynamic_args_; - - TORCH_CHECK(serialized_arg.size() == 1); - std::string serialized_arg_type = serialized_arg.begin().key(); - auto& serialized_arg_val = serialized_arg.begin().value(); - - switch (schema_arg_type->kind()) { - case c10::TypeKind::TensorType: { - TORCH_CHECK( - serialized_arg_type == "as_tensor", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_tensor for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back(index, DynamicArgType::TensorType, 1); - break; - } - case c10::TypeKind::IntType: { - TORCH_CHECK( - serialized_arg_type == "as_int", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_int for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); - break; - } - case c10::TypeKind::SymIntType: { - TORCH_CHECK( - serialized_arg_type == "as_int" || - serialized_arg_type == "as_sym_int", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_int or as_sym_int for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); - break; - } - case c10::TypeKind::FloatType: { - TORCH_CHECK( - serialized_arg_type == "as_float", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_float for argument ", - index, - " but got ", - serialized_arg_type); - stack.at(index) = serialized_arg_val.get(); - break; - } - case c10::TypeKind::BoolType: { - TORCH_CHECK( - serialized_arg_type == "as_bool", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_bool for argument ", - index, - " but got ", - serialized_arg_type); - stack.at(index) = serialized_arg_val.get(); - break; + OSSOpKernel& op_kernel) { + auto& stack = op_kernel.stack_; + auto& dynamic_args = op_kernel.dynamic_args_; + + TORCH_CHECK(serialized_arg.size() == 1); + std::string serialized_arg_type = serialized_arg.begin().key(); + auto& serialized_arg_val = serialized_arg.begin().value(); + + switch (schema_arg_type->kind()) { + case c10::TypeKind::TensorType: { + TORCH_CHECK( + serialized_arg_type == "as_tensor", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_tensor for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back(index, DynamicArgType::TensorType, 1); + break; + } + case c10::TypeKind::IntType: { + TORCH_CHECK( + serialized_arg_type == "as_int", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_int for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); + break; + } + case c10::TypeKind::SymIntType: { + TORCH_CHECK( + serialized_arg_type == "as_int" || + serialized_arg_type == "as_sym_int", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_int or as_sym_int for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); + break; + } + case c10::TypeKind::FloatType: { + TORCH_CHECK( + serialized_arg_type == "as_float", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_float for argument ", + index, + " but got ", + serialized_arg_type); + stack.at(index) = serialized_arg_val.get(); + break; + } + case c10::TypeKind::BoolType: { + TORCH_CHECK( + serialized_arg_type == "as_bool", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_bool for argument ", + index, + " but got ", + serialized_arg_type); + stack.at(index) = serialized_arg_val.get(); + break; + } + case c10::TypeKind::NumberType: { + if (serialized_arg_type == "as_int") { + // Only int Scalar is treated as dynamic arg for now + dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); + } else if (serialized_arg_type == "as_float") { + stack.at(index) = serialized_arg_val.get(); + } else if (serialized_arg_type == "as_bool") { + stack.at(index) = serialized_arg_val.get(); + } else { + TORCH_CHECK( + false, + "Expected extern kernel ", + op_kernel.target_, + " to have a scalar input for argument ", + index, + " but got ", + serialized_arg_type); + } + break; + } + case c10::TypeKind::StringType: { + TORCH_CHECK( + serialized_arg_type == "as_string", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_string for argument ", + index, + " but got ", + serialized_arg_type); + stack.at(index) = serialized_arg_val.get(); + break; + } + case c10::TypeKind::DeviceObjType: { + TORCH_CHECK( + serialized_arg_type == "as_device", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_device for argument ", + index, + " but got ", + serialized_arg_type); + + std::string device_string = serialized_arg_val["type"].get(); + if (serialized_arg_val.contains("index") && + serialized_arg_val["index"].is_number()) { + device_string += ":" + serialized_arg_val["index"].get(); + } + + c10::Device device(device_string); + + if (device != *device_) { + VLOG(1) << "ProxyExecutor is using " << *device_ << " for " + << op_kernel.target_ << " argument #" << index + << ", which is different from the one serialized in thrift: " + << device << ". Please ensure this is intentional."; + } + + stack.at(index) = *device_; + break; + } + case c10::TypeKind::ListType: { + if (schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) { + TORCH_CHECK( + serialized_arg_type == "as_tensors", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_tensors for argument ", + index, + " but got ", + serialized_arg_type); + TORCH_CHECK(serialized_arg_type == "as_tensors"); + dynamic_args.emplace_back( + index, DynamicArgType::ListTensorType, serialized_arg_val.size()); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofInts())) { + TORCH_CHECK( + serialized_arg_type == "as_ints", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_ints for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back( + index, DynamicArgType::ListIntType, serialized_arg_val.size()); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) { + TORCH_CHECK( + serialized_arg_type == "as_ints" || + serialized_arg_type == "as_sym_ints", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_ints or as_sym_ints for argument ", + index, + " but got ", + serialized_arg_type); + dynamic_args.emplace_back( + index, DynamicArgType::ListIntType, serialized_arg_val.size()); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofFloats())) { + TORCH_CHECK( + serialized_arg_type == "as_floats", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_floats for argument ", + index, + " but got ", + serialized_arg_type); + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg.get()); } - case c10::TypeKind::NumberType: { - if (serialized_arg_type == "as_int") { - // Only int Scalar is treated as dynamic arg for now - dynamic_args.emplace_back(index, DynamicArgType::IntType, 1); - } else if (serialized_arg_type == "as_float") { - stack.at(index) = serialized_arg_val.get(); - } else if (serialized_arg_type == "as_bool") { - stack.at(index) = serialized_arg_val.get(); - } else { - TORCH_CHECK( - false, - "Expected extern kernel ", - op_kernel.target_, - " to have a scalar input for argument ", - index, - " but got ", - serialized_arg_type); - } - break; + stack.at(index) = std::move(ret); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofBools())) { + TORCH_CHECK( + serialized_arg_type == "as_bools", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_bools for argument ", + index, + " but got ", + serialized_arg_type); + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg.get()); } - case c10::TypeKind::StringType: { - TORCH_CHECK( - serialized_arg_type == "as_string", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_string for argument ", - index, - " but got ", - serialized_arg_type); - stack.at(index) = serialized_arg_val.get(); - break; + stack.at(index) = std::move(ret); + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofNumbers())) { + if (serialized_arg_type == "as_ints") { + dynamic_args.emplace_back( + index, DynamicArgType::ListIntType, serialized_arg_val.size()); + } else if (serialized_arg_type == "as_floats") { + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg); + } + stack.at(index) = std::move(ret); + } else if (serialized_arg_type == "as_bools") { + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg); + } + stack.at(index) = std::move(ret); + } else { + TORCH_CHECK( + false, + "Expected extern kernel ", + op_kernel.target_, + " to have a List[Scalar] input for argument ", + index, + " but got ", + serialized_arg_type); } - case c10::TypeKind::DeviceObjType: { - TORCH_CHECK( - serialized_arg_type == "as_device", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_device for argument ", - index, - " but got ", - serialized_arg_type); - - std::string device_string = serialized_arg_val["type"].get(); - if (serialized_arg_val.contains("index") && - serialized_arg_val["index"].is_number()) { - device_string += ":" + serialized_arg_val["index"].get(); - } - - c10::Device device(device_string); - - if (device != *device_) { - VLOG(1) << "ProxyExecutor is using " << *device_ << " for " - << op_kernel.target_ << " argument #" << index - << ", which is different from the one serialized in thrift: " - << device << ". Please ensure this is intentional."; - } - - stack.at(index) = *device_; - break; + } else if (schema_arg_type->isSubtypeOf( + at::ListType::ofOptionalTensors())) { + if (serialized_arg_type == "as_optional_tensors") { + std::vector list_item_types; + for (const auto& arg : serialized_arg_val) { + list_item_types.push_back(arg.begin().key()); + } + dynamic_args.emplace_back( + index, + DynamicArgType::ListOptionalTensorType, + serialized_arg_val.size(), + list_item_types); + } else if (serialized_arg_type == "as_tensors") { + dynamic_args.emplace_back( + index, DynamicArgType::ListTensorType, serialized_arg_val.size()); + } else { + TORCH_CHECK( + false, + "Expected extern kernel ", + op_kernel.target_, + " to have a Tensor?[] input for argument ", + index, + " but got ", + serialized_arg_type); } - case c10::TypeKind::ListType: { - if (schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) { - TORCH_CHECK( - serialized_arg_type == "as_tensors", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_tensors for argument ", - index, - " but got ", - serialized_arg_type); - TORCH_CHECK(serialized_arg_type == "as_tensors"); - dynamic_args.emplace_back( - index, DynamicArgType::ListTensorType, serialized_arg_val.size()); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofInts())) { - TORCH_CHECK( - serialized_arg_type == "as_ints", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_ints for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back( - index, DynamicArgType::ListIntType, serialized_arg_val.size()); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) { - TORCH_CHECK( - serialized_arg_type == "as_ints" || - serialized_arg_type == "as_sym_ints", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_ints or as_sym_ints for argument ", - index, - " but got ", - serialized_arg_type); - dynamic_args.emplace_back( - index, DynamicArgType::ListIntType, serialized_arg_val.size()); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofFloats())) { - TORCH_CHECK( - serialized_arg_type == "as_floats", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_floats for argument ", - index, - " but got ", - serialized_arg_type); - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg.get()); - } - stack.at(index) = std::move(ret); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofBools())) { - TORCH_CHECK( - serialized_arg_type == "as_bools", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_bools for argument ", - index, - " but got ", - serialized_arg_type); - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg.get()); - } - stack.at(index) = std::move(ret); - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofNumbers())) { - if (serialized_arg_type == "as_ints") { - dynamic_args.emplace_back( - index, DynamicArgType::ListIntType, serialized_arg_val.size()); - } else if (serialized_arg_type == "as_floats") { - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg); - } - stack.at(index) = std::move(ret); - } else if (serialized_arg_type == "as_bools") { - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg); - } - stack.at(index) = std::move(ret); - } else { - TORCH_CHECK( - false, - "Expected extern kernel ", - op_kernel.target_, - " to have a List[Scalar] input for argument ", - index, - " but got ", - serialized_arg_type); - } - } else if (schema_arg_type->isSubtypeOf( - at::ListType::ofOptionalTensors())) { - if (serialized_arg_type == "as_optional_tensors") { - std::vector list_item_types; - for (const auto& arg : serialized_arg_val) { - list_item_types.push_back(arg.begin().key()); - } - dynamic_args.emplace_back( - index, - DynamicArgType::ListOptionalTensorType, - serialized_arg_val.size(), - list_item_types); - } else if (serialized_arg_type == "as_tensors") { - dynamic_args.emplace_back( - index, DynamicArgType::ListTensorType, serialized_arg_val.size()); - } else { - TORCH_CHECK( - false, - "Expected extern kernel ", - op_kernel.target_, - " to have a Tensor?[] input for argument ", - index, - " but got ", - serialized_arg_type); - } - } else if (schema_arg_type->isSubtypeOf(at::ListType::ofStrings())) { - TORCH_CHECK( - serialized_arg_type == "as_strings", - "Expected extern kernel ", - op_kernel.target_, - " to have serialized argument type as_strings for argument ", - index, - " but got ", - serialized_arg_type); - std::vector ret; - for (const auto& arg : serialized_arg_val) { - ret.push_back(arg.get()); - } - stack.at(index) = std::move(ret); - } else { - TORCH_CHECK( - false, - "NYI: Unsupported list type ", - serialized_arg_type, - " for extern kernel ", - op_kernel.target_, - " argument ", - index); - } - break; + } else if (schema_arg_type->isSubtypeOf(at::ListType::ofStrings())) { + TORCH_CHECK( + serialized_arg_type == "as_strings", + "Expected extern kernel ", + op_kernel.target_, + " to have serialized argument type as_strings for argument ", + index, + " but got ", + serialized_arg_type); + std::vector ret; + for (const auto& arg : serialized_arg_val) { + ret.push_back(arg.get()); } - case c10::TypeKind::OptionalType: { - auto inner_type = - schema_arg_type->castRaw()->getElementType(); - - if (serialized_arg_type == "as_none") { - stack.at(index) = c10::IValue{}; - if (inner_type->kind() == c10::TypeKind::TensorType) { - // Tensor is None - dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0); - } else if ( - inner_type->kind() == c10::TypeKind::IntType || - inner_type->kind() == c10::TypeKind::SymIntType) { - // Int or SymInt is None - dynamic_args.emplace_back(index, DynamicArgType::IntType, 0); - } else if ( - inner_type->kind() == c10::TypeKind::ListType && - schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) { - // List[Tensor] is None - dynamic_args.emplace_back(index, DynamicArgType::ListTensorType, 0); - } else if ( - inner_type->kind() == c10::TypeKind::ListType && - schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) { - // List[SymInt] is None - dynamic_args.emplace_back(index, DynamicArgType::ListIntType, 0); - } - } else { - prefill_stack_with_static_arguments( - index, inner_type, serialized_arg, op_kernel); - } - break; + stack.at(index) = std::move(ret); + } else { + TORCH_CHECK( + false, + "NYI: Unsupported list type ", + serialized_arg_type, + " for extern kernel ", + op_kernel.target_, + " argument ", + index); + } + break; + } + case c10::TypeKind::OptionalType: { + auto inner_type = + schema_arg_type->castRaw()->getElementType(); + + if (serialized_arg_type == "as_none") { + stack.at(index) = c10::IValue{}; + if (inner_type->kind() == c10::TypeKind::TensorType) { + // Tensor is None + dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0); + } else if ( + inner_type->kind() == c10::TypeKind::IntType || + inner_type->kind() == c10::TypeKind::SymIntType) { + // Int or SymInt is None + dynamic_args.emplace_back(index, DynamicArgType::IntType, 0); + } else if ( + inner_type->kind() == c10::TypeKind::ListType && + schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) { + // List[Tensor] is None + dynamic_args.emplace_back(index, DynamicArgType::ListTensorType, 0); + } else if ( + inner_type->kind() == c10::TypeKind::ListType && + schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) { + // List[SymInt] is None + dynamic_args.emplace_back(index, DynamicArgType::ListIntType, 0); } - default: - TORCH_CHECK( - false, - "Unsupported input type ", - serialized_arg_type, - " for extern kernel ", - op_kernel.target_, - " argument ", - index); + } else { + prefill_stack_with_static_arguments( + index, inner_type, serialized_arg, op_kernel); + } + break; } + default: + TORCH_CHECK( + false, + "Unsupported input type ", + serialized_arg_type, + " for extern kernel ", + op_kernel.target_, + " argument ", + index); + } } // Populates op_kernel.stack_, op_kernel.dynamic_args_ void OSSProxyExecutorNpu::get_input_info_from_serialized( const std::vector& schema_args, const nlohmann::json& serialized_node, - OSSOpKernel& op_kernel) -{ - std::vector filled(schema_args.size(), false); - TORCH_CHECK(op_kernel.stack_.size() == 0); - op_kernel.stack_.resize(schema_args.size()); - for (const auto& named_argument : serialized_node["inputs"]) { - const auto& arg = named_argument["arg"]; - const auto& name = named_argument["name"].get(); - - // Doing a linear lookup in the schema to find the index - // of a static argument. Should be fine performance wise - // because we usually only have small amount of arguments. - for (size_t index = 0; index < schema_args.size(); index++) { - auto& schema_arg = schema_args[index]; - if (schema_arg.name() == name) { - prefill_stack_with_static_arguments( - index, schema_arg.real_type(), arg, op_kernel); - filled[index] = true; - break; - } - } - } - - // If an argument is not filled and has a default value, we should - // also prefill the default value. - for (size_t index = 0; index < schema_args.size(); index++) { - if (!filled[index] && schema_args[index].default_value()) { - auto default_value = *schema_args[index].default_value(); - op_kernel.stack_.at(index) = default_value; - } - } + OSSOpKernel& op_kernel) { + std::vector filled(schema_args.size(), false); + TORCH_CHECK(op_kernel.stack_.size() == 0); + op_kernel.stack_.resize(schema_args.size()); + for (const auto& named_argument : serialized_node["inputs"]) { + const auto& arg = named_argument["arg"]; + const auto& name = named_argument["name"].get(); + + // Doing a linear lookup in the schema to find the index + // of a static argument. Should be fine performance wise + // because we usually only have small amount of arguments. + for (size_t index = 0; index < schema_args.size(); index++) { + auto& schema_arg = schema_args[index]; + if (schema_arg.name() == name) { + prefill_stack_with_static_arguments( + index, schema_arg.real_type(), arg, op_kernel); + filled[index] = true; + break; + } + } + } + + // If an argument is not filled and has a default value, we should + // also prefill the default value. + for (size_t index = 0; index < schema_args.size(); index++) { + if (!filled[index] && schema_args[index].default_value()) { + auto default_value = *schema_args[index].default_value(); + op_kernel.stack_.at(index) = default_value; + } + } } // Populates op_kernel.outputs_ void OSSProxyExecutorNpu::get_output_info_from_serialized( const std::vector& schema_returns, const nlohmann::json& serialized_node, - OSSOpKernel& op_kernel) -{ - std::vector& outputs = op_kernel.outputs_; - - TORCH_CHECK( - schema_returns.size() == serialized_node["outputs"].size(), - "Serialized node doesn't match operator ", - serialized_node["target"], - "'s schema outputs."); - - size_t output_index = 0; - for (const auto& serialized_output : serialized_node["outputs"]) { - TORCH_CHECK(serialized_output.size() == 1); - std::string serialized_output_type = serialized_output.begin().key(); - auto& serialized_output_val = serialized_output.begin().value(); - - auto& schema_return = schema_returns[output_index]; - const at::TypePtr& schema_return_type = schema_return.real_type(); - - switch (schema_return_type->kind()) { - case c10::TypeKind::TensorType: { - TORCH_CHECK( - serialized_output_type == "as_tensor", - "Expected extern kernel ", - serialized_node["target"], - " to have serialized output type as_tensor, ", - " but got ", - serialized_output_type); - outputs.emplace_back(output_index, DynamicArgType::TensorType, 1); - break; - } - case c10::TypeKind::ListType: { - if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) { - TORCH_CHECK( - serialized_output_type == "as_tensors", - "Expected extern kernel ", - serialized_node["target"], - " to have serialized output type as_tensors, ", - " but got ", - serialized_output_type); - outputs.emplace_back( - output_index, - DynamicArgType::ListTensorType, - serialized_output_val.size()); - } else { - TORCH_CHECK( - false, - "Unsupported return list type ", - schema_return_type->repr_str()); - } - break; - } - default: { - TORCH_CHECK( - false, - "Unsupported return type ", - schema_return_type->repr_str(), - " for extern kernel ", - op_kernel.target_); - } + OSSOpKernel& op_kernel) { + std::vector& outputs = op_kernel.outputs_; + + TORCH_CHECK( + schema_returns.size() == serialized_node["outputs"].size(), + "Serialized node doesn't match operator ", + serialized_node["target"], + "'s schema outputs."); + + size_t output_index = 0; + for (const auto& serialized_output : serialized_node["outputs"]) { + TORCH_CHECK(serialized_output.size() == 1); + std::string serialized_output_type = serialized_output.begin().key(); + auto& serialized_output_val = serialized_output.begin().value(); + + auto& schema_return = schema_returns[output_index]; + const at::TypePtr& schema_return_type = schema_return.real_type(); + + switch (schema_return_type->kind()) { + case c10::TypeKind::TensorType: { + TORCH_CHECK( + serialized_output_type == "as_tensor", + "Expected extern kernel ", + serialized_node["target"], + " to have serialized output type as_tensor, ", + " but got ", + serialized_output_type); + outputs.emplace_back(output_index, DynamicArgType::TensorType, 1); + break; + } + case c10::TypeKind::ListType: { + if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) { + TORCH_CHECK( + serialized_output_type == "as_tensors", + "Expected extern kernel ", + serialized_node["target"], + " to have serialized output type as_tensors, ", + " but got ", + serialized_output_type); + outputs.emplace_back( + output_index, + DynamicArgType::ListTensorType, + serialized_output_val.size()); + } else { + TORCH_CHECK( + false, + "Unsupported return list type ", + schema_return_type->repr_str()); } - - output_index++; + break; + } + default: { + TORCH_CHECK( + false, + "Unsupported return type ", + schema_return_type->repr_str(), + " for extern kernel ", + op_kernel.target_); + } } + + output_index++; + } } -OSSProxyExecutorNpu::OSSProxyExecutorNpu(const std::string& json_path, bool is_cpu) -{ - if (is_cpu) { - device_ = std::make_unique(c10::DeviceType::CPU); - } else { - int device_idx = -1; - device_ = std::make_unique(c10::DeviceType::CUDA, device_idx); - } +OSSProxyExecutorNpu::OSSProxyExecutorNpu(const std::string& json_path, bool is_cpu) { + if (is_cpu) { + device_ = std::make_unique(c10::DeviceType::CPU); + } else { + int device_idx = -1; + device_ = std::make_unique(c10::DeviceType::CUDA, device_idx); + } - std::string extern_kernel_nodes_serialized; + std::string extern_kernel_nodes_serialized; - std::ifstream json_file(json_path); - TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path); + std::ifstream json_file(json_path); + TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path); - // Parse file into a json object - nlohmann::json json_obj; - json_file >> json_obj; + // Parse file into a json object + nlohmann::json json_obj; + json_file >> json_obj; - // Access data - for (auto const& serialized_extern_node : json_obj["nodes"]) { - auto const& serialized_node = serialized_extern_node["node"]; + // Access data + for (auto const& serialized_extern_node : json_obj["nodes"]) { + auto const& serialized_node = serialized_extern_node["node"]; - const std::string& target = serialized_node["target"]; + const std::string& target = serialized_node["target"]; - std::string opName; - std::string overloadName; - size_t pos = target.find('.'); - if (pos == std::string::npos) { - opName = target; - overloadName = ""; - } else { - // There should be no more periods - size_t pos2 = target.find('.', pos + 1); - TORCH_CHECK(pos2 == std::string::npos); + std::string opName; + std::string overloadName; + size_t pos = target.find('.'); + if (pos == std::string::npos) { + opName = target; + overloadName = ""; + } else { + // There should be no more periods + size_t pos2 = target.find('.', pos + 1); + TORCH_CHECK(pos2 == std::string::npos); - opName = target.substr(0, pos); - overloadName = target.substr(pos + 1, target.length() - pos); - } + opName = target.substr(0, pos); + overloadName = target.substr(pos + 1, target.length() - pos); + } - c10::OperatorHandle op_handle = - c10::Dispatcher::singleton().findSchemaOrThrow( - opName.c_str(), overloadName.c_str()); - const c10::FunctionSchema& schema = op_handle.schema(); + c10::OperatorHandle op_handle = + c10::Dispatcher::singleton().findSchemaOrThrow( + opName.c_str(), overloadName.c_str()); + const c10::FunctionSchema& schema = op_handle.schema(); - const auto& schema_args = schema.arguments(); - const auto& schema_returns = schema.returns(); + const auto& schema_args = schema.arguments(); + const auto& schema_returns = schema.returns(); - OSSOpKernel op_kernel(target, op_handle); - get_input_info_from_serialized(schema_args, serialized_node, op_kernel); - get_output_info_from_serialized(schema_returns, serialized_node, op_kernel); + OSSOpKernel op_kernel(target, op_handle); + get_input_info_from_serialized(schema_args, serialized_node, op_kernel); + get_output_info_from_serialized(schema_returns, serialized_node, op_kernel); - op_kernels_.emplace_back(std::move(op_kernel)); - } + op_kernels_.emplace_back(std::move(op_kernel)); + } } void OSSProxyExecutorNpu::call_function( @@ -499,135 +494,134 @@ void OSSProxyExecutorNpu::call_function( int num_ints, int64_t* flatten_int_args, int num_tensors, - AtenTensorHandle* flatten_tensor_args) -{ - TORCH_CHECK( - extern_node_index < static_cast(op_kernels_.size()), - "Invalid extern node index"); - OSSOpKernel& op_kernel = op_kernels_[extern_node_index]; - - std::vector stack = op_kernel.stack_; - auto& dynamic_args = op_kernel.dynamic_args_; - - int tensor_id = 0; - int int_id = 0; - for (auto& dynamic_arg : dynamic_args) { - int arg_index = dynamic_arg.arg_index; - DynamicArgType dynamic_arg_type = dynamic_arg.arg_type; - int length = dynamic_arg.length; - - if (length == 0) { - continue; - } - - switch (dynamic_arg_type) { - case DynamicArgType::TensorType: { - at::Tensor* tensor = - tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); - stack[arg_index] = *tensor; - break; - } - case DynamicArgType::IntType: { - int64_t val = flatten_int_args[int_id++]; - stack[arg_index] = val; - break; - } - case DynamicArgType::ListTensorType: { - std::vector tensor_list; - for (int j = 0; j < length; j++) { - at::Tensor* tensor = - tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); - tensor_list.push_back(*tensor); - } - stack[arg_index] = tensor_list; - break; - } - case DynamicArgType::ListOptionalTensorType: { - std::vector> optional_tensor_list; - auto& list_item_types = dynamic_arg.list_item_types; - TORCH_CHECK( - list_item_types.has_value(), - "Could not find list of item types for optional tensor list input"); - - for (const std::string& item_type : list_item_types.value()) { - if (item_type == "as_tensor") { - at::Tensor* tensor = tensor_handle_to_tensor_pointer( - flatten_tensor_args[tensor_id++]); - optional_tensor_list.emplace_back(*tensor); - } else if (item_type == "as_none") { - optional_tensor_list.emplace_back(std::nullopt); - } - } - stack[arg_index] = optional_tensor_list; - break; - } - case DynamicArgType::ListIntType: { - std::vector vals; - vals.reserve(length); - for (int j = 0; j < length; j++) { - vals.push_back(flatten_int_args[int_id++]); - } - stack[arg_index] = vals; - break; - } - default: - TORCH_CHECK(false, "Unsupported dynamic arg type: ", dynamic_arg_type); - } + AtenTensorHandle* flatten_tensor_args) { + TORCH_CHECK( + extern_node_index < static_cast(op_kernels_.size()), + "Invalid extern node index"); + OSSOpKernel& op_kernel = op_kernels_[extern_node_index]; + + std::vector stack = op_kernel.stack_; + auto& dynamic_args = op_kernel.dynamic_args_; + + int tensor_id = 0; + int int_id = 0; + for (auto& dynamic_arg : dynamic_args) { + int arg_index = dynamic_arg.arg_index; + DynamicArgType dynamic_arg_type = dynamic_arg.arg_type; + int length = dynamic_arg.length; + + if (length == 0) { + continue; } - int num_output_tensors = op_kernel.num_output_tensors(); - TORCH_CHECK( - tensor_id == num_tensors - num_output_tensors, - "Mismatch between tensors consumed and num of input tensor, got tensor_id = .", - tensor_id, - ", expected num = ", - num_tensors - num_output_tensors); - TORCH_CHECK( - int_id == num_ints, - "Mismatch between ints consumed and num_ints, got int_id = ", - int_id, - ", num_ints = ", - num_ints); - - // Call the op with the prepared stack. - const c10::OperatorHandle& op = op_kernel.op_handle_; - op.callBoxed(stack); - - const c10::FunctionSchema& schema = op.schema(); - const auto& schema_returns = schema.returns(); - - TORCH_CHECK(op_kernel.outputs_.size() == stack.size()); - TORCH_CHECK(stack.size() == schema_returns.size()); - - int index = 0; - for (const auto& schema_return : schema_returns) { - if (schema_return.type()->kind() == c10::TypeKind::TensorType) { - at::Tensor* tensor = - tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); - *tensor = stack[index++].toTensor(); - } else if ( - schema_return.type()->kind() == c10::TypeKind::ListType && - schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) { - auto tensors = stack[index++].toTensorList(); - for (auto&& t : tensors) { - at::Tensor* tensor = - tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); - *tensor = t; - } - } else { - TORCH_CHECK( - false, - "NYI: Unsupported return type for schema: ", - schema_return.type()->repr_str()); + switch (dynamic_arg_type) { + case DynamicArgType::TensorType: { + at::Tensor* tensor = + tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); + stack[arg_index] = *tensor; + break; + } + case DynamicArgType::IntType: { + int64_t val = flatten_int_args[int_id++]; + stack[arg_index] = val; + break; + } + case DynamicArgType::ListTensorType: { + std::vector tensor_list; + for (int j = 0; j < length; j++) { + at::Tensor* tensor = + tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); + tensor_list.push_back(*tensor); } + stack[arg_index] = tensor_list; + break; + } + case DynamicArgType::ListOptionalTensorType: { + std::vector> optional_tensor_list; + auto& list_item_types = dynamic_arg.list_item_types; + TORCH_CHECK( + list_item_types.has_value(), + "Could not find list of item types for optional tensor list input"); + + for (const std::string& item_type : list_item_types.value()) { + if (item_type == "as_tensor") { + at::Tensor* tensor = tensor_handle_to_tensor_pointer( + flatten_tensor_args[tensor_id++]); + optional_tensor_list.emplace_back(*tensor); + } else if (item_type == "as_none") { + optional_tensor_list.emplace_back(std::nullopt); + } + } + stack[arg_index] = optional_tensor_list; + break; + } + case DynamicArgType::ListIntType: { + std::vector vals; + vals.reserve(length); + for (int j = 0; j < length; j++) { + vals.push_back(flatten_int_args[int_id++]); + } + stack[arg_index] = vals; + break; + } + default: + TORCH_CHECK(false, "Unsupported dynamic arg type: ", dynamic_arg_type); } - - TORCH_CHECK( - tensor_id == num_tensors, - "Mismatch between tensors consumed and num_tensors, got tensor_id = ", - tensor_id, - ", expected num = ", - num_tensors); + } + + int num_output_tensors = op_kernel.num_output_tensors(); + TORCH_CHECK( + tensor_id == num_tensors - num_output_tensors, + "Mismatch between tensors consumed and num of input tensor, got tensor_id = .", + tensor_id, + ", expected num = ", + num_tensors - num_output_tensors); + TORCH_CHECK( + int_id == num_ints, + "Mismatch between ints consumed and num_ints, got int_id = ", + int_id, + ", num_ints = ", + num_ints); + + // Call the op with the prepared stack. + const c10::OperatorHandle& op = op_kernel.op_handle_; + op.callBoxed(stack); + + const c10::FunctionSchema& schema = op.schema(); + const auto& schema_returns = schema.returns(); + + TORCH_CHECK(op_kernel.outputs_.size() == stack.size()); + TORCH_CHECK(stack.size() == schema_returns.size()); + + int index = 0; + for (const auto& schema_return : schema_returns) { + if (schema_return.type()->kind() == c10::TypeKind::TensorType) { + at::Tensor* tensor = + tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); + *tensor = stack[index++].toTensor(); + } else if ( + schema_return.type()->kind() == c10::TypeKind::ListType && + schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) { + auto tensors = stack[index++].toTensorList(); + for (auto&& t : tensors) { + at::Tensor* tensor = + tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]); + *tensor = t; + } + } else { + TORCH_CHECK( + false, + "NYI: Unsupported return type for schema: ", + schema_return.type()->repr_str()); + } + } + + TORCH_CHECK( + tensor_id == num_tensors, + "Mismatch between tensors consumed and num_tensors, got tensor_id = ", + tensor_id, + ", expected num = ", + num_tensors); } } // namespace torch::aot_inductor \ No newline at end of file -- Gitee From 8664fc0d9b89ec12e4ea767f6fc14bca623a5e65 Mon Sep 17 00:00:00 2001 From: "zjx.com" Date: Tue, 1 Jul 2025 12:12:09 +0000 Subject: [PATCH 11/11] Init Signed-off-by: zjx.com --- torch_npu/csrc/distributed/Init.cpp | 79 +++++++++++++++++++---------- 1 file changed, 53 insertions(+), 26 deletions(-) diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 19ed1289a40..f7804ad15d6 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -104,7 +104,9 @@ public: inline std::vector cast_tensors(at::TensorList tensors) const { static auto cast_back_to_ori_format = [](const at::Tensor& t) { - return at_npu::native::custom_ops::npu_format_cast(t, torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_); + return at_npu::native::custom_ops::npu_format_cast( + t, + torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_); }; return c10::fmap(tensors, cast_back_to_ori_format); } @@ -164,7 +166,8 @@ void broadcast_coalesced( // multiple devices and can contain a mix of CPU and CUDA tensors. std::vector> buckets; std::tie(buckets, std::ignore) = - c10d_npu::compute_bucket_assignment_by_size(tensors.vec(), {buffer_size}); + c10d_npu::compute_bucket_assignment_by_size( + tensors.vec(), {buffer_size}); // Returns tensor at specified index in input tensor list. const auto lookup = [&tensors](size_t index) { return tensors[index]; }; @@ -195,8 +198,9 @@ void _register_comm_hook( py::object state, py::object comm_hook) { - reducer.register_comm_hook(std::make_unique<::c10d::PythonCommHook>( - std::move(state), std::move(comm_hook))); + reducer.register_comm_hook( + std::make_unique<::c10d::PythonCommHook>( + std::move(state), std::move(comm_hook))); } // Called from DDP's Python API to create a c10d C++ comm hook. @@ -229,9 +233,19 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) const c10::optional>& logger) { if (logger.has_value()) { std::weak_ptr<::c10d::Logger> logger_weakref = logger.value(); - return ::c10d_npu::compute_bucket_assignment_by_size(tensors, bucket_size_limits, expect_sparse_gradient, tensor_indices, {logger_weakref}); + return ::c10d_npu::compute_bucket_assignment_by_size( + tensors, + bucket_size_limits, + expect_sparse_gradient, + tensor_indices, + {logger_weakref}); } else { - return ::c10d_npu::compute_bucket_assignment_by_size(tensors, bucket_size_limits, expect_sparse_gradient, tensor_indices, {}); + return ::c10d_npu::compute_bucket_assignment_by_size( + tensors, + bucket_size_limits, + expect_sparse_gradient, + tensor_indices, + {}); } }, py::arg("tensors"), @@ -247,7 +261,8 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) const c10::optional>& logger) { if (logger.has_value()) { std::weak_ptr<::c10d::Logger> logger_weakref = logger.value(); - c10d_npu::verify_params_across_processes(process_group, params, {logger_weakref}); + c10d_npu::verify_params_across_processes( + process_group, params, {logger_weakref}); } else { c10d_npu::verify_params_across_processes(process_group, params, {}); } @@ -324,13 +339,16 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) -> void { reducer.prepare_for_backward({output}); }, py::call_guard()) .def("get_backward_stats", &c10d_npu::Reducer::get_backward_stats) - .def("_install_post_backward_futures", [](::c10d_npu::Reducer& reducer, const std::vector>& futs) { - c10::List> futures(c10::FutureType::create(c10::TensorType::get())); - for (const auto& fut : futs) { - futures.push_back(fut->fut); - } - reducer.install_futures(std::move(futures)); - }, + .def("_install_post_backward_futures", + [](::c10d_npu::Reducer& reducer, + const std::vector>& futs) { + c10::List> futures( + c10::FutureType::create(c10::TensorType::get())); + for (const auto& fut : futs) { + futures.push_back(fut->fut); + } + reducer.install_futures(std::move(futures)); + }, py::call_guard()) .def("_rebuild_buckets", &::c10d_npu::Reducer::rebuild_buckets, @@ -377,8 +395,9 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) }); py::module_ dist = py::module_::import("torch._C._distributed_c10d"); - auto processGroupHCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupHCCL>( - module, "ProcessGroupHCCL", dist.attr("Backend")) + auto processGroupHCCL = + intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupHCCL>( + module, "ProcessGroupHCCL", dist.attr("Backend")) .def(py::init&, int, int, @@ -413,8 +432,10 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) .def("set_watchdog_status", &::c10d_npu::ProcessGroupHCCL::setWatchdogStatus) .def("clear_workmeta_list", &::c10d_npu::ProcessGroupHCCL::clearWorkMetaList) .def("get_hccl_comm_name", - [](::c10d_npu::ProcessGroupHCCL& pg, int rankid, py::args args, py::kwargs kwargs) - -> std::string { + [](::c10d_npu::ProcessGroupHCCL& pg, + int rankid, + py::args args, + py::kwargs kwargs) -> std::string { bool init_comm = true; if (kwargs.contains("init_comm")) { init_comm = py::cast(kwargs["init_comm"]); @@ -426,7 +447,8 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) py::arg("peer") = -1) .def_property_readonly("options", &::c10d_npu::ProcessGroupHCCL::getOptions) .def("batch_isend_irecv", - [](::c10d_npu::ProcessGroupHCCL& pg, std::vector& op_type, + [](::c10d_npu::ProcessGroupHCCL& pg, + std::vector& op_type, std::vector& tensors, std::vector remote_rank_list) -> c10::intrusive_ptr { @@ -458,19 +480,22 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) &::c10d_npu::ProcessGroupHCCL::Options::is_high_priority_stream) .def_readwrite("global_ranks_in_group", &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group) - .def_readwrite("hccl_config", &::c10d_npu::ProcessGroupHCCL::Options::hccl_config) + .def_readwrite("hccl_config", + &::c10d_npu::ProcessGroupHCCL::Options::hccl_config) .def_readwrite("group_id", &::c10d_npu::ProcessGroupHCCL::Options::group_id); // bind for ProcessGroupLCCL - auto processGroupLCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupLCCL>( - module, "ProcessGroupLCCL", dist.attr("Backend")) + auto processGroupLCCL = + intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupLCCL>( + module, "ProcessGroupLCCL", dist.attr("Backend")) .def(py::init&, int, int>(), py::call_guard()); auto cDist = py::module_::import("torch._C._distributed_c10d"); - auto parallelStore = intrusive_ptr_no_gil_destructor_class_<::c10d::ParallelTcpStore>( - module, "ParallelStore", cDist.attr("Store"), R"( + auto parallelStore = + intrusive_ptr_no_gil_destructor_class_<::c10d::ParallelTcpStore>( + module, "ParallelStore", cDist.attr("Store"), R"( A TCP-Parallel-Epoll-based distributed key-value store implementation. The server store holds the data, while the client stores can connect to the server store over TCP and perform actions such as :meth:`~torch.distributed.store.set` to insert a key-value @@ -530,8 +555,10 @@ Example:: if (worldSize > -1) { numWorkers = static_cast(worldSize); } - ::c10d::TCPStoreOptions opts{port, isServer, numWorkers, waitWorkers, timeout, multiTenant}; - return c10::make_intrusive<::c10d::ParallelTcpStore>(host, agentRun, agentPid, enableTiered, opts); + ::c10d::TCPStoreOptions opts{ + port, isServer, numWorkers, waitWorkers, timeout, multiTenant}; + return c10::make_intrusive<::c10d::ParallelTcpStore>( + host, agentRun, agentPid, enableTiered, opts); }), py::arg("host") = "127.0.0.1", py::arg("port") = 29500, -- Gitee