From ebde3a4e94ee0f4d15fe4862923d9506134783f8 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 02:46:38 +0000
Subject: [PATCH 01/11] model.h

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 torch_npu/csrc/inductor/aoti_runtime/model.h | 660 ++++++++++---------
 1 file changed, 332 insertions(+), 328 deletions(-)

diff --git a/torch_npu/csrc/inductor/aoti_runtime/model.h b/torch_npu/csrc/inductor/aoti_runtime/model.h
index bf04ddc2613..fe017bc59c6 100644
--- a/torch_npu/csrc/inductor/aoti_runtime/model.h
+++ b/torch_npu/csrc/inductor/aoti_runtime/model.h
@@ -22,12 +22,12 @@
 #endif
 
 #define AOTI_RUNTIME_CHECK(EXPR, MSG) \
-  do {                                \
+do {                                \
     bool ok = EXPR;                   \
     if (!ok) {                        \
-      throw std::runtime_error(MSG);  \
+        throw std::runtime_error(MSG);  \
     }                                 \
-  } while (0)
+} while (0)
 
 // At codegen time, we write out a binary file called constants.bin.
 // We then turn the raw binary to an object file that exposes this
@@ -49,10 +49,10 @@ namespace {
 using GPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
 
 GPUPtr RAII_gpuMalloc(size_t num_bytes) {
-  void* data_ptr;
-  AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
-  auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); };
-  return GPUPtr(data_ptr, deleter);
+    void* data_ptr;
+    AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
+    auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); };
+    return GPUPtr(data_ptr, deleter);
 }
 
 #endif // USE_CUDA
@@ -62,14 +62,14 @@ GPUPtr RAII_gpuMalloc(size_t num_bytes) {
 using NPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
 
 NPUPtr RAII_npuMalloc(size_t num_bytes) {
-  void* data_ptr;
-  // aclrtMalloc doesn't support allocate 0-bytes. In this case,
-  // e.g, model has no weight, we should do padding.
-  size_t padding_bytes = 32;
-  if (num_bytes == 0) num_bytes = padding_bytes;
-  AOTI_RUNTIME_DEVICE_CHECK(aclrtMalloc((void**)&data_ptr, num_bytes, ACL_MEM_MALLOC_HUGE_FIRST));
-  auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(aclrtFree(ptr)); };
-  return NPUPtr(data_ptr, deleter);
+    void* data_ptr;
+    // aclrtMalloc doesn't support allocate 0-bytes. In this case,
+    // e.g, model has no weight, we should do padding.
+    size_t padding_bytes = 32;
+    if (num_bytes == 0) num_bytes = padding_bytes;
+    AOTI_RUNTIME_DEVICE_CHECK(aclrtMalloc((void**)&data_ptr, num_bytes, ACL_MEM_MALLOC_HUGE_FIRST));
+    auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(aclrtFree(ptr)); };
+    return NPUPtr(data_ptr, deleter);
 }
 
 #endif // USE_NPU
@@ -79,11 +79,11 @@ NPUPtr RAII_npuMalloc(size_t num_bytes) {
 using GPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
 
 GPUPtr RAII_gpuMalloc(size_t num_bytes) {
-  sycl::queue* queue_ptr = nullptr;
-  aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
-  void* data_ptr = sycl::malloc_device(num_bytes, *queue_ptr);
-  auto deleter = [queue_ptr](void* ptr) { sycl::free(ptr, *queue_ptr); };
-  return GPUPtr(data_ptr, deleter);
+    sycl::queue* queue_ptr = nullptr;
+    aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+    void* data_ptr = sycl::malloc_device(num_bytes, *queue_ptr);
+    auto deleter = [queue_ptr](void* ptr) { sycl::free(ptr, *queue_ptr); };
+    return GPUPtr(data_ptr, deleter);
 }
 
 #endif // USE_CUDA
@@ -92,11 +92,11 @@ GPUPtr RAII_gpuMalloc(size_t num_bytes) {
 
 namespace torch::aot_inductor {
 enum ConstantType : uint8_t {
-  Unknown = 0,
-  Parameter = 1,
-  Buffer = 2,
-  TensorConstant = 3,
-  FoldedConstant = 4,
+    Unknown = 0,
+    Parameter = 1,
+    Buffer = 2,
+    ensorConstant = 3,
+    FoldedConstant = 4,
 };
 
 using ConstantMap = std::unordered_map<std::string, RAIIAtenTensorHandle>;
@@ -107,32 +107,36 @@ inline void parse_device_str(
     const std::string& device_str,
     int32_t& device_type,
     int32_t& device_idx) {
-  std::regex re("(cpu|cuda|xpu|npu)(:([0-9]+))?");
-  std::smatch sm;
-  bool matched = std::regex_match(device_str, sm, re);
-  AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
-
-  if (sm[1].str() == "cpu") {
-    device_type = aoti_torch_device_type_cpu();
-  } else if (sm[1].str() == "cuda") {
-    device_type = aoti_torch_device_type_cuda();
+    std::regex re("(cpu|cuda|xpu|npu)(:([0-9]+))?");
+    std::smatch sm;
+    bool matched = std::regex_match(device_str, sm, re);
+    AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
+
+    if (sm[1].str() == "cpu") {
+        device_type = aoti_torch_device_type_cpu();
+    } 
+    else if (sm[1].str() == "cuda") {
+        device_type = aoti_torch_device_type_cuda();
 #ifdef USE_XPU
-  } else if (sm[1].str() == "xpu") {
-    device_type = aoti_torch_device_type_xpu();
+    }
+    else if (sm[1].str() == "xpu") {
+        device_type = aoti_torch_device_type_xpu();
 #endif
 #ifdef USE_NPU
-  } else if (sm[1].str() == "npu") {
-    device_type = aoti_torch_device_type_npu();
+    } 
+    else if (sm[1].str() == "npu") {
+        device_type = aoti_torch_device_type_npu();
 #endif
-  } else {
-    AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
-  }
-  int64_t device_sub = 3;
-  if (sm[device_sub].matched) {
-    device_idx = stoi(sm[device_sub].str());
-  } else {
-    device_idx = -1;
-  }
+    } 
+    else {
+        AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
+    }
+    int64_t device_sub = 3;
+    if (sm[device_sub].matched) {
+        device_idx = stoi(sm[device_sub].str());
+    } else {
+        device_idx = -1;
+    }
 }
 
 // Defines the base class for AOTInductorModel, which is generated by the
@@ -159,83 +163,83 @@ class AOTInductorModelBase {
 
 #ifdef USE_CUDA
     if (device_idx_ == -1) {
-      AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
+        AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
     } else {
       // If device_idx_ is passed in, we need to set the current device to it
-      AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_));
+        AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_));
     }
 #endif // USE_CUDA
 #ifdef USE_XPU
     if (device_idx_ == -1) {
-      aoti_torch_get_current_xpu_device(&device_idx_);
+        aoti_torch_get_current_xpu_device(&device_idx_);
     } else {
-      aoti_torch_set_current_xpu_device(device_idx_);
+        aoti_torch_set_current_xpu_device(device_idx_);
     }
 #endif // USE_XPU
 #ifdef USE_NPU
     if (device_idx_ == -1) {
-      AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0));
-      AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_));
+        AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0));
+        AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_));
     } else {
-      AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(device_idx_));
+        AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(device_idx_));
     }
 #endif // USE_NPU
   }
 
   // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~AOTInductorModelBase() {
+~AOTInductorModelBase() {
 #ifdef USE_CUDA
     if (run_finished_) {
-      auto code = cudaEventDestroy(*run_finished_);
-      if (code != cudaSuccess) {
-        std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
-                  << cudaGetErrorString(code) << std::endl;
-      }
+        auto code = cudaEventDestroy(*run_finished_);
+        if (code != cudaSuccess) {
+            std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
+                << cudaGetErrorString(code) << std::endl;
+        }
     }
 #endif // USE_CUDA
 #ifdef USE_XPU
     if (run_finished_) {
-      (*run_finished_)->wait_and_throw();
-      delete *run_finished_;
+        (*run_finished_)->wait_and_throw();
+        delete *run_finished_;
     }
 #endif // USE_XPU
 #ifdef USE_NPU
     if (run_finished_) {
-      auto code = aclrtDestroyEvent(*run_finished_);
-      if (code != ACL_SUCCESS) {
-        std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: "
-                  << code << std::endl;
-      }
+        auto code = aclrtDestroyEvent(*run_finished_);
+        if (code != ACL_SUCCESS) {
+            std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: "
+                << code << std::endl;
+        }
     }
 #endif // USE_NPU
-  }
+}
 
-  AOTInductorModelBase(AOTInductorModelBase&&) = delete;
-  AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete;
-  AOTInductorModelBase(const AOTInductorModelBase&) = delete;
-  AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete;
+AOTInductorModelBase(AOTInductorModelBase&&) = delete;
+AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete;
+AOTInductorModelBase(const AOTInductorModelBase&) = delete;
+AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete;
 
-  void run(
-      AtenTensorHandle*
-          input_handles, // array of input AtenTensorHandle; handles
+void run(
+    AtenTensorHandle*
+        input_handles, // array of input AtenTensorHandle; handles
                          // are stolen; the array itself is borrowed
-      AtenTensorHandle*
-          output_handles, // array for writing output AtenTensorHandle; handles
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
                           // will be stolen by the caller; the array itself is
                           // borrowed
-      DeviceStreamType stream,
-      AOTIProxyExecutorHandle proxy_executor) {
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor) {
 #ifdef USE_CUDA
     if (!run_finished_) {
-      cudaEvent_t run_finished;
-      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
-      run_finished_.emplace(run_finished);
+        cudaEvent_t run_finished;
+        AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+        run_finished_.emplace(run_finished);
     }
 #elif defined(USE_XPU)
     if (run_finished_) {
-      (*run_finished_)->wait_and_throw();
-      delete *run_finished_;
-      run_finished_.reset();
+        (*run_finished_)->wait_and_throw();
+        delete *run_finished_;
+        run_finished_.reset();
     }
 #elif defined(USE_NPU)
     if (!run_finished_) {
@@ -262,27 +266,27 @@ class AOTInductorModelBase {
 #endif // USE_CUDA
   }
 
-  std::unordered_map<std::string, AtenTensorHandle> run_const_fold(
-      DeviceStreamType stream,
-      AOTIProxyExecutorHandle proxy_executor,
-      bool initialization = false) {
+std::unordered_map<std::string, AtenTensorHandle> run_const_fold(
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor,
+    bool initialization = false) {
 #ifdef USE_CUDA
     if (!run_finished_) {
-      cudaEvent_t run_finished;
-      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
-      run_finished_.emplace(run_finished);
+        cudaEvent_t run_finished;
+        AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+        run_finished_.emplace(run_finished);
     }
 #elif defined(USE_XPU)
     if (run_finished_) {
-      (*run_finished_)->wait_and_throw();
-      delete *run_finished_;
-      run_finished_.reset();
+        (*run_finished_)->wait_and_throw();
+        delete *run_finished_;
+        run_finished_.reset();
     }
 #elif defined(USE_NPU)
     if (!run_finished_) {
-      aclrtEvent run_finished;
-      AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished));
-      run_finished_.emplace(run_finished);
+        aclrtEvent run_finished;
+        AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished));
+        run_finished_.emplace(run_finished);
     }
 #else // !USE_CUDA && !USE_XPU
     run_finished_ = false;
@@ -305,119 +309,119 @@ class AOTInductorModelBase {
 #endif // USE_CUDA
 
     return folded_constants;
-  }
+}
 
-  void load_constants() {
+void load_constants() {
     size_t num_constants = this->num_constants();
     constants_map_->reserve(num_constants);
 
     std::vector<size_t> constants_internal_offset(num_constants);
     if (device_type_ != aoti_torch_device_type_cpu()) {
-      size_t blob_size = 0;
-      compute_gpu_constant_blob(blob_size, constants_internal_offset);
+        size_t blob_size = 0;
+        compute_gpu_constant_blob(blob_size, constants_internal_offset);
 #if defined(USE_CUDA) || defined(USE_XPU)
-      constant_blob_ = RAII_gpuMalloc(blob_size);
+    constant_blob_ = RAII_gpuMalloc(blob_size);
 #elif defined(USE_NPU)
     constant_blob_ = RAII_npuMalloc(blob_size);
 #endif
     }
     if (!include_weights) {
-      return;
+        return;
     }
 
     size_t bytes_read = 0;
     for (size_t i = 0; i < num_constants; i++) {
-      bool from_folded = this->constant_from_folded(i);
+        bool from_folded = this->constant_from_folded(i);
 #if not defined(USE_XPU) && not defined(USE_CUDA) && not defined(USE_NPU)
-      if (from_folded) {
+    if (from_folded) {
         // We do not reallocate and copy for CPU.
         continue;
-      }
+    }
 #endif // USE_CUDA
-      std::string name = this->constant_name(i);
-      size_t data_size = this->constant_data_size(i);
-      uint8_t* internal_ptr = (data_size != 0)
-          ? constant_ptr(
-                constants_internal_offset[i],
-                bytes_read,
-                data_size,
-                from_folded)
+    std::string name = this->constant_name(i);
+    size_t data_size = this->constant_data_size(i);
+    uint8_t* internal_ptr = (data_size != 0)
+        ? constant_ptr(
+              constants_internal_offset[i],
+              bytes_read,
+              data_size,
+              from_folded)
           : nullptr;
-      bytes_read += data_size;
-
-      // Create at::Tensor from copied memory.
-      auto dtype = this->constant_dtype(i);
-      auto ndim = this->constant_ndim(i);
-      auto size = this->constant_shape(i);
-      auto stride = this->constant_stride(i);
-      auto offset = this->constant_offset(i);
-      auto layout = this->constant_layout(i);
-      auto opaque_metadata_ptr = this->opaque_metadata(i);
-      auto opaque_metadata_size = this->opaque_metadata_size(i);
-
-      AtenTensorHandle tensor_handle = nullptr;
+    bytes_read += data_size;
+
+    // Create at::Tensor from copied memory.
+    auto dtype = this->constant_dtype(i);
+    auto ndim = this->constant_ndim(i);
+    auto size = this->constant_shape(i);
+    auto stride = this->constant_stride(i);
+    auto offset = this->constant_offset(i);
+    auto layout = this->constant_layout(i);
+    auto opaque_metadata_ptr = this->opaque_metadata(i);
+    auto opaque_metadata_size = this->opaque_metadata_size(i);
+
+    AtenTensorHandle tensor_handle = nullptr;
 #ifdef AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1
       // When opaque_metadata_size is not 0, we need to have the
       // aoti_torch_create_tensor_from_blob_npu_v2 available
-      AOTI_RUNTIME_CHECK(
-          opaque_metadata_size == 0,
-          "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined");
-      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu(
-          internal_ptr,
-          ndim,
-          size,
-          stride,
-          offset,
-          dtype,
-          device_type_,
-          device_idx_,
-          &tensor_handle));
+    AOTI_RUNTIME_CHECK(
+        opaque_metadata_size == 0,
+        "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined");
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu(
+        internal_ptr,
+        ndim,
+        size,
+        stride,
+        offset,
+        dtype,
+        device_type_,
+        device_idx_,
+        &tensor_handle));
 #else
-      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(
-          internal_ptr,
-          ndim,
-          size,
-          stride,
-          offset,
-          dtype,
-          device_type_,
-          device_idx_,
-          &tensor_handle,
-          layout,
-          opaque_metadata_ptr,
-          opaque_metadata_size));
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(
+        internal_ptr,
+        ndim,
+        size,
+        stride,
+        offset,
+        dtype,
+        device_type_,
+        device_idx_,
+        &tensor_handle,
+        layout,
+        opaque_metadata_ptr,
+        opaque_metadata_size));
 #endif // AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1
-      constants_map_->emplace(std::move(name), tensor_handle);
+    constants_map_->emplace(std::move(name), tensor_handle);
     }
     if (constants_map_) {
-      this->update_constants_array_from_map();
+        this->update_constants_array_from_map();
     }
-  }
+}
 
 #if defined(USE_CUDA) || defined(USE_XPU)
-  GPUPtr&& release_constant_blob() {
-    return std::move(constant_blob_);
-  }
+    GPUPtr&& release_constant_blob() {
+        return std::move(constant_blob_);
+    }
 #endif
 #ifdef USE_NPU
-  NPUPtr&& release_constant_blob() {
-    return std::move(constant_blob_);
-  }
+    NPUPtr&& release_constant_blob() {
+        return std::move(constant_blob_);
+    }
 #endif
 
-  std::shared_ptr<std::vector<ConstantHandle>> get_constants_array() {
+std::shared_ptr<std::vector<ConstantHandle>> get_constants_array() {
     return constants_;
-  }
+}
 
-  int32_t get_device_idx() const {
+int32_t get_device_idx() const {
     return device_idx_;
-  }
+}
 
-  uint8_t* constant_ptr(
-      size_t constant_offset,
-      size_t bytes_read,
-      size_t data_size,
-      bool skip_copy) {
+uint8_t* constant_ptr(
+    size_t constant_offset,
+    size_t bytes_read,
+    size_t data_size,
+    bool skip_copy) {
 #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU)
     auto* constants_ptr = static_cast<uint8_t*>(constant_blob_.get());
     uint8_t* internal_ptr = constants_ptr + constant_offset;
@@ -454,154 +458,154 @@ class AOTInductorModelBase {
 #endif // USE_CUDA
   }
 
-  void compute_gpu_constant_blob(
-      size_t& blob_size,
-      std::vector<size_t>& constants_internal_offset) {
+void compute_gpu_constant_blob(
+    size_t& blob_size,
+    std::vector<size_t>& constants_internal_offset) {
 #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU)
     size_t num_constants = this->num_constants();
     // Compute required blob size with 64-alignment if on GPU.
     blob_size = 0;
     for (size_t i = 0; i < num_constants; i++) {
-      size_t data_size = this->constant_data_size(i);
-      if (data_size % AOTI_CONST_GPU_ALIGNMENT) {
-        data_size = AOTI_CONST_GPU_ALIGNMENT +
+        size_t data_size = this->constant_data_size(i);
+        if (data_size % AOTI_CONST_GPU_ALIGNMENT) {
+            data_size = AOTI_CONST_GPU_ALIGNMENT +
             (data_size / AOTI_CONST_GPU_ALIGNMENT) * AOTI_CONST_GPU_ALIGNMENT;
-      }
-      constants_internal_offset[i] = blob_size;
-      blob_size += data_size;
+        }
+        constants_internal_offset[i] = blob_size;
+        blob_size += data_size;
     }
 #endif // USE_CUDA
-  }
+}
 
-  size_t num_inputs() const {
+size_t num_inputs() const {
     return inputs_info_.size();
-  }
+}
 
-  size_t num_outputs() const {
+size_t num_outputs() const {
     return outputs_info_.size();
-  }
+}
 
-  size_t num_constants() const {
+size_t num_constants() const {
     return constants_info_.size();
-  }
+}
 
-  const char* input_name(int64_t idx) const {
+const char* input_name(int64_t idx) const {
     return inputs_info_.at(idx).name;
-  }
+}
 
-  const char* output_name(int64_t idx) const {
+const char* output_name(int64_t idx) const {
     return outputs_info_.at(idx).name;
-  }
+}
 
-  const char* constant_name(int64_t idx) const {
+const char* constant_name(int64_t idx) const {
     return constants_info_.at(idx).name;
-  }
+}
 
-  size_t constant_ndim(int64_t idx) {
+size_t constant_ndim(int64_t idx) {
     return constants_info_.at(idx).shape.size();
-  }
+}
 
-  const int64_t* constant_shape(int64_t idx) const {
+const int64_t* constant_shape(int64_t idx) const {
     return constants_info_.at(idx).shape.data();
-  }
+}
 
-  const int64_t* constant_stride(int64_t idx) const {
+const int64_t* constant_stride(int64_t idx) const {
     return constants_info_.at(idx).stride.data();
-  }
+}
 
-  int32_t constant_dtype(int64_t idx) const {
+int32_t constant_dtype(int64_t idx) const {
     return constants_info_.at(idx).dtype;
-  }
+}
 
-  int32_t constant_layout(int64_t idx) const {
+int32_t constant_layout(int64_t idx) const {
     return constants_info_.at(idx).layout;
-  }
+}
 
-  size_t constant_offset(int64_t idx) const {
+size_t constant_offset(int64_t idx) const {
     return constants_info_.at(idx).offset;
-  }
+}
 
-  size_t constant_data_size(int64_t idx) const {
+size_t constant_data_size(int64_t idx) const {
     return constants_info_.at(idx).data_size;
-  }
+}
 
-  const char* constant_original_fqn(int64_t idx) const {
+const char* constant_original_fqn(int64_t idx) const {
     return constants_info_.at(idx).original_fqn;
-  }
+}
 
-  const uint8_t* opaque_metadata(int64_t idx) const {
+const uint8_t* opaque_metadata(int64_t idx) const {
     return constants_info_.at(idx).opaque_metadata.data();
-  }
+}
 
-  size_t opaque_metadata_size(int64_t idx) {
+size_t opaque_metadata_size(int64_t idx) {
     return constants_info_.at(idx).opaque_metadata.size();
-  }
+}
 
-  bool constant_from_folded(int64_t idx) const {
+bool constant_from_folded(int64_t idx) const {
     return constants_info_.at(idx).from_folded;
-  }
+}
 
-  int32_t constant_type(int64_t idx) const {
+int32_t constant_type(int64_t idx) const {
     return constants_info_.at(idx).type;
-  }
+}
 
-  const char* get_in_spec() const {
+const char* get_in_spec() const {
     return in_spec_.c_str();
-  }
+}
 
-  const char* get_out_spec() const {
+const char* get_out_spec() const {
     return out_spec_.c_str();
-  }
+}
 
-  void update_constants_array_from_map() {
+void update_constants_array_from_map() {
     if (!constants_map_) {
-      throw std::runtime_error{
-          "constants_map_ was not ready when constants_ is trying to be constructed from it!"};
+        throw std::runtime_error{
+            "constants_map_ was not ready when constants_ is trying to be constructed from it!"};
     }
     if (!constants_) {
-      constants_ =
-          std::make_shared<std::vector<ConstantHandle>>(constants_info_.size());
+        constants_ =
+            std::make_shared<std::vector<ConstantHandle>>(constants_info_.size());
     } else {
-      constants_->resize(constants_info_.size());
+        constants_->resize(constants_info_.size());
     }
     int idx = 0;
     for (const auto& info : constants_info_) {
-      const auto it = constants_map_->find(info.name);
-      if (it != constants_map_->end()) {
-        constants_->at(idx) = ConstantHandle(it->second);
-      }
-      idx++;
+        const auto it = constants_map_->find(info.name);
+        if (it != constants_map_->end()) {
+            constants_->at(idx) = ConstantHandle(it->second);
+        }
+        idx++;
     }
-  }
+}
 
-  void update_constants_map(
-      std::shared_ptr<ConstantMap> constants_map,
-      bool remap_constants_array = true) {
-    constants_map_ = std::move(constants_map);
-    if (remap_constants_array) {
-      update_constants_array_from_map();
-    }
-  }
+void update_constants_map(
+    std::shared_ptr<ConstantMap> constants_map,
+    bool remap_constants_array = true) {
+        constants_map_ = std::move(constants_map);
+        if (remap_constants_array) {
+            update_constants_array_from_map();
+        }
+}
 
   // This function allows us to update the constants_ that is used to look up
   // the corresponding constant tensor during runtime.
-  void update_constants_array(
-      std::shared_ptr<std::vector<ConstantHandle>> constants_array) {
-    constants_ = std::move(constants_array);
-  }
+void update_constants_array(
+    std::shared_ptr<std::vector<ConstantHandle>> constants_array) {
+        constants_ = std::move(constants_array);
+}
 
   /// Returns true if the model is complete.
-  bool is_finished() {
+bool is_finished() {
 #ifdef USE_CUDA
     if (!run_finished_) {
-      throw std::runtime_error{"Model CUDA event was not initialized"};
+        throw std::runtime_error{"Model CUDA event was not initialized"};
     }
 
     auto event_status = cudaEventQuery(*run_finished_);
     if (event_status == cudaSuccess) {
-      return true;
+        return true;
     } else if (event_status == cudaErrorNotReady) {
-      return false;
+        return false;
     }
 
     throw std::runtime_error(
@@ -610,19 +614,19 @@ class AOTInductorModelBase {
 
 #elif defined(USE_NPU)
         if (!run_finished_) {
-          throw std::runtime_error{"Model NPU event was not initialized"};
+            throw std::runtime_error{"Model NPU event was not initialized"};
         }
         aclrtEventRecordedStatus recordStatus = ACL_EVENT_RECORDED_STATUS_NOT_READY;
         AOTI_RUNTIME_DEVICE_CHECK(aclrtQueryEventStatus(*run_finished_, &recordStatus));
     
         if (recordStatus == ACL_EVENT_RECORDED_STATUS_COMPLETE) {
-          return true;
+            return true;
         } else {
-          return false;
+            return false;
         }
 #elif defined(USE_XPU)
     if (!run_finished_) {
-      throw std::runtime_error{"Model XPU event was not initialized"};
+        throw std::runtime_error{"Model XPU event was not initialized"};
     }
     using namespace sycl::info;
     return (*run_finished_)->get_info<event::command_execution_status>() ==
@@ -631,24 +635,24 @@ class AOTInductorModelBase {
 #else // !USE_CUDA && !USE_XPU
     return run_finished_;
 #endif // USE_CUDA
-  }
+}
 
   /// Synchronizes completion event.
-  void wait_for_completion() {
+void wait_for_completion() {
 #ifdef USE_CUDA
     if (!run_finished_) {
-      throw std::runtime_error{"Model event was not initialized"};
+        throw std::runtime_error{"Model event was not initialized"};
     }
 
     AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_));
 #endif // USE_CUDA
 #ifdef USE_XPU
     if (!run_finished_) {
-      throw std::runtime_error{"Model event was not initialized"};
+        throw std::runtime_error{"Model event was not initialized"};
     }
     (*run_finished_)->wait_and_throw();
 #endif
-  }
+}
 
  protected:
   uint8_t* _get_constants_start() {
@@ -657,7 +661,7 @@ class AOTInductorModelBase {
     return const_cast<uint8_t*>(_binary_constants_bin_start);
 #else
     if (self_mmap) {
-      return self_mmap;
+        return self_mmap;
     }
     Dl_info dl_info;
     // get pointer to constant which are appended to the binary
@@ -691,11 +695,11 @@ class AOTInductorModelBase {
     return self_mmap;
 #endif
   }
-  struct ParamInfo {
+struct ParamInfo {
     const char* name = nullptr;
-  };
+};
 
-  struct ConstInfo {
+struct ConstInfo {
     const char* name = nullptr;
     std::vector<int64_t> shape;
     std::vector<int64_t> stride;
@@ -708,29 +712,29 @@ class AOTInductorModelBase {
     const char* original_fqn = nullptr;
     bool from_folded{};
     int32_t type{};
-  };
+};
 
-  std::vector<ParamInfo> inputs_info_;
-  std::vector<ParamInfo> outputs_info_;
-  std::vector<ConstInfo> constants_info_;
-  std::string in_spec_;
-  std::string out_spec_;
+std::vector<ParamInfo> inputs_info_;
+std::vector<ParamInfo> outputs_info_;
+std::vector<ConstInfo> constants_info_;
+std::string in_spec_;
+std::string out_spec_;
 
-  std::shared_ptr<ConstantMap> constants_map_;
-  std::shared_ptr<std::vector<ConstantHandle>> constants_;
+std::shared_ptr<ConstantMap> constants_map_;
+std::shared_ptr<std::vector<ConstantHandle>> constants_;
 
 #if defined(USE_CUDA) || defined(USE_XPU)
   // Holds the blob storage for constants' at::Tensor for CUDA.
-  GPUPtr constant_blob_;
+    GPUPtr constant_blob_;
 #endif // USE_CUDA
 
 #ifdef USE_NPU
   // Holds the blob storage for constants' at::Tensor for CUDA.
-  NPUPtr constant_blob_;
+    NPUPtr constant_blob_;
 #endif // USE_NPU
 
 #ifdef USE_MMAP_SELF
-  uint8_t* self_mmap = NULL;
+    uint8_t* self_mmap = NULL;
 #endif
 
   // A directory with CUDA binary files, e.g. compiled kernels, etc.
@@ -744,18 +748,18 @@ class AOTInductorModelBase {
   // Record if the model finishes an inference run so that its owning
   // AOTModelContainer can re-use this instance.
 #ifdef USE_CUDA
-  std::optional<cudaEvent_t> run_finished_;
+    std::optional<cudaEvent_t> run_finished_;
 #elif defined(USE_XPU)
-  std::optional<sycl::event*> run_finished_;
+    std::optional<sycl::event*> run_finished_;
 #elif defined(USE_NPU)
-  std::optional<aclrtEvent> run_finished_;
+    std::optional<aclrtEvent> run_finished_;
 #else // !USE_CUDA
-  bool run_finished_{};
+    bool run_finished_{};
 #endif
 
   // Generated model uses this device index to create CUDA guards.
-  int32_t device_type_{};
-  int32_t device_idx_{};
+    int32_t device_type_{};
+    int32_t device_idx_{};
 };
 
 // Codegen-ed classes can derive from this to keep pointers to loaded kernels.
@@ -765,55 +769,55 @@ class AOTInductorModelKernelsBase {
 };
 
 class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
- public:
-  AOTInductorModel(
-      std::shared_ptr<ConstantMap> constants_map,
-      std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-      const std::string& device_str,
-      std::optional<std::string> cubin_dir,
-      bool include_weights = true);
-
-  std::unordered_map<std::string, AtenTensorHandle> const_run_impl(
-      DeviceStreamType stream,
-      AOTIProxyExecutorHandle proxy_executor,
-      bool initialization = false);
-
-  void _const_run_impl(
-      std::vector<AtenTensorHandle>& output_handles,
-      DeviceStreamType stream,
-      AOTIProxyExecutorHandle proxy_executor);
-
-  void run_impl(
-      AtenTensorHandle*
-          input_handles, // array of input AtenTensorHandle; handles
-                         // are stolen; the array itself is borrowed
-      AtenTensorHandle*
-          output_handles, // array for writing output AtenTensorHandle; handles
-                          // will be stolen by the caller; the array itself is
-                          // borrowed
-      DeviceStreamType stream,
-      AOTIProxyExecutorHandle proxy_executor);
-
-  template <typename Inputs, typename Outputs>
-  Outputs run_impl_minimal_arrayref_interface(
-      const Inputs& inputs,
-      DeviceStreamType stream,
-      AOTIProxyExecutorHandle proxy_executor);
-
-  static std::unique_ptr<AOTInductorModel> Create(
-      std::shared_ptr<ConstantMap> constants_map,
-      std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-      const std::string& device_str,
-      std::optional<std::string> cubin_dir) {
-    return std::make_unique<AOTInductorModel>(
-        std::move(constants_map),
-        std::move(constants_array),
-        device_str,
-        std::move(cubin_dir));
-  }
+    public:
+        AOTInductorModel(
+            std::shared_ptr<ConstantMap> constants_map,
+            std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+            const std::string& device_str,
+            std::optional<std::string> cubin_dir,
+            bool include_weights = true);
+
+        std::unordered_map<std::string, AtenTensorHandle> const_run_impl(
+            DeviceStreamType stream,
+            AOTIProxyExecutorHandle proxy_executor,
+            bool initialization = false);
+
+        void _const_run_impl(
+            std::vector<AtenTensorHandle>& output_handles,
+            DeviceStreamType stream,
+            AOTIProxyExecutorHandle proxy_executor);
+
+        void run_impl(
+            AtenTensorHandle* 
+                input_handles, // array of input AtenTensorHandle; 
+                               //handles are stolen; the array itself is borrowed
+            AtenTensorHandle* 
+                output_handles, // array for writing output AtenTensorHandle; 
+                                //handles will be stolen by the caller; 
+                                //the array itself is borrowed
+            DeviceStreamType stream,
+            AOTIProxyExecutorHandle proxy_executor);
+
+        template <typename Inputs, typename Outputs>
+        Outputs run_impl_minimal_arrayref_interface(
+            const Inputs& inputs,
+            DeviceStreamType stream,
+            AOTIProxyExecutorHandle proxy_executor);
+
+        static std::unique_ptr<AOTInductorModel> Create(
+            std::shared_ptr<ConstantMap> constants_map,
+            std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+            const std::string& device_str,
+            std::optional<std::string> cubin_dir) {
+            return std::make_unique<AOTInductorModel>(
+                std::move(constants_map),
+                std::move(constants_array),
+                device_str,
+                std::move(cubin_dir));
+        }
 
- private:
-  std::unique_ptr<AOTInductorModelKernelsBase> kernels_;
+    private:
+        std::unique_ptr<AOTInductorModelKernelsBase> kernels_;
 };
 
 } // namespace torch::aot_inductor
-- 
Gitee


From 83c445cc52ba1904ee0eaef891a6d315c7d29d82 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 05:07:50 +0000
Subject: [PATCH 02/11] model

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 torch_npu/csrc/inductor/aoti_runtime/model.h | 1013 +++++++++---------
 1 file changed, 497 insertions(+), 516 deletions(-)

diff --git a/torch_npu/csrc/inductor/aoti_runtime/model.h b/torch_npu/csrc/inductor/aoti_runtime/model.h
index fe017bc59c6..644b61abcc8 100644
--- a/torch_npu/csrc/inductor/aoti_runtime/model.h
+++ b/torch_npu/csrc/inductor/aoti_runtime/model.h
@@ -22,12 +22,12 @@
 #endif
 
 #define AOTI_RUNTIME_CHECK(EXPR, MSG) \
-do {                                \
-    bool ok = EXPR;                   \
-    if (!ok) {                        \
-        throw std::runtime_error(MSG);  \
-    }                                 \
-} while (0)
+    do { \
+        bool ok = EXPR; \
+        if (!ok) { \
+            throw std::runtime_error(MSG); \
+        } \
+    } while (0)
 
 // At codegen time, we write out a binary file called constants.bin.
 // We then turn the raw binary to an object file that exposes this
@@ -48,7 +48,8 @@ namespace {
 
 using GPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
 
-GPUPtr RAII_gpuMalloc(size_t num_bytes) {
+GPUPtr RAII_gpuMalloc(size_t num_bytes)
+{
     void* data_ptr;
     AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
     auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); };
@@ -61,12 +62,15 @@ GPUPtr RAII_gpuMalloc(size_t num_bytes) {
 
 using NPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
 
-NPUPtr RAII_npuMalloc(size_t num_bytes) {
+NPUPtr RAII_npuMalloc(size_t num_bytes)
+{
     void* data_ptr;
     // aclrtMalloc doesn't support allocate 0-bytes. In this case,
     // e.g, model has no weight, we should do padding.
     size_t padding_bytes = 32;
-    if (num_bytes == 0) num_bytes = padding_bytes;
+    if (num_bytes == 0) {
+        num_bytes = padding_bytes;
+    }
     AOTI_RUNTIME_DEVICE_CHECK(aclrtMalloc((void**)&data_ptr, num_bytes, ACL_MEM_MALLOC_HUGE_FIRST));
     auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(aclrtFree(ptr)); };
     return NPUPtr(data_ptr, deleter);
@@ -78,7 +82,8 @@ NPUPtr RAII_npuMalloc(size_t num_bytes) {
 
 using GPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
 
-GPUPtr RAII_gpuMalloc(size_t num_bytes) {
+GPUPtr RAII_gpuMalloc(size_t num_bytes)
+{
     sycl::queue* queue_ptr = nullptr;
     aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
     void* data_ptr = sycl::malloc_device(num_bytes, *queue_ptr);
@@ -95,7 +100,7 @@ enum ConstantType : uint8_t {
     Unknown = 0,
     Parameter = 1,
     Buffer = 2,
-    ensorConstant = 3,
+    TensorConstant = 3,
     FoldedConstant = 4,
 };
 
@@ -106,7 +111,8 @@ using ConstantMap = std::unordered_map<std::string, RAIIAtenTensorHandle>;
 inline void parse_device_str(
     const std::string& device_str,
     int32_t& device_type,
-    int32_t& device_idx) {
+    int32_t& device_idx)
+{
     std::regex re("(cpu|cuda|xpu|npu)(:([0-9]+))?");
     std::smatch sm;
     bool matched = std::regex_match(device_str, sm, re);
@@ -114,21 +120,17 @@ inline void parse_device_str(
 
     if (sm[1].str() == "cpu") {
         device_type = aoti_torch_device_type_cpu();
-    } 
-    else if (sm[1].str() == "cuda") {
+    } else if (sm[1].str() == "cuda") {
         device_type = aoti_torch_device_type_cuda();
 #ifdef USE_XPU
-    }
-    else if (sm[1].str() == "xpu") {
+    } else if (sm[1].str() == "xpu") {
         device_type = aoti_torch_device_type_xpu();
 #endif
 #ifdef USE_NPU
-    } 
-    else if (sm[1].str() == "npu") {
+    } else if (sm[1].str() == "npu") {
         device_type = aoti_torch_device_type_npu();
 #endif
-    } 
-    else {
+    } else {
         AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
     }
     int64_t device_sub = 3;
@@ -146,471 +148,446 @@ inline void parse_device_str(
 // methods such as run_impl.
 template <typename Model>
 class AOTInductorModelBase {
- public:
-  AOTInductorModelBase(
-      size_t num_inputs,
-      size_t num_outputs,
-      size_t num_constants,
-      const std::string& device_str,
-      std::optional<std::string> cubin_dir,
-      bool include_weights = true)
-      : inputs_info_(num_inputs),
-        outputs_info_(num_outputs),
-        constants_info_(num_constants),
-        cubin_dir_(std::move(cubin_dir)),
-        include_weights(include_weights) {
-    parse_device_str(device_str, device_type_, device_idx_);
+public:
+    AOTInductorModelBase(
+        size_t num_inputs,
+        size_t num_outputs,
+        size_t num_constants,
+        const std::string& device_str,
+        std::optional<std::string> cubin_dir,
+        bool include_weights = true)
+        : inputs_info_(num_inputs),
+          outputs_info_(num_outputs),
+          constants_info_(num_constants),
+          cubin_dir_(std::move(cubin_dir)),
+          include_weights(include_weights)
+    {
+        parse_device_str(device_str, device_type_, device_idx_);
 
 #ifdef USE_CUDA
-    if (device_idx_ == -1) {
-        AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
-    } else {
-      // If device_idx_ is passed in, we need to set the current device to it
-        AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_));
-    }
+        if (device_idx_ == -1) {
+            AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
+        } else {
+            // If device_idx_ is passed in, we need to set the current device to it
+            AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_));
+        }
 #endif // USE_CUDA
 #ifdef USE_XPU
-    if (device_idx_ == -1) {
-        aoti_torch_get_current_xpu_device(&device_idx_);
-    } else {
-        aoti_torch_set_current_xpu_device(device_idx_);
-    }
+        if (device_idx_ == -1) {
+            aoti_torch_get_current_xpu_device(&device_idx_);
+        } else {
+            aoti_torch_set_current_xpu_device(device_idx_);
+        }
 #endif // USE_XPU
 #ifdef USE_NPU
-    if (device_idx_ == -1) {
-        AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0));
-        AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_));
-    } else {
-        AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(device_idx_));
-    }
+        if (device_idx_ == -1) {
+            AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0));
+            AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_));
+        } else {
+            AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(device_idx_));
+        }
 #endif // USE_NPU
-  }
+    }
 
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-~AOTInductorModelBase() {
+    // NOLINTNEXTLINE(modernize-use-equals-default)
+    ~AOTInductorModelBase()
+    {
 #ifdef USE_CUDA
-    if (run_finished_) {
-        auto code = cudaEventDestroy(*run_finished_);
-        if (code != cudaSuccess) {
-            std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
-                << cudaGetErrorString(code) << std::endl;
+        if (run_finished_) {
+            auto code = cudaEventDestroy(*run_finished_);
+            if (code != cudaSuccess) {
+                std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
+                          << cudaGetErrorString(code) << std::endl;
+            }
         }
-    }
 #endif // USE_CUDA
 #ifdef USE_XPU
-    if (run_finished_) {
-        (*run_finished_)->wait_and_throw();
-        delete *run_finished_;
-    }
+        if (run_finished_) {
+            (*run_finished_)->wait_and_throw();
+            delete *run_finished_;
+        }
 #endif // USE_XPU
 #ifdef USE_NPU
-    if (run_finished_) {
-        auto code = aclrtDestroyEvent(*run_finished_);
-        if (code != ACL_SUCCESS) {
-            std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: "
-                << code << std::endl;
+        if (run_finished_) {
+            auto code = aclrtDestroyEvent(*run_finished_);
+            if (code != ACL_SUCCESS) {
+                std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: "
+                          << code << std::endl;
+            }
         }
-    }
 #endif // USE_NPU
-}
+    }
 
-AOTInductorModelBase(AOTInductorModelBase&&) = delete;
-AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete;
-AOTInductorModelBase(const AOTInductorModelBase&) = delete;
-AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete;
-
-void run(
-    AtenTensorHandle*
-        input_handles, // array of input AtenTensorHandle; handles
-                         // are stolen; the array itself is borrowed
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                          // will be stolen by the caller; the array itself is
-                          // borrowed
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor) {
+    AOTInductorModelBase(AOTInductorModelBase&&) = delete;
+    AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete;
+    AOTInductorModelBase(const AOTInductorModelBase&) = delete;
+    AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete;
+
+    void run(
+        AtenTensorHandle*
+            input_handles, // array of input AtenTensorHandle; handles
+                           // are stolen; the array itself is borrowed
+        AtenTensorHandle*
+            output_handles, // array for writing output AtenTensorHandle; handles
+                            // will be stolen by the caller; the array itself is
+                            // borrowed
+        DeviceStreamType stream,
+        AOTIProxyExecutorHandle proxy_executor)
+    {
 #ifdef USE_CUDA
-    if (!run_finished_) {
-        cudaEvent_t run_finished;
-        AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
-        run_finished_.emplace(run_finished);
-    }
+        if (!run_finished_) {
+            cudaEvent_t run_finished;
+            AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+            run_finished_.emplace(run_finished);
+        }
 #elif defined(USE_XPU)
-    if (run_finished_) {
-        (*run_finished_)->wait_and_throw();
-        delete *run_finished_;
-        run_finished_.reset();
-    }
+        if (run_finished_) {
+            (*run_finished_)->wait_and_throw();
+            delete *run_finished_;
+            run_finished_.reset();
+        }
 #elif defined(USE_NPU)
-    if (!run_finished_) {
-        aclrtEvent run_finished;
-        AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished));
-        run_finished_.emplace(run_finished);
-    }
+        if (!run_finished_) {
+            aclrtEvent run_finished;
+            AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished));
+            run_finished_.emplace(run_finished);
+        }
 #else // !USE_CUDA && !USE_XPU
-    run_finished_ = false;
+        run_finished_ = false;
 #endif
 
-    auto* model = static_cast<Model*>(this);
-    model->run_impl(input_handles, output_handles, stream, proxy_executor);
+        auto* model = static_cast<Model*>(this);
+        model->run_impl(input_handles, output_handles, stream, proxy_executor);
 
 #ifdef USE_CUDA
-    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+        AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
 #elif defined(USE_XPU)
-    run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
-        static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
+        run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
+            static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
 #elif defined(USE_NPU)
-    AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream));
+        AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream));
 #else // !USE_CUDA && !USE_XPU
-    run_finished_ = true;
+        run_finished_ = true;
 #endif // USE_CUDA
-  }
+    }
 
-std::unordered_map<std::string, AtenTensorHandle> run_const_fold(
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor,
-    bool initialization = false) {
+    std::unordered_map<std::string, AtenTensorHandle> run_const_fold(
+        DeviceStreamType stream,
+        AOTIProxyExecutorHandle proxy_executor,
+        bool initialization = false)
+    {
 #ifdef USE_CUDA
-    if (!run_finished_) {
-        cudaEvent_t run_finished;
-        AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
-        run_finished_.emplace(run_finished);
-    }
+        if (!run_finished_) {
+            cudaEvent_t run_finished;
+            AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+            run_finished_.emplace(run_finished);
+        }
 #elif defined(USE_XPU)
-    if (run_finished_) {
-        (*run_finished_)->wait_and_throw();
-        delete *run_finished_;
-        run_finished_.reset();
-    }
+        if (run_finished_) {
+            (*run_finished_)->wait_and_throw();
+            delete *run_finished_;
+            run_finished_.reset();
+        }
 #elif defined(USE_NPU)
-    if (!run_finished_) {
-        aclrtEvent run_finished;
-        AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished));
-        run_finished_.emplace(run_finished);
-    }
+        if (!run_finished_) {
+            aclrtEvent run_finished;
+            AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished));
+            run_finished_.emplace(run_finished);
+        }
 #else // !USE_CUDA && !USE_XPU
-    run_finished_ = false;
+        run_finished_ = false;
 #endif
 
-    auto* model = static_cast<Model*>(this);
-    auto folded_constants =
-        model->const_run_impl(stream, proxy_executor, initialization);
+        auto* model = static_cast<Model*>(this);
+        auto folded_constants =
+            model->const_run_impl(stream, proxy_executor, initialization);
 
 #ifdef USE_CUDA
-    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+        AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
 #elif defined(USE_XPU)
-    run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
-        static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
+        run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
+            static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
 
 #elif defined(USE_NPU)
-    AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream));
+        AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream));
 #else // !USE_CUDA && !USE_XPU
-    run_finished_ = true;
+        run_finished_ = true;
 #endif // USE_CUDA
 
-    return folded_constants;
-}
+        return folded_constants;
+    }
 
-void load_constants() {
-    size_t num_constants = this->num_constants();
-    constants_map_->reserve(num_constants);
+    void load_constants()
+    {
+        size_t num_constants = this->num_constants();
+        constants_map_->reserve(num_constants);
 
-    std::vector<size_t> constants_internal_offset(num_constants);
-    if (device_type_ != aoti_torch_device_type_cpu()) {
-        size_t blob_size = 0;
-        compute_gpu_constant_blob(blob_size, constants_internal_offset);
+        std::vector<size_t> constants_internal_offset(num_constants);
+        if (device_type_ != aoti_torch_device_type_cpu()) {
+            size_t blob_size = 0;
+            compute_gpu_constant_blob(blob_size, constants_internal_offset);
 #if defined(USE_CUDA) || defined(USE_XPU)
-    constant_blob_ = RAII_gpuMalloc(blob_size);
+            constant_blob_ = RAII_gpuMalloc(blob_size);
 #elif defined(USE_NPU)
-    constant_blob_ = RAII_npuMalloc(blob_size);
+            constant_blob_ = RAII_npuMalloc(blob_size);
 #endif
-    }
-    if (!include_weights) {
-        return;
-    }
+        }
+        if (!include_weights) {
+            return;
+        }
 
-    size_t bytes_read = 0;
-    for (size_t i = 0; i < num_constants; i++) {
-        bool from_folded = this->constant_from_folded(i);
+        size_t bytes_read = 0;
+        for (size_t i = 0; i < num_constants; i++) {
+            bool from_folded = this->constant_from_folded(i);
 #if not defined(USE_XPU) && not defined(USE_CUDA) && not defined(USE_NPU)
-    if (from_folded) {
-        // We do not reallocate and copy for CPU.
-        continue;
-    }
+            if (from_folded) {
+                // We do not reallocate and copy for CPU.
+                continue;
+            }
 #endif // USE_CUDA
-    std::string name = this->constant_name(i);
-    size_t data_size = this->constant_data_size(i);
-    uint8_t* internal_ptr = (data_size != 0)
-        ? constant_ptr(
-              constants_internal_offset[i],
-              bytes_read,
-              data_size,
-              from_folded)
-          : nullptr;
-    bytes_read += data_size;
-
-    // Create at::Tensor from copied memory.
-    auto dtype = this->constant_dtype(i);
-    auto ndim = this->constant_ndim(i);
-    auto size = this->constant_shape(i);
-    auto stride = this->constant_stride(i);
-    auto offset = this->constant_offset(i);
-    auto layout = this->constant_layout(i);
-    auto opaque_metadata_ptr = this->opaque_metadata(i);
-    auto opaque_metadata_size = this->opaque_metadata_size(i);
-
-    AtenTensorHandle tensor_handle = nullptr;
+            std::string name = this->constant_name(i);
+            size_t data_size = this->constant_data_size(i);
+            uint8_t* internal_ptr = (data_size != 0)
+                ? constant_ptr(
+                    constants_internal_offset[i],
+                    bytes_read,
+                    data_size,
+                    from_folded)
+                : nullptr;
+            bytes_read += data_size;
+
+            // Create at::Tensor from copied memory.
+            auto dtype = this->constant_dtype(i);
+            auto ndim = this->constant_ndim(i);
+            auto size = this->constant_shape(i);
+            auto stride = this->constant_stride(i);
+            auto offset = this->constant_offset(i);
+            auto layout = this->constant_layout(i);
+            auto opaque_metadata_ptr = this->opaque_metadata(i);
+            auto opaque_metadata_size = this->opaque_metadata_size(i);
+
+            AtenTensorHandle tensor_handle = nullptr;
 #ifdef AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1
-      // When opaque_metadata_size is not 0, we need to have the
-      // aoti_torch_create_tensor_from_blob_npu_v2 available
-    AOTI_RUNTIME_CHECK(
-        opaque_metadata_size == 0,
-        "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined");
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu(
-        internal_ptr,
-        ndim,
-        size,
-        stride,
-        offset,
-        dtype,
-        device_type_,
-        device_idx_,
-        &tensor_handle));
+            // When opaque_metadata_size is not 0, we need to have the
+            // aoti_torch_create_tensor_from_blob_npu_v2 available
+            AOTI_RUNTIME_CHECK(
+                opaque_metadata_size == 0,
+                "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined");
+            AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu(
+                internal_ptr,
+                ndim,
+                size,
+                stride,
+                offset,
+                dtype,
+                device_type_,
+                device_idx_,
+                &tensor_handle));
 #else
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(
-        internal_ptr,
-        ndim,
-        size,
-        stride,
-        offset,
-        dtype,
-        device_type_,
-        device_idx_,
-        &tensor_handle,
-        layout,
-        opaque_metadata_ptr,
-        opaque_metadata_size));
+            AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(
+                internal_ptr,
+                ndim,
+                size,
+                stride,
+                offset,
+                dtype,
+                device_type_,
+                device_idx_,
+                &tensor_handle,
+                layout,
+                opaque_metadata_ptr,
+                opaque_metadata_size));
 #endif // AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1
-    constants_map_->emplace(std::move(name), tensor_handle);
-    }
-    if (constants_map_) {
-        this->update_constants_array_from_map();
+            constants_map_->emplace(std::move(name), tensor_handle);
+        }
+        if (constants_map_) {
+            this->update_constants_array_from_map();
+        }
     }
-}
 
 #if defined(USE_CUDA) || defined(USE_XPU)
-    GPUPtr&& release_constant_blob() {
+    GPUPtr&& release_constant_blob()
+    {
         return std::move(constant_blob_);
     }
 #endif
 #ifdef USE_NPU
-    NPUPtr&& release_constant_blob() {
+    NPUPtr&& release_constant_blob()
+    {
         return std::move(constant_blob_);
     }
 #endif
 
-std::shared_ptr<std::vector<ConstantHandle>> get_constants_array() {
-    return constants_;
-}
+    std::shared_ptr<std::vector<ConstantHandle>> get_constants_array()
+    {
+        return constants_;
+    }
 
-int32_t get_device_idx() const {
-    return device_idx_;
-}
+    int32_t get_device_idx() const
+    {
+        return device_idx_;
+    }
 
-uint8_t* constant_ptr(
-    size_t constant_offset,
-    size_t bytes_read,
-    size_t data_size,
-    bool skip_copy) {
+    uint8_t* constant_ptr(
+        size_t constant_offset,
+        size_t bytes_read,
+        size_t data_size,
+        bool skip_copy)
+    {
 #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU)
-    auto* constants_ptr = static_cast<uint8_t*>(constant_blob_.get());
-    uint8_t* internal_ptr = constants_ptr + constant_offset;
-    // Copy data to GPU memory
-    if (!skip_copy) {
+        auto* constants_ptr = static_cast<uint8_t*>(constant_blob_.get());
+        uint8_t* internal_ptr = constants_ptr + constant_offset;
+        // Copy data to GPU memory
+        if (!skip_copy) {
 #ifdef USE_XPU
-      sycl::queue* queue_ptr = nullptr;
-      aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
-      queue_ptr
-          ->memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size)
-          .wait();
+            sycl::queue* queue_ptr = nullptr;
+            aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+            queue_ptr
+                ->memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size)
+                .wait();
 
 #elif defined(USE_NPU)
-        AOTI_RUNTIME_DEVICE_CHECK(aclrtMemcpy(
-            internal_ptr,
-            data_size,
-            _get_constants_start() + bytes_read,
-            data_size,
-            ACL_MEMCPY_HOST_TO_DEVICE));
+            AOTI_RUNTIME_DEVICE_CHECK(aclrtMemcpy(
+                internal_ptr,
+                data_size,
+                _get_constants_start() + bytes_read,
+                data_size,
+                ACL_MEMCPY_HOST_TO_DEVICE));
 #else
-      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
-          internal_ptr,
-          _get_constants_start() + bytes_read,
-          data_size,
-          cudaMemcpyHostToDevice));
+            AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+                internal_ptr,
+                _get_constants_start() + bytes_read,
+                data_size,
+                cudaMemcpyHostToDevice));
 #endif
-    }
-    return internal_ptr;
+        }
+        return internal_ptr;
 
 #else
-    // get pointer to constant which is packed in model during compile time.
-    AOTI_RUNTIME_CHECK(!skip_copy, "pure cpu mode doesn't support skip copy");
-    return _get_constants_start() + bytes_read;
+        // get pointer to constant which is packed in model during compile time.
+        AOTI_RUNTIME_CHECK(!skip_copy, "pure cpu mode doesn't support skip copy");
+        return _get_constants_start() + bytes_read;
 #endif // USE_CUDA
-  }
+    }
 
-void compute_gpu_constant_blob(
-    size_t& blob_size,
-    std::vector<size_t>& constants_internal_offset) {
+    void compute_gpu_constant_blob(
+        size_t& blob_size,
+        std::vector<size_t>& constants_internal_offset)
+    {
 #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU)
-    size_t num_constants = this->num_constants();
-    // Compute required blob size with 64-alignment if on GPU.
-    blob_size = 0;
-    for (size_t i = 0; i < num_constants; i++) {
-        size_t data_size = this->constant_data_size(i);
-        if (data_size % AOTI_CONST_GPU_ALIGNMENT) {
-            data_size = AOTI_CONST_GPU_ALIGNMENT +
-            (data_size / AOTI_CONST_GPU_ALIGNMENT) * AOTI_CONST_GPU_ALIGNMENT;
+        size_t num_constants = this->num_constants();
+        // Compute required blob size with 64-alignment if on GPU.
+        blob_size = 0;
+        for (size_t i = 0; i < num_constants; i++) {
+            size_t data_size = this->constant_data_size(i);
+            if (data_size % AOTI_CONST_GPU_ALIGNMENT) {
+                data_size = AOTI_CONST_GPU_ALIGNMENT +
+                    (data_size / AOTI_CONST_GPU_ALIGNMENT) * AOTI_CONST_GPU_ALIGNMENT;
+            }
+            constants_internal_offset[i] = blob_size;
+            blob_size += data_size;
         }
-        constants_internal_offset[i] = blob_size;
-        blob_size += data_size;
-    }
 #endif // USE_CUDA
-}
+    }
 
-size_t num_inputs() const {
-    return inputs_info_.size();
-}
+    size_t num_inputs() const { return inputs_info_.size(); }
 
-size_t num_outputs() const {
-    return outputs_info_.size();
-}
+    size_t num_outputs() const { return outputs_info_.size(); }
 
-size_t num_constants() const {
-    return constants_info_.size();
-}
+    size_t num_constants() const { return constants_info_.size(); }
 
-const char* input_name(int64_t idx) const {
-    return inputs_info_.at(idx).name;
-}
+    const char* input_name(int64_t idx) const { return inputs_info_.at(idx).name; }
 
-const char* output_name(int64_t idx) const {
-    return outputs_info_.at(idx).name;
-}
+    const char* output_name(int64_t idx) const { return outputs_info_.at(idx).name; }
 
-const char* constant_name(int64_t idx) const {
-    return constants_info_.at(idx).name;
-}
+    const char* constant_name(int64_t idx) const { return constants_info_.at(idx).name; }
 
-size_t constant_ndim(int64_t idx) {
-    return constants_info_.at(idx).shape.size();
-}
+    size_t constant_ndim(int64_t idx) { return constants_info_.at(idx).shape.size(); }
 
-const int64_t* constant_shape(int64_t idx) const {
-    return constants_info_.at(idx).shape.data();
-}
+    const int64_t* constant_shape(int64_t idx) const { return constants_info_.at(idx).shape.data(); }
 
-const int64_t* constant_stride(int64_t idx) const {
-    return constants_info_.at(idx).stride.data();
-}
+    const int64_t* constant_stride(int64_t idx) const { return constants_info_.at(idx).stride.data(); }
 
-int32_t constant_dtype(int64_t idx) const {
-    return constants_info_.at(idx).dtype;
-}
+    int32_t constant_dtype(int64_t idx) const { return constants_info_.at(idx).dtype; }
 
-int32_t constant_layout(int64_t idx) const {
-    return constants_info_.at(idx).layout;
-}
+    int32_t constant_layout(int64_t idx) const { return constants_info_.at(idx).layout; }
 
-size_t constant_offset(int64_t idx) const {
-    return constants_info_.at(idx).offset;
-}
+    size_t constant_offset(int64_t idx) const { return constants_info_.at(idx).offset; }
 
-size_t constant_data_size(int64_t idx) const {
-    return constants_info_.at(idx).data_size;
-}
+    size_t constant_data_size(int64_t idx) const { return constants_info_.at(idx).data_size; }
 
-const char* constant_original_fqn(int64_t idx) const {
-    return constants_info_.at(idx).original_fqn;
-}
+    const char* constant_original_fqn(int64_t idx) const { return constants_info_.at(idx).original_fqn; }
 
-const uint8_t* opaque_metadata(int64_t idx) const {
-    return constants_info_.at(idx).opaque_metadata.data();
-}
+    const uint8_t* opaque_metadata(int64_t idx) const { return constants_info_.at(idx).opaque_metadata.data(); }
 
-size_t opaque_metadata_size(int64_t idx) {
-    return constants_info_.at(idx).opaque_metadata.size();
-}
+    size_t opaque_metadata_size(int64_t idx) { return constants_info_.at(idx).opaque_metadata.size(); }
 
-bool constant_from_folded(int64_t idx) const {
-    return constants_info_.at(idx).from_folded;
-}
+    bool constant_from_folded(int64_t idx) const { return constants_info_.at(idx).from_folded; }
 
-int32_t constant_type(int64_t idx) const {
-    return constants_info_.at(idx).type;
-}
+    int32_t constant_type(int64_t idx) const { return constants_info_.at(idx).type; }
 
-const char* get_in_spec() const {
-    return in_spec_.c_str();
-}
+    const char* get_in_spec() const { return in_spec_.c_str(); }
 
-const char* get_out_spec() const {
-    return out_spec_.c_str();
-}
+    const char* get_out_spec() const { return out_spec_.c_str(); }
 
-void update_constants_array_from_map() {
-    if (!constants_map_) {
-        throw std::runtime_error{
-            "constants_map_ was not ready when constants_ is trying to be constructed from it!"};
-    }
-    if (!constants_) {
-        constants_ =
-            std::make_shared<std::vector<ConstantHandle>>(constants_info_.size());
-    } else {
-        constants_->resize(constants_info_.size());
-    }
-    int idx = 0;
-    for (const auto& info : constants_info_) {
-        const auto it = constants_map_->find(info.name);
-        if (it != constants_map_->end()) {
-            constants_->at(idx) = ConstantHandle(it->second);
+    void update_constants_array_from_map()
+    {
+        if (!constants_map_) {
+            throw std::runtime_error{
+                "constants_map_ was not ready when constants_ is trying to be constructed from it!"};
+        }
+        if (!constants_) {
+            constants_ =
+                std::make_shared<std::vector<ConstantHandle>>(constants_info_.size());
+        } else {
+            constants_->resize(constants_info_.size());
+        }
+        int idx = 0;
+        for (const auto& info : constants_info_) {
+            const auto it = constants_map_->find(info.name);
+            if (it != constants_map_->end()) {
+                constants_->at(idx) = ConstantHandle(it->second);
+            }
+            idx++;
         }
-        idx++;
     }
-}
 
-void update_constants_map(
-    std::shared_ptr<ConstantMap> constants_map,
-    bool remap_constants_array = true) {
+    void update_constants_map(
+        std::shared_ptr<ConstantMap> constants_map,
+        bool remap_constants_array = true)
+    {
         constants_map_ = std::move(constants_map);
         if (remap_constants_array) {
             update_constants_array_from_map();
         }
-}
+    }
 
-  // This function allows us to update the constants_ that is used to look up
-  // the corresponding constant tensor during runtime.
-void update_constants_array(
-    std::shared_ptr<std::vector<ConstantHandle>> constants_array) {
+    // This function allows us to update the constants_ that is used to look up
+    // the corresponding constant tensor during runtime.
+    void update_constants_array(
+        std::shared_ptr<std::vector<ConstantHandle>> constants_array)
+    {
         constants_ = std::move(constants_array);
-}
+    }
 
-  /// Returns true if the model is complete.
-bool is_finished() {
+    /// Returns true if the model is complete.
+    bool is_finished()
+    {
 #ifdef USE_CUDA
-    if (!run_finished_) {
-        throw std::runtime_error{"Model CUDA event was not initialized"};
-    }
+        if (!run_finished_) {
+            throw std::runtime_error{"Model CUDA event was not initialized"};
+        }
 
-    auto event_status = cudaEventQuery(*run_finished_);
-    if (event_status == cudaSuccess) {
-        return true;
-    } else if (event_status == cudaErrorNotReady) {
-        return false;
-    }
+        auto event_status = cudaEventQuery(*run_finished_);
+        if (event_status == cudaSuccess) {
+            return true;
+        } else if (event_status == cudaErrorNotReady) {
+            return false;
+        }
 
-    throw std::runtime_error(
-        std::string("The model did not finish successfully. Error: ") +
-        cudaGetErrorString(cudaGetLastError()));
+        throw std::runtime_error(
+            std::string("The model did not finish successfully. Error: ") +
+            cudaGetErrorString(cudaGetLastError()));
 
 #elif defined(USE_NPU)
         if (!run_finished_) {
@@ -618,118 +595,120 @@ bool is_finished() {
         }
         aclrtEventRecordedStatus recordStatus = ACL_EVENT_RECORDED_STATUS_NOT_READY;
         AOTI_RUNTIME_DEVICE_CHECK(aclrtQueryEventStatus(*run_finished_, &recordStatus));
-    
+
         if (recordStatus == ACL_EVENT_RECORDED_STATUS_COMPLETE) {
             return true;
         } else {
             return false;
         }
 #elif defined(USE_XPU)
-    if (!run_finished_) {
-        throw std::runtime_error{"Model XPU event was not initialized"};
-    }
-    using namespace sycl::info;
-    return (*run_finished_)->get_info<event::command_execution_status>() ==
-        event_command_status::complete;
+        if (!run_finished_) {
+            throw std::runtime_error{"Model XPU event was not initialized"};
+        }
+        using namespace sycl::info;
+        return (*run_finished_)->get_info<event::command_execution_status>() ==
+            event_command_status::complete;
 
 #else // !USE_CUDA && !USE_XPU
-    return run_finished_;
+        return run_finished_;
 #endif // USE_CUDA
-}
+    }
 
-  /// Synchronizes completion event.
-void wait_for_completion() {
+    /// Synchronizes completion event.
+    void wait_for_completion()
+    {
 #ifdef USE_CUDA
-    if (!run_finished_) {
-        throw std::runtime_error{"Model event was not initialized"};
-    }
+        if (!run_finished_) {
+            throw std::runtime_error{"Model event was not initialized"};
+        }
 
-    AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_));
+        AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_));
 #endif // USE_CUDA
 #ifdef USE_XPU
-    if (!run_finished_) {
-        throw std::runtime_error{"Model event was not initialized"};
-    }
-    (*run_finished_)->wait_and_throw();
+        if (!run_finished_) {
+            throw std::runtime_error{"Model event was not initialized"};
+        }
+        (*run_finished_)->wait_and_throw();
 #endif
-}
+    }
 
- protected:
-  uint8_t* _get_constants_start() {
+protected:
+    uint8_t* _get_constants_start()
+    {
 #ifndef USE_MMAP_SELF
-    // NOLINTNEXTLINE(*const-cast*)
-    return const_cast<uint8_t*>(_binary_constants_bin_start);
+        // NOLINTNEXTLINE(*const-cast*)
+        return const_cast<uint8_t*>(_binary_constants_bin_start);
 #else
-    if (self_mmap) {
+        if (self_mmap) {
+            return self_mmap;
+        }
+        Dl_info dl_info;
+        // get pointer to constant which are appended to the binary
+        AOTI_RUNTIME_CHECK(
+            dladdr(__func__, &dl_info), "Can't find shared library name");
+        int fd = open(dl_info.dli_fname, O_RDONLY);
+        AOTI_RUNTIME_CHECK(fd >= 0, "Shared library file cannot be opened");
+        auto fsize = lseek(fd, 0, SEEK_END);
+        auto weights_size =
+            reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[0];
+        auto magic_number =
+            reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[1];
+        auto weights_offset = fsize - weights_size;
+        AOTI_RUNTIME_CHECK(
+            (weights_offset & 0x3fff) == 0,
+            "weights_offset must be aligned to 16K boundary");
+        auto ptr = mmap(
+            NULL,
+            weights_size,
+            PROT_READ | PROT_WRITE,
+            MAP_PRIVATE,
+            fd,
+            weights_offset);
+        close(fd);
+        AOTI_RUNTIME_CHECK(ptr != MAP_FAILED, "mmap() failed");
+        self_mmap = static_cast<uint8_t*>(ptr);
+        AOTI_RUNTIME_CHECK(
+            reinterpret_cast<uint64_t*>(
+                self_mmap + weights_size - sizeof(uint64_t))[0] == magic_number,
+            "Weigths data seems corrupt");
         return self_mmap;
-    }
-    Dl_info dl_info;
-    // get pointer to constant which are appended to the binary
-    AOTI_RUNTIME_CHECK(
-        dladdr(__func__, &dl_info), "Can't find shared library name");
-    int fd = open(dl_info.dli_fname, O_RDONLY);
-    AOTI_RUNTIME_CHECK(fd >= 0, "Shared library file cannot be opened");
-    auto fsize = lseek(fd, 0, SEEK_END);
-    auto weights_size =
-        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[0];
-    auto magic_number =
-        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[1];
-    auto weights_offset = fsize - weights_size;
-    AOTI_RUNTIME_CHECK(
-        (weights_offset & 0x3fff) == 0,
-        "weights_offset must be aligned to 16K boundary");
-    auto ptr = mmap(
-        NULL,
-        weights_size,
-        PROT_READ | PROT_WRITE,
-        MAP_PRIVATE,
-        fd,
-        weights_offset);
-    close(fd);
-    AOTI_RUNTIME_CHECK(ptr != MAP_FAILED, "mmap() failed");
-    self_mmap = static_cast<uint8_t*>(ptr);
-    AOTI_RUNTIME_CHECK(
-        reinterpret_cast<uint64_t*>(
-            self_mmap + weights_size - sizeof(uint64_t))[0] == magic_number,
-        "Weigths data seems corrupt");
-    return self_mmap;
 #endif
-  }
-struct ParamInfo {
-    const char* name = nullptr;
-};
-
-struct ConstInfo {
-    const char* name = nullptr;
-    std::vector<int64_t> shape;
-    std::vector<int64_t> stride;
-    int32_t dtype{};
-    int64_t offset{};
-    size_t data_size{};
-    int32_t layout{};
-    std::vector<uint8_t> opaque_metadata;
-    int64_t opaque_metadata_size{};
-    const char* original_fqn = nullptr;
-    bool from_folded{};
-    int32_t type{};
-};
-
-std::vector<ParamInfo> inputs_info_;
-std::vector<ParamInfo> outputs_info_;
-std::vector<ConstInfo> constants_info_;
-std::string in_spec_;
-std::string out_spec_;
-
-std::shared_ptr<ConstantMap> constants_map_;
-std::shared_ptr<std::vector<ConstantHandle>> constants_;
+    }
+    struct ParamInfo {
+        const char* name = nullptr;
+    };
+
+    struct ConstInfo {
+        const char* name = nullptr;
+        std::vector<int64_t> shape;
+        std::vector<int64_t> stride;
+        int32_t dtype{};
+        int64_t offset{};
+        size_t data_size{};
+        int32_t layout{};
+        std::vector<uint8_t> opaque_metadata;
+        int64_t opaque_metadata_size{};
+        const char* original_fqn = nullptr;
+        bool from_folded{};
+        int32_t type{};
+    };
+
+    std::vector<ParamInfo> inputs_info_;
+    std::vector<ParamInfo> outputs_info_;
+    std::vector<ConstInfo> constants_info_;
+    std::string in_spec_;
+    std::string out_spec_;
+
+    std::shared_ptr<ConstantMap> constants_map_;
+    std::shared_ptr<std::vector<ConstantHandle>> constants_;
 
 #if defined(USE_CUDA) || defined(USE_XPU)
-  // Holds the blob storage for constants' at::Tensor for CUDA.
+    // Holds the blob storage for constants' at::Tensor for CUDA.
     GPUPtr constant_blob_;
 #endif // USE_CUDA
 
 #ifdef USE_NPU
-  // Holds the blob storage for constants' at::Tensor for CUDA.
+    // Holds the blob storage for constants' at::Tensor for CUDA.
     NPUPtr constant_blob_;
 #endif // USE_NPU
 
@@ -737,16 +716,16 @@ std::shared_ptr<std::vector<ConstantHandle>> constants_;
     uint8_t* self_mmap = NULL;
 #endif
 
-  // A directory with CUDA binary files, e.g. compiled kernels, etc.
-  const std::optional<std::string> cubin_dir_;
+    // A directory with CUDA binary files, e.g. compiled kernels, etc.
+    const std::optional<std::string> cubin_dir_;
 
-  // This is the flag that implies whether the weight is included in the model.
-  // If True, we would prepare the weight when loading the model, otherwise the
-  // model will be loaded without weights, and need to be provided by the user.
-  bool include_weights;
+    // This is the flag that implies whether the weight is included in the model.
+    // If True, we would prepare the weight when loading the model, otherwise the
+    // model will be loaded without weights, and need to be provided by the user.
+    bool include_weights;
 
-  // Record if the model finishes an inference run so that its owning
-  // AOTModelContainer can re-use this instance.
+    // Record if the model finishes an inference run so that its owning
+    // AOTModelContainer can re-use this instance.
 #ifdef USE_CUDA
     std::optional<cudaEvent_t> run_finished_;
 #elif defined(USE_XPU)
@@ -757,67 +736,69 @@ std::shared_ptr<std::vector<ConstantHandle>> constants_;
     bool run_finished_{};
 #endif
 
-  // Generated model uses this device index to create CUDA guards.
+    // Generated model uses this device index to create CUDA guards.
     int32_t device_type_{};
     int32_t device_idx_{};
 };
 
 // Codegen-ed classes can derive from this to keep pointers to loaded kernels.
 class AOTInductorModelKernelsBase {
- public:
-  virtual ~AOTInductorModelKernelsBase() = default;
+public:
+    virtual ~AOTInductorModelKernelsBase() = default;
 };
 
 class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
-    public:
-        AOTInductorModel(
-            std::shared_ptr<ConstantMap> constants_map,
-            std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-            const std::string& device_str,
-            std::optional<std::string> cubin_dir,
-            bool include_weights = true);
-
-        std::unordered_map<std::string, AtenTensorHandle> const_run_impl(
-            DeviceStreamType stream,
-            AOTIProxyExecutorHandle proxy_executor,
-            bool initialization = false);
-
-        void _const_run_impl(
-            std::vector<AtenTensorHandle>& output_handles,
-            DeviceStreamType stream,
-            AOTIProxyExecutorHandle proxy_executor);
-
-        void run_impl(
-            AtenTensorHandle* 
-                input_handles, // array of input AtenTensorHandle; 
-                               //handles are stolen; the array itself is borrowed
-            AtenTensorHandle* 
-                output_handles, // array for writing output AtenTensorHandle; 
-                                //handles will be stolen by the caller; 
-                                //the array itself is borrowed
-            DeviceStreamType stream,
-            AOTIProxyExecutorHandle proxy_executor);
-
-        template <typename Inputs, typename Outputs>
-        Outputs run_impl_minimal_arrayref_interface(
-            const Inputs& inputs,
-            DeviceStreamType stream,
-            AOTIProxyExecutorHandle proxy_executor);
-
-        static std::unique_ptr<AOTInductorModel> Create(
-            std::shared_ptr<ConstantMap> constants_map,
-            std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-            const std::string& device_str,
-            std::optional<std::string> cubin_dir) {
-            return std::make_unique<AOTInductorModel>(
-                std::move(constants_map),
-                std::move(constants_array),
-                device_str,
-                std::move(cubin_dir));
-        }
-
-    private:
-        std::unique_ptr<AOTInductorModelKernelsBase> kernels_;
+public:
+    AOTInductorModel(
+        std::shared_ptr<ConstantMap> constants_map,
+        std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+        const std::string& device_str,
+        std::optional<std::string> cubin_dir,
+        bool include_weights = true);
+
+    std::unordered_map<std::string, AtenTensorHandle> const_run_impl(
+        DeviceStreamType stream,
+        AOTIProxyExecutorHandle proxy_executor,
+        bool initialization = false);
+
+    void _const_run_impl(
+        std::vector<AtenTensorHandle>& output_handles,
+        DeviceStreamType stream,
+        AOTIProxyExecutorHandle proxy_executor);
+
+    void run_impl(
+        AtenTensorHandle*
+            input_handles, // array of input AtenTensorHandle; handles
+                           // are stolen; the array itself is borrowed
+        AtenTensorHandle*
+            output_handles, // array for writing output AtenTensorHandle; handles
+                            // will be stolen by the caller; the array itself is
+                            // borrowed
+        DeviceStreamType stream,
+        AOTIProxyExecutorHandle proxy_executor);
+
+    template <typename Inputs, typename Outputs>
+    Outputs run_impl_minimal_arrayref_interface(
+        const Inputs& inputs,
+        DeviceStreamType stream,
+        AOTIProxyExecutorHandle proxy_executor);
+
+    static std::unique_ptr<AOTInductorModel> Create(
+        std::shared_ptr<ConstantMap> constants_map,
+        std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+        const std::string& device_str,
+        std::optional<std::string> cubin_dir)
+    {
+        return std::make_unique<AOTInductorModel>(
+            std::move(constants_map),
+            std::move(constants_array),
+            device_str,
+            std::move(cubin_dir));
+    }
+
+private:
+    std::unique_ptr<AOTInductorModelKernelsBase> kernels_;
 };
 
 } // namespace torch::aot_inductor
+```
\ No newline at end of file
-- 
Gitee


From 3bc27d7394c33785648a48fc2a08d458949cd447 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 07:21:29 +0000
Subject: [PATCH 03/11] oss_proxy

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 .../aoti_torch/oss_proxy_executor_npu.cpp     | 1134 +++++++++--------
 1 file changed, 570 insertions(+), 564 deletions(-)

diff --git a/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp b/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp
index db916d1d251..c11ef93bfda 100644
--- a/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp
+++ b/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp
@@ -5,8 +5,9 @@
 #include <torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.h>
 
 namespace {
-at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) {
-  return reinterpret_cast<at::Tensor*>(handle);
+at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle)
+{
+    return reinterpret_cast<at::Tensor*>(handle);
 }
 } // namespace
 
@@ -16,477 +17,481 @@ void OSSProxyExecutorNpu::prefill_stack_with_static_arguments(
     size_t index,
     const at::TypePtr& schema_arg_type,
     const nlohmann::json& serialized_arg,
-    OSSOpKernel& op_kernel) {
-  auto& stack = op_kernel.stack_;
-  auto& dynamic_args = op_kernel.dynamic_args_;
-
-  TORCH_CHECK(serialized_arg.size() == 1);
-  std::string serialized_arg_type = serialized_arg.begin().key();
-  auto& serialized_arg_val = serialized_arg.begin().value();
-
-  switch (schema_arg_type->kind()) {
-    case c10::TypeKind::TensorType: {
-      TORCH_CHECK(
-          serialized_arg_type == "as_tensor",
-          "Expected extern kernel ",
-          op_kernel.target_,
-          " to have serialized argument type as_tensor for argument ",
-          index,
-          " but got ",
-          serialized_arg_type);
-      dynamic_args.emplace_back(index, DynamicArgType::TensorType, 1);
-      break;
-    }
-    case c10::TypeKind::IntType: {
-      TORCH_CHECK(
-          serialized_arg_type == "as_int",
-          "Expected extern kernel ",
-          op_kernel.target_,
-          " to have serialized argument type as_int for argument ",
-          index,
-          " but got ",
-          serialized_arg_type);
-      dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
-      break;
-    }
-    case c10::TypeKind::SymIntType: {
-      TORCH_CHECK(
-          serialized_arg_type == "as_int" ||
-              serialized_arg_type == "as_sym_int",
-          "Expected extern kernel ",
-          op_kernel.target_,
-          " to have serialized argument type as_int or as_sym_int for argument ",
-          index,
-          " but got ",
-          serialized_arg_type);
-      dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
-      break;
-    }
-    case c10::TypeKind::FloatType: {
-      TORCH_CHECK(
-          serialized_arg_type == "as_float",
-          "Expected extern kernel ",
-          op_kernel.target_,
-          " to have serialized argument type as_float for argument ",
-          index,
-          " but got ",
-          serialized_arg_type);
-      stack.at(index) = serialized_arg_val.get<double>();
-      break;
-    }
-    case c10::TypeKind::BoolType: {
-      TORCH_CHECK(
-          serialized_arg_type == "as_bool",
-          "Expected extern kernel ",
-          op_kernel.target_,
-          " to have serialized argument type as_bool for argument ",
-          index,
-          " but got ",
-          serialized_arg_type);
-      stack.at(index) = serialized_arg_val.get<bool>();
-      break;
-    }
-    case c10::TypeKind::NumberType: {
-      if (serialized_arg_type == "as_int") {
-        // Only int Scalar is treated as dynamic arg for now
-        dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
-      } else if (serialized_arg_type == "as_float") {
-        stack.at(index) = serialized_arg_val.get<double>();
-      } else if (serialized_arg_type == "as_bool") {
-        stack.at(index) = serialized_arg_val.get<bool>();
-      } else {
-        TORCH_CHECK(
-            false,
-            "Expected extern kernel ",
-            op_kernel.target_,
-            " to have a scalar input for argument ",
-            index,
-            " but got ",
-            serialized_arg_type);
-      }
-      break;
-    }
-    case c10::TypeKind::StringType: {
-      TORCH_CHECK(
-          serialized_arg_type == "as_string",
-          "Expected extern kernel ",
-          op_kernel.target_,
-          " to have serialized argument type as_string for argument ",
-          index,
-          " but got ",
-          serialized_arg_type);
-      stack.at(index) = serialized_arg_val.get<std::string>();
-      break;
-    }
-    case c10::TypeKind::DeviceObjType: {
-      TORCH_CHECK(
-          serialized_arg_type == "as_device",
-          "Expected extern kernel ",
-          op_kernel.target_,
-          " to have serialized argument type as_device for argument ",
-          index,
-          " but got ",
-          serialized_arg_type);
-
-      std::string device_string = serialized_arg_val["type"].get<std::string>();
-      if (serialized_arg_val.contains("index") &&
-          serialized_arg_val["index"].is_number()) {
-        device_string += ":" + serialized_arg_val["index"].get<std::string>();
-      }
-
-      c10::Device device(device_string);
-
-      if (device != *device_) {
-        VLOG(1) << "ProxyExecutor is using " << *device_ << " for "
-                << op_kernel.target_ << " argument #" << index
-                << ", which is different from the one serialized in thrift: "
-                << device << ". Please ensure this is intentional.";
-      }
-
-      stack.at(index) = *device_;
-      break;
-    }
-    case c10::TypeKind::ListType: {
-      if (schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) {
-        TORCH_CHECK(
-            serialized_arg_type == "as_tensors",
-            "Expected extern kernel ",
-            op_kernel.target_,
-            " to have serialized argument type as_tensors for argument ",
-            index,
-            " but got ",
-            serialized_arg_type);
-        TORCH_CHECK(serialized_arg_type == "as_tensors");
-        dynamic_args.emplace_back(
-            index, DynamicArgType::ListTensorType, serialized_arg_val.size());
-      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofInts())) {
-        TORCH_CHECK(
-            serialized_arg_type == "as_ints",
-            "Expected extern kernel ",
-            op_kernel.target_,
-            " to have serialized argument type as_ints for argument ",
-            index,
-            " but got ",
-            serialized_arg_type);
-        dynamic_args.emplace_back(
-            index, DynamicArgType::ListIntType, serialized_arg_val.size());
-      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) {
-        TORCH_CHECK(
-            serialized_arg_type == "as_ints" ||
-                serialized_arg_type == "as_sym_ints",
-            "Expected extern kernel ",
-            op_kernel.target_,
-            " to have serialized argument type as_ints or as_sym_ints for argument ",
-            index,
-            " but got ",
-            serialized_arg_type);
-        dynamic_args.emplace_back(
-            index, DynamicArgType::ListIntType, serialized_arg_val.size());
-      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofFloats())) {
-        TORCH_CHECK(
-            serialized_arg_type == "as_floats",
-            "Expected extern kernel ",
-            op_kernel.target_,
-            " to have serialized argument type as_floats for argument ",
-            index,
-            " but got ",
-            serialized_arg_type);
-        std::vector<double> ret;
-        for (const auto& arg : serialized_arg_val) {
-          ret.push_back(arg.get<double>());
+    OSSOpKernel& op_kernel)
+{
+    auto& stack = op_kernel.stack_;
+    auto& dynamic_args = op_kernel.dynamic_args_;
+
+    TORCH_CHECK(serialized_arg.size() == 1);
+    std::string serialized_arg_type = serialized_arg.begin().key();
+    auto& serialized_arg_val = serialized_arg.begin().value();
+
+    switch (schema_arg_type->kind()) {
+        case c10::TypeKind::TensorType: {
+            TORCH_CHECK(
+                serialized_arg_type == "as_tensor",
+                "Expected extern kernel ",
+                op_kernel.target_,
+                " to have serialized argument type as_tensor for argument ",
+                index,
+                " but got ",
+                serialized_arg_type);
+            dynamic_args.emplace_back(index, DynamicArgType::TensorType, 1);
+            break;
         }
-        stack.at(index) = std::move(ret);
-      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofBools())) {
-        TORCH_CHECK(
-            serialized_arg_type == "as_bools",
-            "Expected extern kernel ",
-            op_kernel.target_,
-            " to have serialized argument type as_bools for argument ",
-            index,
-            " but got ",
-            serialized_arg_type);
-        std::vector<bool> ret;
-        for (const auto& arg : serialized_arg_val) {
-          ret.push_back(arg.get<bool>());
+        case c10::TypeKind::IntType: {
+            TORCH_CHECK(
+                serialized_arg_type == "as_int",
+                "Expected extern kernel ",
+                op_kernel.target_,
+                " to have serialized argument type as_int for argument ",
+                index,
+                " but got ",
+                serialized_arg_type);
+            dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
+            break;
         }
-        stack.at(index) = std::move(ret);
-      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofNumbers())) {
-        if (serialized_arg_type == "as_ints") {
-          dynamic_args.emplace_back(
-              index, DynamicArgType::ListIntType, serialized_arg_val.size());
-        } else if (serialized_arg_type == "as_floats") {
-          std::vector<double> ret;
-          for (const auto& arg : serialized_arg_val) {
-            ret.push_back(arg);
-          }
-          stack.at(index) = std::move(ret);
-        } else if (serialized_arg_type == "as_bools") {
-          std::vector<bool> ret;
-          for (const auto& arg : serialized_arg_val) {
-            ret.push_back(arg);
-          }
-          stack.at(index) = std::move(ret);
-        } else {
-          TORCH_CHECK(
-              false,
-              "Expected extern kernel ",
-              op_kernel.target_,
-              " to have a List[Scalar] input for argument ",
-              index,
-              " but got ",
-              serialized_arg_type);
+        case c10::TypeKind::SymIntType: {
+            TORCH_CHECK(
+                serialized_arg_type == "as_int" ||
+                    serialized_arg_type == "as_sym_int",
+                "Expected extern kernel ",
+                op_kernel.target_,
+                " to have serialized argument type as_int or as_sym_int for argument ",
+                index,
+                " but got ",
+                serialized_arg_type);
+            dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
+            break;
         }
-      } else if (schema_arg_type->isSubtypeOf(
-                     at::ListType::ofOptionalTensors())) {
-        if (serialized_arg_type == "as_optional_tensors") {
-          std::vector<std::string> list_item_types;
-          for (const auto& arg : serialized_arg_val) {
-            list_item_types.push_back(arg.begin().key());
-          }
-          dynamic_args.emplace_back(
-              index,
-              DynamicArgType::ListOptionalTensorType,
-              serialized_arg_val.size(),
-              list_item_types);
-        } else if (serialized_arg_type == "as_tensors") {
-          dynamic_args.emplace_back(
-              index, DynamicArgType::ListTensorType, serialized_arg_val.size());
-        } else {
-          TORCH_CHECK(
-              false,
-              "Expected extern kernel ",
-              op_kernel.target_,
-              " to have a Tensor?[] input for argument ",
-              index,
-              " but got ",
-              serialized_arg_type);
+        case c10::TypeKind::FloatType: {
+            TORCH_CHECK(
+                serialized_arg_type == "as_float",
+                "Expected extern kernel ",
+                op_kernel.target_,
+                " to have serialized argument type as_float for argument ",
+                index,
+                " but got ",
+                serialized_arg_type);
+            stack.at(index) = serialized_arg_val.get<double>();
+            break;
         }
-      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofStrings())) {
-        TORCH_CHECK(
-            serialized_arg_type == "as_strings",
-            "Expected extern kernel ",
-            op_kernel.target_,
-            " to have serialized argument type as_strings for argument ",
-            index,
-            " but got ",
-            serialized_arg_type);
-        std::vector<std::string> ret;
-        for (const auto& arg : serialized_arg_val) {
-          ret.push_back(arg.get<std::string>());
+        case c10::TypeKind::BoolType: {
+            TORCH_CHECK(
+                serialized_arg_type == "as_bool",
+                "Expected extern kernel ",
+                op_kernel.target_,
+                " to have serialized argument type as_bool for argument ",
+                index,
+                " but got ",
+                serialized_arg_type);
+            stack.at(index) = serialized_arg_val.get<bool>();
+            break;
         }
-        stack.at(index) = std::move(ret);
-      } else {
-        TORCH_CHECK(
-            false,
-            "NYI: Unsupported list type ",
-            serialized_arg_type,
-            " for extern kernel ",
-            op_kernel.target_,
-            " argument ",
-            index);
-      }
-      break;
-    }
-    case c10::TypeKind::OptionalType: {
-      auto inner_type =
-          schema_arg_type->castRaw<at::OptionalType>()->getElementType();
-
-      if (serialized_arg_type == "as_none") {
-        stack.at(index) = c10::IValue{};
-        if (inner_type->kind() == c10::TypeKind::TensorType) {
-          // Tensor is None
-          dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0);
-        } else if (
-            inner_type->kind() == c10::TypeKind::IntType ||
-            inner_type->kind() == c10::TypeKind::SymIntType) {
-          // Int or SymInt is None
-          dynamic_args.emplace_back(index, DynamicArgType::IntType, 0);
-        } else if (
-            inner_type->kind() == c10::TypeKind::ListType &&
-            schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) {
-          // List[Tensor] is None
-          dynamic_args.emplace_back(index, DynamicArgType::ListTensorType, 0);
-        } else if (
-            inner_type->kind() == c10::TypeKind::ListType &&
-            schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) {
-          // List[SymInt] is None
-          dynamic_args.emplace_back(index, DynamicArgType::ListIntType, 0);
+        case c10::TypeKind::NumberType: {
+            if (serialized_arg_type == "as_int") {
+                // Only int Scalar is treated as dynamic arg for now
+                dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
+            } else if (serialized_arg_type == "as_float") {
+                stack.at(index) = serialized_arg_val.get<double>();
+            } else if (serialized_arg_type == "as_bool") {
+                stack.at(index) = serialized_arg_val.get<bool>();
+            } else {
+                TORCH_CHECK(
+                    false,
+                    "Expected extern kernel ",
+                    op_kernel.target_,
+                    " to have a scalar input for argument ",
+                    index,
+                    " but got ",
+                    serialized_arg_type);
+            }
+            break;
+        }
+        case c10::TypeKind::StringType: {
+            TORCH_CHECK(
+                serialized_arg_type == "as_string",
+                "Expected extern kernel ",
+                op_kernel.target_,
+                " to have serialized argument type as_string for argument ",
+                index,
+                " but got ",
+                serialized_arg_type);
+            stack.at(index) = serialized_arg_val.get<std::string>();
+            break;
+        }
+        case c10::TypeKind::DeviceObjType: {
+            TORCH_CHECK(
+                serialized_arg_type == "as_device",
+                "Expected extern kernel ",
+                op_kernel.target_,
+                " to have serialized argument type as_device for argument ",
+                index,
+                " but got ",
+                serialized_arg_type);
+
+            std::string device_string = serialized_arg_val["type"].get<std::string>();
+            if (serialized_arg_val.contains("index") &&
+                serialized_arg_val["index"].is_number()) {
+                device_string += ":" + serialized_arg_val["index"].get<std::string>();
+            }
+
+            c10::Device device(device_string);
+
+            if (device != *device_) {
+                VLOG(1) << "ProxyExecutor is using " << *device_ << " for "
+                        << op_kernel.target_ << " argument #" << index
+                        << ", which is different from the one serialized in thrift: "
+                        << device << ". Please ensure this is intentional.";
+            }
+
+            stack.at(index) = *device_;
+            break;
+        }
+        case c10::TypeKind::ListType: {
+            if (schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) {
+                TORCH_CHECK(
+                    serialized_arg_type == "as_tensors",
+                    "Expected extern kernel ",
+                    op_kernel.target_,
+                    " to have serialized argument type as_tensors for argument ",
+                    index,
+                    " but got ",
+                    serialized_arg_type);
+                TORCH_CHECK(serialized_arg_type == "as_tensors");
+                dynamic_args.emplace_back(
+                    index, DynamicArgType::ListTensorType, serialized_arg_val.size());
+            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofInts())) {
+                TORCH_CHECK(
+                    serialized_arg_type == "as_ints",
+                    "Expected extern kernel ",
+                    op_kernel.target_,
+                    " to have serialized argument type as_ints for argument ",
+                    index,
+                    " but got ",
+                    serialized_arg_type);
+                dynamic_args.emplace_back(
+                    index, DynamicArgType::ListIntType, serialized_arg_val.size());
+            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) {
+                TORCH_CHECK(
+                    serialized_arg_type == "as_ints" ||
+                        serialized_arg_type == "as_sym_ints",
+                    "Expected extern kernel ",
+                    op_kernel.target_,
+                    " to have serialized argument type as_ints or as_sym_ints for argument ",
+                    index,
+                    " but got ",
+                    serialized_arg_type);
+                dynamic_args.emplace_back(
+                    index, DynamicArgType::ListIntType, serialized_arg_val.size());
+            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofFloats())) {
+                TORCH_CHECK(
+                    serialized_arg_type == "as_floats",
+                    "Expected extern kernel ",
+                    op_kernel.target_,
+                    " to have serialized argument type as_floats for argument ",
+                    index,
+                    " but got ",
+                    serialized_arg_type);
+                std::vector<double> ret;
+                for (const auto& arg : serialized_arg_val) {
+                    ret.push_back(arg.get<double>());
+                }
+                stack.at(index) = std::move(ret);
+            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofBools())) {
+                TORCH_CHECK(
+                    serialized_arg_type == "as_bools",
+                    "Expected extern kernel ",
+                    op_kernel.target_,
+                    " to have serialized argument type as_bools for argument ",
+                    index,
+                    " but got ",
+                    serialized_arg_type);
+                std::vector<bool> ret;
+                for (const auto& arg : serialized_arg_val) {
+                    ret.push_back(arg.get<bool>());
+                }
+                stack.at(index) = std::move(ret);
+            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofNumbers())) {
+                if (serialized_arg_type == "as_ints") {
+                    dynamic_args.emplace_back(
+                        index, DynamicArgType::ListIntType, serialized_arg_val.size());
+                } else if (serialized_arg_type == "as_floats") {
+                    std::vector<double> ret;
+                    for (const auto& arg : serialized_arg_val) {
+                        ret.push_back(arg);
+                    }
+                    stack.at(index) = std::move(ret);
+                } else if (serialized_arg_type == "as_bools") {
+                    std::vector<bool> ret;
+                    for (const auto& arg : serialized_arg_val) {
+                        ret.push_back(arg);
+                    }
+                    stack.at(index) = std::move(ret);
+                } else {
+                    TORCH_CHECK(
+                        false,
+                        "Expected extern kernel ",
+                        op_kernel.target_,
+                        " to have a List[Scalar] input for argument ",
+                        index,
+                        " but got ",
+                        serialized_arg_type);
+                }
+            } else if (schema_arg_type->isSubtypeOf(
+                           at::ListType::ofOptionalTensors())) {
+                if (serialized_arg_type == "as_optional_tensors") {
+                    std::vector<std::string> list_item_types;
+                    for (const auto& arg : serialized_arg_val) {
+                        list_item_types.push_back(arg.begin().key());
+                    }
+                    dynamic_args.emplace_back(
+                        index,
+                        DynamicArgType::ListOptionalTensorType,
+                        serialized_arg_val.size(),
+                        list_item_types);
+                } else if (serialized_arg_type == "as_tensors") {
+                    dynamic_args.emplace_back(
+                        index, DynamicArgType::ListTensorType, serialized_arg_val.size());
+                } else {
+                    TORCH_CHECK(
+                        false,
+                        "Expected extern kernel ",
+                        op_kernel.target_,
+                        " to have a Tensor?[] input for argument ",
+                        index,
+                        " but got ",
+                        serialized_arg_type);
+                }
+            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofStrings())) {
+                TORCH_CHECK(
+                    serialized_arg_type == "as_strings",
+                    "Expected extern kernel ",
+                    op_kernel.target_,
+                    " to have serialized argument type as_strings for argument ",
+                    index,
+                    " but got ",
+                    serialized_arg_type);
+                std::vector<std::string> ret;
+                for (const auto& arg : serialized_arg_val) {
+                    ret.push_back(arg.get<std::string>());
+                }
+                stack.at(index) = std::move(ret);
+            } else {
+                TORCH_CHECK(
+                    false,
+                    "NYI: Unsupported list type ",
+                    serialized_arg_type,
+                    " for extern kernel ",
+                    op_kernel.target_,
+                    " argument ",
+                    index);
+            }
+            break;
+        }
+        case c10::TypeKind::OptionalType: {
+            auto inner_type =
+                schema_arg_type->castRaw<at::OptionalType>()->getElementType();
+
+            if (serialized_arg_type == "as_none") {
+                stack.at(index) = c10::IValue{};
+                if (inner_type->kind() == c10::TypeKind::TensorType) {
+                    // Tensor is None
+                    dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0);
+                } else if (
+                    inner_type->kind() == c10::TypeKind::IntType ||
+                    inner_type->kind() == c10::TypeKind::SymIntType) {
+                    // Int or SymInt is None
+                    dynamic_args.emplace_back(index, DynamicArgType::IntType, 0);
+                } else if (
+                    inner_type->kind() == c10::TypeKind::ListType &&
+                    schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) {
+                    // List[Tensor] is None
+                    dynamic_args.emplace_back(index, DynamicArgType::ListTensorType, 0);
+                } else if (
+                    inner_type->kind() == c10::TypeKind::ListType &&
+                    schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) {
+                    // List[SymInt] is None
+                    dynamic_args.emplace_back(index, DynamicArgType::ListIntType, 0);
+                }
+            } else {
+                prefill_stack_with_static_arguments(
+                    index, inner_type, serialized_arg, op_kernel);
+            }
+            break;
         }
-      } else {
-        prefill_stack_with_static_arguments(
-            index, inner_type, serialized_arg, op_kernel);
-      }
-      break;
+        default:
+            TORCH_CHECK(
+                false,
+                "Unsupported input type ",
+                serialized_arg_type,
+                " for extern kernel ",
+                op_kernel.target_,
+                " argument ",
+                index);
     }
-    default:
-      TORCH_CHECK(
-          false,
-          "Unsupported input type ",
-          serialized_arg_type,
-          " for extern kernel ",
-          op_kernel.target_,
-          " argument ",
-          index);
-  }
 }
 
 // Populates op_kernel.stack_, op_kernel.dynamic_args_
 void OSSProxyExecutorNpu::get_input_info_from_serialized(
     const std::vector<c10::Argument>& schema_args,
     const nlohmann::json& serialized_node,
-    OSSOpKernel& op_kernel) {
-  std::vector<bool> filled(schema_args.size(), false);
-  TORCH_CHECK(op_kernel.stack_.size() == 0);
-  op_kernel.stack_.resize(schema_args.size());
-  for (const auto& named_argument : serialized_node["inputs"]) {
-      const auto& arg = named_argument["arg"];
-      const auto& name = named_argument["name"].get<std::string>();
-  
-      // Doing a linear lookup in the schema to find the index
-      // of a static argument. Should be fine performance wise
-      // because we usually only have small amount of arguments.
-      for (size_t index = 0; index < schema_args.size(); index++) {
-      auto& schema_arg = schema_args[index];
-      if (schema_arg.name() == name) {
-          prefill_stack_with_static_arguments(
-              index, schema_arg.real_type(), arg, op_kernel);
-          filled[index] = true;
-          break;
-      }
-      }
-  }
-  
-  // If an argument is not filled and has a default value, we should
-  // also prefill the default value.
-  for (size_t index = 0; index < schema_args.size(); index++) {
-      if (!filled[index] && schema_args[index].default_value()) {
-      auto default_value = *schema_args[index].default_value();
-      op_kernel.stack_.at(index) = default_value;
-      }
-  }
+    OSSOpKernel& op_kernel)
+{
+    std::vector<bool> filled(schema_args.size(), false);
+    TORCH_CHECK(op_kernel.stack_.size() == 0);
+    op_kernel.stack_.resize(schema_args.size());
+    for (const auto& named_argument : serialized_node["inputs"]) {
+        const auto& arg = named_argument["arg"];
+        const auto& name = named_argument["name"].get<std::string>();
+
+        // Doing a linear lookup in the schema to find the index
+        // of a static argument. Should be fine performance wise
+        // because we usually only have small amount of arguments.
+        for (size_t index = 0; index < schema_args.size(); index++) {
+            auto& schema_arg = schema_args[index];
+            if (schema_arg.name() == name) {
+                prefill_stack_with_static_arguments(
+                    index, schema_arg.real_type(), arg, op_kernel);
+                filled[index] = true;
+                break;
+            }
+        }
+    }
+
+    // If an argument is not filled and has a default value, we should
+    // also prefill the default value.
+    for (size_t index = 0; index < schema_args.size(); index++) {
+        if (!filled[index] && schema_args[index].default_value()) {
+            auto default_value = *schema_args[index].default_value();
+            op_kernel.stack_.at(index) = default_value;
+        }
+    }
 }
 
 // Populates op_kernel.outputs_
 void OSSProxyExecutorNpu::get_output_info_from_serialized(
     const std::vector<c10::Argument>& schema_returns,
     const nlohmann::json& serialized_node,
-    OSSOpKernel& op_kernel) {
-  std::vector<OSSDynamicArg>& outputs = op_kernel.outputs_;
-
-  TORCH_CHECK(
-      schema_returns.size() == serialized_node["outputs"].size(),
-      "Serialized node doesn't match operator ",
-      serialized_node["target"],
-      "'s schema outputs.");
-
-  size_t output_index = 0;
-  for (const auto& serialized_output : serialized_node["outputs"]) {
-    TORCH_CHECK(serialized_output.size() == 1);
-    std::string serialized_output_type = serialized_output.begin().key();
-    auto& serialized_output_val = serialized_output.begin().value();
-
-    auto& schema_return = schema_returns[output_index];
-    const at::TypePtr& schema_return_type = schema_return.real_type();
-
-    switch (schema_return_type->kind()) {
-      case c10::TypeKind::TensorType: {
-        TORCH_CHECK(
-            serialized_output_type == "as_tensor",
-            "Expected extern kernel ",
-            serialized_node["target"],
-            " to have serialized output type as_tensor, ",
-            " but got ",
-            serialized_output_type);
-        outputs.emplace_back(output_index, DynamicArgType::TensorType, 1);
-        break;
-      }
-      case c10::TypeKind::ListType: {
-        if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) {
-          TORCH_CHECK(
-              serialized_output_type == "as_tensors",
-              "Expected extern kernel ",
-              serialized_node["target"],
-              " to have serialized output type as_tensors, ",
-              " but got ",
-              serialized_output_type);
-          outputs.emplace_back(
-              output_index,
-              DynamicArgType::ListTensorType,
-              serialized_output_val.size());
-        } else {
-          TORCH_CHECK(
-              false,
-              "Unsupported return list type ",
-              schema_return_type->repr_str());
+    OSSOpKernel& op_kernel)
+{
+    std::vector<OSSDynamicArg>& outputs = op_kernel.outputs_;
+
+    TORCH_CHECK(
+        schema_returns.size() == serialized_node["outputs"].size(),
+        "Serialized node doesn't match operator ",
+        serialized_node["target"],
+        "'s schema outputs.");
+
+    size_t output_index = 0;
+    for (const auto& serialized_output : serialized_node["outputs"]) {
+        TORCH_CHECK(serialized_output.size() == 1);
+        std::string serialized_output_type = serialized_output.begin().key();
+        auto& serialized_output_val = serialized_output.begin().value();
+
+        auto& schema_return = schema_returns[output_index];
+        const at::TypePtr& schema_return_type = schema_return.real_type();
+
+        switch (schema_return_type->kind()) {
+            case c10::TypeKind::TensorType: {
+                TORCH_CHECK(
+                    serialized_output_type == "as_tensor",
+                    "Expected extern kernel ",
+                    serialized_node["target"],
+                    " to have serialized output type as_tensor, ",
+                    " but got ",
+                    serialized_output_type);
+                outputs.emplace_back(output_index, DynamicArgType::TensorType, 1);
+                break;
+            }
+            case c10::TypeKind::ListType: {
+                if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) {
+                    TORCH_CHECK(
+                        serialized_output_type == "as_tensors",
+                        "Expected extern kernel ",
+                        serialized_node["target"],
+                        " to have serialized output type as_tensors, ",
+                        " but got ",
+                        serialized_output_type);
+                    outputs.emplace_back(
+                        output_index,
+                        DynamicArgType::ListTensorType,
+                        serialized_output_val.size());
+                } else {
+                    TORCH_CHECK(
+                        false,
+                        "Unsupported return list type ",
+                        schema_return_type->repr_str());
+                }
+                break;
+            }
+            default: {
+                TORCH_CHECK(
+                    false,
+                    "Unsupported return type ",
+                    schema_return_type->repr_str(),
+                    " for extern kernel ",
+                    op_kernel.target_);
+            }
         }
-        break;
-      }
-      default: {
-        TORCH_CHECK(
-            false,
-            "Unsupported return type ",
-            schema_return_type->repr_str(),
-            " for extern kernel ",
-            op_kernel.target_);
-      }
-    }
 
-    output_index++;
-  }
+        output_index++;
+    }
 }
 
-OSSProxyExecutorNpu::OSSProxyExecutorNpu(const std::string& json_path, bool is_cpu) {
-  if (is_cpu) {
-    device_ = std::make_unique<c10::Device>(c10::DeviceType::CPU);
-  } else {
-    int device_idx = -1;
-    device_ = std::make_unique<c10::Device>(c10::DeviceType::CUDA, device_idx);
-  }
+OSSProxyExecutorNpu::OSSProxyExecutorNpu(const std::string& json_path, bool is_cpu)
+{
+    if (is_cpu) {
+        device_ = std::make_unique<c10::Device>(c10::DeviceType::CPU);
+    } else {
+        int device_idx = -1;
+        device_ = std::make_unique<c10::Device>(c10::DeviceType::CUDA, device_idx);
+    }
 
-  std::string extern_kernel_nodes_serialized;
+    std::string extern_kernel_nodes_serialized;
 
-  std::ifstream json_file(json_path);
-  TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path);
+    std::ifstream json_file(json_path);
+    TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path);
 
-  // Parse file into a json object
-  nlohmann::json json_obj;
-  json_file >> json_obj;
+    // Parse file into a json object
+    nlohmann::json json_obj;
+    json_file >> json_obj;
 
-  // Access data
-  for (auto const& serialized_extern_node : json_obj["nodes"]) {
-    auto const& serialized_node = serialized_extern_node["node"];
+    // Access data
+    for (auto const& serialized_extern_node : json_obj["nodes"]) {
+        auto const& serialized_node = serialized_extern_node["node"];
 
-    const std::string& target = serialized_node["target"];
+        const std::string& target = serialized_node["target"];
 
-    std::string opName;
-    std::string overloadName;
-    size_t pos = target.find('.');
-    if (pos == std::string::npos) {
-      opName = target;
-      overloadName = "";
-    } else {
-      // There should be no more periods
-      size_t pos2 = target.find('.', pos + 1);
-      TORCH_CHECK(pos2 == std::string::npos);
+        std::string opName;
+        std::string overloadName;
+        size_t pos = target.find('.');
+        if (pos == std::string::npos) {
+            opName = target;
+            overloadName = "";
+        } else {
+            // There should be no more periods
+            size_t pos2 = target.find('.', pos + 1);
+            TORCH_CHECK(pos2 == std::string::npos);
 
-      opName = target.substr(0, pos);
-      overloadName = target.substr(pos + 1, target.length() - pos);
-    }
+            opName = target.substr(0, pos);
+            overloadName = target.substr(pos + 1, target.length() - pos);
+        }
 
-    c10::OperatorHandle op_handle =
-        c10::Dispatcher::singleton().findSchemaOrThrow(
-            opName.c_str(), overloadName.c_str());
-    const c10::FunctionSchema& schema = op_handle.schema();
+        c10::OperatorHandle op_handle =
+            c10::Dispatcher::singleton().findSchemaOrThrow(
+                opName.c_str(), overloadName.c_str());
+        const c10::FunctionSchema& schema = op_handle.schema();
 
-    const auto& schema_args = schema.arguments();
-    const auto& schema_returns = schema.returns();
+        const auto& schema_args = schema.arguments();
+        const auto& schema_returns = schema.returns();
 
-    OSSOpKernel op_kernel(target, op_handle);
-    get_input_info_from_serialized(schema_args, serialized_node, op_kernel);
-    get_output_info_from_serialized(schema_returns, serialized_node, op_kernel);
+        OSSOpKernel op_kernel(target, op_handle);
+        get_input_info_from_serialized(schema_args, serialized_node, op_kernel);
+        get_output_info_from_serialized(schema_returns, serialized_node, op_kernel);
 
-    op_kernels_.emplace_back(std::move(op_kernel));
-  }
+        op_kernels_.emplace_back(std::move(op_kernel));
+    }
 }
 
 void OSSProxyExecutorNpu::call_function(
@@ -494,134 +499,135 @@ void OSSProxyExecutorNpu::call_function(
     int num_ints,
     int64_t* flatten_int_args,
     int num_tensors,
-    AtenTensorHandle* flatten_tensor_args) {
-  TORCH_CHECK(
-      extern_node_index < static_cast<int>(op_kernels_.size()),
-      "Invalid extern node index");
-  OSSOpKernel& op_kernel = op_kernels_[extern_node_index];
-
-  std::vector<c10::IValue> stack = op_kernel.stack_;
-  auto& dynamic_args = op_kernel.dynamic_args_;
-
-  int tensor_id = 0;
-  int int_id = 0;
-  for (auto& dynamic_arg : dynamic_args) {
-    int arg_index = dynamic_arg.arg_index;
-    DynamicArgType dynamic_arg_type = dynamic_arg.arg_type;
-    int length = dynamic_arg.length;
-
-    if (length == 0) {
-      continue;
-    }
-
-    switch (dynamic_arg_type) {
-      case DynamicArgType::TensorType: {
-        at::Tensor* tensor =
-            tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
-        stack[arg_index] = *tensor;
-        break;
-      }
-      case DynamicArgType::IntType: {
-        int64_t val = flatten_int_args[int_id++];
-        stack[arg_index] = val;
-        break;
-      }
-      case DynamicArgType::ListTensorType: {
-        std::vector<at::Tensor> tensor_list;
-        for (int j = 0; j < length; j++) {
-          at::Tensor* tensor =
-              tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
-          tensor_list.push_back(*tensor);
+    AtenTensorHandle* flatten_tensor_args)
+{
+    TORCH_CHECK(
+        extern_node_index < static_cast<int>(op_kernels_.size()),
+        "Invalid extern node index");
+    OSSOpKernel& op_kernel = op_kernels_[extern_node_index];
+
+    std::vector<c10::IValue> stack = op_kernel.stack_;
+    auto& dynamic_args = op_kernel.dynamic_args_;
+
+    int tensor_id = 0;
+    int int_id = 0;
+    for (auto& dynamic_arg : dynamic_args) {
+        int arg_index = dynamic_arg.arg_index;
+        DynamicArgType dynamic_arg_type = dynamic_arg.arg_type;
+        int length = dynamic_arg.length;
+
+        if (length == 0) {
+            continue;
         }
-        stack[arg_index] = tensor_list;
-        break;
-      }
-      case DynamicArgType::ListOptionalTensorType: {
-        std::vector<std::optional<at::Tensor>> optional_tensor_list;
-        auto& list_item_types = dynamic_arg.list_item_types;
-        TORCH_CHECK(
-            list_item_types.has_value(),
-            "Could not find list of item types for optional tensor list input");
-
-        for (const std::string& item_type : list_item_types.value()) {
-          if (item_type == "as_tensor") {
-            at::Tensor* tensor = tensor_handle_to_tensor_pointer(
-                flatten_tensor_args[tensor_id++]);
-            optional_tensor_list.emplace_back(*tensor);
-          } else if (item_type == "as_none") {
-            optional_tensor_list.emplace_back(std::nullopt);
-          }
-        }
-        stack[arg_index] = optional_tensor_list;
-        break;
-      }
-      case DynamicArgType::ListIntType: {
-        std::vector<int64_t> vals;
-        vals.reserve(length);
-        for (int j = 0; j < length; j++) {
-          vals.push_back(flatten_int_args[int_id++]);
+
+        switch (dynamic_arg_type) {
+            case DynamicArgType::TensorType: {
+                at::Tensor* tensor =
+                    tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+                stack[arg_index] = *tensor;
+                break;
+            }
+            case DynamicArgType::IntType: {
+                int64_t val = flatten_int_args[int_id++];
+                stack[arg_index] = val;
+                break;
+            }
+            case DynamicArgType::ListTensorType: {
+                std::vector<at::Tensor> tensor_list;
+                for (int j = 0; j < length; j++) {
+                    at::Tensor* tensor =
+                        tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+                    tensor_list.push_back(*tensor);
+                }
+                stack[arg_index] = tensor_list;
+                break;
+            }
+            case DynamicArgType::ListOptionalTensorType: {
+                std::vector<std::optional<at::Tensor>> optional_tensor_list;
+                auto& list_item_types = dynamic_arg.list_item_types;
+                TORCH_CHECK(
+                    list_item_types.has_value(),
+                    "Could not find list of item types for optional tensor list input");
+
+                for (const std::string& item_type : list_item_types.value()) {
+                    if (item_type == "as_tensor") {
+                        at::Tensor* tensor = tensor_handle_to_tensor_pointer(
+                            flatten_tensor_args[tensor_id++]);
+                        optional_tensor_list.emplace_back(*tensor);
+                    } else if (item_type == "as_none") {
+                        optional_tensor_list.emplace_back(std::nullopt);
+                    }
+                }
+                stack[arg_index] = optional_tensor_list;
+                break;
+            }
+            case DynamicArgType::ListIntType: {
+                std::vector<int64_t> vals;
+                vals.reserve(length);
+                for (int j = 0; j < length; j++) {
+                    vals.push_back(flatten_int_args[int_id++]);
+                }
+                stack[arg_index] = vals;
+                break;
+            }
+            default:
+                TORCH_CHECK(false, "Unsupported dynamic arg type: ", dynamic_arg_type);
         }
-        stack[arg_index] = vals;
-        break;
-      }
-      default:
-        TORCH_CHECK(false, "Unsupported dynamic arg type: ", dynamic_arg_type);
     }
-  }
-
-  int num_output_tensors = op_kernel.num_output_tensors();
-  TORCH_CHECK(
-      tensor_id == num_tensors - num_output_tensors,
-      "Mismatch between tensors consumed and num of input tensor, got tensor_id = .",
-      tensor_id,
-      ", expected num = ",
-      num_tensors - num_output_tensors);
-  TORCH_CHECK(
-      int_id == num_ints,
-      "Mismatch between ints consumed and num_ints, got int_id = ",
-      int_id,
-      ", num_ints = ",
-      num_ints);
-
-  // Call the op with the prepared stack.
-  const c10::OperatorHandle& op = op_kernel.op_handle_;
-  op.callBoxed(stack);
-
-  const c10::FunctionSchema& schema = op.schema();
-  const auto& schema_returns = schema.returns();
-
-  TORCH_CHECK(op_kernel.outputs_.size() == stack.size());
-  TORCH_CHECK(stack.size() == schema_returns.size());
-
-  int index = 0;
-  for (const auto& schema_return : schema_returns) {
-    if (schema_return.type()->kind() == c10::TypeKind::TensorType) {
-      at::Tensor* tensor =
-          tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
-      *tensor = stack[index++].toTensor();
-    } else if (
-        schema_return.type()->kind() == c10::TypeKind::ListType &&
-        schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) {
-      auto tensors = stack[index++].toTensorList();
-      for (auto&& t : tensors) {
-        at::Tensor* tensor =
-            tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
-        *tensor = t;
-      }
-    } else {
-      TORCH_CHECK(
-          false,
-          "NYI: Unsupported return type for schema: ",
-          schema_return.type()->repr_str());
+
+    int num_output_tensors = op_kernel.num_output_tensors();
+    TORCH_CHECK(
+        tensor_id == num_tensors - num_output_tensors,
+        "Mismatch between tensors consumed and num of input tensor, got tensor_id = .",
+        tensor_id,
+        ", expected num = ",
+        num_tensors - num_output_tensors);
+    TORCH_CHECK(
+        int_id == num_ints,
+        "Mismatch between ints consumed and num_ints, got int_id = ",
+        int_id,
+        ", num_ints = ",
+        num_ints);
+
+    // Call the op with the prepared stack.
+    const c10::OperatorHandle& op = op_kernel.op_handle_;
+    op.callBoxed(stack);
+
+    const c10::FunctionSchema& schema = op.schema();
+    const auto& schema_returns = schema.returns();
+
+    TORCH_CHECK(op_kernel.outputs_.size() == stack.size());
+    TORCH_CHECK(stack.size() == schema_returns.size());
+
+    int index = 0;
+    for (const auto& schema_return : schema_returns) {
+        if (schema_return.type()->kind() == c10::TypeKind::TensorType) {
+            at::Tensor* tensor =
+                tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+            *tensor = stack[index++].toTensor();
+        } else if (
+            schema_return.type()->kind() == c10::TypeKind::ListType &&
+            schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) {
+            auto tensors = stack[index++].toTensorList();
+            for (auto&& t : tensors) {
+                at::Tensor* tensor =
+                    tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+                *tensor = t;
+            }
+        } else {
+            TORCH_CHECK(
+                false,
+                "NYI: Unsupported return type for schema: ",
+                schema_return.type()->repr_str());
+        }
     }
-  }
-
-  TORCH_CHECK(
-      tensor_id == num_tensors,
-      "Mismatch between tensors consumed and num_tensors, got tensor_id = ",
-      tensor_id,
-      ", expected num = ",
-      num_tensors);
+
+    TORCH_CHECK(
+        tensor_id == num_tensors,
+        "Mismatch between tensors consumed and num_tensors, got tensor_id = ",
+        tensor_id,
+        ", expected num = ",
+        num_tensors);
 }
 
-} // namespace torch::aot_inductor
+} // namespace torch::aot_inductor
\ No newline at end of file
-- 
Gitee


From 98cf06bf670668bba5a724c630b64952c4aa7418 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 08:40:41 +0000
Subject: [PATCH 04/11] TensorType

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 torch_npu/csrc/utils/TensorType.cpp | 307 ++++++++++++++--------------
 1 file changed, 149 insertions(+), 158 deletions(-)

diff --git a/torch_npu/csrc/utils/TensorType.cpp b/torch_npu/csrc/utils/TensorType.cpp
index aeb6fd8b832..8a05d271aab 100644
--- a/torch_npu/csrc/utils/TensorType.cpp
+++ b/torch_npu/csrc/utils/TensorType.cpp
@@ -14,12 +14,11 @@ std::vector<std::pair<Backend, ScalarType>> all_declared_types_npu()
 {
     std::vector<std::pair<Backend, ScalarType>> ret;
     // can't easily iterate over enum classes, does not support BFloat16 now
-    std::vector<Backend> backends = { c10::Backend::PrivateUse1 };
+    std::vector<Backend> backends = {c10::Backend::PrivateUse1};
     std::vector<ScalarType> scalar_types = {
         ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float,
         ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half,
-        ScalarType::Bool, ScalarType::BFloat16
-    };
+        ScalarType::Bool, ScalarType::BFloat16};
 
     for (auto& backend : backends) {
         for (auto& scalar_type : scalar_types) {
@@ -39,10 +38,7 @@ struct PyTensorType {
     int backend;
     int scalar_type;
 
-    Backend get_backend() const
-    {
-        return static_cast<Backend>(backend);
-    }
+    Backend get_backend() const { return static_cast<Backend>(backend); }
 
     DispatchKey get_dispatch_key() const
     {
@@ -59,7 +55,7 @@ static_assert(std::is_standard_layout<PyTensorType>::value, "PyTensorType must b
 
 static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types);
 
-static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+static PyObject* Tensor_new(PyTypeObject* type, PyObject* args, PyObject* kwargs)
 {
     HANDLE_TH_ERRORS
     auto& tensor_type = *((PyTensorType*)type);
@@ -76,112 +72,108 @@ static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs
         " not available. Torch not compiled with npu enabled.", PTA_ERROR(ErrCode::TYPE))
     torch_npu::utils::npu_lazy_init();
     return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(),
-                                                             tensor_type.get_scalar_type(),
-                                                             args,
-                                                             kwargs));
+        tensor_type.get_scalar_type(),
+        args,
+        kwargs));
     END_HANDLE_TH_ERRORS
 }
 
 static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg)
 {
-  HANDLE_TH_ERRORS
-  auto self = (PyTensorType*)_self;
-  if (THPVariable_Check(arg)) {
-    const auto& var = THPVariable_Unpack(arg);
-
-    if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() &&
-        var.scalar_type() == static_cast<ScalarType>(self->scalar_type)) {
-      Py_RETURN_TRUE;
+    HANDLE_TH_ERRORS
+    auto self = (PyTensorType*)_self;
+    if (THPVariable_Check(arg)) {
+        const auto& var = THPVariable_Unpack(arg);
+
+        if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() &&
+            var.scalar_type() == static_cast<ScalarType>(self->scalar_type)) {
+            Py_RETURN_TRUE;
+        }
     }
-  }
-  Py_RETURN_FALSE;
-  END_HANDLE_TH_ERRORS
+    Py_RETURN_FALSE;
+    END_HANDLE_TH_ERRORS
 }
 
-PyObject* Tensor_dtype(PyTensorType* self, void *unused)
+PyObject* Tensor_dtype(PyTensorType* self, void* unused)
 {
-  return torch::autograd::utils::wrap(self->dtype);
+    return torch::autograd::utils::wrap(self->dtype);
 }
 
-PyObject* Tensor_layout(PyTensorType* self, void *unused)
+PyObject* Tensor_layout(PyTensorType* self, void* unused)
 {
-  return torch::autograd::utils::wrap(self->layout);
+    return torch::autograd::utils::wrap(self->layout);
 }
 
-PyObject* Tensor_is_npu(PyTensorType* self, void *unused)
+PyObject* Tensor_is_npu(PyTensorType* self, void* unused)
 {
-  if (self->is_npu) {
-    Py_RETURN_TRUE;
-  } else {
-    Py_RETURN_FALSE;
-  }
+    if (self->is_npu) {
+        Py_RETURN_TRUE;
+    } else {
+        Py_RETURN_FALSE;
+    }
 }
 
-PyObject* Tensor_is_sparse(PyTensorType *self, void *unused)
+PyObject* Tensor_is_sparse(PyTensorType* self, void* unused)
 {
-  if (self->layout->layout == at::Layout::Strided) {
-    Py_RETURN_FALSE;
-  } else {
-    Py_RETURN_TRUE;
-  }
+    if (self->layout->layout == at::Layout::Strided) {
+        Py_RETURN_FALSE;
+    } else {
+        Py_RETURN_TRUE;
+    }
 }
 
 static struct PyMethodDef metaclass_methods[] = {
     {"__instancecheck__", Tensor_instancecheck, METH_O, nullptr},
-    {nullptr}
-};
+    {nullptr}};
 
-using getter = PyObject* (*)(PyObject *, void *);
+using getter = PyObject* (*)(PyObject*, void*);
 
 static struct PyGetSetDef metaclass_properties[] = {
-    {"dtype",        (getter)Tensor_dtype, nullptr, nullptr, nullptr},
-    {"layout",       (getter)Tensor_layout, nullptr, nullptr, nullptr},
-    {"is_npu",      (getter)Tensor_is_npu, nullptr, nullptr, nullptr},
-    {"is_sparse",    (getter)Tensor_is_sparse, nullptr, nullptr, nullptr},
-    {nullptr}
-};
+    {"dtype", (getter)Tensor_dtype, nullptr, nullptr, nullptr},
+    {"layout", (getter)Tensor_layout, nullptr, nullptr, nullptr},
+    {"is_npu", (getter)Tensor_is_npu, nullptr, nullptr, nullptr},
+    {"is_sparse", (getter)Tensor_is_sparse, nullptr, nullptr, nullptr},
+    {nullptr}};
 
 static PyTypeObject metaclass = {
-  PyVarObject_HEAD_INIT(nullptr, 0)
-  "torch.tensortype",                          /* tp_name */
-  sizeof(PyTypeObject)                         /* tp_basicsize */
+    PyVarObject_HEAD_INIT(nullptr, 0) "torch.tensortype", /* tp_name */
+    sizeof(PyTypeObject) /* tp_basicsize */
 };
 
 static void py_initialize_metaclass(PyTypeObject& metaclass)
 {
-  metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
-  metaclass.tp_methods = metaclass_methods;
-  metaclass.tp_getset = metaclass_properties;
-  metaclass.tp_base = &PyType_Type;
-  if (PyType_Ready(&metaclass) < 0) {
-    throw python_error();
-  }
+    metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+    metaclass.tp_methods = metaclass_methods;
+    metaclass.tp_getset = metaclass_properties;
+    metaclass.tp_base = &PyType_Type;
+    if (PyType_Ready(&metaclass) < 0) {
+        throw python_error();
+    }
 }
 
 static PyTypeObject tensor_type_prototype = {
-  PyVarObject_HEAD_INIT(&metaclass, 0)
-  nullptr,                                     /* tp_name */
-  sizeof(PyTensorType)                         /* tp_basicsize */
+    PyVarObject_HEAD_INIT(&metaclass, 0) nullptr, /* tp_name */
+    sizeof(PyTensorType) /* tp_basicsize */
 };
 
 static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict)
 {
-  // NOTE: we don't use the typical static declaration of PyTypeObject because
-  // we need to initialize as many types as there are VariableType instances.
-  // We copy the basic object fields from a prototype definition and initialize
-  // the remaining fields below.
-  memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject));
-  // Subclassing from torch.<ScalarType>Tensor isn't supported.
-  // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed.
-  type.tp_flags = Py_TPFLAGS_DEFAULT;
-  type.tp_name = name;
-  type.tp_new = Tensor_new;
-  if (PyType_Ready(&type) < 0) {
-    throw python_error();
-  }
-  if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) {
-    throw python_error();
-  }
+    // NOTE: we don't use the typical static declaration of PyTypeObject because
+    // we need to initialize as many types as there are VariableType instances.
+    // We copy the basic object fields from a prototype definition and initialize
+    // the remaining fields below.
+    memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject));
+    // Subclassing from torch.<ScalarType>Tensor isn't supported.
+    // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed.
+    type.tp_flags = Py_TPFLAGS_DEFAULT;
+    type.tp_name = name;
+    type.tp_new = Tensor_new;
+    if (PyType_Ready(&type) < 0) {
+        throw python_error();
+    }
+    if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) {
+        throw python_error();
+    }
 }
 
 static std::string get_module(Backend backend)
@@ -204,100 +196,100 @@ static std::string get_module(Backend backend)
 
 static std::string get_name(Backend backend, ScalarType scalarType)
 {
-  std::ostringstream ss;
-  ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
-  return ss.str();
+    std::ostringstream ss;
+    ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
+    return ss.str();
 }
 
 static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType)
 {
-  // This field is lazily initialized from backend and scalar_type
-  type_obj.backend = static_cast<int>(backend);
-  type_obj.scalar_type = static_cast<int>(scalarType);
-  type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend));
-  type_obj.dtype = torch::getTHPDtype(scalarType);
-  type_obj.is_npu = (backend == c10::Backend::PrivateUse1);
+    // This field is lazily initialized from backend and scalar_type
+    type_obj.backend = static_cast<int>(backend);
+    type_obj.scalar_type = static_cast<int>(scalarType);
+    type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend));
+    type_obj.dtype = torch::getTHPDtype(scalarType);
+    type_obj.is_npu = (backend == c10::Backend::PrivateUse1);
 }
 
 static void set_name(PyTensorType& type_obj, const std::string& name)
 {
-  size_t n = sizeof(type_obj.name);
-  strncpy(type_obj.name, name.c_str(), n);
-  type_obj.name[n - 1] = '\0';
+    size_t n = sizeof(type_obj.name);
+    strncpy(type_obj.name, name.c_str(), n);
+    type_obj.name[n - 1] = '\0';
 }
 
 static THPObjectPtr get_tensor_dict()
 {
-  auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
-  if (!torch) {
-      throw python_error();
-  }
-
-  auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor"));
-  if (!tensor_class) {
-      throw python_error();
-  }
-
-  auto tensor_type = (PyTypeObject*)tensor_class.get();
-  TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE));
-
-  auto res = THPObjectPtr(PyDict_New());
-  if (!res) {
-      throw python_error();
-  }
-
-  if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) {
-    throw python_error();
-  }
-  if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) {
-    throw python_error();
-  }
-
-  return res;
+    auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
+    if (!torch) {
+        throw python_error();
+    }
+
+    auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor"));
+    if (!tensor_class) {
+        throw python_error();
+    }
+
+    auto tensor_type = (PyTypeObject*)tensor_class.get();
+    TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE));
+
+    auto res = THPObjectPtr(PyDict_New());
+    if (!res) {
+        throw python_error();
+    }
+
+    if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) {
+        throw python_error();
+    }
+    if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) {
+        throw python_error();
+    }
+
+    return res;
 }
 
 static std::vector<PyTensorType> tensor_types;
 
 static void initialize_npu_aten_types(std::vector<PyTensorType>& tensor_types)
 {
-  // only initialize npu types
-  auto declared_types = all_declared_types_npu();
-  tensor_types.resize(declared_types.size());
-
-  for (size_t i = 0, end = declared_types.size(); i != end; i++) {
-    auto& tensor_type = tensor_types[i];
-    Backend backend = declared_types[i].first;
-    ScalarType scalar_type = declared_types[i].second;
-    set_type(tensor_type, backend, scalar_type);
-    set_name(tensor_type, get_name(backend, scalar_type));
-  }
+    // only initialize npu types
+    auto declared_types = all_declared_types_npu();
+    tensor_types.resize(declared_types.size());
+
+    for (size_t i = 0, end = declared_types.size(); i != end; i++) {
+        auto& tensor_type = tensor_types[i];
+        Backend backend = declared_types[i].first;
+        ScalarType scalar_type = declared_types[i].second;
+        set_type(tensor_type, backend, scalar_type);
+        set_name(tensor_type, get_name(backend, scalar_type));
+    }
 }
 
 void _initialize_python_bindings()
 {
-  // Initialize the at::Type* pointers, name, and properties of the PyTensorType
-  // vector. After this call, the vector must not be resized.
-  initialize_npu_aten_types(tensor_types);
-
-  // Initialize the Python metaclass for the torch.FloatTensor, etc. types.
-  // The metaclass handles __instancecheck__ checks and binds the dtype property
-  // on the type objects.
-  py_initialize_metaclass(metaclass);
-
-  // Get the tp_dict of the Variable class. We copy function definitions
-  // onto each Tensor type object so that they can be accessed via e.g.
-  // `torch.npu.FloatTensor.add`.
-  auto tensor_dict = get_tensor_dict();
-
-  // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc.
-  for (auto& tensor_type : tensor_types) {
-    py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get());
-  }
-
-  // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor
-  // is added to the `torch_npu` module as `FloatTensor`. Also add all the type
-  // objects to the set torch_npu._tensor_classes.
-  py_bind_tensor_types(tensor_types);
+    // Initialize the at::Type* pointers, name, and properties of the PyTensorType
+    // vector. After this call, the vector must not be resized.
+    initialize_npu_aten_types(tensor_types);
+
+    // Initialize the Python metaclass for the torch.FloatTensor, etc. types.
+    // The metaclass handles __instancecheck__ checks and binds the dtype property
+    // on the type objects.
+    py_initialize_metaclass(metaclass);
+
+    // Get the tp_dict of the Variable class. We copy function definitions
+    // onto each Tensor type object so that they can be accessed via e.g.
+    // `torch.npu.FloatTensor.add`.
+    auto tensor_dict = get_tensor_dict();
+
+    // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc.
+    for (auto& tensor_type : tensor_types) {
+        py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get());
+    }
+
+    // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor
+    // is added to the `torch_npu` module as `FloatTensor`. Also add all the type
+    // objects to the set torch_npu._tensor_classes.
+    py_bind_tensor_types(tensor_types);
 }
 
 static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
@@ -335,23 +327,22 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
 }
 
 // Callback for python part. Used for additional initialization of python classes
-static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs)
+static PyObject* THPModule_initExtension(PyObject* _unused, PyObject* noargs)
 {
-  HANDLE_TH_ERRORS
-  _initialize_python_bindings();
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
+    HANDLE_TH_ERRORS
+    _initialize_python_bindings();
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
 }
 
 // autograd methods on torch._C
 static PyMethodDef TorchNpuExtensionMethods[] = {
     {"_initExtension", (PyCFunction)THPModule_initExtension, METH_NOARGS, nullptr},
-    {nullptr, nullptr, 0, nullptr}
-};
+    {nullptr, nullptr, 0, nullptr}};
 
 PyMethodDef* npu_extension_functions()
 {
-  return TorchNpuExtensionMethods;
-}
-}
+    return TorchNpuExtensionMethods;
 }
+} // namespace utils
+} // namespace torch_npu
\ No newline at end of file
-- 
Gitee


From 41380bcef4f8254d5b7f56847156b070cc671a16 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 08:44:43 +0000
Subject: [PATCH 05/11] runner_npu

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 .../model_container_runner_npu.cpp            | 89 ++++++++++---------
 1 file changed, 49 insertions(+), 40 deletions(-)

diff --git a/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.cpp b/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.cpp
index b995cd0ffe3..d333dacadf5 100644
--- a/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.cpp
+++ b/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.cpp
@@ -9,12 +9,13 @@ namespace fs = std::filesystem;
 #endif
 
 namespace {
-bool file_exists(std::string& path) {
+bool file_exists(std::string& path)
+{
 #ifdef _WIN32
-  return fs::exists(path);
+    return fs::exists(path);
 #else
-  struct stat rc {};
-  return lstat(path.c_str(), &rc) == 0;
+    struct stat rc {};
+    return lstat(path.c_str(), &rc) == 0;
 #endif
 }
 } // namespace
@@ -27,51 +28,59 @@ AOTIModelContainerRunnerNpu::AOTIModelContainerRunnerNpu(
     const std::string& device_str,
     const std::string& cubin_dir)
     : AOTIModelContainerRunner(
-          model_so_path,
-          num_models,
-          device_str,
-          cubin_dir) {
-  model_so_path_ = model_so_path;
-  init_flag_ = false;
+        model_so_path,
+        num_models,
+        device_str,
+        cubin_dir)
+{
+    model_so_path_ = model_so_path;
+    init_flag_ = false;
 }
 
 AOTIModelContainerRunnerNpu::~AOTIModelContainerRunnerNpu() = default;
 
-void AOTIModelContainerRunnerNpu::init_proxy_executor() {
-  if (init_flag_) return;
+void AOTIModelContainerRunnerNpu::init_proxy_executor()
+{
+    if (init_flag_) {
+        return;
+    }
 
-  init_flag_ = true;
-  size_t lastindex = model_so_path_.find_last_of('.');
-  std::string json_filename = model_so_path_.substr(0, lastindex) + "_npu.json";
+    init_flag_ = true;
+    size_t lastindex = model_so_path_.find_last_of('.');
+    std::string json_filename = model_so_path_.substr(0, lastindex) + "_npu.json";
 
-  if (file_exists(json_filename)) {
-    proxy_executor_npu_ = std::make_unique<torch::aot_inductor::OSSProxyExecutorNpu>(
-        json_filename, false);
-    proxy_executor_handle_ =
-        reinterpret_cast<AOTIProxyExecutorHandle>(proxy_executor_npu_.get());
-  } else {
-    proxy_executor_handle_ = nullptr;
-  }
+    if (file_exists(json_filename)) {
+        proxy_executor_npu_ = std::make_unique<torch::aot_inductor::OSSProxyExecutorNpu>(
+            json_filename, false);
+        proxy_executor_handle_ =
+            reinterpret_cast<AOTIProxyExecutorHandle>(proxy_executor_npu_.get());
+    } else {
+        proxy_executor_handle_ = nullptr;
+    }
 }
 
 std::vector<at::Tensor> AOTIModelContainerRunnerNpu::run(
-  const std::vector<at::Tensor>& inputs, void* stream_handle) {
-  init_proxy_executor();
-  c10_npu::NPUStream npu_stream = c10_npu::getCurrentNPUStream();
-  return AOTIModelContainerRunner::run(
-      inputs, reinterpret_cast<AOTInductorStreamHandle>(npu_stream.stream()));
+    const std::vector<at::Tensor>& inputs,
+    void* stream_handle)
+{
+    init_proxy_executor();
+    c10_npu::NPUStream npu_stream = c10_npu::getCurrentNPUStream();
+    return AOTIModelContainerRunner::run(
+        inputs, reinterpret_cast<AOTInductorStreamHandle>(npu_stream.stream()));
 }
 
 std::vector<at::Tensor> AOTIModelContainerRunnerNpu::run_with_npu_stream(
     std::vector<at::Tensor>& inputs,
-    c10_npu::NPUStream npu_stream) {
-  return AOTIModelContainerRunner::run(
-      inputs, reinterpret_cast<AOTInductorStreamHandle>(npu_stream.stream()));
+    c10_npu::NPUStream npu_stream)
+{
+    return AOTIModelContainerRunner::run(
+        inputs, reinterpret_cast<AOTInductorStreamHandle>(npu_stream.stream()));
 }
 
-void AOTIModelContainerRunnerNpu::set_proxy_executor(AOTIProxyExecutorHandle handle) {
-  proxy_executor_handle_ = handle;
-  init_flag_ = true;
+void AOTIModelContainerRunnerNpu::set_proxy_executor(AOTIProxyExecutorHandle handle)
+{
+    proxy_executor_handle_ = handle;
+    init_flag_ = true;
 }
 
 namespace {
@@ -79,13 +88,13 @@ std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_npu(
     const std::string& model_so_path,
     size_t num_models,
     const std::string& device_str,
-    const std::string& cubin_dir) {
-  return std::make_unique<AOTIModelContainerRunnerNpu>(
-      model_so_path, num_models, device_str, cubin_dir);
+    const std::string& cubin_dir)
+{
+    return std::make_unique<AOTIModelContainerRunnerNpu>(
+        model_so_path, num_models, device_str, cubin_dir);
 }
 
 RegisterAOTIModelRunner register_npu_runner("npu", &create_aoti_runner_npu);
-}
-
+} // namespace
 
-} // namespace torch::inductor
+} // namespace torch::inductor
\ No newline at end of file
-- 
Gitee


From d3b8b7a6723a9744cecaf6ab978020afcc29939f Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 08:46:35 +0000
Subject: [PATCH 06/11] ForceAclnnList

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 .../csrc/framework/utils/ForceAclnnList.cpp   | 46 ++++++++++---------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
index c6b0e64641a..0bdf4393e0c 100644
--- a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
+++ b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
@@ -19,34 +19,36 @@
 namespace at_npu {
 namespace native {
 
-void ForceAclnn::RegisterOp(const std::string &list) {
-  if (list.empty()) {
-    return;
-  }
+void ForceAclnn::RegisterOp(const std::string& list)
+{
+    if (list.empty()) {
+        return;
+    }
 
-  auto value = list;
-  std::string delimiter = ",";
-  auto start = 0U;
-  auto end = value.find(delimiter);
-  std::string token;
-  while (end != std::string::npos) {
+    auto value = list;
+    std::string delimiter = ",";
+    auto start = 0U;
+    auto end = value.find(delimiter);
+    std::string token;
+    while (end != std::string::npos) {
+        token = value.substr(start, end - start);
+        if (!token.empty()) {
+            force_aclnn_op_list_.insert(token);
+        }
+        start = end + delimiter.size();
+        end = value.find(delimiter, start);
+    }
     token = value.substr(start, end - start);
     if (!token.empty()) {
-      force_aclnn_op_list_.insert(token);
+        force_aclnn_op_list_.insert(token);
     }
-    start = end + delimiter.size();
-    end = value.find(delimiter, start);
-  }
-  token = value.substr(start, end - start);
-  if (!token.empty()) {
-    force_aclnn_op_list_.insert(token);
-  }
-  return;
+    return;
 }
 
-bool ForceAclnn::IsForceAclnnOp(const std::string &op_name) const {
-  bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end());
-  return ret;
+bool ForceAclnn::IsForceAclnnOp(const std::string& op_name) const
+{
+    bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end());
+    return ret;
 }
 } // namespace native
 } // namespace at_npu
\ No newline at end of file
-- 
Gitee


From 8710a02447c41f99a6a75e9e7aeea5b15aa14972 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 08:51:07 +0000
Subject: [PATCH 07/11] shim_npu

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 .../csrc/inductor/aoti_torch/shim_npu.cpp     | 48 ++++++++++---------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/torch_npu/csrc/inductor/aoti_torch/shim_npu.cpp b/torch_npu/csrc/inductor/aoti_torch/shim_npu.cpp
index 95f5a807ae4..3139135c028 100644
--- a/torch_npu/csrc/inductor/aoti_torch/shim_npu.cpp
+++ b/torch_npu/csrc/inductor/aoti_torch/shim_npu.cpp
@@ -12,7 +12,8 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int32_t aoti_torch_device_type_npu() {
+int32_t aoti_torch_device_type_npu()
+{
     return (int32_t)c10::DeviceType::PrivateUse1;
 }
 
@@ -21,15 +22,16 @@ int32_t aoti_torch_device_type_npu() {
 #endif
 
 namespace {
-    static c10::Device c10_device(int32_t device_type, int32_t device_index) {
-        if (device_type == aoti_torch_device_type_cpu()) {
-            return c10::Device(static_cast<c10::DeviceType>(device_type));
-        } else {
-            return c10::Device(
-                static_cast<c10::DeviceType>(device_type),
-                static_cast<c10::DeviceIndex>(device_index));
-        }
+static c10::Device c10_device(int32_t device_type, int32_t device_index)
+{
+    if (device_type == aoti_torch_device_type_cpu()) {
+        return c10::Device(static_cast<c10::DeviceType>(device_type));
+    } else {
+        return c10::Device(
+            static_cast<c10::DeviceType>(device_type),
+            static_cast<c10::DeviceIndex>(device_index));
     }
+}
 } // namespace
 
 AOTITorchError aoti_torch_create_tensor_from_blob_npu(
@@ -41,18 +43,19 @@ AOTITorchError aoti_torch_create_tensor_from_blob_npu(
     int32_t dtype,
     int32_t device_type,
     int32_t device_index,
-    AtenTensorHandle* ret_new_tensor) {
-  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
-    c10::IntArrayRef sizes(sizes_ptr, ndim);
-    c10::IntArrayRef strides(strides_ptr, ndim);
-    c10::Device device = c10_device(device_type, device_index);
-    c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
-        static_cast<c10::ScalarType>(dtype));
-    *ret_new_tensor = torch::aot_inductor::new_tensor_handle(
-        // data == nullptr can happen for a 0-size tensor
-        (data != nullptr) ? at_npu::native::from_blob(data, sizes, strides, storage_offset, options, device)
-                          : at::empty_strided(sizes, strides, options));
-  });
+    AtenTensorHandle* ret_new_tensor)
+{
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+        c10::IntArrayRef sizes(sizes_ptr, ndim);
+        c10::IntArrayRef strides(strides_ptr, ndim);
+        c10::Device device = c10_device(device_type, device_index);
+        c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
+            static_cast<c10::ScalarType>(dtype));
+        *ret_new_tensor = torch::aot_inductor::new_tensor_handle(
+            // data == nullptr can happen for a 0-size tensor
+            (data != nullptr) ? at_npu::native::from_blob(data, sizes, strides, storage_offset, options, device)
+                              : at::empty_strided(sizes, strides, options));
+    });
 }
 
 AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2(
@@ -67,7 +70,8 @@ AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2(
     AtenTensorHandle* ret_new_tensor,
     int32_t layout,
     const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size) {
+    int64_t opaque_metadata_size)
+{
     AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
         if (layout == static_cast<int32_t>(at::kMkldnn)) {
             throw std::runtime_error("do not support mkldnn on npu.");
-- 
Gitee


From 8120e6ad6b15dcc63c4d19571180dbc5d61cf6a6 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 08:52:26 +0000
Subject: [PATCH 08/11] Init

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 torch_npu/csrc/distributed/Init.cpp | 282 ++++++++++++++--------------
 1 file changed, 144 insertions(+), 138 deletions(-)

diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 99c6dc6f22a..19ed1289a40 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -47,7 +47,8 @@ public:
         : impl_(std::move(impl)) {}
     explicit IntrusivePtrNoGilDestructor(T* impl)
         : impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
-    ~IntrusivePtrNoGilDestructor() {
+    ~IntrusivePtrNoGilDestructor()
+    {
         if (impl_) {
             if (PyGILState_Check() != 0) {
                 pybind11::gil_scoped_release release;
@@ -57,19 +58,24 @@ public:
             }
         }
     }
-    T& operator*() const noexcept {
+    T& operator*() const noexcept
+    {
         return *impl_;
     }
-    T* operator->() const noexcept {
+    T* operator->() const noexcept
+    {
         return impl_.get();
     }
-    C10_NODISCARD T* get() const noexcept {
+    C10_NODISCARD T* get() const noexcept
+    {
         return impl_.get();
     }
-    void reset() noexcept {
+    void reset() noexcept
+    {
         impl_.reset();
     }
-    operator bool() const noexcept {
+    operator bool() const noexcept
+    {
         return impl_;
     }
 };
@@ -97,7 +103,7 @@ class BroadcastWork {
 public:
     inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) const
     {
-        static auto cast_back_to_ori_format = [](const at::Tensor &t) {
+        static auto cast_back_to_ori_format = [](const at::Tensor& t) {
             return at_npu::native::custom_ops::npu_format_cast(t, torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_);
         };
         return c10::fmap(tensors, cast_back_to_ori_format);
@@ -109,7 +115,8 @@ public:
         int root_rank = 0)
         : bucket_tensors_(std::move(bucket_tensors)),
           cast_tensors_(cast_tensors(bucket_tensors_)),
-          flat_tensor_({torch::utils::flatten_dense_tensors(cast_tensors_)}) {
+          flat_tensor_({torch::utils::flatten_dense_tensors(cast_tensors_)})
+    {
         c10d::BroadcastOptions broadcastOptions;
         broadcastOptions.rootRank = root_rank;
         work_ = process_group->broadcast(flat_tensor_, broadcastOptions);
@@ -141,7 +148,6 @@ protected:
     std::vector<at::Tensor> flat_tensor_;
 
 private:
-
     // The broadcast work that is kicked off upon construction.
     c10::intrusive_ptr<c10d::Work> work_;
 };
@@ -210,17 +216,17 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
         throw python_error();
     }
     auto torch_npu_C_m = py::handle(torch_npu_C_module).cast<py::module>();
-    
+
     auto m =
         torch_npu_C_m.def_submodule("_distributed_c10d", "distributed c10d bindings");
     auto module = py::handle(m).cast<py::module>();
 
     module.def("_compute_bucket_assignment_by_size",
         [](const std::vector<at::Tensor>& tensors,
-        const std::vector<size_t>& bucket_size_limits,
-        const std::vector<bool>& expect_sparse_gradient,
-        const std::vector<int64_t>& tensor_indices,
-        const c10::optional<std::shared_ptr<::c10d::Logger>>& logger) {
+           const std::vector<size_t>& bucket_size_limits,
+           const std::vector<bool>& expect_sparse_gradient,
+           const std::vector<int64_t>& tensor_indices,
+           const c10::optional<std::shared_ptr<::c10d::Logger>>& logger) {
             if (logger.has_value()) {
                 std::weak_ptr<::c10d::Logger> logger_weakref = logger.value();
                 return ::c10d_npu::compute_bucket_assignment_by_size(tensors, bucket_size_limits, expect_sparse_gradient, tensor_indices, {logger_weakref});
@@ -237,8 +243,8 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
 
     module.def("_verify_params_across_processes",
         [](const c10::intrusive_ptr<::c10d::ProcessGroup>& process_group,
-        const std::vector<at::Tensor>& params,
-        const c10::optional<std::shared_ptr<::c10d::Logger>>& logger) {
+           const std::vector<at::Tensor>& params,
+           const c10::optional<std::shared_ptr<::c10d::Logger>>& logger) {
             if (logger.has_value()) {
                 std::weak_ptr<::c10d::Logger> logger_weakref = logger.value();
                 c10d_npu::verify_params_across_processes(process_group, params, {logger_weakref});
@@ -254,23 +260,23 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
     module
         .def("_register_comm_hook",
             &_register_comm_hook,
-             py::arg("reducer"),
-             py::arg("state"),
-             py::arg("comm_hook"),
-             py::call_guard<py::gil_scoped_release>())
+            py::arg("reducer"),
+            py::arg("state"),
+            py::arg("comm_hook"),
+            py::call_guard<py::gil_scoped_release>())
         .def("_register_builtin_comm_hook",
             &_register_builtin_comm_hook,
-             py::arg("reducer"),
-             py::arg("comm_hook_type"));
+            py::arg("reducer"),
+            py::arg("comm_hook_type"));
 
     module.def("_broadcast_coalesced",
         // Define a lambda such that the pybind11 prototype can take a std::vector
         // for the tensor list argument, but still pass it to the underlying
         // function as a c10::ArrayRef.
         [](c10::intrusive_ptr<::c10d::ProcessGroup> process_group,
-            std::vector<at::Tensor> tensors, // NOLINT
-            size_t buffer_size,
-            int rank) {
+           std::vector<at::Tensor> tensors, // NOLINT
+           size_t buffer_size,
+           int rank) {
             torch_npu::distributed::broadcast_coalesced(
                 std::move(process_group), tensors, buffer_size, rank);
         },
@@ -285,76 +291,76 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
 
     shared_ptr_class_<c10d_npu::Reducer>(module, "Reducer")
         .def(py::init<
-               std::vector<at::Tensor>,
-               std::vector<std::vector<size_t>>,
-               std::vector<size_t>,
-               c10::intrusive_ptr<::c10d::ProcessGroup>,
-               std::vector<bool>,
-               int64_t,
-               bool,
-               bool,
-               std::unordered_map<size_t, std::string>,
-               int64_t>(),
-             py::arg("params"),
-             py::arg("bucket_indices"),
-             py::arg("per_bucket_size_limits"),
-             py::arg("process_group"),
-             py::arg("expect_sparse_gradients") = std::vector<bool>(),
-             py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap,
-             py::arg("find_unused_parameters") = false,
-             py::arg("gradient_as_bucket_view") = false,
-             py::arg("param_to_name_mapping") =
+                 std::vector<at::Tensor>,
+                 std::vector<std::vector<size_t>>,
+                 std::vector<size_t>,
+                 c10::intrusive_ptr<::c10d::ProcessGroup>,
+                 std::vector<bool>,
+                 int64_t,
+                 bool,
+                 bool,
+                 std::unordered_map<size_t, std::string>,
+                 int64_t>(),
+            py::arg("params"),
+            py::arg("bucket_indices"),
+            py::arg("per_bucket_size_limits"),
+            py::arg("process_group"),
+            py::arg("expect_sparse_gradients") = std::vector<bool>(),
+            py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap,
+            py::arg("find_unused_parameters") = false,
+            py::arg("gradient_as_bucket_view") = false,
+            py::arg("param_to_name_mapping") =
                 std::unordered_map<size_t, std::string>(),
-             py::arg("first_bucket_bytes_cap") = ::c10d::kDefaultFirstBucketBytes,
-             py::call_guard<py::gil_scoped_release>())
+            py::arg("first_bucket_bytes_cap") = ::c10d::kDefaultFirstBucketBytes,
+            py::call_guard<py::gil_scoped_release>())
         .def("prepare_for_forward",
             &c10d_npu::Reducer::prepare_for_forward,
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("prepare_for_backward",
             &c10d_npu::Reducer::prepare_for_backward,
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("prepare_for_backward",
             [](c10d_npu::Reducer& reducer, const at::Tensor& output)
                 -> void { reducer.prepare_for_backward({output}); },
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("get_backward_stats", &c10d_npu::Reducer::get_backward_stats)
         .def("_install_post_backward_futures", [](::c10d_npu::Reducer& reducer, const std::vector<std::shared_ptr<torch::jit::PythonFutureWrapper>>& futs) {
-                c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futures(c10::FutureType::create(c10::TensorType::get()));
-                for (const auto &fut : futs) {
-                    futures.push_back(fut->fut);
-                }
-                reducer.install_futures(std::move(futures));
-            },
-             py::call_guard<py::gil_scoped_release>())
+            c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futures(c10::FutureType::create(c10::TensorType::get()));
+            for (const auto& fut : futs) {
+                futures.push_back(fut->fut);
+            }
+            reducer.install_futures(std::move(futures));
+        },
+            py::call_guard<py::gil_scoped_release>())
         .def("_rebuild_buckets",
             &::c10d_npu::Reducer::rebuild_buckets,
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("_get_zeros_like_grad_buckets",
             [](::c10d_npu::Reducer& reducer) {
                 return reducer.get_grad_buckets(true);
             },
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("_push_all_rebuilt_params",
             &::c10d_npu::Reducer::push_rebuilt_params_for_all_indices,
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("_set_forward_pass_work_handle",
             &::c10d_npu::Reducer::set_forward_pass_work_handle,
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("_get_local_used_map",
             &::c10d_npu::Reducer::get_local_used_map_on_device)
         .def("_set_ddp_runtime_logging_sample_rate",
             &::c10d_npu::Reducer::set_ddp_runtime_logging_sample_rate,
-             py::arg("sample_rate"),
-             py::call_guard<py::gil_scoped_release>())
+            py::arg("sample_rate"),
+            py::call_guard<py::gil_scoped_release>())
         .def("_set_static_graph",
             &::c10d_npu::Reducer::set_static_graph,
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("_ddp_graph_static",
             &::c10d_npu::Reducer::ddp_graph_static,
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("_delay_all_reduce",
             &::c10d_npu::Reducer::delay_all_reduce,
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("_run_comm_hook",
             [](::c10d_npu::Reducer& reducer, ::c10d::GradBucket& bucket)
                 -> std::shared_ptr<torch::jit::PythonFutureWrapper> {
@@ -362,10 +368,10 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
                     reducer.run_comm_hook(bucket);
                 return std::make_shared<torch::jit::PythonFutureWrapper>(fut);
             },
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("set_logger",
             [](::c10d_npu::Reducer& reducer,
-                const std::shared_ptr<::c10d::Logger> logger) {
+               const std::shared_ptr<::c10d::Logger> logger) {
                 std::weak_ptr<::c10d::Logger> logger_weakref = logger;
                 reducer.set_logger(logger_weakref);
             });
@@ -374,40 +380,40 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
     auto processGroupHCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupHCCL>(
         module, "ProcessGroupHCCL", dist.attr("Backend"))
         .def(py::init<const c10::intrusive_ptr<::c10d::Store>&,
-                int,
-                int,
-                c10::intrusive_ptr<::c10d_npu::ProcessGroupHCCL::Options>>(),
-             py::call_guard<py::gil_scoped_release>())
+                 int,
+                 int,
+                 c10::intrusive_ptr<::c10d_npu::ProcessGroupHCCL::Options>>(),
+            py::call_guard<py::gil_scoped_release>())
         .def(py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
-                int rank,
-                int size,
-                const std::chrono::milliseconds& timeout) {
-                auto options = ::c10d_npu::ProcessGroupHCCL::Options::create();
-                    options->is_high_priority_stream = false;
-                    options->timeout = timeout;
-                    return c10::make_intrusive<::c10d_npu::ProcessGroupHCCL>(
-                        store, rank, size, options);
-                }),
-             py::arg("store"),
-             py::arg("rank"),
-             py::arg("size"),
-             py::arg("timeout") = kProcessGroupDefaultTimeout,
-             py::call_guard<py::gil_scoped_release>())
+                         int rank,
+                         int size,
+                         const std::chrono::milliseconds& timeout) {
+            auto options = ::c10d_npu::ProcessGroupHCCL::Options::create();
+            options->is_high_priority_stream = false;
+            options->timeout = timeout;
+            return c10::make_intrusive<::c10d_npu::ProcessGroupHCCL>(
+                store, rank, size, options);
+        }),
+            py::arg("store"),
+            py::arg("rank"),
+            py::arg("size"),
+            py::arg("timeout") = kProcessGroupDefaultTimeout,
+            py::call_guard<py::gil_scoped_release>())
         .def("get_hccl_comm", &::c10d_npu::ProcessGroupHCCL::getHcclComm)
         .def("_set_hccl_comm_name", &::c10d_npu::ProcessGroupHCCL::setHcclCommName)
         .def("resume_hccl_comm", &::c10d_npu::ProcessGroupHCCL::resumeHcclComm)
         .def("_set_switch_nic_comm",
             &::c10d_npu::ProcessGroupHCCL::setSwitchNicComm,
-             py::arg("rankid"),
-             py::arg("nRanks"),
-             py::arg("ranks") = std::vector<uint32_t>{},
-             py::arg("useBackup") = std::vector<bool>{})
+            py::arg("rankid"),
+            py::arg("nRanks"),
+            py::arg("ranks") = std::vector<uint32_t>{},
+            py::arg("useBackup") = std::vector<bool>{})
         .def("abort_hccl_comm", &::c10d_npu::ProcessGroupHCCL::abortAndClearHcclComm)
         .def("_delete_tcpstore_key", &::c10d_npu::ProcessGroupHCCL::deleteTCPStoreKey)
         .def("set_watchdog_status", &::c10d_npu::ProcessGroupHCCL::setWatchdogStatus)
         .def("clear_workmeta_list", &::c10d_npu::ProcessGroupHCCL::clearWorkMetaList)
         .def("get_hccl_comm_name",
-            [](::c10d_npu::ProcessGroupHCCL &pg, int rankid, py::args args, py::kwargs kwargs)
+            [](::c10d_npu::ProcessGroupHCCL& pg, int rankid, py::args args, py::kwargs kwargs)
                 -> std::string {
                 bool init_comm = true;
                 if (kwargs.contains("init_comm")) {
@@ -416,31 +422,31 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
                 return pg.getHcclCommName(rankid, init_comm);
             })
         .def("_get_stream_id", &::c10d_npu::ProcessGroupHCCL::getStreamId,
-             py::arg("p2p") = false,
-             py::arg("peer") = -1)
+            py::arg("p2p") = false,
+            py::arg("peer") = -1)
         .def_property_readonly("options", &::c10d_npu::ProcessGroupHCCL::getOptions)
         .def("batch_isend_irecv",
-            [](::c10d_npu::ProcessGroupHCCL &pg, std::vector<std::string> &op_type,
-                std::vector<at::Tensor> &tensors,
-                std::vector<uint32_t> remote_rank_list)
+            [](::c10d_npu::ProcessGroupHCCL& pg, std::vector<std::string>& op_type,
+               std::vector<at::Tensor>& tensors,
+               std::vector<uint32_t> remote_rank_list)
                 -> c10::intrusive_ptr<c10d::Work> {
                 return pg.batch_isend_irecv(op_type, tensors, remote_rank_list);
             },
-             py::call_guard<py::gil_scoped_release>())
+            py::call_guard<py::gil_scoped_release>())
         .def("reduce_scatter_tensor_uneven",
             &::c10d_npu::ProcessGroupHCCL::_reduce_scatter_base_uneven,
-             py::arg("output"),
-             py::arg("input"),
-             py::arg("input_split_sizes") = std::vector<int64_t>{},
-             py::arg("opts") = ::c10d::ReduceScatterOptions(),
-             py::call_guard<py::gil_scoped_release>())
+            py::arg("output"),
+            py::arg("input"),
+            py::arg("input_split_sizes") = std::vector<int64_t>{},
+            py::arg("opts") = ::c10d::ReduceScatterOptions(),
+            py::call_guard<py::gil_scoped_release>())
         .def("all_gather_into_tensor_uneven",
             &::c10d_npu::ProcessGroupHCCL::_allgather_base_uneven,
-             py::arg("output"),
-             py::arg("input"),
-             py::arg("output_split_sizes") = std::vector<int64_t>{},
-             py::arg("opts") = ::c10d::AllgatherOptions(),
-             py::call_guard<py::gil_scoped_release>());
+            py::arg("output"),
+            py::arg("input"),
+            py::arg("output_split_sizes") = std::vector<int64_t>{},
+            py::arg("opts") = ::c10d::AllgatherOptions(),
+            py::call_guard<py::gil_scoped_release>());
 
     intrusive_ptr_class_<::c10d_npu::ProcessGroupHCCL::Options>(
         processGroupHCCL,
@@ -449,13 +455,13 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
         .def(py::init<>())
         .def_readwrite("op_timeout", &::c10d_npu::ProcessGroupHCCL::Options::opTimeout)
         .def_readwrite("is_high_priority_stream",
-                       &::c10d_npu::ProcessGroupHCCL::Options::is_high_priority_stream)
+            &::c10d_npu::ProcessGroupHCCL::Options::is_high_priority_stream)
         .def_readwrite("global_ranks_in_group",
-                       &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group)
+            &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group)
         .def_readwrite("hccl_config", &::c10d_npu::ProcessGroupHCCL::Options::hccl_config)
         .def_readwrite("group_id",
-                       &::c10d_npu::ProcessGroupHCCL::Options::group_id);
-    
+            &::c10d_npu::ProcessGroupHCCL::Options::group_id);
+
     // bind for ProcessGroupLCCL
     auto processGroupLCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupLCCL>(
         module, "ProcessGroupLCCL", dist.attr("Backend"))
@@ -510,37 +516,37 @@ Example::
     >>> client_store.get("first_key")
     )")
 
-      .def(py::init([](const std::string &host,
-                      uint16_t port,
-                      int worldSize,
-                      bool agentRun,
-                      uint32_t agentPid,
-                      bool isServer,
-                      bool enableTiered,
-                      std::chrono::milliseconds timeout,
-                      bool waitWorkers,
-                      bool multiTenant) {
+        .def(py::init([](const std::string& host,
+                         uint16_t port,
+                         int worldSize,
+                         bool agentRun,
+                         uint32_t agentPid,
+                         bool isServer,
+                         bool enableTiered,
+                         std::chrono::milliseconds timeout,
+                         bool waitWorkers,
+                         bool multiTenant) {
             c10::optional<std::size_t> numWorkers = c10::nullopt;
             if (worldSize > -1) {
                 numWorkers = static_cast<std::size_t>(worldSize);
             }
-            ::c10d::TCPStoreOptions opts{ port, isServer, numWorkers, waitWorkers, timeout, multiTenant };
-            return c10::make_intrusive <::c10d::ParallelTcpStore>(host, agentRun, agentPid, enableTiered, opts);
-            }),
-           py::arg("host") = "127.0.0.1",
-           py::arg("port") = 29500,
-           py::arg("world_size") = -1,
-           py::arg("agent_run") = false,
-           py::arg("agent_pid") = -1,
-           py::arg("is_server") = false,
-           py::arg("enable_tiered") = false,
-           py::arg("timeout") = std::chrono::milliseconds(300000),
-           py::arg("wait_workers") = true,
-           py::arg("multi_tenant") = false);
+            ::c10d::TCPStoreOptions opts{port, isServer, numWorkers, waitWorkers, timeout, multiTenant};
+            return c10::make_intrusive<::c10d::ParallelTcpStore>(host, agentRun, agentPid, enableTiered, opts);
+        }),
+            py::arg("host") = "127.0.0.1",
+            py::arg("port") = 29500,
+            py::arg("world_size") = -1,
+            py::arg("agent_run") = false,
+            py::arg("agent_pid") = -1,
+            py::arg("is_server") = false,
+            py::arg("enable_tiered") = false,
+            py::arg("timeout") = std::chrono::milliseconds(300000),
+            py::arg("wait_workers") = true,
+            py::arg("multi_tenant") = false);
 
     module.def("_dump_hccl_trace_json",
         [](std::optional<bool> includeCollectives,
-            std::optional<bool> onlyActive) {
+           std::optional<bool> onlyActive) {
             return py::bytes(::c10d_npu::dump_hccl_trace_json(
                 includeCollectives.value_or(true), onlyActive.value_or(false)));
         },
@@ -556,8 +562,8 @@ Example::
         )");
     module.def("_dump_hccl_trace",
         [](std::optional<bool> includeCollectives,
-            std::optional<bool> includeStackTraces,
-            std::optional<bool> onlyActive) {
+           std::optional<bool> includeStackTraces,
+           std::optional<bool> onlyActive) {
             return py::bytes(::c10d_npu::dump_hccl_trace(
                 includeCollectives.value_or(true),
                 includeStackTraces.value_or(true),
@@ -576,7 +582,7 @@ Example::
               Default settings return everything - i.e. contains HCCL comm dumps and collective traces.
         )");
 
-  Py_RETURN_TRUE;
+    Py_RETURN_TRUE;
 }
 
 // c10d methods on torch._C
@@ -593,4 +599,4 @@ PyMethodDef* python_functions()
 }
 
 } // namespace distributed
-} // namespace torch_npu
+} // namespace torch_npu
\ No newline at end of file
-- 
Gitee


From 0bd3d4c3170244842efa91492473bf4739009ef9 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 08:55:16 +0000
Subject: [PATCH 09/11] AclInterface

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 .../csrc/core/npu/interface/AclInterface.cpp  | 275 ++++++++++--------
 1 file changed, 152 insertions(+), 123 deletions(-)

diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index d85dd2211da..a2ea830bd73 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -11,14 +11,13 @@
 #endif
 
 namespace c10_npu {
-
 namespace acl {
 #undef LOAD_FUNCTION
 #define LOAD_FUNCTION(funcName) \
-  REGISTER_FUNCTION(libascendcl, funcName)
+    REGISTER_FUNCTION(libascendcl, funcName)
 #undef GET_FUNC
-#define GET_FUNC(funcName)           \
-  GET_FUNCTION(libascendcl, funcName)
+#define GET_FUNC(funcName) \
+    GET_FUNCTION(libascendcl, funcName)
 
 REGISTER_LIBRARY(libascendcl)
 LOAD_FUNCTION(aclGetRecentErrMsg)
@@ -83,8 +82,9 @@ LOAD_FUNCTION(aclmdlRICaptureTaskUpdateEnd)
 LOAD_FUNCTION(aclrtHostRegister)
 LOAD_FUNCTION(aclrtHostUnregister)
 
-aclprofStepInfoPtr init_stepinfo() {
-    typedef aclprofStepInfoPtr(*npdInitFunc)();
+aclprofStepInfoPtr init_stepinfo()
+{
+    typedef aclprofStepInfoPtr (*npdInitFunc)();
     static npdInitFunc func = nullptr;
     if (func == nullptr) {
         func = (npdInitFunc)GET_FUNC(aclprofCreateStepInfo);
@@ -94,8 +94,9 @@ aclprofStepInfoPtr init_stepinfo() {
     return ret;
 }
 
-NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo) {
-    typedef NpdStatus(*npdDestroyFunc)(aclprofStepInfoPtr);
+NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo)
+{
+    typedef NpdStatus (*npdDestroyFunc)(aclprofStepInfoPtr);
     static npdDestroyFunc func = nullptr;
     if (func == nullptr) {
         func = (npdDestroyFunc)GET_FUNC(aclprofDestroyStepInfo);
@@ -105,8 +106,9 @@ NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo) {
     return ret;
 }
 
-NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) {
-    typedef NpdStatus(*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
+NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream)
+{
+    typedef NpdStatus (*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
     static npdStartProfiling func = nullptr;
     if (func == nullptr) {
         func = (npdStartProfiling)GET_FUNC(aclprofGetStepTimestamp);
@@ -116,8 +118,9 @@ NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag,
     return ret;
 }
 
-NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) {
-    typedef NpdStatus(*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
+NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream)
+{
+    typedef NpdStatus (*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
     static npdStopProfiling func = nullptr;
     if (func == nullptr) {
         func = (npdStopProfiling)GET_FUNC(aclprofGetStepTimestamp);
@@ -127,9 +130,9 @@ NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, a
     return ret;
 }
 
-const char *AclGetErrMsg()
+const char* AclGetErrMsg()
 {
-    typedef const char *(*aclGetErrMsg)();
+    typedef const char* (*aclGetErrMsg)();
     static aclGetErrMsg func = nullptr;
     if (func == nullptr) {
         func = (aclGetErrMsg)GET_FUNC(aclGetRecentErrMsg);
@@ -141,8 +144,9 @@ const char *AclGetErrMsg()
     return "";
 }
 
-aclError AclrtCreateStreamWithConfig(aclrtStream *stream, uint32_t priority, uint32_t flag) {
-    typedef aclError(*aclrtCreateStreamWithConfigFunc)(aclrtStream*, uint32_t, uint32_t);
+aclError AclrtCreateStreamWithConfig(aclrtStream* stream, uint32_t priority, uint32_t flag)
+{
+    typedef aclError (*aclrtCreateStreamWithConfigFunc)(aclrtStream*, uint32_t, uint32_t);
     static aclrtCreateStreamWithConfigFunc func = nullptr;
     if (func == nullptr) {
         func = (aclrtCreateStreamWithConfigFunc)GET_FUNC(aclrtCreateStreamWithConfig);
@@ -170,11 +174,13 @@ aclError AclrtCreateStreamWithConfig(aclrtStream *stream, uint32_t priority, uin
     }
 }
 
-aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) {
+aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode)
+{
     if (stream == nullptr) { // default stream
         return ACL_ERROR_INVALID_PARAM;
     }
-    typedef aclError(*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t);
+
+    typedef aclError (*aclrtSetStreamFailureModeFunc)(aclrtStream, uint64_t);
     static aclrtSetStreamFailureModeFunc func = (aclrtSetStreamFailureModeFunc)GET_FUNC(aclrtSetStreamFailureMode);
     if (func == nullptr) {
         return ACL_SUCCESS;
@@ -182,8 +188,9 @@ aclError AclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) {
     return func(stream, mode);
 }
 
-aclError AclrtSetOpWaitTimeout(uint32_t timeout) {
-    typedef aclError(*aclrtSetOpWaitTimeoutFunc)(uint32_t);
+aclError AclrtSetOpWaitTimeout(uint32_t timeout)
+{
+    typedef aclError (*aclrtSetOpWaitTimeoutFunc)(uint32_t);
     static aclrtSetOpWaitTimeoutFunc func = nullptr;
     if (func == nullptr) {
         func = (aclrtSetOpWaitTimeoutFunc)GET_FUNC(aclrtSetOpWaitTimeout);
@@ -194,14 +201,14 @@ aclError AclrtSetOpWaitTimeout(uint32_t timeout) {
 
 bool IsExistCreateEventExWithFlag()
 {
-    typedef aclError(*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t);
+    typedef aclError (*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t);
     static AclrtCreateEventWithFlagFunc func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventExWithFlag);
     return func != nullptr;
 }
 
-aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag)
+aclError AclrtCreateEventWithFlag(aclrtEvent* event, uint32_t flag)
 {
-    typedef aclError(*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t);
+    typedef aclError (*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t);
     // Recommend aclrtCreateEventExWithFlag.
     // Differences from aclrtCreateEventWithFlag:
     //   1. Event can be reused naturally, aclrtResetEvent is not supported.
@@ -220,9 +227,9 @@ aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag)
     return func_ex(event, flag);
 }
 
-aclError AclQueryEventWaitStatus(aclrtEvent event, aclrtEventWaitStatus *waitStatus)
+aclError AclQueryEventWaitStatus(aclrtEvent event, aclrtEventWaitStatus* waitStatus)
 {
-    typedef aclError (*aclQueryEventWaitStatus)(aclrtEvent event, aclrtEventWaitStatus *waitStatus);
+    typedef aclError (*aclQueryEventWaitStatus)(aclrtEvent event, aclrtEventWaitStatus* waitStatus);
     static aclQueryEventWaitStatus func = nullptr;
     if (func == nullptr) {
         func = (aclQueryEventWaitStatus)GET_FUNC(aclrtQueryEventWaitStatus);
@@ -231,8 +238,9 @@ aclError AclQueryEventWaitStatus(aclrtEvent event, aclrtEventWaitStatus *waitSta
     return func(event, waitStatus);
 }
 
-aclError AclQueryEventRecordedStatus(aclrtEvent event, aclrtEventRecordedStatus *status) {
-    typedef aclError (*aclQueryEventStatus)(aclrtEvent event, aclrtEventRecordedStatus *status);
+aclError AclQueryEventRecordedStatus(aclrtEvent event, aclrtEventRecordedStatus* status)
+{
+    typedef aclError (*aclQueryEventStatus)(aclrtEvent event, aclrtEventRecordedStatus* status);
     static aclQueryEventStatus func = nullptr;
     if (func == nullptr) {
         func = (aclQueryEventStatus)GET_FUNC(aclrtQueryEventStatus);
@@ -243,7 +251,7 @@ aclError AclQueryEventRecordedStatus(aclrtEvent event, aclrtEventRecordedStatus
 
 bool IsExistQueryEventRecordedStatus()
 {
-    typedef aclError (*aclQueryEventStatus)(aclrtEvent event, aclrtEventRecordedStatus *status);
+    typedef aclError (*aclQueryEventStatus)(aclrtEvent event, aclrtEventRecordedStatus* status);
     static aclQueryEventStatus func = nullptr;
     if (func == nullptr) {
         func = (aclQueryEventStatus)GET_FUNC(aclrtQueryEventStatus);
@@ -255,8 +263,9 @@ bool IsExistQueryEventRecordedStatus()
     }
 }
 
-aclError AclProfilingInit(const char *profilerResultPath, size_t length) {
-    typedef aclError (*AclProfInitFunc) (const char *, size_t);
+aclError AclProfilingInit(const char* profilerResultPath, size_t length)
+{
+    typedef aclError (*AclProfInitFunc)(const char*, size_t);
     static AclProfInitFunc func = nullptr;
     if (func == nullptr) {
         func = (AclProfInitFunc)GET_FUNC(aclprofInit);
@@ -265,8 +274,9 @@ aclError AclProfilingInit(const char *profilerResultPath, size_t length) {
     return func(profilerResultPath, length);
 }
 
-aclError AclProfilingStart(const aclprofConfig *profilerConfig) {
-    typedef aclError (*AclProfStartFunc) (const aclprofConfig *);
+aclError AclProfilingStart(const aclprofConfig* profilerConfig)
+{
+    typedef aclError (*AclProfStartFunc)(const aclprofConfig*);
     static AclProfStartFunc func = nullptr;
     if (func == nullptr) {
         func = (AclProfStartFunc)GET_FUNC(aclprofStart);
@@ -275,8 +285,9 @@ aclError AclProfilingStart(const aclprofConfig *profilerConfig) {
     return func(profilerConfig);
 }
 
-aclError AclProfilingStop(const aclprofConfig *profilerConfig) {
-    typedef aclError (*AclProfStopFunc) (const aclprofConfig*);
+aclError AclProfilingStop(const aclprofConfig* profilerConfig)
+{
+    typedef aclError (*AclProfStopFunc)(const aclprofConfig*);
     static AclProfStopFunc func = nullptr;
     if (func == nullptr) {
         func = (AclProfStopFunc)GET_FUNC(aclprofStop);
@@ -285,8 +296,9 @@ aclError AclProfilingStop(const aclprofConfig *profilerConfig) {
     return func(profilerConfig);
 }
 
-aclError AclProfilingFinalize() {
-    typedef aclError (*AclProfFinalizeFunc) ();
+aclError AclProfilingFinalize()
+{
+    typedef aclError (*AclProfFinalizeFunc)();
     static AclProfFinalizeFunc func = nullptr;
     if (func == nullptr) {
         func = (AclProfFinalizeFunc)GET_FUNC(aclprofFinalize);
@@ -295,14 +307,15 @@ aclError AclProfilingFinalize() {
     return func();
 }
 
-aclprofConfig *AclProfilingCreateConfig(
-    uint32_t *deviceIdList,
+aclprofConfig* AclProfilingCreateConfig(
+    uint32_t* deviceIdList,
     uint32_t deviceNums,
     aclprofAicoreMetrics aicoreMetrics,
-    aclprofAicoreEvents *aicoreEvents,
-    uint64_t dataTypeConfig) {
-    typedef aclprofConfig *(*AclProfCreateConfigFunc) \
-        (uint32_t *, uint32_t, aclprofAicoreMetrics, const aclprofAicoreEvents *, uint64_t);
+    aclprofAicoreEvents* aicoreEvents,
+    uint64_t dataTypeConfig)
+{
+    typedef aclprofConfig* (*AclProfCreateConfigFunc) \
+        (uint32_t*, uint32_t, aclprofAicoreMetrics, const aclprofAicoreEvents*, uint64_t);
     static AclProfCreateConfigFunc func = nullptr;
     if (func == nullptr) {
         func = (AclProfCreateConfigFunc)GET_FUNC(aclprofCreateConfig);
@@ -311,8 +324,9 @@ aclprofConfig *AclProfilingCreateConfig(
     return func(deviceIdList, deviceNums, aicoreMetrics, aicoreEvents, dataTypeConfig);
 }
 
-aclError AclProfilingDestroyConfig(const aclprofConfig *profilerConfig) {
-    typedef aclError (*AclProfDestroyConfigFunc) (const aclprofConfig *);
+aclError AclProfilingDestroyConfig(const aclprofConfig* profilerConfig)
+{
+    typedef aclError (*AclProfDestroyConfigFunc)(const aclprofConfig*);
     static AclProfDestroyConfigFunc func = nullptr;
     if (func == nullptr) {
         func = (AclProfDestroyConfigFunc)GET_FUNC(aclprofDestroyConfig);
@@ -321,8 +335,9 @@ aclError AclProfilingDestroyConfig(const aclprofConfig *profilerConfig) {
     return func(profilerConfig);
 }
 
-const char *AclrtGetSocName() {
-    typedef const char *(*aclrtGetSocNameFunc)();
+const char* AclrtGetSocName()
+{
+    typedef const char* (*aclrtGetSocNameFunc)();
     static aclrtGetSocNameFunc func = nullptr;
     if (func == nullptr) {
         func = (aclrtGetSocNameFunc)GET_FUNC(aclrtGetSocName);
@@ -331,8 +346,9 @@ const char *AclrtGetSocName() {
     return func();
 }
 
-const char *AclGetSocName() {
-    typedef const char *(*AclGetSoc) ();
+const char* AclGetSocName()
+{
+    typedef const char* (*AclGetSoc)();
     static AclGetSoc func = nullptr;
     if (func == nullptr) {
         func = (AclGetSoc)GET_FUNC(aclrtGetSocName);
@@ -343,7 +359,8 @@ const char *AclGetSocName() {
     return func();
 }
 
-aclError AclrtSetDeviceSatMode(aclrtFloatOverflowMode mode) {
+aclError AclrtSetDeviceSatMode(aclrtFloatOverflowMode mode)
+{
     typedef aclError (*AclrtSetDeviceSatMode)(aclrtFloatOverflowMode mode);
     static AclrtSetDeviceSatMode func = nullptr;
     if (func == nullptr) {
@@ -353,7 +370,8 @@ aclError AclrtSetDeviceSatMode(aclrtFloatOverflowMode mode) {
     return func(mode);
 }
 
-aclError AclrtSetStreamOverflowSwitch(aclrtStream stream, uint32_t flag) {
+aclError AclrtSetStreamOverflowSwitch(aclrtStream stream, uint32_t flag)
+{
     typedef aclError (*AclrtSetStreamOverflowSwitch)(aclrtStream, uint32_t);
     static AclrtSetStreamOverflowSwitch func = nullptr;
     if (func == nullptr) {
@@ -363,7 +381,8 @@ aclError AclrtSetStreamOverflowSwitch(aclrtStream stream, uint32_t flag) {
     return func(stream, flag);
 }
 
-aclError AclrtSetOpExecuteTimeOut(uint32_t timeout) {
+aclError AclrtSetOpExecuteTimeOut(uint32_t timeout)
+{
     typedef aclError (*AclrtSetOpExecuteTimeOut)(uint32_t);
     static AclrtSetOpExecuteTimeOut func = nullptr;
     if (func == nullptr) {
@@ -375,7 +394,8 @@ aclError AclrtSetOpExecuteTimeOut(uint32_t timeout) {
     return func(timeout);
 }
 
-aclError AclrtGetStreamOverflowSwitch(aclrtStream stream, uint32_t *flag) {
+aclError AclrtGetStreamOverflowSwitch(aclrtStream stream, uint32_t* flag)
+{
     typedef aclError (*AclrtGetStreamOverflowSwitch)(aclrtStream, uint32_t*);
     static AclrtGetStreamOverflowSwitch func = nullptr;
     if (func == nullptr) {
@@ -385,7 +405,8 @@ aclError AclrtGetStreamOverflowSwitch(aclrtStream stream, uint32_t *flag) {
     return func(stream, flag);
 }
 
-aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) {
+aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream)
+{
     if (C10_UNLIKELY(
         c10_npu::warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
         c10_npu::warn_or_error_on_sync();
@@ -410,10 +431,11 @@ aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) {
         }
         TORCH_CHECK(func_backup, "Failed to find function", "aclrtSynchronizeStreamWithTimeout and aclrtSynchronizeStream", PROF_ERROR(ErrCode::NOT_FOUND));
         return func_backup(stream);
-  }
+    }
 }
 
-aclError AclrtDestroyStreamForce(aclrtStream stream) {
+aclError AclrtDestroyStreamForce(aclrtStream stream)
+{
     typedef aclError (*AclrtDestroyStreamForce)(aclrtStream);
     static AclrtDestroyStreamForce func = (AclrtDestroyStreamForce)GET_FUNC(aclrtDestroyStreamForce);
     if (func != nullptr) {
@@ -423,7 +445,8 @@ aclError AclrtDestroyStreamForce(aclrtStream stream) {
     return aclrtDestroyStream(stream);
 }
 
-aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *utilizationInfo) {
+aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo* utilizationInfo)
+{
     typedef aclError (*AclrtGetDeviceUtilizationRate)(int32_t, aclrtUtilizationInfo*);
     static AclrtGetDeviceUtilizationRate func = nullptr;
     if (func == nullptr) {
@@ -433,7 +456,8 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u
     return func(deviceId, utilizationInfo);
 }
 
-aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy) {
+aclError AclrtMallocAlign32(void** devPtr, size_t size, aclrtMemMallocPolicy policy)
+{
     typedef aclError (*AclrtMallocAlign32)(void**, size_t, aclrtMemMallocPolicy);
     static AclrtMallocAlign32 func = (AclrtMallocAlign32)GET_FUNC(aclrtMallocAlign32);
     aclError ret;
@@ -461,7 +485,8 @@ aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy pol
     return ret;
 }
 
-aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status) {
+aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus* status)
+{
     typedef aclError (*AclrtStreamQuery)(aclrtStream, aclrtStreamStatus*);
     static AclrtStreamQuery func = nullptr;
     if (func == nullptr) {
@@ -471,7 +496,8 @@ aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status) {
     return func(stream, status);
 }
 
-bool can_device_access_peer(c10::DeviceIndex device_id, c10::DeviceIndex peer_device_id) {
+bool can_device_access_peer(c10::DeviceIndex device_id, c10::DeviceIndex peer_device_id)
+{
     int32_t can_access_peer = 0;
     c10::DeviceIndex num_npus = c10_npu::device_count();
     TORCH_CHECK(device_id >= 0 && device_id < num_npus, PROF_ERROR(ErrCode::VALUE));
@@ -490,8 +516,8 @@ bool can_device_access_peer(c10::DeviceIndex device_id, c10::DeviceIndex peer_de
     return can_access_peer != 0;
 }
 
-aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, void *expectPtr, uint64_t flags,
-                                HcclComm hcclComm)
+aclError AclrtReserveMemAddress(void** virPtr, size_t size, size_t alignment, void* expectPtr, uint64_t flags,
+    HcclComm hcclComm)
 {
     typedef aclError (*AclrtReserveMemAddress)(void**, size_t, size_t, void*, uint64_t);
     static AclrtReserveMemAddress func = nullptr;
@@ -506,7 +532,7 @@ aclError AclrtReserveMemAddress(void **virPtr, size_t size, size_t alignment, vo
     return ret;
 }
 
-aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm)
+aclError AclrtReleaseMemAddress(void* virPtr, HcclComm hcclComm)
 {
     typedef aclError (*AclrtReleaseMemAddress)(void*);
     static AclrtReleaseMemAddress func = nullptr;
@@ -521,8 +547,9 @@ aclError AclrtReleaseMemAddress(void *virPtr, HcclComm hcclComm)
     return ret;
 }
 
-aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrtPhysicalMemProp *prop,
-    uint64_t flags) {
+aclError AclrtMallocPhysical(aclrtDrvMemHandle* handle, size_t size, const aclrtPhysicalMemProp* prop,
+    uint64_t flags)
+{
     typedef aclError (*AclrtMallocPhysical)(aclrtDrvMemHandle*, size_t, const aclrtPhysicalMemProp*, uint64_t);
     static AclrtMallocPhysical func = nullptr;
     if (func == nullptr) {
@@ -537,17 +564,18 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrt
                             "is enabled, but the pre-allocated number of 1G large pages is insufficient "
                             "or 1G large-page memory pre-allocation is not enabled.");
         aclrtPhysicalMemProp prop_update = {prop->handleType,
-                                            prop->allocationType,
-                                            ACL_HBM_MEM_HUGE,
-                                            {prop->location.id,
-                                             prop->location.type},
-                                            prop->reserve};
+            prop->allocationType,
+            ACL_HBM_MEM_HUGE,
+            {prop->location.id,
+                prop->location.type},
+            prop->reserve};
         ret = func(handle, size, &prop_update, flags);
     }
     return ret;
 }
 
-aclError AclrtFreePhysical(aclrtDrvMemHandle handle) {
+aclError AclrtFreePhysical(aclrtDrvMemHandle handle)
+{
     typedef aclError (*AclrtFreePhysical)(aclrtDrvMemHandle);
     static AclrtFreePhysical func = nullptr;
     if (func == nullptr) {
@@ -557,8 +585,8 @@ aclError AclrtFreePhysical(aclrtDrvMemHandle handle) {
     return func(handle);
 }
 
-aclError AclrtMapMem(void *virPtr, size_t size, size_t offset, aclrtDrvMemHandle handle, uint64_t flags,
-                     HcclComm hcclComm)
+aclError AclrtMapMem(void* virPtr, size_t size, size_t offset, aclrtDrvMemHandle handle, uint64_t flags,
+    HcclComm hcclComm)
 {
     typedef aclError (*AclrtMapMem)(void*, size_t, size_t, aclrtDrvMemHandle, uint64_t);
     static AclrtMapMem func = nullptr;
@@ -573,7 +601,7 @@ aclError AclrtMapMem(void *virPtr, size_t size, size_t offset, aclrtDrvMemHandle
     return ret;
 }
 
-aclError AclrtUnmapMem(void *virPtr, HcclComm hcclComm)
+aclError AclrtUnmapMem(void* virPtr, HcclComm hcclComm)
 {
     typedef aclError (*AclrtUnmapMem)(void*);
     static AclrtUnmapMem func = nullptr;
@@ -590,50 +618,50 @@ aclError AclrtUnmapMem(void *virPtr, HcclComm hcclComm)
 
 bool IsExistGetCannAttribute()
 {
-    typedef aclError (*AclGetCannAttribute)(aclCannAttr, int32_t *);
-    static AclGetCannAttribute func = (AclGetCannAttribute) GET_FUNC(aclGetCannAttribute);
+    typedef aclError (*AclGetCannAttribute)(aclCannAttr, int32_t*);
+    static AclGetCannAttribute func = (AclGetCannAttribute)GET_FUNC(aclGetCannAttribute);
     return func != nullptr;
 }
 
-aclError AclGetCannAttributeList(const aclCannAttr **cannAttrList, size_t *num)
+aclError AclGetCannAttributeList(const aclCannAttr** cannAttrList, size_t* num)
 {
-    typedef aclError (*AclGetCannAttributeList)(const aclCannAttr **, size_t *);
+    typedef aclError (*AclGetCannAttributeList)(const aclCannAttr**, size_t*);
     static AclGetCannAttributeList func = nullptr;
     if (func == nullptr) {
-        func = (AclGetCannAttributeList) GET_FUNC(aclGetCannAttributeList);
+        func = (AclGetCannAttributeList)GET_FUNC(aclGetCannAttributeList);
     }
     TORCH_CHECK(func, "Failed to find function ", "aclGetCannAttributeList", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(cannAttrList, num);
 }
 
-aclError AclGetCannAttribute(aclCannAttr cannAttr, int32_t *value)
+aclError AclGetCannAttribute(aclCannAttr cannAttr, int32_t* value)
 {
-    typedef aclError (*AclGetCannAttribute)(aclCannAttr, int32_t *);
+    typedef aclError (*AclGetCannAttribute)(aclCannAttr, int32_t*);
     static AclGetCannAttribute func = nullptr;
     if (func == nullptr) {
-        func = (AclGetCannAttribute) GET_FUNC(aclGetCannAttribute);
+        func = (AclGetCannAttribute)GET_FUNC(aclGetCannAttribute);
     }
     TORCH_CHECK(func, "Failed to find function ", "aclGetCannAttribute", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(cannAttr, value);
 }
 
-aclError AclGetDeviceCapability(uint32_t deviceId, aclDeviceInfo deviceInfo, int64_t *value)
+aclError AclGetDeviceCapability(uint32_t deviceId, aclDeviceInfo deviceInfo, int64_t* value)
 {
-    typedef aclError (*AclGetDeviceCapability)(uint32_t, aclDeviceInfo, int64_t *);
+    typedef aclError (*AclGetDeviceCapability)(uint32_t, aclDeviceInfo, int64_t*);
     static AclGetDeviceCapability func = nullptr;
     if (func == nullptr) {
-        func = (AclGetDeviceCapability) GET_FUNC(aclGetDeviceCapability);
+        func = (AclGetDeviceCapability)GET_FUNC(aclGetDeviceCapability);
     }
     TORCH_CHECK(func, "Failed to find function ", "aclGetDeviceCapability", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(deviceId, deviceInfo, value);
 }
 
-aclError AclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize, size_t *retSize)
+aclError AclrtGetMemUceInfo(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, size_t arraySize, size_t* retSize)
 {
-    typedef aclError (*AclrtGetMemUceInfo)(int32_t, aclrtMemUceInfo*, size_t, size_t *);
+    typedef aclError (*AclrtGetMemUceInfo)(int32_t, aclrtMemUceInfo*, size_t, size_t*);
     static AclrtGetMemUceInfo func = nullptr;
     if (func == nullptr) {
-        func = (AclrtGetMemUceInfo) GET_FUNC(aclrtGetMemUceInfo);
+        func = (AclrtGetMemUceInfo)GET_FUNC(aclrtGetMemUceInfo);
     }
     if (func == nullptr) {
         TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtGetMemUceInfo");
@@ -647,7 +675,7 @@ aclError AclrtDeviceTaskAbort(int32_t deviceId)
     typedef aclError (*AclrtDeviceTaskAbort)(int32_t, uint32_t);
     static AclrtDeviceTaskAbort func = nullptr;
     if (func == nullptr) {
-        func = (AclrtDeviceTaskAbort) GET_FUNC(aclrtDeviceTaskAbort);
+        func = (AclrtDeviceTaskAbort)GET_FUNC(aclrtDeviceTaskAbort);
     }
     if (func == nullptr) {
         TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtDeviceTaskAbort");
@@ -662,7 +690,7 @@ aclError AclrtMemUceRepair(int32_t deviceId, aclrtMemUceInfo* memUceInfoArray, s
     typedef aclError (*AclrtMemUceRepair)(int32_t, aclrtMemUceInfo*, size_t);
     static AclrtMemUceRepair func = nullptr;
     if (func == nullptr) {
-        func = (AclrtMemUceRepair) GET_FUNC(aclrtMemUceRepair);
+        func = (AclrtMemUceRepair)GET_FUNC(aclrtMemUceRepair);
     }
     if (func == nullptr) {
         TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMemUceRepair");
@@ -676,7 +704,7 @@ aclError AclrtCmoAsync(void* src, size_t size, aclrtCmoType cmoType, aclrtStream
     typedef aclError (*AclrtCmoAsync)(void*, size_t, aclrtCmoType, aclrtStream);
     static AclrtCmoAsync func = nullptr;
     if (func == nullptr) {
-        func = (AclrtCmoAsync) GET_FUNC(aclrtCmoAsync);
+        func = (AclrtCmoAsync)GET_FUNC(aclrtCmoAsync);
     }
     TORCH_CHECK(func, "Failed to find function ", "aclrtCmoAsync", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(src, size, cmoType, stream);
@@ -687,7 +715,7 @@ aclError AclrtGetLastError(aclrtLastErrLevel flag)
     typedef aclError (*AclrtGetLastError)(aclrtLastErrLevel flag);
     static AclrtGetLastError func = nullptr;
     if (func == nullptr) {
-        func = (AclrtGetLastError) GET_FUNC(aclrtGetLastError);
+        func = (AclrtGetLastError)GET_FUNC(aclrtGetLastError);
     }
     if (func == nullptr) {
         return ACL_ERROR_NONE;
@@ -700,7 +728,7 @@ aclError AclrtPeekAtLastError(aclrtLastErrLevel flag)
     typedef aclError (*AclrtPeekAtLastError)(aclrtLastErrLevel flag);
     static AclrtPeekAtLastError func = nullptr;
     if (func == nullptr) {
-        func = (AclrtPeekAtLastError) GET_FUNC(aclrtPeekAtLastError);
+        func = (AclrtPeekAtLastError)GET_FUNC(aclrtPeekAtLastError);
     }
     if (func == nullptr) {
         return ACL_ERROR_NONE;
@@ -708,12 +736,12 @@ aclError AclrtPeekAtLastError(aclrtLastErrLevel flag)
     return func(flag);
 }
 
-aclError AclStressDetect(int32_t deviceId, void *workspace, size_t workspaceSize)
+aclError AclStressDetect(int32_t deviceId, void* workspace, size_t workspaceSize)
 {
     typedef aclError (*AclStressDetect)(int32_t, void*, size_t);
     static AclStressDetect func = nullptr;
     if (func == nullptr) {
-        func = (AclStressDetect) GetOpApiFuncAddr("StressDetect");
+        func = (AclStressDetect)GetOpApiFuncAddr("StressDetect");
     }
     TORCH_CHECK(func, "Failed to find function ", "StressDetect", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(deviceId, workspace, workspaceSize);
@@ -740,7 +768,7 @@ aclError AclrtSynchronizeDeviceWithTimeout(void)
     }
 }
 
-aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp)
+aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t* timestamp)
 {
     typedef aclError (*AclrtEventGetTimestamp)(aclrtEvent, uint64_t*);
     static AclrtEventGetTimestamp func = nullptr;
@@ -756,31 +784,31 @@ aclError AclmdlRICaptureBegin(aclrtStream stream, aclmdlRICaptureMode mode)
     typedef aclError (*AclmdlRICaptureBegin)(aclrtStream, aclmdlRICaptureMode);
     static AclmdlRICaptureBegin func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRICaptureBegin) GET_FUNC(aclmdlRICaptureBegin);
+        func = (AclmdlRICaptureBegin)GET_FUNC(aclmdlRICaptureBegin);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRICaptureBegin", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream, mode);
 }
 
-aclError AclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus *status, aclmdlRI *modelRI)
+aclError AclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus* status, aclmdlRI* modelRI)
 {
-    typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *);
+    typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus*, aclmdlRI*);
     static AclmdlRICaptureGetInfo func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRICaptureGetInfo) GET_FUNC(aclmdlRICaptureGetInfo);
+        func = (AclmdlRICaptureGetInfo)GET_FUNC(aclmdlRICaptureGetInfo);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRICaptureGetInfo", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream, status, modelRI);
 }
 
-aclError AclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *modelRI)
+aclError AclmdlRICaptureEnd(aclrtStream stream, aclmdlRI* modelRI)
 {
-    typedef aclError (*AclmdlRICaptureEnd)(aclrtStream, aclmdlRI *);
+    typedef aclError (*AclmdlRICaptureEnd)(aclrtStream, aclmdlRI*);
     static AclmdlRICaptureEnd func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRICaptureEnd) GET_FUNC(aclmdlRICaptureEnd);
+        func = (AclmdlRICaptureEnd)GET_FUNC(aclmdlRICaptureEnd);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRICaptureEnd", PTA_ERROR(ErrCode::NOT_FOUND));
@@ -792,7 +820,7 @@ aclError AclmdlRIDebugPrint(aclmdlRI modelRI)
     typedef aclError (*AclmdlRIDebugPrint)(aclmdlRI);
     static AclmdlRIDebugPrint func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRIDebugPrint) GET_FUNC(aclmdlRIDebugPrint);
+        func = (AclmdlRIDebugPrint)GET_FUNC(aclmdlRIDebugPrint);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRIDebugPrint", PTA_ERROR(ErrCode::NOT_FOUND));
@@ -804,7 +832,7 @@ aclError AclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream)
     typedef aclError (*AclmdlRIExecuteAsync)(aclmdlRI, aclrtStream);
     static AclmdlRIExecuteAsync func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRIExecuteAsync) GET_FUNC(aclmdlRIExecuteAsync);
+        func = (AclmdlRIExecuteAsync)GET_FUNC(aclmdlRIExecuteAsync);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRIExecuteAsync", PTA_ERROR(ErrCode::NOT_FOUND));
@@ -812,14 +840,14 @@ aclError AclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream)
     return func(modelRI, stream);
 }
 
-aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version)
+aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion* version)
 {
-    using aclsysGetCANNVersionFunc = aclError(*)(aclCANNPackageName, aclCANNPackageVersion *);
+    using aclsysGetCANNVersionFunc = aclError (*)(aclCANNPackageName, aclCANNPackageVersion*);
     static aclsysGetCANNVersionFunc func = nullptr;
     if (func == nullptr) {
         func = (aclsysGetCANNVersionFunc)GET_FUNC(aclsysGetCANNVersion);
         if (func == nullptr) {
-        return ACL_ERROR_RT_FEATURE_NOT_SUPPORT;
+            return ACL_ERROR_RT_FEATURE_NOT_SUPPORT;
         }
     }
 
@@ -831,7 +859,7 @@ aclError AclmdlRIDestroy(aclmdlRI modelRI)
     typedef aclError (*AclmdlRIDestroy)(aclmdlRI);
     static AclmdlRIDestroy func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRIDestroy) GET_FUNC(aclmdlRIDestroy);
+        func = (AclmdlRIDestroy)GET_FUNC(aclmdlRIDestroy);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRIDestroy", PTA_ERROR(ErrCode::NOT_FOUND));
@@ -844,11 +872,12 @@ bool IsCaptureSupported()
     static bool have_load_func = false;
     static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) &&
         (GetSocVersion() < SocVersion::Ascend310B1)) ||
-        (GetSocVersion() >= SocVersion::Ascend910_9391);
+        ((GetSocVersion() >= SocVersion::Ascend910_9391) &&
+            (GetSocVersion() < SocVersion::Ascend910_95));
     if (default_support_capture && !have_load_func) {
         have_load_func = true;
-        typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *);
-        static AclmdlRICaptureGetInfo func = (AclmdlRICaptureGetInfo) GET_FUNC(aclmdlRICaptureGetInfo);
+        typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus*, aclmdlRI*);
+        static AclmdlRICaptureGetInfo func = (AclmdlRICaptureGetInfo)GET_FUNC(aclmdlRICaptureGetInfo);
         is_support = (func != nullptr);
     }
 
@@ -860,19 +889,19 @@ aclError AclmdlRICaptureTaskGrpBegin(aclrtStream stream)
     typedef aclError (*AclmdlRICaptureTaskGrpBegin)(aclrtStream);
     static AclmdlRICaptureTaskGrpBegin func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRICaptureTaskGrpBegin) GET_FUNC(aclmdlRICaptureTaskGrpBegin);
+        func = (AclmdlRICaptureTaskGrpBegin)GET_FUNC(aclmdlRICaptureTaskGrpBegin);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskGrpBegin", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream);
 }
 
-aclError AclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp *handle)
+aclError AclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp* handle)
 {
     typedef aclError (*AclmdlRICaptureTaskGrpEnd)(aclrtStream, aclrtTaskGrp*);
     static AclmdlRICaptureTaskGrpEnd func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRICaptureTaskGrpEnd) GET_FUNC(aclmdlRICaptureTaskGrpEnd);
+        func = (AclmdlRICaptureTaskGrpEnd)GET_FUNC(aclmdlRICaptureTaskGrpEnd);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskGrpEnd", PTA_ERROR(ErrCode::NOT_FOUND));
@@ -884,7 +913,7 @@ aclError AclmdlRICaptureTaskUpdateBegin(aclrtStream stream, aclrtTaskGrp handle)
     typedef aclError (*AclmdlRICaptureTaskUpdateBegin)(aclrtStream, aclrtTaskGrp);
     static AclmdlRICaptureTaskUpdateBegin func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRICaptureTaskUpdateBegin) GET_FUNC(aclmdlRICaptureTaskUpdateBegin);
+        func = (AclmdlRICaptureTaskUpdateBegin)GET_FUNC(aclmdlRICaptureTaskUpdateBegin);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskUpdateBegin", PTA_ERROR(ErrCode::NOT_FOUND));
@@ -896,31 +925,31 @@ aclError AclmdlRICaptureTaskUpdateEnd(aclrtStream stream)
     typedef aclError (*AclmdlRICaptureTaskUpdateEnd)(aclmdlRI);
     static AclmdlRICaptureTaskUpdateEnd func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlRICaptureTaskUpdateEnd) GET_FUNC(aclmdlRICaptureTaskUpdateEnd);
+        func = (AclmdlRICaptureTaskUpdateEnd)GET_FUNC(aclmdlRICaptureTaskUpdateEnd);
     }
 
     TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskUpdateEnd", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream);
 }
 
-aclError AclrtHostRegister(void *ptr, uint64_t size, aclrtHostRegisterType type, void **devPtr)
+aclError AclrtHostRegister(void* ptr, uint64_t size, aclrtHostRegisterType type, void** devPtr)
 {
-    typedef aclError (*AclrtHostRegister)(void *, uint64_t, aclrtHostRegisterType, void **);
+    typedef aclError (*AclrtHostRegister)(void*, uint64_t, aclrtHostRegisterType, void**);
     static AclrtHostRegister func = nullptr;
     if (func == nullptr) {
-        func = (AclrtHostRegister) GET_FUNC(aclrtHostRegister);
+        func = (AclrtHostRegister)GET_FUNC(aclrtHostRegister);
     }
 
     TORCH_CHECK(func, "Failed to find function aclrtHostRegister", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(ptr, size, type, devPtr);
 }
 
-aclError AclrtHostUnregister(void *ptr)
+aclError AclrtHostUnregister(void* ptr)
 {
-    typedef aclError (*AclrtHostUnregister)(void *);
+    typedef aclError (*AclrtHostUnregister)(void*);
     static AclrtHostUnregister func = nullptr;
     if (func == nullptr) {
-        func = (AclrtHostUnregister) GET_FUNC(aclrtHostUnregister);
+        func = (AclrtHostUnregister)GET_FUNC(aclrtHostUnregister);
     }
 
     TORCH_CHECK(func, "Failed to find function aclrtHostUnregister", PTA_ERROR(ErrCode::NOT_FOUND));
@@ -928,4 +957,4 @@ aclError AclrtHostUnregister(void *ptr)
 }
 
 } // namespace acl
-} // namespace c10
+} // namespace c10_npu
\ No newline at end of file
-- 
Gitee


From 86a4bb7a97074833111f006c16e1f8c17c7e4750 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 11:36:09 +0000
Subject: [PATCH 10/11] oss_proxy

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 .../aoti_torch/oss_proxy_executor_npu.cpp     | 1132 ++++++++---------
 1 file changed, 563 insertions(+), 569 deletions(-)

diff --git a/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp b/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp
index c11ef93bfda..d9f1c04fc2c 100644
--- a/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp
+++ b/torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.cpp
@@ -5,9 +5,8 @@
 #include <torch_npu/csrc/inductor/aoti_torch/oss_proxy_executor_npu.h>
 
 namespace {
-at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle)
-{
-    return reinterpret_cast<at::Tensor*>(handle);
+at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) {
+  return reinterpret_cast<at::Tensor*>(handle);
 }
 } // namespace
 
@@ -17,481 +16,477 @@ void OSSProxyExecutorNpu::prefill_stack_with_static_arguments(
     size_t index,
     const at::TypePtr& schema_arg_type,
     const nlohmann::json& serialized_arg,
-    OSSOpKernel& op_kernel)
-{
-    auto& stack = op_kernel.stack_;
-    auto& dynamic_args = op_kernel.dynamic_args_;
-
-    TORCH_CHECK(serialized_arg.size() == 1);
-    std::string serialized_arg_type = serialized_arg.begin().key();
-    auto& serialized_arg_val = serialized_arg.begin().value();
-
-    switch (schema_arg_type->kind()) {
-        case c10::TypeKind::TensorType: {
-            TORCH_CHECK(
-                serialized_arg_type == "as_tensor",
-                "Expected extern kernel ",
-                op_kernel.target_,
-                " to have serialized argument type as_tensor for argument ",
-                index,
-                " but got ",
-                serialized_arg_type);
-            dynamic_args.emplace_back(index, DynamicArgType::TensorType, 1);
-            break;
-        }
-        case c10::TypeKind::IntType: {
-            TORCH_CHECK(
-                serialized_arg_type == "as_int",
-                "Expected extern kernel ",
-                op_kernel.target_,
-                " to have serialized argument type as_int for argument ",
-                index,
-                " but got ",
-                serialized_arg_type);
-            dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
-            break;
-        }
-        case c10::TypeKind::SymIntType: {
-            TORCH_CHECK(
-                serialized_arg_type == "as_int" ||
-                    serialized_arg_type == "as_sym_int",
-                "Expected extern kernel ",
-                op_kernel.target_,
-                " to have serialized argument type as_int or as_sym_int for argument ",
-                index,
-                " but got ",
-                serialized_arg_type);
-            dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
-            break;
-        }
-        case c10::TypeKind::FloatType: {
-            TORCH_CHECK(
-                serialized_arg_type == "as_float",
-                "Expected extern kernel ",
-                op_kernel.target_,
-                " to have serialized argument type as_float for argument ",
-                index,
-                " but got ",
-                serialized_arg_type);
-            stack.at(index) = serialized_arg_val.get<double>();
-            break;
-        }
-        case c10::TypeKind::BoolType: {
-            TORCH_CHECK(
-                serialized_arg_type == "as_bool",
-                "Expected extern kernel ",
-                op_kernel.target_,
-                " to have serialized argument type as_bool for argument ",
-                index,
-                " but got ",
-                serialized_arg_type);
-            stack.at(index) = serialized_arg_val.get<bool>();
-            break;
+    OSSOpKernel& op_kernel) {
+  auto& stack = op_kernel.stack_;
+  auto& dynamic_args = op_kernel.dynamic_args_;
+
+  TORCH_CHECK(serialized_arg.size() == 1);
+  std::string serialized_arg_type = serialized_arg.begin().key();
+  auto& serialized_arg_val = serialized_arg.begin().value();
+
+  switch (schema_arg_type->kind()) {
+    case c10::TypeKind::TensorType: {
+      TORCH_CHECK(
+          serialized_arg_type == "as_tensor",
+          "Expected extern kernel ",
+          op_kernel.target_,
+          " to have serialized argument type as_tensor for argument ",
+          index,
+          " but got ",
+          serialized_arg_type);
+      dynamic_args.emplace_back(index, DynamicArgType::TensorType, 1);
+      break;
+    }
+    case c10::TypeKind::IntType: {
+      TORCH_CHECK(
+          serialized_arg_type == "as_int",
+          "Expected extern kernel ",
+          op_kernel.target_,
+          " to have serialized argument type as_int for argument ",
+          index,
+          " but got ",
+          serialized_arg_type);
+      dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
+      break;
+    }
+    case c10::TypeKind::SymIntType: {
+      TORCH_CHECK(
+          serialized_arg_type == "as_int" ||
+              serialized_arg_type == "as_sym_int",
+          "Expected extern kernel ",
+          op_kernel.target_,
+          " to have serialized argument type as_int or as_sym_int for argument ",
+          index,
+          " but got ",
+          serialized_arg_type);
+      dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
+      break;
+    }
+    case c10::TypeKind::FloatType: {
+      TORCH_CHECK(
+          serialized_arg_type == "as_float",
+          "Expected extern kernel ",
+          op_kernel.target_,
+          " to have serialized argument type as_float for argument ",
+          index,
+          " but got ",
+          serialized_arg_type);
+      stack.at(index) = serialized_arg_val.get<double>();
+      break;
+    }
+    case c10::TypeKind::BoolType: {
+      TORCH_CHECK(
+          serialized_arg_type == "as_bool",
+          "Expected extern kernel ",
+          op_kernel.target_,
+          " to have serialized argument type as_bool for argument ",
+          index,
+          " but got ",
+          serialized_arg_type);
+      stack.at(index) = serialized_arg_val.get<bool>();
+      break;
+    }
+    case c10::TypeKind::NumberType: {
+      if (serialized_arg_type == "as_int") {
+        // Only int Scalar is treated as dynamic arg for now
+        dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
+      } else if (serialized_arg_type == "as_float") {
+        stack.at(index) = serialized_arg_val.get<double>();
+      } else if (serialized_arg_type == "as_bool") {
+        stack.at(index) = serialized_arg_val.get<bool>();
+      } else {
+        TORCH_CHECK(
+            false,
+            "Expected extern kernel ",
+            op_kernel.target_,
+            " to have a scalar input for argument ",
+            index,
+            " but got ",
+            serialized_arg_type);
+      }
+      break;
+    }
+    case c10::TypeKind::StringType: {
+      TORCH_CHECK(
+          serialized_arg_type == "as_string",
+          "Expected extern kernel ",
+          op_kernel.target_,
+          " to have serialized argument type as_string for argument ",
+          index,
+          " but got ",
+          serialized_arg_type);
+      stack.at(index) = serialized_arg_val.get<std::string>();
+      break;
+    }
+    case c10::TypeKind::DeviceObjType: {
+      TORCH_CHECK(
+          serialized_arg_type == "as_device",
+          "Expected extern kernel ",
+          op_kernel.target_,
+          " to have serialized argument type as_device for argument ",
+          index,
+          " but got ",
+          serialized_arg_type);
+
+      std::string device_string = serialized_arg_val["type"].get<std::string>();
+      if (serialized_arg_val.contains("index") &&
+          serialized_arg_val["index"].is_number()) {
+        device_string += ":" + serialized_arg_val["index"].get<std::string>();
+      }
+
+      c10::Device device(device_string);
+
+      if (device != *device_) {
+        VLOG(1) << "ProxyExecutor is using " << *device_ << " for "
+                << op_kernel.target_ << " argument #" << index
+                << ", which is different from the one serialized in thrift: "
+                << device << ". Please ensure this is intentional.";
+      }
+
+      stack.at(index) = *device_;
+      break;
+    }
+    case c10::TypeKind::ListType: {
+      if (schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) {
+        TORCH_CHECK(
+            serialized_arg_type == "as_tensors",
+            "Expected extern kernel ",
+            op_kernel.target_,
+            " to have serialized argument type as_tensors for argument ",
+            index,
+            " but got ",
+            serialized_arg_type);
+        TORCH_CHECK(serialized_arg_type == "as_tensors");
+        dynamic_args.emplace_back(
+            index, DynamicArgType::ListTensorType, serialized_arg_val.size());
+      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofInts())) {
+        TORCH_CHECK(
+            serialized_arg_type == "as_ints",
+            "Expected extern kernel ",
+            op_kernel.target_,
+            " to have serialized argument type as_ints for argument ",
+            index,
+            " but got ",
+            serialized_arg_type);
+        dynamic_args.emplace_back(
+            index, DynamicArgType::ListIntType, serialized_arg_val.size());
+      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) {
+        TORCH_CHECK(
+            serialized_arg_type == "as_ints" ||
+                serialized_arg_type == "as_sym_ints",
+            "Expected extern kernel ",
+            op_kernel.target_,
+            " to have serialized argument type as_ints or as_sym_ints for argument ",
+            index,
+            " but got ",
+            serialized_arg_type);
+        dynamic_args.emplace_back(
+            index, DynamicArgType::ListIntType, serialized_arg_val.size());
+      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofFloats())) {
+        TORCH_CHECK(
+            serialized_arg_type == "as_floats",
+            "Expected extern kernel ",
+            op_kernel.target_,
+            " to have serialized argument type as_floats for argument ",
+            index,
+            " but got ",
+            serialized_arg_type);
+        std::vector<double> ret;
+        for (const auto& arg : serialized_arg_val) {
+          ret.push_back(arg.get<double>());
         }
-        case c10::TypeKind::NumberType: {
-            if (serialized_arg_type == "as_int") {
-                // Only int Scalar is treated as dynamic arg for now
-                dynamic_args.emplace_back(index, DynamicArgType::IntType, 1);
-            } else if (serialized_arg_type == "as_float") {
-                stack.at(index) = serialized_arg_val.get<double>();
-            } else if (serialized_arg_type == "as_bool") {
-                stack.at(index) = serialized_arg_val.get<bool>();
-            } else {
-                TORCH_CHECK(
-                    false,
-                    "Expected extern kernel ",
-                    op_kernel.target_,
-                    " to have a scalar input for argument ",
-                    index,
-                    " but got ",
-                    serialized_arg_type);
-            }
-            break;
+        stack.at(index) = std::move(ret);
+      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofBools())) {
+        TORCH_CHECK(
+            serialized_arg_type == "as_bools",
+            "Expected extern kernel ",
+            op_kernel.target_,
+            " to have serialized argument type as_bools for argument ",
+            index,
+            " but got ",
+            serialized_arg_type);
+        std::vector<bool> ret;
+        for (const auto& arg : serialized_arg_val) {
+          ret.push_back(arg.get<bool>());
         }
-        case c10::TypeKind::StringType: {
-            TORCH_CHECK(
-                serialized_arg_type == "as_string",
-                "Expected extern kernel ",
-                op_kernel.target_,
-                " to have serialized argument type as_string for argument ",
-                index,
-                " but got ",
-                serialized_arg_type);
-            stack.at(index) = serialized_arg_val.get<std::string>();
-            break;
+        stack.at(index) = std::move(ret);
+      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofNumbers())) {
+        if (serialized_arg_type == "as_ints") {
+          dynamic_args.emplace_back(
+              index, DynamicArgType::ListIntType, serialized_arg_val.size());
+        } else if (serialized_arg_type == "as_floats") {
+          std::vector<double> ret;
+          for (const auto& arg : serialized_arg_val) {
+            ret.push_back(arg);
+          }
+          stack.at(index) = std::move(ret);
+        } else if (serialized_arg_type == "as_bools") {
+          std::vector<bool> ret;
+          for (const auto& arg : serialized_arg_val) {
+            ret.push_back(arg);
+          }
+          stack.at(index) = std::move(ret);
+        } else {
+          TORCH_CHECK(
+              false,
+              "Expected extern kernel ",
+              op_kernel.target_,
+              " to have a List[Scalar] input for argument ",
+              index,
+              " but got ",
+              serialized_arg_type);
         }
-        case c10::TypeKind::DeviceObjType: {
-            TORCH_CHECK(
-                serialized_arg_type == "as_device",
-                "Expected extern kernel ",
-                op_kernel.target_,
-                " to have serialized argument type as_device for argument ",
-                index,
-                " but got ",
-                serialized_arg_type);
-
-            std::string device_string = serialized_arg_val["type"].get<std::string>();
-            if (serialized_arg_val.contains("index") &&
-                serialized_arg_val["index"].is_number()) {
-                device_string += ":" + serialized_arg_val["index"].get<std::string>();
-            }
-
-            c10::Device device(device_string);
-
-            if (device != *device_) {
-                VLOG(1) << "ProxyExecutor is using " << *device_ << " for "
-                        << op_kernel.target_ << " argument #" << index
-                        << ", which is different from the one serialized in thrift: "
-                        << device << ". Please ensure this is intentional.";
-            }
-
-            stack.at(index) = *device_;
-            break;
+      } else if (schema_arg_type->isSubtypeOf(
+                     at::ListType::ofOptionalTensors())) {
+        if (serialized_arg_type == "as_optional_tensors") {
+          std::vector<std::string> list_item_types;
+          for (const auto& arg : serialized_arg_val) {
+            list_item_types.push_back(arg.begin().key());
+          }
+          dynamic_args.emplace_back(
+              index,
+              DynamicArgType::ListOptionalTensorType,
+              serialized_arg_val.size(),
+              list_item_types);
+        } else if (serialized_arg_type == "as_tensors") {
+          dynamic_args.emplace_back(
+              index, DynamicArgType::ListTensorType, serialized_arg_val.size());
+        } else {
+          TORCH_CHECK(
+              false,
+              "Expected extern kernel ",
+              op_kernel.target_,
+              " to have a Tensor?[] input for argument ",
+              index,
+              " but got ",
+              serialized_arg_type);
         }
-        case c10::TypeKind::ListType: {
-            if (schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) {
-                TORCH_CHECK(
-                    serialized_arg_type == "as_tensors",
-                    "Expected extern kernel ",
-                    op_kernel.target_,
-                    " to have serialized argument type as_tensors for argument ",
-                    index,
-                    " but got ",
-                    serialized_arg_type);
-                TORCH_CHECK(serialized_arg_type == "as_tensors");
-                dynamic_args.emplace_back(
-                    index, DynamicArgType::ListTensorType, serialized_arg_val.size());
-            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofInts())) {
-                TORCH_CHECK(
-                    serialized_arg_type == "as_ints",
-                    "Expected extern kernel ",
-                    op_kernel.target_,
-                    " to have serialized argument type as_ints for argument ",
-                    index,
-                    " but got ",
-                    serialized_arg_type);
-                dynamic_args.emplace_back(
-                    index, DynamicArgType::ListIntType, serialized_arg_val.size());
-            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) {
-                TORCH_CHECK(
-                    serialized_arg_type == "as_ints" ||
-                        serialized_arg_type == "as_sym_ints",
-                    "Expected extern kernel ",
-                    op_kernel.target_,
-                    " to have serialized argument type as_ints or as_sym_ints for argument ",
-                    index,
-                    " but got ",
-                    serialized_arg_type);
-                dynamic_args.emplace_back(
-                    index, DynamicArgType::ListIntType, serialized_arg_val.size());
-            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofFloats())) {
-                TORCH_CHECK(
-                    serialized_arg_type == "as_floats",
-                    "Expected extern kernel ",
-                    op_kernel.target_,
-                    " to have serialized argument type as_floats for argument ",
-                    index,
-                    " but got ",
-                    serialized_arg_type);
-                std::vector<double> ret;
-                for (const auto& arg : serialized_arg_val) {
-                    ret.push_back(arg.get<double>());
-                }
-                stack.at(index) = std::move(ret);
-            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofBools())) {
-                TORCH_CHECK(
-                    serialized_arg_type == "as_bools",
-                    "Expected extern kernel ",
-                    op_kernel.target_,
-                    " to have serialized argument type as_bools for argument ",
-                    index,
-                    " but got ",
-                    serialized_arg_type);
-                std::vector<bool> ret;
-                for (const auto& arg : serialized_arg_val) {
-                    ret.push_back(arg.get<bool>());
-                }
-                stack.at(index) = std::move(ret);
-            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofNumbers())) {
-                if (serialized_arg_type == "as_ints") {
-                    dynamic_args.emplace_back(
-                        index, DynamicArgType::ListIntType, serialized_arg_val.size());
-                } else if (serialized_arg_type == "as_floats") {
-                    std::vector<double> ret;
-                    for (const auto& arg : serialized_arg_val) {
-                        ret.push_back(arg);
-                    }
-                    stack.at(index) = std::move(ret);
-                } else if (serialized_arg_type == "as_bools") {
-                    std::vector<bool> ret;
-                    for (const auto& arg : serialized_arg_val) {
-                        ret.push_back(arg);
-                    }
-                    stack.at(index) = std::move(ret);
-                } else {
-                    TORCH_CHECK(
-                        false,
-                        "Expected extern kernel ",
-                        op_kernel.target_,
-                        " to have a List[Scalar] input for argument ",
-                        index,
-                        " but got ",
-                        serialized_arg_type);
-                }
-            } else if (schema_arg_type->isSubtypeOf(
-                           at::ListType::ofOptionalTensors())) {
-                if (serialized_arg_type == "as_optional_tensors") {
-                    std::vector<std::string> list_item_types;
-                    for (const auto& arg : serialized_arg_val) {
-                        list_item_types.push_back(arg.begin().key());
-                    }
-                    dynamic_args.emplace_back(
-                        index,
-                        DynamicArgType::ListOptionalTensorType,
-                        serialized_arg_val.size(),
-                        list_item_types);
-                } else if (serialized_arg_type == "as_tensors") {
-                    dynamic_args.emplace_back(
-                        index, DynamicArgType::ListTensorType, serialized_arg_val.size());
-                } else {
-                    TORCH_CHECK(
-                        false,
-                        "Expected extern kernel ",
-                        op_kernel.target_,
-                        " to have a Tensor?[] input for argument ",
-                        index,
-                        " but got ",
-                        serialized_arg_type);
-                }
-            } else if (schema_arg_type->isSubtypeOf(at::ListType::ofStrings())) {
-                TORCH_CHECK(
-                    serialized_arg_type == "as_strings",
-                    "Expected extern kernel ",
-                    op_kernel.target_,
-                    " to have serialized argument type as_strings for argument ",
-                    index,
-                    " but got ",
-                    serialized_arg_type);
-                std::vector<std::string> ret;
-                for (const auto& arg : serialized_arg_val) {
-                    ret.push_back(arg.get<std::string>());
-                }
-                stack.at(index) = std::move(ret);
-            } else {
-                TORCH_CHECK(
-                    false,
-                    "NYI: Unsupported list type ",
-                    serialized_arg_type,
-                    " for extern kernel ",
-                    op_kernel.target_,
-                    " argument ",
-                    index);
-            }
-            break;
+      } else if (schema_arg_type->isSubtypeOf(at::ListType::ofStrings())) {
+        TORCH_CHECK(
+            serialized_arg_type == "as_strings",
+            "Expected extern kernel ",
+            op_kernel.target_,
+            " to have serialized argument type as_strings for argument ",
+            index,
+            " but got ",
+            serialized_arg_type);
+        std::vector<std::string> ret;
+        for (const auto& arg : serialized_arg_val) {
+          ret.push_back(arg.get<std::string>());
         }
-        case c10::TypeKind::OptionalType: {
-            auto inner_type =
-                schema_arg_type->castRaw<at::OptionalType>()->getElementType();
-
-            if (serialized_arg_type == "as_none") {
-                stack.at(index) = c10::IValue{};
-                if (inner_type->kind() == c10::TypeKind::TensorType) {
-                    // Tensor is None
-                    dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0);
-                } else if (
-                    inner_type->kind() == c10::TypeKind::IntType ||
-                    inner_type->kind() == c10::TypeKind::SymIntType) {
-                    // Int or SymInt is None
-                    dynamic_args.emplace_back(index, DynamicArgType::IntType, 0);
-                } else if (
-                    inner_type->kind() == c10::TypeKind::ListType &&
-                    schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) {
-                    // List[Tensor] is None
-                    dynamic_args.emplace_back(index, DynamicArgType::ListTensorType, 0);
-                } else if (
-                    inner_type->kind() == c10::TypeKind::ListType &&
-                    schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) {
-                    // List[SymInt] is None
-                    dynamic_args.emplace_back(index, DynamicArgType::ListIntType, 0);
-                }
-            } else {
-                prefill_stack_with_static_arguments(
-                    index, inner_type, serialized_arg, op_kernel);
-            }
-            break;
+        stack.at(index) = std::move(ret);
+      } else {
+        TORCH_CHECK(
+            false,
+            "NYI: Unsupported list type ",
+            serialized_arg_type,
+            " for extern kernel ",
+            op_kernel.target_,
+            " argument ",
+            index);
+      }
+      break;
+    }
+    case c10::TypeKind::OptionalType: {
+      auto inner_type =
+          schema_arg_type->castRaw<at::OptionalType>()->getElementType();
+
+      if (serialized_arg_type == "as_none") {
+        stack.at(index) = c10::IValue{};
+        if (inner_type->kind() == c10::TypeKind::TensorType) {
+          // Tensor is None
+          dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0);
+        } else if (
+            inner_type->kind() == c10::TypeKind::IntType ||
+            inner_type->kind() == c10::TypeKind::SymIntType) {
+          // Int or SymInt is None
+          dynamic_args.emplace_back(index, DynamicArgType::IntType, 0);
+        } else if (
+            inner_type->kind() == c10::TypeKind::ListType &&
+            schema_arg_type->isSubtypeOf(at::ListType::ofTensors())) {
+          // List[Tensor] is None
+          dynamic_args.emplace_back(index, DynamicArgType::ListTensorType, 0);
+        } else if (
+            inner_type->kind() == c10::TypeKind::ListType &&
+            schema_arg_type->isSubtypeOf(at::ListType::ofSymInts())) {
+          // List[SymInt] is None
+          dynamic_args.emplace_back(index, DynamicArgType::ListIntType, 0);
         }
-        default:
-            TORCH_CHECK(
-                false,
-                "Unsupported input type ",
-                serialized_arg_type,
-                " for extern kernel ",
-                op_kernel.target_,
-                " argument ",
-                index);
+      } else {
+        prefill_stack_with_static_arguments(
+            index, inner_type, serialized_arg, op_kernel);
+      }
+      break;
     }
+    default:
+      TORCH_CHECK(
+          false,
+          "Unsupported input type ",
+          serialized_arg_type,
+          " for extern kernel ",
+          op_kernel.target_,
+          " argument ",
+          index);
+  }
 }
 
 // Populates op_kernel.stack_, op_kernel.dynamic_args_
 void OSSProxyExecutorNpu::get_input_info_from_serialized(
     const std::vector<c10::Argument>& schema_args,
     const nlohmann::json& serialized_node,
-    OSSOpKernel& op_kernel)
-{
-    std::vector<bool> filled(schema_args.size(), false);
-    TORCH_CHECK(op_kernel.stack_.size() == 0);
-    op_kernel.stack_.resize(schema_args.size());
-    for (const auto& named_argument : serialized_node["inputs"]) {
-        const auto& arg = named_argument["arg"];
-        const auto& name = named_argument["name"].get<std::string>();
-
-        // Doing a linear lookup in the schema to find the index
-        // of a static argument. Should be fine performance wise
-        // because we usually only have small amount of arguments.
-        for (size_t index = 0; index < schema_args.size(); index++) {
-            auto& schema_arg = schema_args[index];
-            if (schema_arg.name() == name) {
-                prefill_stack_with_static_arguments(
-                    index, schema_arg.real_type(), arg, op_kernel);
-                filled[index] = true;
-                break;
-            }
-        }
-    }
-
-    // If an argument is not filled and has a default value, we should
-    // also prefill the default value.
-    for (size_t index = 0; index < schema_args.size(); index++) {
-        if (!filled[index] && schema_args[index].default_value()) {
-            auto default_value = *schema_args[index].default_value();
-            op_kernel.stack_.at(index) = default_value;
-        }
-    }
+    OSSOpKernel& op_kernel) {
+  std::vector<bool> filled(schema_args.size(), false);
+  TORCH_CHECK(op_kernel.stack_.size() == 0);
+  op_kernel.stack_.resize(schema_args.size());
+  for (const auto& named_argument : serialized_node["inputs"]) {
+      const auto& arg = named_argument["arg"];
+      const auto& name = named_argument["name"].get<std::string>();
+  
+      // Doing a linear lookup in the schema to find the index
+      // of a static argument. Should be fine performance wise
+      // because we usually only have small amount of arguments.
+      for (size_t index = 0; index < schema_args.size(); index++) {
+      auto& schema_arg = schema_args[index];
+      if (schema_arg.name() == name) {
+          prefill_stack_with_static_arguments(
+              index, schema_arg.real_type(), arg, op_kernel);
+          filled[index] = true;
+          break;
+      }
+      }
+  }
+  
+  // If an argument is not filled and has a default value, we should
+  // also prefill the default value.
+  for (size_t index = 0; index < schema_args.size(); index++) {
+      if (!filled[index] && schema_args[index].default_value()) {
+      auto default_value = *schema_args[index].default_value();
+      op_kernel.stack_.at(index) = default_value;
+      }
+  }
 }
 
 // Populates op_kernel.outputs_
 void OSSProxyExecutorNpu::get_output_info_from_serialized(
     const std::vector<c10::Argument>& schema_returns,
     const nlohmann::json& serialized_node,
-    OSSOpKernel& op_kernel)
-{
-    std::vector<OSSDynamicArg>& outputs = op_kernel.outputs_;
-
-    TORCH_CHECK(
-        schema_returns.size() == serialized_node["outputs"].size(),
-        "Serialized node doesn't match operator ",
-        serialized_node["target"],
-        "'s schema outputs.");
-
-    size_t output_index = 0;
-    for (const auto& serialized_output : serialized_node["outputs"]) {
-        TORCH_CHECK(serialized_output.size() == 1);
-        std::string serialized_output_type = serialized_output.begin().key();
-        auto& serialized_output_val = serialized_output.begin().value();
-
-        auto& schema_return = schema_returns[output_index];
-        const at::TypePtr& schema_return_type = schema_return.real_type();
-
-        switch (schema_return_type->kind()) {
-            case c10::TypeKind::TensorType: {
-                TORCH_CHECK(
-                    serialized_output_type == "as_tensor",
-                    "Expected extern kernel ",
-                    serialized_node["target"],
-                    " to have serialized output type as_tensor, ",
-                    " but got ",
-                    serialized_output_type);
-                outputs.emplace_back(output_index, DynamicArgType::TensorType, 1);
-                break;
-            }
-            case c10::TypeKind::ListType: {
-                if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) {
-                    TORCH_CHECK(
-                        serialized_output_type == "as_tensors",
-                        "Expected extern kernel ",
-                        serialized_node["target"],
-                        " to have serialized output type as_tensors, ",
-                        " but got ",
-                        serialized_output_type);
-                    outputs.emplace_back(
-                        output_index,
-                        DynamicArgType::ListTensorType,
-                        serialized_output_val.size());
-                } else {
-                    TORCH_CHECK(
-                        false,
-                        "Unsupported return list type ",
-                        schema_return_type->repr_str());
-                }
-                break;
-            }
-            default: {
-                TORCH_CHECK(
-                    false,
-                    "Unsupported return type ",
-                    schema_return_type->repr_str(),
-                    " for extern kernel ",
-                    op_kernel.target_);
-            }
+    OSSOpKernel& op_kernel) {
+  std::vector<OSSDynamicArg>& outputs = op_kernel.outputs_;
+
+  TORCH_CHECK(
+      schema_returns.size() == serialized_node["outputs"].size(),
+      "Serialized node doesn't match operator ",
+      serialized_node["target"],
+      "'s schema outputs.");
+
+  size_t output_index = 0;
+  for (const auto& serialized_output : serialized_node["outputs"]) {
+    TORCH_CHECK(serialized_output.size() == 1);
+    std::string serialized_output_type = serialized_output.begin().key();
+    auto& serialized_output_val = serialized_output.begin().value();
+
+    auto& schema_return = schema_returns[output_index];
+    const at::TypePtr& schema_return_type = schema_return.real_type();
+
+    switch (schema_return_type->kind()) {
+      case c10::TypeKind::TensorType: {
+        TORCH_CHECK(
+            serialized_output_type == "as_tensor",
+            "Expected extern kernel ",
+            serialized_node["target"],
+            " to have serialized output type as_tensor, ",
+            " but got ",
+            serialized_output_type);
+        outputs.emplace_back(output_index, DynamicArgType::TensorType, 1);
+        break;
+      }
+      case c10::TypeKind::ListType: {
+        if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) {
+          TORCH_CHECK(
+              serialized_output_type == "as_tensors",
+              "Expected extern kernel ",
+              serialized_node["target"],
+              " to have serialized output type as_tensors, ",
+              " but got ",
+              serialized_output_type);
+          outputs.emplace_back(
+              output_index,
+              DynamicArgType::ListTensorType,
+              serialized_output_val.size());
+        } else {
+          TORCH_CHECK(
+              false,
+              "Unsupported return list type ",
+              schema_return_type->repr_str());
         }
-
-        output_index++;
+        break;
+      }
+      default: {
+        TORCH_CHECK(
+            false,
+            "Unsupported return type ",
+            schema_return_type->repr_str(),
+            " for extern kernel ",
+            op_kernel.target_);
+      }
     }
+
+    output_index++;
+  }
 }
 
-OSSProxyExecutorNpu::OSSProxyExecutorNpu(const std::string& json_path, bool is_cpu)
-{
-    if (is_cpu) {
-        device_ = std::make_unique<c10::Device>(c10::DeviceType::CPU);
-    } else {
-        int device_idx = -1;
-        device_ = std::make_unique<c10::Device>(c10::DeviceType::CUDA, device_idx);
-    }
+OSSProxyExecutorNpu::OSSProxyExecutorNpu(const std::string& json_path, bool is_cpu) {
+  if (is_cpu) {
+    device_ = std::make_unique<c10::Device>(c10::DeviceType::CPU);
+  } else {
+    int device_idx = -1;
+    device_ = std::make_unique<c10::Device>(c10::DeviceType::CUDA, device_idx);
+  }
 
-    std::string extern_kernel_nodes_serialized;
+  std::string extern_kernel_nodes_serialized;
 
-    std::ifstream json_file(json_path);
-    TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path);
+  std::ifstream json_file(json_path);
+  TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path);
 
-    // Parse file into a json object
-    nlohmann::json json_obj;
-    json_file >> json_obj;
+  // Parse file into a json object
+  nlohmann::json json_obj;
+  json_file >> json_obj;
 
-    // Access data
-    for (auto const& serialized_extern_node : json_obj["nodes"]) {
-        auto const& serialized_node = serialized_extern_node["node"];
+  // Access data
+  for (auto const& serialized_extern_node : json_obj["nodes"]) {
+    auto const& serialized_node = serialized_extern_node["node"];
 
-        const std::string& target = serialized_node["target"];
+    const std::string& target = serialized_node["target"];
 
-        std::string opName;
-        std::string overloadName;
-        size_t pos = target.find('.');
-        if (pos == std::string::npos) {
-            opName = target;
-            overloadName = "";
-        } else {
-            // There should be no more periods
-            size_t pos2 = target.find('.', pos + 1);
-            TORCH_CHECK(pos2 == std::string::npos);
+    std::string opName;
+    std::string overloadName;
+    size_t pos = target.find('.');
+    if (pos == std::string::npos) {
+      opName = target;
+      overloadName = "";
+    } else {
+      // There should be no more periods
+      size_t pos2 = target.find('.', pos + 1);
+      TORCH_CHECK(pos2 == std::string::npos);
 
-            opName = target.substr(0, pos);
-            overloadName = target.substr(pos + 1, target.length() - pos);
-        }
+      opName = target.substr(0, pos);
+      overloadName = target.substr(pos + 1, target.length() - pos);
+    }
 
-        c10::OperatorHandle op_handle =
-            c10::Dispatcher::singleton().findSchemaOrThrow(
-                opName.c_str(), overloadName.c_str());
-        const c10::FunctionSchema& schema = op_handle.schema();
+    c10::OperatorHandle op_handle =
+        c10::Dispatcher::singleton().findSchemaOrThrow(
+            opName.c_str(), overloadName.c_str());
+    const c10::FunctionSchema& schema = op_handle.schema();
 
-        const auto& schema_args = schema.arguments();
-        const auto& schema_returns = schema.returns();
+    const auto& schema_args = schema.arguments();
+    const auto& schema_returns = schema.returns();
 
-        OSSOpKernel op_kernel(target, op_handle);
-        get_input_info_from_serialized(schema_args, serialized_node, op_kernel);
-        get_output_info_from_serialized(schema_returns, serialized_node, op_kernel);
+    OSSOpKernel op_kernel(target, op_handle);
+    get_input_info_from_serialized(schema_args, serialized_node, op_kernel);
+    get_output_info_from_serialized(schema_returns, serialized_node, op_kernel);
 
-        op_kernels_.emplace_back(std::move(op_kernel));
-    }
+    op_kernels_.emplace_back(std::move(op_kernel));
+  }
 }
 
 void OSSProxyExecutorNpu::call_function(
@@ -499,135 +494,134 @@ void OSSProxyExecutorNpu::call_function(
     int num_ints,
     int64_t* flatten_int_args,
     int num_tensors,
-    AtenTensorHandle* flatten_tensor_args)
-{
-    TORCH_CHECK(
-        extern_node_index < static_cast<int>(op_kernels_.size()),
-        "Invalid extern node index");
-    OSSOpKernel& op_kernel = op_kernels_[extern_node_index];
-
-    std::vector<c10::IValue> stack = op_kernel.stack_;
-    auto& dynamic_args = op_kernel.dynamic_args_;
-
-    int tensor_id = 0;
-    int int_id = 0;
-    for (auto& dynamic_arg : dynamic_args) {
-        int arg_index = dynamic_arg.arg_index;
-        DynamicArgType dynamic_arg_type = dynamic_arg.arg_type;
-        int length = dynamic_arg.length;
-
-        if (length == 0) {
-            continue;
-        }
-
-        switch (dynamic_arg_type) {
-            case DynamicArgType::TensorType: {
-                at::Tensor* tensor =
-                    tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
-                stack[arg_index] = *tensor;
-                break;
-            }
-            case DynamicArgType::IntType: {
-                int64_t val = flatten_int_args[int_id++];
-                stack[arg_index] = val;
-                break;
-            }
-            case DynamicArgType::ListTensorType: {
-                std::vector<at::Tensor> tensor_list;
-                for (int j = 0; j < length; j++) {
-                    at::Tensor* tensor =
-                        tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
-                    tensor_list.push_back(*tensor);
-                }
-                stack[arg_index] = tensor_list;
-                break;
-            }
-            case DynamicArgType::ListOptionalTensorType: {
-                std::vector<std::optional<at::Tensor>> optional_tensor_list;
-                auto& list_item_types = dynamic_arg.list_item_types;
-                TORCH_CHECK(
-                    list_item_types.has_value(),
-                    "Could not find list of item types for optional tensor list input");
-
-                for (const std::string& item_type : list_item_types.value()) {
-                    if (item_type == "as_tensor") {
-                        at::Tensor* tensor = tensor_handle_to_tensor_pointer(
-                            flatten_tensor_args[tensor_id++]);
-                        optional_tensor_list.emplace_back(*tensor);
-                    } else if (item_type == "as_none") {
-                        optional_tensor_list.emplace_back(std::nullopt);
-                    }
-                }
-                stack[arg_index] = optional_tensor_list;
-                break;
-            }
-            case DynamicArgType::ListIntType: {
-                std::vector<int64_t> vals;
-                vals.reserve(length);
-                for (int j = 0; j < length; j++) {
-                    vals.push_back(flatten_int_args[int_id++]);
-                }
-                stack[arg_index] = vals;
-                break;
-            }
-            default:
-                TORCH_CHECK(false, "Unsupported dynamic arg type: ", dynamic_arg_type);
-        }
+    AtenTensorHandle* flatten_tensor_args) {
+  TORCH_CHECK(
+      extern_node_index < static_cast<int>(op_kernels_.size()),
+      "Invalid extern node index");
+  OSSOpKernel& op_kernel = op_kernels_[extern_node_index];
+
+  std::vector<c10::IValue> stack = op_kernel.stack_;
+  auto& dynamic_args = op_kernel.dynamic_args_;
+
+  int tensor_id = 0;
+  int int_id = 0;
+  for (auto& dynamic_arg : dynamic_args) {
+    int arg_index = dynamic_arg.arg_index;
+    DynamicArgType dynamic_arg_type = dynamic_arg.arg_type;
+    int length = dynamic_arg.length;
+
+    if (length == 0) {
+      continue;
     }
 
-    int num_output_tensors = op_kernel.num_output_tensors();
-    TORCH_CHECK(
-        tensor_id == num_tensors - num_output_tensors,
-        "Mismatch between tensors consumed and num of input tensor, got tensor_id = .",
-        tensor_id,
-        ", expected num = ",
-        num_tensors - num_output_tensors);
-    TORCH_CHECK(
-        int_id == num_ints,
-        "Mismatch between ints consumed and num_ints, got int_id = ",
-        int_id,
-        ", num_ints = ",
-        num_ints);
-
-    // Call the op with the prepared stack.
-    const c10::OperatorHandle& op = op_kernel.op_handle_;
-    op.callBoxed(stack);
-
-    const c10::FunctionSchema& schema = op.schema();
-    const auto& schema_returns = schema.returns();
-
-    TORCH_CHECK(op_kernel.outputs_.size() == stack.size());
-    TORCH_CHECK(stack.size() == schema_returns.size());
-
-    int index = 0;
-    for (const auto& schema_return : schema_returns) {
-        if (schema_return.type()->kind() == c10::TypeKind::TensorType) {
-            at::Tensor* tensor =
-                tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
-            *tensor = stack[index++].toTensor();
-        } else if (
-            schema_return.type()->kind() == c10::TypeKind::ListType &&
-            schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) {
-            auto tensors = stack[index++].toTensorList();
-            for (auto&& t : tensors) {
-                at::Tensor* tensor =
-                    tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
-                *tensor = t;
-            }
-        } else {
-            TORCH_CHECK(
-                false,
-                "NYI: Unsupported return type for schema: ",
-                schema_return.type()->repr_str());
+    switch (dynamic_arg_type) {
+      case DynamicArgType::TensorType: {
+        at::Tensor* tensor =
+            tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+        stack[arg_index] = *tensor;
+        break;
+      }
+      case DynamicArgType::IntType: {
+        int64_t val = flatten_int_args[int_id++];
+        stack[arg_index] = val;
+        break;
+      }
+      case DynamicArgType::ListTensorType: {
+        std::vector<at::Tensor> tensor_list;
+        for (int j = 0; j < length; j++) {
+          at::Tensor* tensor =
+              tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+          tensor_list.push_back(*tensor);
         }
+        stack[arg_index] = tensor_list;
+        break;
+      }
+      case DynamicArgType::ListOptionalTensorType: {
+        std::vector<std::optional<at::Tensor>> optional_tensor_list;
+        auto& list_item_types = dynamic_arg.list_item_types;
+        TORCH_CHECK(
+            list_item_types.has_value(),
+            "Could not find list of item types for optional tensor list input");
+
+        for (const std::string& item_type : list_item_types.value()) {
+          if (item_type == "as_tensor") {
+            at::Tensor* tensor = tensor_handle_to_tensor_pointer(
+                flatten_tensor_args[tensor_id++]);
+            optional_tensor_list.emplace_back(*tensor);
+          } else if (item_type == "as_none") {
+            optional_tensor_list.emplace_back(std::nullopt);
+          }
+        }
+        stack[arg_index] = optional_tensor_list;
+        break;
+      }
+      case DynamicArgType::ListIntType: {
+        std::vector<int64_t> vals;
+        vals.reserve(length);
+        for (int j = 0; j < length; j++) {
+          vals.push_back(flatten_int_args[int_id++]);
+        }
+        stack[arg_index] = vals;
+        break;
+      }
+      default:
+        TORCH_CHECK(false, "Unsupported dynamic arg type: ", dynamic_arg_type);
     }
-
-    TORCH_CHECK(
-        tensor_id == num_tensors,
-        "Mismatch between tensors consumed and num_tensors, got tensor_id = ",
-        tensor_id,
-        ", expected num = ",
-        num_tensors);
+  }
+
+  int num_output_tensors = op_kernel.num_output_tensors();
+  TORCH_CHECK(
+      tensor_id == num_tensors - num_output_tensors,
+      "Mismatch between tensors consumed and num of input tensor, got tensor_id = .",
+      tensor_id,
+      ", expected num = ",
+      num_tensors - num_output_tensors);
+  TORCH_CHECK(
+      int_id == num_ints,
+      "Mismatch between ints consumed and num_ints, got int_id = ",
+      int_id,
+      ", num_ints = ",
+      num_ints);
+
+  // Call the op with the prepared stack.
+  const c10::OperatorHandle& op = op_kernel.op_handle_;
+  op.callBoxed(stack);
+
+  const c10::FunctionSchema& schema = op.schema();
+  const auto& schema_returns = schema.returns();
+
+  TORCH_CHECK(op_kernel.outputs_.size() == stack.size());
+  TORCH_CHECK(stack.size() == schema_returns.size());
+
+  int index = 0;
+  for (const auto& schema_return : schema_returns) {
+    if (schema_return.type()->kind() == c10::TypeKind::TensorType) {
+      at::Tensor* tensor =
+          tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+      *tensor = stack[index++].toTensor();
+    } else if (
+        schema_return.type()->kind() == c10::TypeKind::ListType &&
+        schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) {
+      auto tensors = stack[index++].toTensorList();
+      for (auto&& t : tensors) {
+        at::Tensor* tensor =
+            tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+        *tensor = t;
+      }
+    } else {
+      TORCH_CHECK(
+          false,
+          "NYI: Unsupported return type for schema: ",
+          schema_return.type()->repr_str());
+    }
+  }
+
+  TORCH_CHECK(
+      tensor_id == num_tensors,
+      "Mismatch between tensors consumed and num_tensors, got tensor_id = ",
+      tensor_id,
+      ", expected num = ",
+      num_tensors);
 }
 
 } // namespace torch::aot_inductor
\ No newline at end of file
-- 
Gitee


From 8664fc0d9b89ec12e4ea767f6fc14bca623a5e65 Mon Sep 17 00:00:00 2001
From: "zjx.com" <zhangjianxin27@huawei.com>
Date: Tue, 1 Jul 2025 12:12:09 +0000
Subject: [PATCH 11/11] Init

Signed-off-by: zjx.com <zhangjianxin27@huawei.com>
---
 torch_npu/csrc/distributed/Init.cpp | 79 +++++++++++++++++++----------
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 19ed1289a40..f7804ad15d6 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -104,7 +104,9 @@ public:
     inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) const
     {
         static auto cast_back_to_ori_format = [](const at::Tensor& t) {
-            return at_npu::native::custom_ops::npu_format_cast(t, torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_);
+            return at_npu::native::custom_ops::npu_format_cast(
+                t,
+                torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_);
         };
         return c10::fmap(tensors, cast_back_to_ori_format);
     }
@@ -164,7 +166,8 @@ void broadcast_coalesced(
     // multiple devices and can contain a mix of CPU and CUDA tensors.
     std::vector<std::vector<size_t>> buckets;
     std::tie(buckets, std::ignore) =
-        c10d_npu::compute_bucket_assignment_by_size(tensors.vec(), {buffer_size});
+        c10d_npu::compute_bucket_assignment_by_size(
+            tensors.vec(), {buffer_size});
 
     // Returns tensor at specified index in input tensor list.
     const auto lookup = [&tensors](size_t index) { return tensors[index]; };
@@ -195,8 +198,9 @@ void _register_comm_hook(
     py::object state,
     py::object comm_hook)
 {
-    reducer.register_comm_hook(std::make_unique<::c10d::PythonCommHook>(
-        std::move(state), std::move(comm_hook)));
+    reducer.register_comm_hook(
+        std::make_unique<::c10d::PythonCommHook>(
+            std::move(state), std::move(comm_hook)));
 }
 
 // Called from DDP's Python API to create a c10d C++ comm hook.
@@ -229,9 +233,19 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
            const c10::optional<std::shared_ptr<::c10d::Logger>>& logger) {
             if (logger.has_value()) {
                 std::weak_ptr<::c10d::Logger> logger_weakref = logger.value();
-                return ::c10d_npu::compute_bucket_assignment_by_size(tensors, bucket_size_limits, expect_sparse_gradient, tensor_indices, {logger_weakref});
+                return ::c10d_npu::compute_bucket_assignment_by_size(
+                    tensors,
+                    bucket_size_limits,
+                    expect_sparse_gradient,
+                    tensor_indices,
+                    {logger_weakref});
             } else {
-                return ::c10d_npu::compute_bucket_assignment_by_size(tensors, bucket_size_limits, expect_sparse_gradient, tensor_indices, {});
+                return ::c10d_npu::compute_bucket_assignment_by_size(
+                    tensors,
+                    bucket_size_limits,
+                    expect_sparse_gradient,
+                    tensor_indices,
+                    {});
             }
         },
         py::arg("tensors"),
@@ -247,7 +261,8 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
            const c10::optional<std::shared_ptr<::c10d::Logger>>& logger) {
             if (logger.has_value()) {
                 std::weak_ptr<::c10d::Logger> logger_weakref = logger.value();
-                c10d_npu::verify_params_across_processes(process_group, params, {logger_weakref});
+                c10d_npu::verify_params_across_processes(
+                    process_group, params, {logger_weakref});
             } else {
                 c10d_npu::verify_params_across_processes(process_group, params, {});
             }
@@ -324,13 +339,16 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
                 -> void { reducer.prepare_for_backward({output}); },
             py::call_guard<py::gil_scoped_release>())
         .def("get_backward_stats", &c10d_npu::Reducer::get_backward_stats)
-        .def("_install_post_backward_futures", [](::c10d_npu::Reducer& reducer, const std::vector<std::shared_ptr<torch::jit::PythonFutureWrapper>>& futs) {
-            c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futures(c10::FutureType::create(c10::TensorType::get()));
-            for (const auto& fut : futs) {
-                futures.push_back(fut->fut);
-            }
-            reducer.install_futures(std::move(futures));
-        },
+        .def("_install_post_backward_futures",
+            [](::c10d_npu::Reducer& reducer,
+               const std::vector<std::shared_ptr<torch::jit::PythonFutureWrapper>>& futs) {
+                c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futures(
+                    c10::FutureType::create(c10::TensorType::get()));
+                for (const auto& fut : futs) {
+                    futures.push_back(fut->fut);
+                }
+                reducer.install_futures(std::move(futures));
+            },
             py::call_guard<py::gil_scoped_release>())
         .def("_rebuild_buckets",
             &::c10d_npu::Reducer::rebuild_buckets,
@@ -377,8 +395,9 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
             });
 
     py::module_ dist = py::module_::import("torch._C._distributed_c10d");
-    auto processGroupHCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupHCCL>(
-        module, "ProcessGroupHCCL", dist.attr("Backend"))
+    auto processGroupHCCL =
+        intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupHCCL>(
+            module, "ProcessGroupHCCL", dist.attr("Backend"))
         .def(py::init<const c10::intrusive_ptr<::c10d::Store>&,
                  int,
                  int,
@@ -413,8 +432,10 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
         .def("set_watchdog_status", &::c10d_npu::ProcessGroupHCCL::setWatchdogStatus)
         .def("clear_workmeta_list", &::c10d_npu::ProcessGroupHCCL::clearWorkMetaList)
         .def("get_hccl_comm_name",
-            [](::c10d_npu::ProcessGroupHCCL& pg, int rankid, py::args args, py::kwargs kwargs)
-                -> std::string {
+            [](::c10d_npu::ProcessGroupHCCL& pg,
+               int rankid,
+               py::args args,
+               py::kwargs kwargs) -> std::string {
                 bool init_comm = true;
                 if (kwargs.contains("init_comm")) {
                     init_comm = py::cast<bool>(kwargs["init_comm"]);
@@ -426,7 +447,8 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
             py::arg("peer") = -1)
         .def_property_readonly("options", &::c10d_npu::ProcessGroupHCCL::getOptions)
         .def("batch_isend_irecv",
-            [](::c10d_npu::ProcessGroupHCCL& pg, std::vector<std::string>& op_type,
+            [](::c10d_npu::ProcessGroupHCCL& pg,
+               std::vector<std::string>& op_type,
                std::vector<at::Tensor>& tensors,
                std::vector<uint32_t> remote_rank_list)
                 -> c10::intrusive_ptr<c10d::Work> {
@@ -458,19 +480,22 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
             &::c10d_npu::ProcessGroupHCCL::Options::is_high_priority_stream)
         .def_readwrite("global_ranks_in_group",
             &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group)
-        .def_readwrite("hccl_config", &::c10d_npu::ProcessGroupHCCL::Options::hccl_config)
+        .def_readwrite("hccl_config",
+            &::c10d_npu::ProcessGroupHCCL::Options::hccl_config)
         .def_readwrite("group_id",
             &::c10d_npu::ProcessGroupHCCL::Options::group_id);
 
     // bind for ProcessGroupLCCL
-    auto processGroupLCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupLCCL>(
-        module, "ProcessGroupLCCL", dist.attr("Backend"))
+    auto processGroupLCCL =
+        intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupLCCL>(
+            module, "ProcessGroupLCCL", dist.attr("Backend"))
         .def(py::init<const c10::intrusive_ptr<::c10d::Store>&, int, int>(),
             py::call_guard<py::gil_scoped_release>());
 
     auto cDist = py::module_::import("torch._C._distributed_c10d");
-    auto parallelStore = intrusive_ptr_no_gil_destructor_class_<::c10d::ParallelTcpStore>(
-        module, "ParallelStore", cDist.attr("Store"), R"(
+    auto parallelStore =
+        intrusive_ptr_no_gil_destructor_class_<::c10d::ParallelTcpStore>(
+            module, "ParallelStore", cDist.attr("Store"), R"(
 A TCP-Parallel-Epoll-based distributed key-value store implementation. The server store holds
 the data, while the client stores can connect to the server store over TCP and
 perform actions such as :meth:`~torch.distributed.store.set` to insert a key-value
@@ -530,8 +555,10 @@ Example::
             if (worldSize > -1) {
                 numWorkers = static_cast<std::size_t>(worldSize);
             }
-            ::c10d::TCPStoreOptions opts{port, isServer, numWorkers, waitWorkers, timeout, multiTenant};
-            return c10::make_intrusive<::c10d::ParallelTcpStore>(host, agentRun, agentPid, enableTiered, opts);
+            ::c10d::TCPStoreOptions opts{
+                port, isServer, numWorkers, waitWorkers, timeout, multiTenant};
+            return c10::make_intrusive<::c10d::ParallelTcpStore>(
+                host, agentRun, agentPid, enableTiered, opts);
         }),
             py::arg("host") = "127.0.0.1",
             py::arg("port") = 29500,
-- 
Gitee