diff --git a/torch_npu/contrib/function/roll.py b/torch_npu/contrib/function/roll.py
index 97037c8d0754427942742c28de5ba0f6568ee3ac..550064e693c5ce5fc1110ef7f2b666ceb739a531 100644
--- a/torch_npu/contrib/function/roll.py
+++ b/torch_npu/contrib/function/roll.py
@@ -30,7 +30,7 @@ _roll_with_index_select = _RollWithIndexSelect.apply
 def _get_roll_index(H, W, shifts, device='cpu'):
     index = torch.arange(0, H * W).reshape(H, W)
     index_fp = torch.roll(index, shifts=shifts, dims=(0, 1)).reshape(-1).long()
-    index_bp_dict = {i:idx for idx, i in enumerate(index_fp.numpy().tolist())}
+    index_bp_dict = {i: idx for idx, i in enumerate(index_fp.numpy().tolist())}
     index_bp_list = [index_bp_dict[i] for i in range(H * W)]
     index_bp = torch.LongTensor(index_bp_list)
     return [index_fp.to(device), index_bp.to(device)]
diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
index bcb89e6c88354a1a290b320c0d9003417b0339f7..c5f6f913d44632c09ebea3978b0f1bdbccbaf82a 100644
--- a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
@@ -93,8 +93,10 @@ void GetExclusiveAffinityCPU()
             offset = find_offset->second;
         }
         c10_npu::CoreIdRange cpu_range = parseAffinityCPU(affinity_cpu);
-        int length = (cpu_range.end - cpu_range.start + 1) / same_num;
-        c10_npu::CoreIdRange exclusiveAffinityCpu = {cpu_range.start + offset * length, (cpu_range.start + length - 1) + offset * length};
+        unsigned int length = (cpu_range.end - cpu_range.start + 1) / static_cast<unsigned int>(same_num);
+        c10_npu::CoreIdRange exclusiveAffinityCpu = {
+            cpu_range.start + static_cast<unsigned int>(offset) * length,
+            (cpu_range.start + length - 1) + static_cast<unsigned int>(offset) * length};
         offsetMap[affinity_cpu] = offset + 1;
         CardIdAffinityCPU[card_id] = exclusiveAffinityCpu;
     }
diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
index 8916a70fc9998b2562a6471fd2cae992086fbe7a..cc817f5506ae2e654caec02cd1a7bc1923382152 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
@@ -123,7 +123,7 @@ int64_t DriverVersionToNum(std::string versionStr)
                   ((release + 1) * 10000) +
                   ((RCVersion + 1) * 100 + 5000) +
                   ((TVersion + 1) * 100) -
-                  (alphaVersion ? 1 : 0) * (100 - alphaVersion) +
+                  (alphaVersion != 0 ? 1 : 0) * (100 - alphaVersion) +
                   (bVersion - 1) + patch;
     return num;
 }
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index d3425f6f44b63a8467a0e56174058836796bce9c..5517812a0108c265a5171213f09631b335732351 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -2502,7 +2502,7 @@ private:
 
         // Repeat GC until we reach reclaim > target size.
         bool block_freed = true;
-        while (gc_reclaimed < target_size && block_freed == true && freeable_block_count > 0) {
+        while (gc_reclaimed < target_size && block_freed && freeable_block_count > 0) {
             // Free blocks exceeding this age threshold first.
             double age_threshold = total_age / freeable_block_count;
             // Stop iteration if we can no longer free a block.
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index c33f51fbc895f989609cd6ef0953678d7b0e1cdf..a4e14d2232ab30f7a3cd4e991c904f404b18f6a5 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -23,8 +23,8 @@ C10_NPU_API std::mutex* getFreeMutex();
 // block inside of already allocated area.
 class FreeMemoryCallback {
 public:
-  virtual ~FreeMemoryCallback(){};
-  virtual bool Execute() = 0;
+    virtual ~FreeMemoryCallback(){};
+    virtual bool Execute() = 0;
 };
 
 C10_DECLARE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
diff --git a/torch_npu/csrc/core/npu/NPUEventManager.h b/torch_npu/csrc/core/npu/NPUEventManager.h
index c01491aa033752413dd880329445a9eb2d8556e2..ac7f0176e0f52daf9f88fdd39bdb2f5b0d546f5b 100644
--- a/torch_npu/csrc/core/npu/NPUEventManager.h
+++ b/torch_npu/csrc/core/npu/NPUEventManager.h
@@ -22,7 +22,7 @@ public:
     void DecreaseUnrecordedCount(aclrtEvent event);
     bool IsEventRecorded(aclrtEvent event);
     void ClearUnrecordedCount();
-    ~NPUEventManager() {}   
+    ~NPUEventManager() {}
 
 private:
     void run(aclrtEvent event);
diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp
index 9c667f1fdb120c30696c49c1a911702332ffb54b..5d130ec760dbee43d428b3e4b5a94c5c54d534a5 100644
--- a/torch_npu/csrc/core/npu/NPUException.cpp
+++ b/torch_npu/csrc/core/npu/NPUException.cpp
@@ -132,10 +132,12 @@ const std::string c10_npu_check_error_message(std::string& errmsg)
 
         std::regex ws_regex("[\\s\\t\\n\\r]+");
         content = std::regex_replace(content, ws_regex, " ");
-        if (!content.empty() && content.front() == ' ')
+        if (!content.empty() && content.front() == ' ') {
             content.erase(0, 1);
-        if (!content.empty() && content.back() == ' ')
+        }
+        if (!content.empty() && content.back() == ' ') {
             content.pop_back();
+        }
 
         return content;
     }
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 73e2bb7ca1d19b06af7bd6b1c34861ce7a5467c7..bd29315e057b8e14ee9189bde7c802f3e73558b9 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -249,7 +249,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     // occur.
 #ifndef BUILD_LIBTORCH
     PyThreadState *gilState = nullptr;
-    if (PyGILState_Check()) {
+    if (PyGILState_Check() != 0) {
         gilState = PyEval_SaveThread();
     }
 #endif
@@ -523,7 +523,7 @@ void Repository::Enqueue(void *cur_paras)
     uint64_t u = 1;
 
     SetWriteWorking(true);
-    while (ret == false && (GetStatus() == RUN || GetStatus() == INIT)) {
+    while (!ret && (GetStatus() == RUN || GetStatus() == INIT)) {
         ret = WriteQueue(cur_paras);
         if (ret == false) {
             SetWriteWorking(false);
@@ -531,7 +531,7 @@ void Repository::Enqueue(void *cur_paras)
             if (IsFullQueue()) {
 #ifndef BUILD_LIBTORCH
                 // double check the current thread hold a Gil lock
-                if (PyGILState_Check()) {
+                if (PyGILState_Check() != 0) {
                     Py_BEGIN_ALLOW_THREADS s = eventfd_read(efd_write, &u);
                     Py_END_ALLOW_THREADS
                 } else {
diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp
index 35e6e526b17c08fa40bbf9259ae2d7e666e0d5cc..4411760ab4497d45c9237fc67c34498196b2b603 100644
--- a/torch_npu/csrc/core/npu/NPUStream.cpp
+++ b/torch_npu/csrc/core/npu/NPUStream.cpp
@@ -259,7 +259,7 @@ static uint32_t get_idx(std::atomic<uint32_t>& counter)
 {
     auto raw_idx = counter++;
     static int StreamsPerPool = GetStreamsPerPool();
-    return raw_idx % StreamsPerPool;
+    return raw_idx % static_cast<uint32_t>(StreamsPerPool);
 }
 
 static uint32_t get_sync_launch_stream_idx(std::atomic<uint32_t>& counter)
diff --git a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp
index 084f1df577a58ba352c76361211f5741df8ab4ef..39d19b0b628e1a191aab9ff4182a7e5bd1f6c657 100644
--- a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp
@@ -47,7 +47,7 @@ void* registerSvmMem(void* ptr, size_t size)
 void* mallocHostSwapMemory(size_t size)
 {
     if (!initialized) {
-        kAlignSize = sysconf(_SC_PAGESIZE);
+        kAlignSize = static_cast<size_t>(sysconf(_SC_PAGESIZE));
         initialized = true;
     }
     size = (size + kAlignSize - 1) & ~(kAlignSize - 1);
diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp
index 4e0fce02fb311d239c95a913939e41b554333b02..3fedb9d387ef61702a7414912b5572a8e187e7cd 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.cpp
+++ b/torch_npu/csrc/core/npu/NpuVariables.cpp
@@ -39,7 +39,8 @@ static std::map<std::string, SocVersion> socVersionMap = {
     {"Ascend910_9372", SocVersion::Ascend910_9372},
     {"Ascend910_9362", SocVersion::Ascend910_9362}};
 
-void SetSocVersion(const char* const socVersion) {
+void SetSocVersion(const char* const socVersion)
+{
   if (socVersion == nullptr ||
       g_curSocVersion != SocVersion::UnsupportedSocVersion) {
     return;
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index d85dd2211dae3d89134fff556cd55b95e36b963b..0ac77517aa9d965731f73ae41e934ab9b66db525 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -410,7 +410,7 @@ aclError AclrtSynchronizeStreamWithTimeout(aclrtStream stream) {
         }
         TORCH_CHECK(func_backup, "Failed to find function", "aclrtSynchronizeStreamWithTimeout and aclrtSynchronizeStream", PROF_ERROR(ErrCode::NOT_FOUND));
         return func_backup(stream);
-  }
+    }
 }
 
 aclError AclrtDestroyStreamForce(aclrtStream stream) {
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 3564e5196cc3ce6bce26f9c67850d0e2cc425ef5..4b6707b8495b9a89c500e06f160b42950e9ae6fb 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -277,8 +277,8 @@ void NpuSysCtrl::RegisterLazyFn(const option::OptionCallBack& call_, const std::
     lazy_fn_.emplace_back(std::make_pair(call_, in));
 }
 
-void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn,
-                                   ReleasePriority priority) {
+void NpuSysCtrl::RegisterReleaseFn(ReleaseFn release_fn, ReleasePriority priority)
+{
     const auto& iter = this->release_fn_.find(priority);
     if (iter != release_fn_.end()) {
         release_fn_[priority].emplace_back(release_fn);
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 99c6dc6f22aee259c44014bdddc383c092a0fa87..5c3a7729907ab060521f24fb9d941e5d1fe966c8 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -576,7 +576,7 @@ Example::
               Default settings return everything - i.e. contains HCCL comm dumps and collective traces.
         )");
 
-  Py_RETURN_TRUE;
+     Py_RETURN_TRUE;
 }
 
 // c10d methods on torch._C
diff --git a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
index c6b0e64641a65ee303b95e8dd3a8c87883bbcd4a..1626499a80b2870fb4cad1c8c726595e003212ff 100644
--- a/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
+++ b/torch_npu/csrc/framework/utils/ForceAclnnList.cpp
@@ -18,35 +18,36 @@
 
 namespace at_npu {
 namespace native {
+void ForceAclnn::RegisterOp(const std::string &list)
+{
+    if (list.empty()) {
+        return;
+    }
 
-void ForceAclnn::RegisterOp(const std::string &list) {
-  if (list.empty()) {
-    return;
-  }
-
-  auto value = list;
-  std::string delimiter = ",";
-  auto start = 0U;
-  auto end = value.find(delimiter);
-  std::string token;
-  while (end != std::string::npos) {
+    auto value = list;
+    std::string delimiter = ",";
+    auto start = 0U;
+    auto end = value.find(delimiter);
+    std::string token;
+    while (end != std::string::npos) {
+        token = value.substr(start, end - start);
+        if (!token.empty()) {
+            force_aclnn_op_list_.insert(token);
+        }
+        start = end + delimiter.size();
+        end = value.find(delimiter, start);
+    }
     token = value.substr(start, end - start);
     if (!token.empty()) {
-      force_aclnn_op_list_.insert(token);
+        force_aclnn_op_list_.insert(token);
     }
-    start = end + delimiter.size();
-    end = value.find(delimiter, start);
-  }
-  token = value.substr(start, end - start);
-  if (!token.empty()) {
-    force_aclnn_op_list_.insert(token);
-  }
-  return;
+    return;
 }
 
-bool ForceAclnn::IsForceAclnnOp(const std::string &op_name) const {
-  bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end());
-  return ret;
+bool ForceAclnn::IsForceAclnnOp(const std::string &op_name) const
+{
+    bool ret = (force_aclnn_op_list_.find(op_name) != force_aclnn_op_list_.end());
+    return ret;
 }
 } // namespace native
 } // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.h b/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.h
index 3c839aeb68d73cd347db637892518c32534842ac..57c8e71e7fbb7c9e06c57ece3f54f2c254476dee 100644
--- a/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.h
+++ b/torch_npu/csrc/inductor/aoti_runner/model_container_runner_npu.h
@@ -8,32 +8,25 @@ namespace torch::inductor {
 // NOTICE: Following APIs are subject to change due to active development
 // We provide NO BC guarantee for these APIs
 class TORCH_API AOTIModelContainerRunnerNpu : public AOTIModelContainerRunner {
- public:
-  // @param device_str: cuda device string, e.g. "cuda", "cuda:0"
-  AOTIModelContainerRunnerNpu(
-      const std::string& model_so_path,
-      size_t num_models = 1,
-      const std::string& device_str = "npu",
-      const std::string& cubin_dir = "");
+public:
+    // @param device_str: cuda device string, e.g. "cuda", "cuda:0"
+    AOTIModelContainerRunnerNpu(const std::string &model_so_path, size_t num_models = 1,
+        const std::string &device_str = "npu", const std::string &cubin_dir = "");
 
-  ~AOTIModelContainerRunnerNpu();
+    ~AOTIModelContainerRunnerNpu();
 
-  std::vector<at::Tensor> run(
-    const std::vector<at::Tensor>& inputs,
-    void* stream_handle = nullptr) override;
+    std::vector<at::Tensor> run(const std::vector<at::Tensor> &inputs, void *stream_handle = nullptr) override;
 
-  std::vector<at::Tensor> run_with_npu_stream(
-      std::vector<at::Tensor>& inputs,
-      c10_npu::NPUStream npu_stream);
+    std::vector<at::Tensor> run_with_npu_stream(std::vector<at::Tensor> &inputs, c10_npu::NPUStream npu_stream);
 
-  void init_proxy_executor();
+    void init_proxy_executor();
 
-  void set_proxy_executor(AOTIProxyExecutorHandle handle);
+    void set_proxy_executor(AOTIProxyExecutorHandle handle);
 
- private:
-  std::string model_so_path_;
-  bool init_flag_;
-  std::unique_ptr<torch::aot_inductor::ProxyExecutor> proxy_executor_npu_;
+private:
+    std::string model_so_path_;
+    bool init_flag_;
+    std::unique_ptr<torch::aot_inductor::ProxyExecutor> proxy_executor_npu_;
 };
 
 } // namespace torch::inductor
\ No newline at end of file
diff --git a/torch_npu/csrc/inductor/aoti_runtime/arrayref_tensor.h b/torch_npu/csrc/inductor/aoti_runtime/arrayref_tensor.h
index c5a989b5ca9397eb6226e85aa6b9d9b6517fe3cb..7659b3d2217a0a76980383f9244afff3b315af82 100644
--- a/torch_npu/csrc/inductor/aoti_runtime/arrayref_tensor.h
+++ b/torch_npu/csrc/inductor/aoti_runtime/arrayref_tensor.h
@@ -14,371 +14,341 @@ namespace torch::aot_inductor {
 // adapted.
 template <typename T>
 class MiniArrayRef final {
- public:
-  using iterator = T*;
-  using const_iterator = const T*;
-  using size_type = size_t;
-  using value_type = T;
-
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
- private:
-  /// The start of the array, in an external buffer.
-  T* Data;
-
-  /// The number of elements.
-  size_type Length;
-
- public:
-  /// @name Constructors
-  /// @{
-
-  /// Construct an empty MiniArrayRef.
-  /* implicit */ constexpr MiniArrayRef() : Data(nullptr), Length(0) {}
-
-  /// Construct an MiniArrayRef from a single element.
-  // TODO Make this explicit
-  constexpr MiniArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
-
-  /// Construct an MiniArrayRef from a pointer and length.
-  constexpr MiniArrayRef(T* data, size_t length) : Data(data), Length(length) {}
-
-  /// Construct an MiniArrayRef from a range.
-  constexpr MiniArrayRef(T* begin, T* end) : Data(begin), Length(end - begin) {}
-
-  template <
-      typename Container,
-      typename = std::enable_if_t<std::is_same_v<
-          std::remove_const_t<decltype(std::declval<Container>().data())>,
-          T*>>>
-  /* implicit */ MiniArrayRef(Container& container)
-      : Data(container.data()), Length(container.size()) {}
-
-  /// Construct an MiniArrayRef from a std::vector.
-  // The enable_if stuff here makes sure that this isn't used for
-  // std::vector<bool>, because MiniArrayRef can't work on a std::vector<bool>
-  // bitfield.
-  template <typename A>
-  /* implicit */ MiniArrayRef(const std::vector<T, A>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    static_assert(
-        !std::is_same_v<T, bool>,
-        "MiniArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
-  }
-
-  /// Construct an MiniArrayRef from a std::array
-  template <size_t N>
-  /* implicit */ constexpr MiniArrayRef(std::array<T, N>& Arr)
-      : Data(Arr.data()), Length(N) {}
-
-  /// Construct an MiniArrayRef from a C array.
-  template <size_t N>
-  // NOLINTNEXTLINE(*c-array*)
-  /* implicit */ constexpr MiniArrayRef(T (&Arr)[N]) : Data(Arr), Length(N) {}
-
-  // /// Construct an MiniArrayRef from an empty C array.
-  /* implicit */ constexpr MiniArrayRef(const volatile void* Arr)
-      : Data(nullptr), Length(0) {}
-
-  /// Construct an MiniArrayRef from a std::initializer_list.
-  /* implicit */ constexpr MiniArrayRef(const std::initializer_list<T>& Vec)
-      : Data(
-            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
-                                             : std::begin(Vec)),
-        Length(Vec.size()) {}
-
-  /// @}
-  /// @name Simple Operations
-  /// @{
-
-  constexpr iterator begin() const {
-    return Data;
-  }
-  constexpr iterator end() const {
-    return Data + Length;
-  }
-
-  // These are actually the same as iterator, since MiniArrayRef only
-  // gives you const iterators.
-  constexpr const_iterator cbegin() const {
-    return Data;
-  }
-  constexpr const_iterator cend() const {
-    return Data + Length;
-  }
-
-  constexpr reverse_iterator rbegin() const {
-    return reverse_iterator(end());
-  }
-  constexpr reverse_iterator rend() const {
-    return reverse_iterator(begin());
-  }
-
-  /// empty - Check if the array is empty.
-  constexpr bool empty() const {
-    return Length == 0;
-  }
-
-  constexpr T* data() const {
-    return Data;
-  }
-
-  /// size - Get the array size.
-  constexpr size_t size() const {
-    return Length;
-  }
-
-  /// equals - Check for element-wise equality.
-  constexpr bool equals(MiniArrayRef RHS) const {
-    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
-  }
-
-  /// @}
-  /// @name Operator Overloads
-  /// @{
-  constexpr const T& operator[](size_t Index) const {
-    return Data[Index];
-  }
-
-  /// Disallow accidental assignment from a temporary.
-  ///
-  /// The declaration here is extra complicated so that "arrayRef = {}"
-  /// continues to select the move assignment operator.
-  template <typename U>
-  std::enable_if_t<std::is_same_v<U, T>, MiniArrayRef<T>>& operator=(
-      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
-      U&& Temporary) = delete;
-
-  /// Disallow accidental assignment from a temporary.
-  ///
-  /// The declaration here is extra complicated so that "arrayRef = {}"
-  /// continues to select the move assignment operator.
-  template <typename U>
-  std::enable_if_t<std::is_same_v<U, T>, MiniArrayRef<T>>& operator=(
-      std::initializer_list<U>) = delete;
+public:
+    using iterator = T *;
+    using const_iterator = const T *;
+    using size_type = size_t;
+    using value_type = T;
+
+    using reverse_iterator = std::reverse_iterator<iterator>;
+
+private:
+    // / The start of the array, in an external buffer.
+    T *Data;
+
+    // / The number of elements.
+    size_type Length;
+
+public:
+    // / @name Constructors
+    // / @{
+
+    // / Construct an empty MiniArrayRef.
+    /* implicit */ constexpr MiniArrayRef() : Data(nullptr), Length(0) {}
+
+    // / Construct an MiniArrayRef from a single element.
+    // TODO Make this explicit
+    constexpr MiniArrayRef(const T &OneElt) : Data(&OneElt), Length(1) {}
+
+    // / Construct an MiniArrayRef from a pointer and length.
+    constexpr MiniArrayRef(T *data, size_t length) : Data(data), Length(length) {}
+
+    // / Construct an MiniArrayRef from a range.
+    constexpr MiniArrayRef(T *begin, T *end) : Data(begin), Length(end - begin) {}
+
+    template <typename Container, typename = std::enable_if_t<
+        std::is_same_v<std::remove_const_t<decltype(std::declval<Container>().data())>, T *>>>
+    /* implicit */ MiniArrayRef(Container &container) : Data(container.data()), Length(container.size())
+    {}
+
+    // / Construct an MiniArrayRef from a std::vector.
+    // The enable_if stuff here makes sure that this isn't used for
+    // std::vector<bool>, because MiniArrayRef can't work on a std::vector<bool>
+    // bitfield.
+    template <typename A>
+    /* implicit */ MiniArrayRef(const std::vector<T, A> &Vec) : Data(Vec.data()), Length(Vec.size())
+    {
+        static_assert(!std::is_same_v<T, bool>,
+            "MiniArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
+    }
+
+    // / Construct an MiniArrayRef from a std::array
+    template <size_t N>
+    /* implicit */ constexpr MiniArrayRef(std::array<T, N> &Arr) : Data(Arr.data()), Length(N)
+    {}
+
+    // / Construct an MiniArrayRef from a C array.
+    template <size_t N>
+    // NOLINTNEXTLINE(*c-array*)
+    /* implicit */ constexpr MiniArrayRef(T (&Arr)[N]) : Data(Arr), Length(N)
+    {}
+
+    // /// Construct an MiniArrayRef from an empty C array.
+    /* implicit */ constexpr MiniArrayRef(const volatile void *Arr) : Data(nullptr), Length(0) {}
+
+    // / Construct an MiniArrayRef from a std::initializer_list.
+    /* implicit */ constexpr MiniArrayRef(const std::initializer_list<T> &Vec)
+        : Data(std::begin(Vec) == std::end(Vec) ? static_cast<T *>(nullptr) : std::begin(Vec)), Length(Vec.size())
+    {}
+
+    // / @}
+    // / @name Simple Operations
+    // / @{
+
+    constexpr iterator begin() const
+    {
+        return Data;
+    }
+    constexpr iterator end() const
+    {
+        return Data + Length;
+    }
+
+    // These are actually the same as iterator, since MiniArrayRef only
+    // gives you const iterators.
+    constexpr const_iterator cbegin() const
+    {
+        return Data;
+    }
+    constexpr const_iterator cend() const
+    {
+        return Data + Length;
+    }
+
+    constexpr reverse_iterator rbegin() const
+    {
+        return reverse_iterator(end());
+    }
+    constexpr reverse_iterator rend() const
+    {
+        return reverse_iterator(begin());
+    }
+
+    // / empty - Check if the array is empty.
+    constexpr bool empty() const
+    {
+        return Length == 0;
+    }
+
+    constexpr T *data() const
+    {
+        return Data;
+    }
+
+    // / size - Get the array size.
+    constexpr size_t size() const
+    {
+        return Length;
+    }
+
+    // / equals - Check for element-wise equality.
+    constexpr bool equals(MiniArrayRef RHS) const
+    {
+        return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+    }
+
+    // / @}
+    // / @name Operator Overloads
+    // / @{
+    constexpr const T &operator[](size_t Index) const
+    {
+        return Data[Index];
+    }
+
+    // / Disallow accidental assignment from a temporary.
+    // /
+    // / The declaration here is extra complicated so that "arrayRef = {}"
+    // / continues to select the move assignment operator.
+    template <typename U>
+    std::enable_if_t<std::is_same_v<U, T>, MiniArrayRef<T>> &operator = (
+        // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+        U &&Temporary) = delete;
+
+    // / Disallow accidental assignment from a temporary.
+    // /
+    // / The declaration here is extra complicated so that "arrayRef = {}"
+    // / continues to select the move assignment operator.
+    template <typename U>
+    std::enable_if_t<std::is_same_v<U, T>, MiniArrayRef<T>> &operator = (std::initializer_list<U>) = delete;
 };
 
 using MiniIntArrayRef = MiniArrayRef<int64_t>;
 
-static_assert(
-    sizeof(MiniIntArrayRef) == sizeof(void*) + sizeof(size_t),
+static_assert(sizeof(MiniIntArrayRef) == sizeof(void *) + sizeof(size_t),
     "changing the size of MiniArrayRef breaks ABI compatibility!");
 
-inline bool is_contiguous_strides_for_shape(
-    int64_t ndim,
-    const int64_t* strides_ptr,
-    const int64_t* sizes_ptr) {
-  int64_t z = 1;
-  for (int64_t d = ndim - 1; d >= 0; d--) {
-    const auto& size_d = sizes_ptr[d];
-    if (size_d != 1) {
-      if (strides_ptr[d] == z) {
-        z *= size_d;
-      } else {
-        return false;
-      }
+inline bool is_contiguous_strides_for_shape(int64_t ndim, const int64_t *strides_ptr, const int64_t *sizes_ptr)
+{
+    int64_t z = 1;
+    for (int64_t d = ndim - 1; d >= 0; d--) {
+        const auto &size_d = sizes_ptr[d];
+        if (size_d != 1) {
+            if (strides_ptr[d] == z) {
+                z *= size_d;
+            } else {
+                return false;
+            }
+        }
     }
-  }
-  return true;
+    return true;
 }
 
 // Shim for AOTI generated code to pretend a raw array works like an
 // AtenTensorHandle.
-template <typename T>
-class ArrayRefTensor {
- public:
-  ArrayRefTensor() = default;
-
-  explicit ArrayRefTensor(
-      MiniArrayRef<T> arr,
-      MiniArrayRef<const int64_t> sizes,
-      MiniArrayRef<const int64_t> strides,
-      int32_t device_type,
-      int32_t device_idx)
-      : arrayRef_(arr),
-        sizes_(sizes),
-        strides_(strides),
-        device_type_(device_type),
-        device_idx_(device_idx) {
-    assert(sizes.size() == strides.size());
-    assert(is_contiguous_strides_for_shape(
-        sizes.size(), strides.data(), sizes.data()));
-  }
-
-  AtenTensorHandle expensiveCopyToTensor() const {
-    AtenTensorHandle result = nullptr;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(
-        sizes_.size(),
-        sizes_.data(),
-        strides_.data(),
-        aoti_torch_dtype<std::remove_const_t<T>>(),
-        device_type_,
-        device_idx_,
-        &result));
-    void* dataPtr = nullptr;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(result, &dataPtr));
-    std::memcpy(dataPtr, data(), numel() * sizeof(T));
-    return result;
-  }
-
-  // We need to look the same as RAIIAtenTensorHandle, which returns
-  // an owning AtenTensorHandle from release(). So, we allocate one!
-  AtenTensorHandle release() {
-    return expensiveCopyToTensor();
-  }
-
-  AtenTensorHandle borrowAsTensor() const {
-    AtenTensorHandle result = nullptr;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(
-        data(),
-        sizes_.size(),
-        sizes_.data(),
-        strides_.data(),
-        0,
-        aoti_torch_dtype<std::remove_const_t<T>>(),
-        device_type_,
-        device_idx_,
-        &result,
-        aoti_torch_layout_strided(),
-        nullptr,
-        0));
-    return result;
-  }
-
-  // We don't need to free any memory.
-  void reset() {}
-
-  auto sizes() const {
-    return sizes_;
-  }
-
-  auto strides() const {
-    return strides_;
-  }
-
-  auto device_type() const {
-    return device_type_;
-  }
-
-  auto device_idx() const {
-    return device_idx_;
-  }
-
-  T* data() const {
-    return arrayRef_.data();
-  }
-
-  auto numel() const {
-    return arrayRef_.size();
-  }
-
-  void set_arrayref(MiniArrayRef<T> new_arrayref) {
-    arrayRef_ = new_arrayref;
-  }
-
- private:
-  MiniArrayRef<T> arrayRef_;
-  // We expect generated code to have statically available sizes &
-  // strides for us.
-  MiniArrayRef<const int64_t> sizes_;
-  MiniArrayRef<const int64_t> strides_;
-  int32_t device_type_ = 0;
-  int32_t device_idx_ = 0;
-  // We continue to zero-initialize this field in case we repurpose
-  // the space later; having predictable contents can only help.
-  int32_t unusedDoNotRemoveForABICompatibility_ = 0;
+template <typename T> class ArrayRefTensor {
+public:
+    ArrayRefTensor() = default;
+
+    explicit ArrayRefTensor(MiniArrayRef<T> arr, MiniArrayRef<const int64_t> sizes, MiniArrayRef<const int64_t> strides,
+        int32_t device_type, int32_t device_idx)
+        : arrayRef_(arr), sizes_(sizes), strides_(strides), device_type_(device_type), device_idx_(device_idx)
+    {
+        assert(sizes.size() == strides.size());
+        assert(is_contiguous_strides_for_shape(sizes.size(), strides.data(), sizes.data()));
+    }
+
+    AtenTensorHandle expensiveCopyToTensor() const
+    {
+        AtenTensorHandle result = nullptr;
+        AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(sizes_.size(), sizes_.data(), strides_.data(),
+            aoti_torch_dtype<std::remove_const_t<T>>(), device_type_, device_idx_, &result));
+        void *dataPtr = nullptr;
+        AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(result, &dataPtr));
+        std::memcpy(dataPtr, data(), numel() * sizeof(T));
+        return result;
+    }
+
+    // We need to look the same as RAIIAtenTensorHandle, which returns
+    // an owning AtenTensorHandle from release(). So, we allocate one!
+    AtenTensorHandle release()
+    {
+        return expensiveCopyToTensor();
+    }
+
+    AtenTensorHandle borrowAsTensor() const
+    {
+        AtenTensorHandle result = nullptr;
+        AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(data(), sizes_.size(), sizes_.data(),
+            strides_.data(), 0, aoti_torch_dtype<std::remove_const_t<T>>(), device_type_, device_idx_, &result,
+            aoti_torch_layout_strided(), nullptr, 0));
+        return result;
+    }
+
+    // We don't need to free any memory.
+    void reset() {}
+
+    auto sizes() const
+    {
+        return sizes_;
+    }
+
+    auto strides() const
+    {
+        return strides_;
+    }
+
+    auto device_type() const
+    {
+        return device_type_;
+    }
+
+    auto device_idx() const
+    {
+        return device_idx_;
+    }
+
+    T *data() const
+    {
+        return arrayRef_.data();
+    }
+
+    auto numel() const
+    {
+        return arrayRef_.size();
+    }
+
+    void set_arrayref(MiniArrayRef<T> new_arrayref)
+    {
+        arrayRef_ = new_arrayref;
+    }
+
+private:
+    MiniArrayRef<T> arrayRef_;
+    // We expect generated code to have statically available sizes &
+    // strides for us.
+    MiniArrayRef<const int64_t> sizes_;
+    MiniArrayRef<const int64_t> strides_;
+    int32_t device_type_ = 0;
+    int32_t device_idx_ = 0;
+    // We continue to zero-initialize this field in case we repurpose
+    // the space later; having predictable contents can only help.
+    int32_t unusedDoNotRemoveForABICompatibility_ = 0;
 };
 
-static_assert(
-    sizeof(ArrayRefTensor<int>) ==
-        3 * sizeof(MiniIntArrayRef) + 3 * sizeof(int32_t) +
-            (alignof(ArrayRefTensor<int>) > 4 ? sizeof(int32_t) : 0),
+static_assert(sizeof(ArrayRefTensor<int>) ==
+    3 * sizeof(MiniIntArrayRef) + 3 * sizeof(int32_t) + (alignof(ArrayRefTensor<int>) > 4 ? sizeof(int32_t) : 0),
     "changing the size of ArrayRefTensor breaks ABI compatibility!");
 
 template <typename T>
-inline ArrayRefTensor<T> reinterpret_tensor_wrapper(
-    const ArrayRefTensor<T>& self,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset) {
-  // REVIEW: we should add a way to build the DSO in debug mode during
-  // tests so we can have checks like this!
-  assert(is_contiguous_strides_for_shape(ndim, strides_ptr, sizes_ptr));
-  return ArrayRefTensor<T>(
-      MiniArrayRef<T>(
-          self.data() + storage_offset, self.numel() - storage_offset),
-      MiniArrayRef<const int64_t>(sizes_ptr, ndim),
-      MiniArrayRef<const int64_t>(strides_ptr, ndim),
-      self.device_type(),
-      self.device_idx());
+inline ArrayRefTensor<T> reinterpret_tensor_wrapper(const ArrayRefTensor<T> &self, int64_t ndim,
+    const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset)
+{
+    // REVIEW: we should add a way to build the DSO in debug mode during
+    // tests so we can have checks like this!
+    assert(is_contiguous_strides_for_shape(ndim, strides_ptr, sizes_ptr));
+    return ArrayRefTensor<T>(MiniArrayRef<T>(self.data() + storage_offset, self.numel() - storage_offset),
+        MiniArrayRef<const int64_t>(sizes_ptr, ndim), MiniArrayRef<const int64_t>(strides_ptr, ndim),
+        self.device_type(), self.device_idx());
 }
 
-template <typename T>
-inline T* get_data_ptr_wrapper(ArrayRefTensor<T>& tensor) {
-  return tensor.data();
+template <typename T> inline T *get_data_ptr_wrapper(ArrayRefTensor<T> &tensor)
+{
+    return tensor.data();
 }
 
-template <typename T>
-inline T* get_data_ptr_wrapper(const MiniArrayRef<T>& arr) {
-  return arr.data();
+template <typename T> inline T *get_data_ptr_wrapper(const MiniArrayRef<T> &arr)
+{
+    return arr.data();
 }
 
-template <typename T>
-inline const ArrayRefTensor<T>& unwrap_raii_handle_if_needed(
-    const ArrayRefTensor<T>& tensor) {
-  return tensor;
+template <typename T> inline const ArrayRefTensor<T> &unwrap_raii_handle_if_needed(const ArrayRefTensor<T> &tensor)
+{
+    return tensor;
 }
 
-template <typename T>
-inline ArrayRefTensor<T>& unwrap_raii_handle_if_needed(
-    ArrayRefTensor<T>& tensor) {
-  return tensor;
+template <typename T> inline ArrayRefTensor<T> &unwrap_raii_handle_if_needed(ArrayRefTensor<T> &tensor)
+{
+    return tensor;
 }
 
-template <typename T>
-inline const ArrayRefTensor<T>& wrap_with_raii_handle_if_needed(
-    const ArrayRefTensor<T>& tensor) {
-  return tensor;
+template <typename T> inline const ArrayRefTensor<T> &wrap_with_raii_handle_if_needed(const ArrayRefTensor<T> &tensor)
+{
+    return tensor;
 }
 
-template <typename T>
-inline ArrayRefTensor<T>& wrap_with_raii_handle_if_needed(
-    ArrayRefTensor<T>& tensor) {
-  return tensor;
+template <typename T> inline ArrayRefTensor<T> &wrap_with_raii_handle_if_needed(ArrayRefTensor<T> &tensor)
+{
+    return tensor;
 }
 
-template <typename T>
-inline RAIIAtenTensorHandle expensive_copy_to_tensor_if_needed(
-    const ArrayRefTensor<T>& tensor) {
-  return tensor.expensiveCopyToTensor();
+template <typename T> inline RAIIAtenTensorHandle expensive_copy_to_tensor_if_needed(const ArrayRefTensor<T> &tensor)
+{
+    return tensor.expensiveCopyToTensor();
 }
 
-inline AtenTensorHandle expensive_copy_to_tensor_if_needed(
-    AtenTensorHandle handle) {
-  return handle;
+inline AtenTensorHandle expensive_copy_to_tensor_if_needed(AtenTensorHandle handle)
+{
+    return handle;
 }
 
-template <typename T>
-const T& copy_arrayref_tensor_to_tensor(const T& t) {
-  return t;
+template <typename T> const T &copy_arrayref_tensor_to_tensor(const T &t)
+{
+    return t;
 }
 
-template <typename T>
-RAIIAtenTensorHandle copy_arrayref_tensor_to_tensor(
-    const ArrayRefTensor<T>& art) {
-  return art.expensiveCopyToTensor();
+template <typename T> RAIIAtenTensorHandle copy_arrayref_tensor_to_tensor(const ArrayRefTensor<T> &art)
+{
+    return art.expensiveCopyToTensor();
 }
 
-template <typename T>
-const T& borrow_arrayref_tensor_as_tensor(const T& t) {
-  return t;
+template <typename T> const T &borrow_arrayref_tensor_as_tensor(const T &t)
+{
+    return t;
 }
 
-template <typename T>
-RAIIAtenTensorHandle borrow_arrayref_tensor_as_tensor(
-    const ArrayRefTensor<T>& art) {
-  return art.borrowAsTensor();
+template <typename T> RAIIAtenTensorHandle borrow_arrayref_tensor_as_tensor(const ArrayRefTensor<T> &art)
+{
+    return art.borrowAsTensor();
 }
 
 } // namespace torch::aot_inductor
diff --git a/torch_npu/csrc/inductor/aoti_runtime/device_utils.h b/torch_npu/csrc/inductor/aoti_runtime/device_utils.h
index fd640b78ecae6a88bf78130319e8b0fcf645448c..7d32261911f5343d6e5fe63c44847bda234df38c 100644
--- a/torch_npu/csrc/inductor/aoti_runtime/device_utils.h
+++ b/torch_npu/csrc/inductor/aoti_runtime/device_utils.h
@@ -14,15 +14,14 @@
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
-#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                    \
-  do {                                                     \
-    const cudaError_t code = EXPR;                         \
-    const char* msg = cudaGetErrorString(code);            \
-    if (code != cudaSuccess) {                             \
-      throw std::runtime_error(                            \
-          std::string("CUDA error: ") + std::string(msg)); \
-    }                                                      \
-  } while (0)
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                                               \
+    do {                                                                              \
+        const cudaError_t code = EXPR;                                                \
+        const char *msg = cudaGetErrorString(code);                                   \
+        if (code != cudaSuccess) {                                                    \
+            throw std::runtime_error(std::string("CUDA error: ") + std::string(msg)); \
+        }                                                                             \
+    } while (0)
 
 namespace torch::aot_inductor {
 
@@ -34,19 +33,19 @@ using DeviceStreamType = cudaStream_t;
 #include <level_zero/ze_api.h>
 #include <sycl/sycl.hpp>
 #include <sstream>
-#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                                   \
-  do {                                                                    \
-    const ze_result_t status = EXPR;                                      \
-    if (status != ZE_RESULT_SUCCESS) {                                    \
-      std::stringstream ss;                                               \
-      ss << "L0 runtime error: " << std::hex << std::uppercase << status; \
-      throw std::runtime_error(ss.str());                                 \
-    }                                                                     \
-  } while (0)
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                                         \
+    do {                                                                        \
+        const ze_result_t status = EXPR;                                        \
+        if (status != ZE_RESULT_SUCCESS) {                                      \
+            std::stringstream ss;                                               \
+            ss << "L0 runtime error: " << std::hex << std::uppercase << status; \
+            throw std::runtime_error(ss.str());                                 \
+        }                                                                       \
+    } while (0)
 
 namespace torch::aot_inductor {
 
-using DeviceStreamType = sycl::queue*;
+using DeviceStreamType = sycl::queue *;
 
 } // namespace torch::aot_inductor
 
@@ -55,19 +54,18 @@ using DeviceStreamType = sycl::queue*;
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "third_party/acl/inc/acl/acl_rt.h"
 
-typedef void* NPUdeviceptr;
+typedef void *NPUdeviceptr;
 
-typedef void* NPUfunction;
+typedef void *NPUfunction;
 
-#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                          \
-  do {                                                           \
-    const aclError code = EXPR;                                  \
-    if (code != ACL_SUCCESS) {                                   \
-      throw std::runtime_error(                                  \
-          std::string("NPU error core: ") + std::to_string(code) \
-          + std::string(" ") +  std::string(__FILE__) + std::string(":") + std::to_string(__LINE__)); \
-    }                                                            \
-  } while (0)
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                                                                          \
+    do {                                                                                                         \
+        const aclError code = EXPR;                                                                              \
+        if (code != ACL_SUCCESS) {                                                                               \
+            throw std::runtime_error(std::string("NPU error core: ") + std::to_string(code) + std::string(" ") + \
+                std::string(__FILE__) + std::string(":") + std::to_string(__LINE__));                            \
+        }                                                                                                        \
+    } while (0)
 
 namespace torch::aot_inductor {
 
@@ -77,16 +75,17 @@ using DeviceStreamType = aclrtStream;
 
 #else
 
-#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)            \
-  bool ok = EXPR;                                  \
-  if (!ok) {                                       \
-    throw std::runtime_error("CPU runtime error"); \
-  }
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                \
+    bool ok = EXPR;                                    \
+    if (!ok) {                                         \
+        throw std::runtime_error("CPU runtime error"); \
+    }
 
 namespace torch::aot_inductor {
 
-using DeviceStreamType = void*;
+using DeviceStreamType = void *;
 
 } // namespace torch::aot_inductor
 
-#endif // USE_CUDA
+#endif \
+    // USE_CUDA
diff --git a/torch_npu/csrc/inductor/aoti_torch/c/shim.h b/torch_npu/csrc/inductor/aoti_torch/c/shim.h
index efc536fc3205031e2719ea49a34ba88e82542d24..6cafd55f8743f7c80b3163e0e1618737d6b60e27 100644
--- a/torch_npu/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch_npu/csrc/inductor/aoti_torch/c/shim.h
@@ -75,13 +75,13 @@ extern "C" {
 // (note that RAIIAtenTensorHandle is private to model.so, and never crosses
 // the ABI boundary.)
 struct AtenTensorOpaque;
-using AtenTensorHandle = AtenTensorOpaque*;
+using AtenTensorHandle = AtenTensorOpaque *;
 
 struct AtenGeneratorOpaque;
-using AtenGeneratorHandle = AtenGeneratorOpaque*;
+using AtenGeneratorHandle = AtenGeneratorOpaque *;
 
 struct AOTIProxyExecutorOpaque;
-using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque*;
+using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque *;
 
 using AOTITorchError = int32_t;
 #define AOTI_TORCH_SUCCESS 0
@@ -135,564 +135,325 @@ AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_channels_last_3d();
 AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_preserve_format();
 
 // Functions for converting a single-element tensor to a scalar value
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_float16(AtenTensorHandle tensor, c10::Half* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_float32(AtenTensorHandle tensor, float* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_float64(AtenTensorHandle tensor, double* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_uint8(AtenTensorHandle tensor, uint8_t* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_uint16(AtenTensorHandle tensor, uint16_t* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_uint32(AtenTensorHandle tensor, uint32_t* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_uint64(AtenTensorHandle tensor, uint64_t* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_int8(AtenTensorHandle tensor, int8_t* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_int16(AtenTensorHandle tensor, int16_t* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_int32(AtenTensorHandle tensor, int32_t* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_int64(AtenTensorHandle tensor, int64_t* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_bool(AtenTensorHandle tensor, bool* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_item_bfloat16(AtenTensorHandle tensor, c10::BFloat16* ret_value);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_complex64(
-    AtenTensorHandle tensor,
-    c10::complex<float>* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_float16(AtenTensorHandle tensor, c10::Half *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_float32(AtenTensorHandle tensor, float *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_float64(AtenTensorHandle tensor, double *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_uint8(AtenTensorHandle tensor, uint8_t *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_uint16(AtenTensorHandle tensor, uint16_t *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_uint32(AtenTensorHandle tensor, uint32_t *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_uint64(AtenTensorHandle tensor, uint64_t *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_int8(AtenTensorHandle tensor, int8_t *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_int16(AtenTensorHandle tensor, int16_t *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_int32(AtenTensorHandle tensor, int32_t *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_int64(AtenTensorHandle tensor, int64_t *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_bool(AtenTensorHandle tensor, bool *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_bfloat16(AtenTensorHandle tensor, c10::BFloat16 *ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_complex64(AtenTensorHandle tensor, c10::complex<float> *ret_value);
 
 // Functions for wrapping a scalar value to a single-element tensor
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float32(
-    float value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float64(
-    double value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint8(
-    uint8_t value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint16(
-    uint16_t value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint32(
-    uint32_t value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint64(
-    uint64_t value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int8(
-    int8_t value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int16(
-    int16_t value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int32(
-    int32_t value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int64(
-    int64_t value,
-    AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_scalar_to_tensor_bool(bool value, AtenTensorHandle* ret_new_tensor);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_complex64(
-    c10::complex<float> value,
-    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float32(float value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float64(double value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint8(uint8_t value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint16(uint16_t value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint32(uint32_t value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint64(uint64_t value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int8(int8_t value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int16(int16_t value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int32(int32_t value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int64(int64_t value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_bool(bool value, AtenTensorHandle *ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_complex64(c10::complex<float> value,
+    AtenTensorHandle *ret_new_tensor);
 
 AOTI_TORCH_EXPORT bool aoti_torch_grad_mode_is_enabled();
 AOTI_TORCH_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 
 // Free the tensor object
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_delete_tensor_object(AtenTensorHandle tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_delete_tensor_object(AtenTensorHandle tensor);
 
 // Get a pointer to the underlying storage data
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(
-    AtenTensorHandle tensor,
-    void** ret_data_ptr // returns borrowed reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(AtenTensorHandle tensor,
+    void **ret_data_ptr // returns borrowed reference
 );
 
 // Get the nbytes of the underlying storage
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_storage_size(AtenTensorHandle tensor, int64_t* ret_size);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_size(AtenTensorHandle tensor, int64_t *ret_size);
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_dim(AtenTensorHandle tensor, int64_t* ret_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_dim(AtenTensorHandle tensor, int64_t *ret_dim);
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_numel(AtenTensorHandle tensor, int64_t* ret_numel);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_numel(AtenTensorHandle tensor, int64_t *ret_numel);
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_storage_numel(AtenTensorHandle tensor, int64_t* ret_numel);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_numel(AtenTensorHandle tensor, int64_t *ret_numel);
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_sizes(
-    AtenTensorHandle tensor,
-    int64_t** ret_sizes // returns borrowed reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_sizes(AtenTensorHandle tensor,
+    int64_t **ret_sizes // returns borrowed reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_size(AtenTensorHandle tensor, int64_t d, int64_t* ret_size);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_size(AtenTensorHandle tensor, int64_t d, int64_t *ret_size);
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_strides(
-    AtenTensorHandle tensor,
-    int64_t** ret_strides // returns borrowed reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_strides(AtenTensorHandle tensor,
+    int64_t **ret_strides // returns borrowed reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_stride(AtenTensorHandle tensor, int64_t d, int64_t* ret_stride);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_stride(AtenTensorHandle tensor, int64_t d, int64_t *ret_stride);
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_dtype(AtenTensorHandle tensor, int32_t* ret_dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_dtype(AtenTensorHandle tensor, int32_t *ret_dtype);
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_device_type(AtenTensorHandle tensor, int32_t* ret_device_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_device_type(AtenTensorHandle tensor, int32_t *ret_device_type);
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_device_index(AtenTensorHandle tensor, int32_t* ret_device_index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_device_index(AtenTensorHandle tensor, int32_t *ret_device_index);
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(
-    AtenTensorHandle tensor,
-    int64_t* ret_storage_offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(AtenTensorHandle tensor, int64_t *ret_storage_offset);
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch__alloc_from_pool(
-    AtenTensorHandle self,
-    int64_t offset_bytes,
-    int32_t dtype,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__alloc_from_pool(AtenTensorHandle self, int64_t offset_bytes, int32_t dtype,
+    int64_t ndim, const int64_t *sizes_ptr, const int64_t *strides_ptr, AtenTensorHandle *ret_new_tensor);
 
 // This function will create a new tensor object and its pointer is returned
 // through *out. The caller is responsible for wrapping the tensor pointer
 // with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
 // when going out of scope.
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
-    AtenTensorHandle self,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    AtenTensorHandle* ret_new_tensor // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(AtenTensorHandle self, int64_t ndim,
+    const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset,
+    AtenTensorHandle *ret_new_tensor // returns new reference
 );
 
 // This function will create a new tensor object and its pointer is returned
 // through *out. The caller is responsible for wrapping the tensor pointer
 // with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
 // when going out of scope.
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AtenTensorHandle* ret_new_tensor // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(int64_t ndim, const int64_t *sizes_ptr,
+    const int64_t *strides_ptr, int32_t dtype, int32_t device_type, int32_t device_index,
+    AtenTensorHandle *ret_new_tensor // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AtenTensorHandle* ret // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(void *data, int64_t ndim, const int64_t *sizes_ptr,
+    const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype, int32_t device_type, int32_t device_index,
+    AtenTensorHandle *ret // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(void *data, int64_t ndim,
+    const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype, int32_t device_type,
     int32_t device_index,
-    AtenTensorHandle* ret, // returns new reference
-    int32_t layout,
-    const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
+    AtenTensorHandle *ret, // returns new reference
+    int32_t layout, const uint8_t *opaque_metadata, int64_t opaque_metadata_size);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu(void *data, int64_t ndim,
+    const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype, int32_t device_type,
     int32_t device_index,
-    AtenTensorHandle* ret // returns new reference
+    AtenTensorHandle *ret // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2(void *data, int64_t ndim,
+    const int64_t *sizes_ptr, const int64_t *strides_ptr, int64_t storage_offset, int32_t dtype, int32_t device_type,
     int32_t device_index,
-    AtenTensorHandle* ret, // returns new reference
-    int32_t layout,
-    const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag(
-    AtenTensorHandle weight,
-    AtenTensorHandle indices,
-    AtenTensorHandle offsets,
-    int32_t scale_grad_by_freq,
-    int32_t mode,
-    int32_t sparse,
+    AtenTensorHandle *ret, // returns new reference
+    int32_t layout, const uint8_t *opaque_metadata, int64_t opaque_metadata_size);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices,
+    AtenTensorHandle offsets, int32_t scale_grad_by_freq, int32_t mode, int32_t sparse,
     AtenTensorHandle per_sample_weights, // optional argument
-    int32_t include_last_offset,
-    int32_t padding_idx,
-    AtenTensorHandle* ret0, // returns new reference
-    AtenTensorHandle* ret1, // returns new reference
-    AtenTensorHandle* ret2, // returns new reference
-    AtenTensorHandle* ret3 // returns new reference
+    int32_t include_last_offset, int32_t padding_idx,
+    AtenTensorHandle *ret0, // returns new reference
+    AtenTensorHandle *ret1, // returns new reference
+    AtenTensorHandle *ret2, // returns new reference
+    AtenTensorHandle *ret3  // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c(
-    AtenTensorHandle self,
-    const int64_t* dim_ptr,
-    int64_t dim_size,
-    int64_t normalization,
-    int32_t forward,
-    AtenTensorHandle* ret // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c(AtenTensorHandle self, const int64_t *dim_ptr, int64_t dim_size,
+    int64_t normalization, int32_t forward,
+    AtenTensorHandle *ret // returns new reference
 );
 
 // This version is deprecated. We will remove it later
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention(
-    AtenTensorHandle query,
-    AtenTensorHandle key,
-    AtenTensorHandle value,
-    double dropout_p,
-    bool is_causal,
-    bool return_debug_mask,
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention(AtenTensorHandle query,
+    AtenTensorHandle key, AtenTensorHandle value, double dropout_p, bool is_causal, bool return_debug_mask,
     double scale,
-    AtenTensorHandle* ret0, // returns new reference
-    AtenTensorHandle* ret1, // returns new reference
-    AtenTensorHandle* ret2, // returns new reference
-    AtenTensorHandle* ret3, // returns new reference
-    int64_t* ret4,
-    int64_t* ret5,
-    AtenTensorHandle* ret6, // returns new reference
-    AtenTensorHandle* ret7, // returns new reference
-    AtenTensorHandle* ret8 // returns new reference
+    AtenTensorHandle *ret0, // returns new reference
+    AtenTensorHandle *ret1, // returns new reference
+    AtenTensorHandle *ret2, // returns new reference
+    AtenTensorHandle *ret3, // returns new reference
+    int64_t *ret4, int64_t *ret5,
+    AtenTensorHandle *ret6, // returns new reference
+    AtenTensorHandle *ret7, // returns new reference
+    AtenTensorHandle *ret8  // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch__scaled_dot_product_flash_attention_v2(
-    AtenTensorHandle query,
-    AtenTensorHandle key,
-    AtenTensorHandle value,
-    double dropout_p,
-    int is_causal,
-    int return_debug_mask,
-    double* scale, // optional argument
-    AtenTensorHandle* ret0, // returns new reference
-    AtenTensorHandle* ret1, // returns new reference
-    AtenTensorHandle* ret2, // returns new reference
-    AtenTensorHandle* ret3, // returns new reference
-    int64_t* ret4,
-    int64_t* ret5,
-    AtenTensorHandle* ret6, // returns new reference
-    AtenTensorHandle* ret7, // returns new reference
-    AtenTensorHandle* ret8 // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention_v2(AtenTensorHandle query,
+    AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int is_causal, int return_debug_mask,
+    double *scale,          // optional argument
+    AtenTensorHandle *ret0, // returns new reference
+    AtenTensorHandle *ret1, // returns new reference
+    AtenTensorHandle *ret2, // returns new reference
+    AtenTensorHandle *ret3, // returns new reference
+    int64_t *ret4, int64_t *ret5,
+    AtenTensorHandle *ret6, // returns new reference
+    AtenTensorHandle *ret7, // returns new reference
+    AtenTensorHandle *ret8  // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch__scaled_dot_product_efficient_attention(
-    AtenTensorHandle query,
-    AtenTensorHandle key,
-    AtenTensorHandle value,
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_efficient_attention(AtenTensorHandle query,
+    AtenTensorHandle key, AtenTensorHandle value,
     AtenTensorHandle attn_bias, // optional argument
-    int compute_log_sumexp,
-    double dropout_p,
-    int is_causal,
-    double* scale, // optional argument
-    AtenTensorHandle* ret0, // returns new reference
-    AtenTensorHandle* ret1, // returns new reference
-    AtenTensorHandle* ret2, // returns new reference
-    AtenTensorHandle* ret3 // returns new reference
+    int compute_log_sumexp, double dropout_p, int is_causal,
+    double *scale,          // optional argument
+    AtenTensorHandle *ret0, // returns new reference
+    AtenTensorHandle *ret1, // returns new reference
+    AtenTensorHandle *ret2, // returns new reference
+    AtenTensorHandle *ret3  // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm(
-    AtenTensorHandle self,
-    AtenTensorHandle mat2,
-    AtenTensorHandle bias,
-    int32_t* out_dtype,
-    AtenTensorHandle scale_a,
-    AtenTensorHandle scale_b,
-    AtenTensorHandle scale_result,
-    int8_t use_fast_accum,
-    AtenTensorHandle* ret0,
-    AtenTensorHandle* ret1);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm_v2(
-    AtenTensorHandle self,
-    AtenTensorHandle mat2,
-    AtenTensorHandle scale_a,
-    AtenTensorHandle scale_b,
-    AtenTensorHandle bias,
-    AtenTensorHandle scale_result,
-    int32_t* out_dtype,
-    int8_t use_fast_accum,
-    AtenTensorHandle* ret0);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
-    AtenTensorHandle input,
-    AtenTensorHandle weight,
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2,
+    AtenTensorHandle bias, int32_t *out_dtype, AtenTensorHandle scale_a, AtenTensorHandle scale_b,
+    AtenTensorHandle scale_result, int8_t use_fast_accum, AtenTensorHandle *ret0, AtenTensorHandle *ret1);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm_v2(AtenTensorHandle self, AtenTensorHandle mat2,
+    AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle bias, AtenTensorHandle scale_result,
+    int32_t *out_dtype, int8_t use_fast_accum, AtenTensorHandle *ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(AtenTensorHandle input, AtenTensorHandle weight,
     AtenTensorHandle bias, // optional argument
-    const int64_t* stride_ptr,
-    int64_t stride_size,
-    const int64_t* padding_ptr,
-    int64_t padding_size,
-    const int64_t* dilation_ptr,
-    int64_t dilation_size,
-    int transposed,
-    const int64_t* output_padding_ptr,
-    int64_t output_padding_size,
-    int64_t groups,
-    AtenTensorHandle* ret // returns new reference
+    const int64_t *stride_ptr, int64_t stride_size, const int64_t *padding_ptr, int64_t padding_size,
+    const int64_t *dilation_ptr, int64_t dilation_size, int transposed, const int64_t *output_padding_ptr,
+    int64_t output_padding_size, int64_t groups,
+    AtenTensorHandle *ret // returns new reference
 );
 
 // This function will create a new uninitialized tensor object
 // and its pointer is returned through *ret.
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_new_uninitialized_tensor(AtenTensorHandle* ret);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_new_uninitialized_tensor(AtenTensorHandle *ret);
 
 // WARNING: This will be deprecated. Use aoti_torch_copy_ instead.
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_tensor_copy_(AtenTensorHandle src, AtenTensorHandle dst);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_tensor_copy_(AtenTensorHandle src, AtenTensorHandle dst);
 
 // Make the tensor referred to by dst an alias for the tensor referred
 // to by src. The two tensors must still be deleted with
 // aoti_torch_delete_tensor separately (or not) as before the call.
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_assign_tensors(AtenTensorHandle src, AtenTensorHandle dst);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_assign_tensors(AtenTensorHandle src, AtenTensorHandle dst);
 
 // Make a shallow copy of the tensor referred to by src and assign
 // it to the handle in the ret_dst. This is similar to the above
 // aoti_torch_assign_tensors function, but creates and sets the
 // ret_dst from within.
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_assign_tensors_out(AtenTensorHandle src, AtenTensorHandle* ret_dst);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_assign_tensors_out(AtenTensorHandle src, AtenTensorHandle *ret_dst);
 
 // This function will create a new tensor object and its pointer is returned
 // through *ret. The caller is responsible for wrapping the tensor pointer
 // with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
 // when going out of scope.
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_clone(AtenTensorHandle self, AtenTensorHandle* ret);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_addmm_out(
-    AtenTensorHandle out,
-    AtenTensorHandle self,
-    AtenTensorHandle mat1,
-    AtenTensorHandle mat2,
-    float beta,
-    float alpha);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_bmm_out(
-    AtenTensorHandle out,
-    AtenTensorHandle self,
-    AtenTensorHandle mat2);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_(
-    AtenTensorHandle self,
-    AtenTensorHandle src,
-    int32_t non_blocking);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mm_out(
-    AtenTensorHandle out,
-    AtenTensorHandle self,
-    AtenTensorHandle mat2);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch__mm_plus_mm_out(
-    AtenTensorHandle out,
-    AtenTensorHandle a,
-    AtenTensorHandle b,
-    AtenTensorHandle c,
-    AtenTensorHandle d);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_clone(AtenTensorHandle self, AtenTensorHandle *ret);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_addmm_out(AtenTensorHandle out, AtenTensorHandle self,
+    AtenTensorHandle mat1, AtenTensorHandle mat2, float beta, float alpha);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__mm_plus_mm_out(AtenTensorHandle out, AtenTensorHandle a,
+    AtenTensorHandle b, AtenTensorHandle c, AtenTensorHandle d);
 
 // This will soon be deprecated after ao_quantization is complete.
 // Please refrain from using this or increasing callsites.
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_cpu_wrapped_fbgemm_pack_gemm_matrix_fp16(
-    AtenTensorHandle weight,
-    AtenTensorHandle* out);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_wrapped_fbgemm_pack_gemm_matrix_fp16(AtenTensorHandle weight,
+    AtenTensorHandle *out);
 
 // This will soon be deprecated after ao_quantization is complete.
 // Please refrain from using this or increasing callsites.
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__wrapped_linear_prepack(
-    AtenTensorHandle weight,
-    AtenTensorHandle weight_scale,
-    AtenTensorHandle weight_zero_point,
-    AtenTensorHandle bias,
-    AtenTensorHandle* out);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__wrapped_linear_prepack(AtenTensorHandle weight,
+    AtenTensorHandle weight_scale, AtenTensorHandle weight_zero_point, AtenTensorHandle bias, AtenTensorHandle *out);
 
 // This will soon be deprecated after ao_quantization is complete.
 // Please refrain from using this or increasing callsites.
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight(
-    AtenTensorHandle input,
-    AtenTensorHandle weight,
-    AtenTensorHandle bias,
-    int64_t out_channel,
-    AtenTensorHandle* out);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight(AtenTensorHandle input,
+    AtenTensorHandle weight, AtenTensorHandle bias, int64_t out_channel, AtenTensorHandle *out);
 
 // This will soon be deprecated after ao_quantization is complete.
 // Please refrain from using this or increasing callsites.
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_cpu__wrapped_quantized_linear_prepacked(
-    AtenTensorHandle input,
-    AtenTensorHandle input_scale,
-    AtenTensorHandle input_zero_point,
-    AtenTensorHandle weight,
-    AtenTensorHandle out_scale,
-    AtenTensorHandle out_zeropoint,
-    int64_t out_channel,
-    AtenTensorHandle* out);
-
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_nonzero(AtenTensorHandle self, AtenTensorHandle* out);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__wrapped_quantized_linear_prepacked(AtenTensorHandle input,
+    AtenTensorHandle input_scale, AtenTensorHandle input_zero_point, AtenTensorHandle weight,
+    AtenTensorHandle out_scale, AtenTensorHandle out_zeropoint, int64_t out_channel, AtenTensorHandle *out);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_nonzero(AtenTensorHandle self, AtenTensorHandle *out);
 
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_zero_(AtenTensorHandle self);
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_repeat_interleave_Tensor(
-    AtenTensorHandle repeats,
-    int64_t* output_size,
-    AtenTensorHandle* out);
-
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_check_inf_and_nan(const char* tensor_name, AtenTensorHandle tensor);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_out(
-    AtenTensorHandle out,
-    AtenTensorHandle self,
-    int64_t dim,
-    AtenTensorHandle index,
-    AtenTensorHandle src);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_reduce_out(
-    AtenTensorHandle out,
-    AtenTensorHandle self,
-    int64_t dim,
-    AtenTensorHandle index,
-    AtenTensorHandle src,
-    const char* reduce,
-    int32_t include_self);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_index_put_out(
-    AtenTensorHandle out,
-    AtenTensorHandle self,
-    const AtenTensorHandle* indices,
-    const uint32_t num_indices,
-    const AtenTensorHandle values,
-    bool accumulate);
-
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real(
-    AtenTensorHandle self,
-    AtenTensorHandle* ret // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t *output_size,
+    AtenTensorHandle *out);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_check_inf_and_nan(const char *tensor_name, AtenTensorHandle tensor);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim,
+    AtenTensorHandle index, AtenTensorHandle src);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim,
+    AtenTensorHandle index, AtenTensorHandle src, const char *reduce, int32_t include_self);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_index_put_out(AtenTensorHandle out, AtenTensorHandle self,
+    const AtenTensorHandle *indices, const uint32_t num_indices, const AtenTensorHandle values, bool accumulate);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real(AtenTensorHandle self,
+    AtenTensorHandle *ret // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype(
-    AtenTensorHandle self,
-    int32_t dtype,
-    AtenTensorHandle* ret // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype(AtenTensorHandle self, int32_t dtype,
+    AtenTensorHandle *ret // returns new reference
 );
 
-AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle(
-    AtenTensorHandle self,
-    const char* msg);
+AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char *msg);
 
 // When AOTI debug printer option is enabled, this function will be invoked to
 // torch pickle save the intermediate tensor for debugging purpose.
-AOTI_TORCH_EXPORT void aoti_torch_save_tensor_handle(
-    AtenTensorHandle self,
-    const char* tensor_name,
-    const char* launch_prefix,
-    const char* kernel_name);
+AOTI_TORCH_EXPORT void aoti_torch_save_tensor_handle(AtenTensorHandle self, const char *tensor_name,
+    const char *launch_prefix, const char *kernel_name);
 
 #ifdef USE_CUDA
 
 struct CUDAGuardOpaque;
-using CUDAGuardHandle = CUDAGuardOpaque*;
+using CUDAGuardHandle = CUDAGuardOpaque *;
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
-    int32_t device_index,
-    CUDAGuardHandle* ret_guard // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_guard(int32_t device_index,
+    CUDAGuardHandle *ret_guard // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index);
 
 struct CUDAStreamGuardOpaque;
-using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
+using CUDAStreamGuardHandle = CUDAStreamGuardOpaque *;
 
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
-    void* stream,
-    int32_t device_index,
-    CUDAStreamGuardHandle* ret_guard // returns new reference
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(void *stream, int32_t device_index,
+    CUDAStreamGuardHandle *ret_guard // returns new reference
 );
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 
-AOTI_TORCH_EXPORT AOTITorchError
-aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream(int32_t device_index, void **ret_stream);
 
-#endif // USE_CUDA
+#endif \
+    // USE_CUDA
 
 // See `ProxyExecutor Design Note` in ir.py for more details
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_proxy_executor_call_function(
-    AOTIProxyExecutorHandle proxy_executor,
-    int extern_node_index,
-    int num_ints,
-    int64_t* flatten_int_args,
-    int num_tensors,
-    AtenTensorHandle* flatten_tensor_args);
-
-AOTI_TORCH_EXPORT void aoti_torch_check(
-    bool cond,
-    const char* func,
-    const char* file,
-    uint32_t line,
-    const char* msg);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_proxy_executor_call_function(AOTIProxyExecutorHandle proxy_executor,
+    int extern_node_index, int num_ints, int64_t *flatten_int_args, int num_tensors,
+    AtenTensorHandle *flatten_tensor_args);
+
+AOTI_TORCH_EXPORT void aoti_torch_check(bool cond, const char *func, const char *file, uint32_t line, const char *msg);
 
 #ifdef STRIP_ERROR_MESSAGES
-#define AOTI_TORCH_CHECK(cond, ...)              \
-  if (!(cond)) {                                 \
-    aoti_torch_check(                            \
-        false,                                   \
-        __func__,                                \
-        __FILE__,                                \
-        static_cast<uint32_t>(__LINE__),         \
-        TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \
-  }
+#define AOTI_TORCH_CHECK(cond, ...)                                                  \
+    if (!(cond)) {                                                                   \
+        aoti_torch_check(false, __func__, __FILE__, static_cast<uint32_t>(__LINE__), \
+            TORCH_CHECK_MSG(cond, "", __VA_ARGS__));                                 \
+    }
 #else
-#define AOTI_TORCH_CHECK(cond, ...)                \
-  if (!(cond)) {                                   \
-    aoti_torch_check(                              \
-        false,                                     \
-        __func__,                                  \
-        __FILE__,                                  \
-        static_cast<uint32_t>(__LINE__),           \
-        TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__)); \
-  }
+#define AOTI_TORCH_CHECK(cond, ...)                                                  \
+    if (!(cond)) {                                                                   \
+        aoti_torch_check(false, __func__, __FILE__, static_cast<uint32_t>(__LINE__), \
+            TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__));                               \
+    }
 #endif
 
 #ifdef __cplusplus
@@ -701,11 +462,11 @@ AOTI_TORCH_EXPORT void aoti_torch_check(
 template <typename T>
 int32_t aoti_torch_dtype() = delete;
 
-#define DEFINE_DTYPE_SPECIALIZATION(ctype, typename) \
-  template <>                                        \
-  inline int32_t aoti_torch_dtype<ctype>() {         \
-    return aoti_torch_dtype_##typename();            \
-  }
+#define DEFINE_DTYPE_SPECIALIZATION(ctype, typename)     \
+    template <> inline int32_t aoti_torch_dtype<ctype>() \
+    {                                                    \
+        return aoti_torch_dtype_##typename();            \
+    }
 
 namespace c10 {
 struct BFloat16;
@@ -726,4 +487,5 @@ DEFINE_DTYPE_SPECIALIZATION(bool, bool)
 
 #endif
 
-#endif // AOTI_TORCH_SHIM
+#endif \
+    // AOTI_TORCH_SHIM
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 60ef55e1d409cfc1ef38bafa57e88d14bef8c900..50b306959da090ec5aa84c73e1701b30ad061929 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -1239,7 +1239,8 @@ PyObject* THNPModule_npuCachingAllocator_raw_alloc(PyObject *_unused, PyObject *
     END_HANDLE_TH_ERRORS
 }
 
-PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj) {
+PyObject* THNPModule_npuCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj)
+{
     HANDLE_TH_ERRORS
     void* mem_ptr = PyLong_AsVoidPtr(obj);
     c10_npu::NPUCachingAllocator::raw_delete(mem_ptr);
@@ -1291,7 +1292,8 @@ PyObject* THNPModule_npuUnlockMutex(PyObject *module, PyObject *noargs)
     Py_RETURN_NONE;
 }
 
-PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs) {
+PyObject* THNPModule_initDump(PyObject* _unused, PyObject* noargs)
+{
     HANDLE_TH_ERRORS
     pybind11::gil_scoped_release no_gil;
     NPU_CHECK_ERROR_WITHOUT_UCE(aclmdlInitDump());
diff --git a/torch_npu/csrc/utils/TensorType.cpp b/torch_npu/csrc/utils/TensorType.cpp
index aeb6fd8b832e96d39fc8b8cd0724ddbdcf9125b9..e6998f57eaad6ac9368366847593d66c8d56658e 100644
--- a/torch_npu/csrc/utils/TensorType.cpp
+++ b/torch_npu/csrc/utils/TensorType.cpp
@@ -6,7 +6,6 @@
 
 namespace torch_npu {
 namespace utils {
-
 using namespace at;
 using namespace torch::autograd;
 
@@ -15,14 +14,13 @@ std::vector<std::pair<Backend, ScalarType>> all_declared_types_npu()
     std::vector<std::pair<Backend, ScalarType>> ret;
     // can't easily iterate over enum classes, does not support BFloat16 now
     std::vector<Backend> backends = { c10::Backend::PrivateUse1 };
-    std::vector<ScalarType> scalar_types = {
-        ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float,
-        ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half,
-        ScalarType::Bool, ScalarType::BFloat16
-    };
-
-    for (auto& backend : backends) {
-        for (auto& scalar_type : scalar_types) {
+    std::vector<ScalarType> scalar_types = { ScalarType::Byte,    ScalarType::Char, ScalarType::Double,
+        ScalarType::Float,   ScalarType::Int,  ScalarType::Long,
+        ScalarType::Short,   ScalarType::Half, ScalarType::Bool,
+        ScalarType::BFloat16 };
+
+    for (auto &backend : backends) {
+        for (auto &scalar_type : scalar_types) {
             ret.emplace_back(std::make_pair(backend, scalar_type));
         }
     }
@@ -32,8 +30,8 @@ std::vector<std::pair<Backend, ScalarType>> all_declared_types_npu()
 
 struct PyTensorType {
     PyTypeObject py_type;
-    THPDtype* dtype;
-    THPLayout* layout;
+    THPDtype *dtype;
+    THPLayout *layout;
     bool is_npu;
     char name[64];
     int backend;
@@ -57,73 +55,67 @@ struct PyTensorType {
 
 static_assert(std::is_standard_layout<PyTensorType>::value, "PyTensorType must be standard layout");
 
-static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types);
+static void py_bind_tensor_types(const std::vector<PyTensorType> &tensor_types);
 
-static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+static PyObject *Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 {
     HANDLE_TH_ERRORS
-    auto& tensor_type = *((PyTensorType*)type);
+    auto &tensor_type = *((PyTensorType *)type);
     if (tensor_type.is_npu) {
-        TORCH_NPU_WARN_ONCE(
-            "Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. "
+        TORCH_NPU_WARN_ONCE("Warning: The torch.npu.*DtypeTensor constructors are no longer recommended. "
             "It's best to use methods such as torch.tensor(data, dtype=*, device='npu') "
             "to create tensors.");
     }
-    TORCH_CHECK_TYPE(
-        !tensor_type.is_npu || c10_npu::device_count() != 0,
-        "type ",
-        tensor_type.name,
+    TORCH_CHECK_TYPE(!tensor_type.is_npu || c10_npu::device_count() != 0, "type ", tensor_type.name,
         " not available. Torch not compiled with npu enabled.", PTA_ERROR(ErrCode::TYPE))
     torch_npu::utils::npu_lazy_init();
-    return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(),
-                                                             tensor_type.get_scalar_type(),
-                                                             args,
-                                                             kwargs));
+    return THPVariable_Wrap(
+        torch::utils::legacy_tensor_ctor(tensor_type.get_dispatch_key(), tensor_type.get_scalar_type(), args, kwargs));
     END_HANDLE_TH_ERRORS
 }
 
-static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg)
+static PyObject *Tensor_instancecheck(PyObject *_self, PyObject *arg)
 {
-  HANDLE_TH_ERRORS
-  auto self = (PyTensorType*)_self;
-  if (THPVariable_Check(arg)) {
-    const auto& var = THPVariable_Unpack(arg);
-
-    if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() &&
-        var.scalar_type() == static_cast<ScalarType>(self->scalar_type)) {
-      Py_RETURN_TRUE;
+    HANDLE_TH_ERRORS
+    auto self = (PyTensorType *)_self;
+    if (THPVariable_Check(arg)) {
+        const auto &var = THPVariable_Unpack(arg);
+
+        if (legacyExtractDispatchKey(var.key_set()) == self->get_dispatch_key() &&
+            var.scalar_type() == static_cast<ScalarType>(self->scalar_type)) {
+            Py_RETURN_TRUE;
+        }
     }
-  }
-  Py_RETURN_FALSE;
-  END_HANDLE_TH_ERRORS
+    Py_RETURN_FALSE;
+    END_HANDLE_TH_ERRORS
 }
 
-PyObject* Tensor_dtype(PyTensorType* self, void *unused)
+PyObject *Tensor_dtype(PyTensorType *self, void *unused)
 {
-  return torch::autograd::utils::wrap(self->dtype);
+    return torch::autograd::utils::wrap(self->dtype);
 }
 
-PyObject* Tensor_layout(PyTensorType* self, void *unused)
+PyObject *Tensor_layout(PyTensorType *self, void *unused)
 {
-  return torch::autograd::utils::wrap(self->layout);
+    return torch::autograd::utils::wrap(self->layout);
 }
 
-PyObject* Tensor_is_npu(PyTensorType* self, void *unused)
+PyObject *Tensor_is_npu(PyTensorType *self, void *unused)
 {
-  if (self->is_npu) {
-    Py_RETURN_TRUE;
-  } else {
-    Py_RETURN_FALSE;
-  }
+    if (self->is_npu) {
+        Py_RETURN_TRUE;
+    } else {
+        Py_RETURN_FALSE;
+    }
 }
 
-PyObject* Tensor_is_sparse(PyTensorType *self, void *unused)
+PyObject *Tensor_is_sparse(PyTensorType *self, void *unused)
 {
-  if (self->layout->layout == at::Layout::Strided) {
-    Py_RETURN_FALSE;
-  } else {
-    Py_RETURN_TRUE;
-  }
+    if (self->layout->layout == at::Layout::Strided) {
+        Py_RETURN_FALSE;
+    } else {
+        Py_RETURN_TRUE;
+    }
 }
 
 static struct PyMethodDef metaclass_methods[] = {
@@ -131,7 +123,7 @@ static struct PyMethodDef metaclass_methods[] = {
     {nullptr}
 };
 
-using getter = PyObject* (*)(PyObject *, void *);
+using getter = PyObject *(*)(PyObject *, void *);
 
 static struct PyGetSetDef metaclass_properties[] = {
     {"dtype",        (getter)Tensor_dtype, nullptr, nullptr, nullptr},
@@ -142,46 +134,44 @@ static struct PyGetSetDef metaclass_properties[] = {
 };
 
 static PyTypeObject metaclass = {
-  PyVarObject_HEAD_INIT(nullptr, 0)
-  "torch.tensortype",                          /* tp_name */
-  sizeof(PyTypeObject)                         /* tp_basicsize */
+    PyVarObject_HEAD_INIT(nullptr, 0) "torch.tensortype", /* tp_name */
+    sizeof(PyTypeObject)                                  /* tp_basicsize */
 };
 
-static void py_initialize_metaclass(PyTypeObject& metaclass)
+static void py_initialize_metaclass(PyTypeObject &metaclass)
 {
-  metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
-  metaclass.tp_methods = metaclass_methods;
-  metaclass.tp_getset = metaclass_properties;
-  metaclass.tp_base = &PyType_Type;
-  if (PyType_Ready(&metaclass) < 0) {
-    throw python_error();
-  }
+    metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+    metaclass.tp_methods = metaclass_methods;
+    metaclass.tp_getset = metaclass_properties;
+    metaclass.tp_base = &PyType_Type;
+    if (PyType_Ready(&metaclass) < 0) {
+        throw python_error();
+    }
 }
 
 static PyTypeObject tensor_type_prototype = {
-  PyVarObject_HEAD_INIT(&metaclass, 0)
-  nullptr,                                     /* tp_name */
-  sizeof(PyTensorType)                         /* tp_basicsize */
+    PyVarObject_HEAD_INIT(&metaclass, 0) nullptr, /* tp_name */
+    sizeof(PyTensorType)                          /* tp_basicsize */
 };
 
-static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict)
+static void py_initialize_tensor_type(PyTypeObject &type, const char *name, PyObject *tp_dict)
 {
-  // NOTE: we don't use the typical static declaration of PyTypeObject because
-  // we need to initialize as many types as there are VariableType instances.
-  // We copy the basic object fields from a prototype definition and initialize
-  // the remaining fields below.
-  memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject));
-  // Subclassing from torch.<ScalarType>Tensor isn't supported.
-  // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed.
-  type.tp_flags = Py_TPFLAGS_DEFAULT;
-  type.tp_name = name;
-  type.tp_new = Tensor_new;
-  if (PyType_Ready(&type) < 0) {
-    throw python_error();
-  }
-  if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) {
-    throw python_error();
-  }
+    // NOTE: we don't use the typical static declaration of PyTypeObject because
+    // we need to initialize as many types as there are VariableType instances.
+    // We copy the basic object fields from a prototype definition and initialize
+    // the remaining fields below.
+    memcpy(&type, &tensor_type_prototype, sizeof(PyTypeObject));
+    // Subclassing from torch.<ScalarType>Tensor isn't supported.
+    // (Py_TPFLAGS_BASETYPE omitted). Subclassing torch.Tensor still allowed.
+    type.tp_flags = Py_TPFLAGS_DEFAULT;
+    type.tp_name = name;
+    type.tp_new = Tensor_new;
+    if (PyType_Ready(&type) < 0) {
+        throw python_error();
+    }
+    if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) {
+        throw python_error();
+    }
 }
 
 static std::string get_module(Backend backend)
@@ -204,103 +194,103 @@ static std::string get_module(Backend backend)
 
 static std::string get_name(Backend backend, ScalarType scalarType)
 {
-  std::ostringstream ss;
-  ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
-  return ss.str();
+    std::ostringstream ss;
+    ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
+    return ss.str();
 }
 
-static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType)
+static void set_type(PyTensorType &type_obj, Backend backend, ScalarType scalarType)
 {
-  // This field is lazily initialized from backend and scalar_type
-  type_obj.backend = static_cast<int>(backend);
-  type_obj.scalar_type = static_cast<int>(scalarType);
-  type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend));
-  type_obj.dtype = torch::getTHPDtype(scalarType);
-  type_obj.is_npu = (backend == c10::Backend::PrivateUse1);
+    // This field is lazily initialized from backend and scalar_type
+    type_obj.backend = static_cast<int>(backend);
+    type_obj.scalar_type = static_cast<int>(scalarType);
+    type_obj.layout = torch::getTHPLayout(c10::layout_from_backend(backend));
+    type_obj.dtype = torch::getTHPDtype(scalarType);
+    type_obj.is_npu = (backend == c10::Backend::PrivateUse1);
 }
 
-static void set_name(PyTensorType& type_obj, const std::string& name)
+static void set_name(PyTensorType &type_obj, const std::string &name)
 {
-  size_t n = sizeof(type_obj.name);
-  strncpy(type_obj.name, name.c_str(), n);
-  type_obj.name[n - 1] = '\0';
+    size_t n = sizeof(type_obj.name);
+    strncpy(type_obj.name, name.c_str(), n);
+    type_obj.name[n - 1] = '\0';
 }
 
 static THPObjectPtr get_tensor_dict()
 {
-  auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
-  if (!torch) {
-      throw python_error();
-  }
-
-  auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor"));
-  if (!tensor_class) {
-      throw python_error();
-  }
-
-  auto tensor_type = (PyTypeObject*)tensor_class.get();
-  TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE));
-
-  auto res = THPObjectPtr(PyDict_New());
-  if (!res) {
-      throw python_error();
-  }
-
-  if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) {
-    throw python_error();
-  }
-  if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) {
-    throw python_error();
-  }
-
-  return res;
+    auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
+    if (!torch) {
+        throw python_error();
+    }
+
+    auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor"));
+    if (!tensor_class) {
+        throw python_error();
+    }
+
+    auto tensor_type = (PyTypeObject *)tensor_class.get();
+    TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor", PTA_ERROR(ErrCode::TYPE));
+
+    auto res = THPObjectPtr(PyDict_New());
+    if (!res) {
+        throw python_error();
+    }
+
+    if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) {
+        throw python_error();
+    }
+    if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) {
+        throw python_error();
+    }
+
+    return res;
 }
 
 static std::vector<PyTensorType> tensor_types;
 
-static void initialize_npu_aten_types(std::vector<PyTensorType>& tensor_types)
+static void initialize_npu_aten_types(std::vector<PyTensorType> &tensor_types)
 {
-  // only initialize npu types
-  auto declared_types = all_declared_types_npu();
-  tensor_types.resize(declared_types.size());
-
-  for (size_t i = 0, end = declared_types.size(); i != end; i++) {
-    auto& tensor_type = tensor_types[i];
-    Backend backend = declared_types[i].first;
-    ScalarType scalar_type = declared_types[i].second;
-    set_type(tensor_type, backend, scalar_type);
-    set_name(tensor_type, get_name(backend, scalar_type));
-  }
+    // only initialize npu types
+    auto declared_types = all_declared_types_npu();
+    tensor_types.resize(declared_types.size());
+
+    for (size_t i = 0, end = declared_types.size(); i != end; i++) {
+        auto &tensor_type = tensor_types[i];
+        Backend backend = declared_types[i].first;
+        ScalarType scalar_type = declared_types[i].second;
+        set_type(tensor_type, backend, scalar_type);
+        set_name(tensor_type, get_name(backend, scalar_type));
+    }
 }
 
 void _initialize_python_bindings()
 {
-  // Initialize the at::Type* pointers, name, and properties of the PyTensorType
-  // vector. After this call, the vector must not be resized.
-  initialize_npu_aten_types(tensor_types);
-
-  // Initialize the Python metaclass for the torch.FloatTensor, etc. types.
-  // The metaclass handles __instancecheck__ checks and binds the dtype property
-  // on the type objects.
-  py_initialize_metaclass(metaclass);
-
-  // Get the tp_dict of the Variable class. We copy function definitions
-  // onto each Tensor type object so that they can be accessed via e.g.
-  // `torch.npu.FloatTensor.add`.
-  auto tensor_dict = get_tensor_dict();
-
-  // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc.
-  for (auto& tensor_type : tensor_types) {
-    py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get());
-  }
-
-  // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor
-  // is added to the `torch_npu` module as `FloatTensor`. Also add all the type
-  // objects to the set torch_npu._tensor_classes.
-  py_bind_tensor_types(tensor_types);
+    // Initialize the at::Type* pointers, name, and properties of the PyTensorType
+    // vector. After this call, the vector must not be resized.
+    initialize_npu_aten_types(tensor_types);
+
+    // Initialize the Python metaclass for the torch.FloatTensor, etc. types.
+    // The metaclass handles __instancecheck__ checks and binds the dtype property
+    // on the type objects.
+    py_initialize_metaclass(metaclass);
+
+    // Get the tp_dict of the Variable class. We copy function definitions
+    // onto each Tensor type object so that they can be accessed via e.g.
+    // `torch.npu.FloatTensor.add`.
+    auto tensor_dict = get_tensor_dict();
+
+    // Initialize each Python type object torch.npu.FloatTensor, torch.npu.DoubleTensor, etc.
+    for (auto &tensor_type : tensor_types) {
+        py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get());
+    }
+
+    // Add the type objects to their corresponding modules. e.g. torch.npu.FloatTensor
+    // is added to the `torch_npu` module as `FloatTensor`. Also add all the type
+    // objects to the set torch_npu._tensor_classes.
+    py_bind_tensor_types(tensor_types);
 }
 
-static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
+static void py_bind_tensor_types(const std::vector<PyTensorType> &tensor_types)
 {
     auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
     if (!torch_module) {
@@ -312,7 +302,7 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
         throw python_error();
     }
 
-    for (auto& tensor_type : tensor_types) {
+    for (auto &tensor_type : tensor_types) {
         auto name = std::string(tensor_type.name);
         auto idx = name.rfind('.');
         auto type_name = name.substr(idx + 1);
@@ -323,7 +313,7 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
             throw python_error();
         }
 
-        PyObject* type_obj = (PyObject*)&tensor_type;
+        PyObject *type_obj = (PyObject *)&tensor_type;
         Py_INCREF(type_obj);
         if (PyModule_AddObject(module_obj.get(), type_name.c_str(), type_obj) < 0) {
             throw python_error();
@@ -335,12 +325,12 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
 }
 
 // Callback for python part. Used for additional initialization of python classes
-static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs)
+static PyObject *THPModule_initExtension(PyObject *_unused, PyObject *noargs)
 {
-  HANDLE_TH_ERRORS
-  _initialize_python_bindings();
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
+    HANDLE_TH_ERRORS
+    _initialize_python_bindings();
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
 }
 
 // autograd methods on torch._C
@@ -349,9 +339,9 @@ static PyMethodDef TorchNpuExtensionMethods[] = {
     {nullptr, nullptr, 0, nullptr}
 };
 
-PyMethodDef* npu_extension_functions()
+PyMethodDef *npu_extension_functions()
 {
-  return TorchNpuExtensionMethods;
+    return TorchNpuExtensionMethods;
 }
 }
 }
diff --git a/torch_npu/distributed/fsdp/_add_fsdp_patch.py b/torch_npu/distributed/fsdp/_add_fsdp_patch.py
index 7405620686897a2199333f9eb8580c113443ee0b..62ed66cac6fa1e1e83a20eb35c67ba68aec7bb5f 100644
--- a/torch_npu/distributed/fsdp/_add_fsdp_patch.py
+++ b/torch_npu/distributed/fsdp/_add_fsdp_patch.py
@@ -122,7 +122,8 @@ def foreach_all_gather_copy_out_npu(
             # Copy to a temporary and then chunk-cat into the final all-gather
             # output tensors
             param_all_gather_outputs = [
-                torch.empty_like(t) for t in param_all_gather_outputs
+                torch.empty_like(t)
+                for t in param_all_gather_outputs
             ]
             shard_i_copy_infos.append((fsdp_param, param_all_gather_outputs))
         split_with_sizes_out.extend(param_all_gather_outputs)