From a6096c278dc375215a747ec437a7885bf466a5ee Mon Sep 17 00:00:00 2001
From: feihujiang <jiangfeihu@huawei.com>
Date: Wed, 27 Dec 2023 15:07:11 +0800
Subject: [PATCH] Maintain consistent style

---
 .../framework/contiguous/ContiguousOpt.cpp    | 414 ++++----
 .../framework/contiguous/combined_opt.cpp     | 908 +++++++++---------
 .../csrc/framework/contiguous/permute_opt.cpp | 408 ++++----
 .../csrc/framework/contiguous/select_opt.cpp  | 218 ++---
 .../csrc/framework/contiguous/slice_opt.cpp   | 262 ++---
 5 files changed, 1105 insertions(+), 1105 deletions(-)

diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
index b6b2b788a29..bdbbc3950a1 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
@@ -3,224 +3,224 @@
 #include <ATen/quantized/QTensorImpl.h>
 
 namespace at_npu {
-namespace native {
-
-OptimizationCases TransContiguous::optCasesDefault = {};
-OptimizationCases TransContiguous::optCasesAnyFormat = {"reshape", "slice"};
-ska::flat_hash_map<size_t, CachedContiguousOpt> TransContiguous::cached_contiguous_opt;
-
-
-ContiguousTensorDesc TransContiguous::GetTensorDescInfo(
-    const at::Tensor &src, const OptimizationCases &opt_cases) {
-  auto src_base_info = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
-  c10::SmallVector<int64_t, MAX_DIM> src_size_inferred;
-  c10::SmallVector<int64_t, MAX_DIM> src_stride_inferred;
-  c10::SmallVector<int64_t, MAX_DIM> src_storage_size_inferred =
-      src_base_info.storage_sizes_;
-  if (src.dim() == 0) {
-    src_size_inferred = {1};
-    src_stride_inferred = {1};
-    if (src_storage_size_inferred.size() == 0) {
-      src_storage_size_inferred = {1};
-    }
-  } else {
-    src_size_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.sizes());
-    src_stride_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.strides());
-  }
-  ContiguousTensorDesc src_desc = {
-      src.is_contiguous(),       src_size_inferred,
-      src_stride_inferred,       src.storage_offset(),
-      src_base_info.base_sizes_, src_base_info.base_strides_,
-      src_storage_size_inferred, src_base_info.base_offset_,
-      src_base_info.npu_format_, opt_cases};
-  if (src_desc.opt_cases_.empty()) {
-    src_desc.find_match_optimization_cases();
-  }
-  return src_desc;
-}
-
-bool TransContiguous::CheckClone(const at::Tensor &src, at::Tensor &self) {
-  // self tensor may not be temporary constructed empty tensor from src, so:
-  // 1. contiguous storage is needed:storage_offset and numels eq
-  // 2. full memory copy: size match between src and self
-  if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() &&
-      src.sizes().equals(self.sizes()) &&
-      self.sizes().equals(torch_npu::NPUBridge::GetNpuStorageImpl(self)->get_npu_desc().base_sizes_)) {
-    return true;
-  }
-  return false;
-}
-
-bool TransContiguous::can_optimize_(ContiguousTensorDesc &tensor_desc) {
-  for (auto opt_case : tensor_desc.opt_cases_) {
-    bool res = register_opt::CopyOptRegister::GetInstance()->CanOptimize(
-        opt_case, tensor_desc);
-    if (res) {
-      // refresh patterns to only keep optimized pattern
-      tensor_desc.opt_cases_.clear();
-      tensor_desc.opt_cases_.emplace_back(opt_case);
-      return true;
-    }
-  }
-  return false;
-}
-
-bool TransContiguous::CanOptimize(ContiguousTensorDesc &tensor_desc) {
-  return can_optimize_(tensor_desc);
-}
-
-bool TransContiguous::CanOptimize(const at::Tensor &tensor,
-                                  const OptimizationCases &opt_cases) {
-  ContiguousTensorDesc tensor_desc = GetTensorDescInfo(tensor, opt_cases);
-  return can_optimize_(tensor_desc);
-}
-
-bool TransContiguous::contiguous_optimize_with_anyformat_(
-    at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc) {
-  if (!CheckClone(src, self)) {
-        return false;
-    }
-    for (auto &opt_case : src_desc.opt_cases_) {
-        bool res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self,
-                                                                     src, src_desc);
-        if (res) {
-            return true;
+    namespace native {
+
+        OptimizationCases TransContiguous::optCasesDefault = {};
+        OptimizationCases TransContiguous::optCasesAnyFormat = {"reshape", "slice"};
+        ska::flat_hash_map<size_t, CachedContiguousOpt> TransContiguous::cached_contiguous_opt;
+
+
+        ContiguousTensorDesc TransContiguous::GetTensorDescInfo(
+                const at::Tensor &src, const OptimizationCases &opt_cases) {
+            auto src_base_info = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
+            c10::SmallVector<int64_t, MAX_DIM> src_size_inferred;
+            c10::SmallVector<int64_t, MAX_DIM> src_stride_inferred;
+            c10::SmallVector<int64_t, MAX_DIM> src_storage_size_inferred =
+                    src_base_info.storage_sizes_;
+            if (src.dim() == 0) {
+                src_size_inferred = {1};
+                src_stride_inferred = {1};
+                if (src_storage_size_inferred.size() == 0) {
+                    src_storage_size_inferred = {1};
+                }
+            } else {
+                src_size_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.sizes());
+                src_stride_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.strides());
+            }
+            ContiguousTensorDesc src_desc = {
+                    src.is_contiguous(),       src_size_inferred,
+                    src_stride_inferred,       src.storage_offset(),
+                    src_base_info.base_sizes_, src_base_info.base_strides_,
+                    src_storage_size_inferred, src_base_info.base_offset_,
+                    src_base_info.npu_format_, opt_cases};
+            if (src_desc.opt_cases_.empty()) {
+                src_desc.find_match_optimization_cases();
+            }
+            return src_desc;
+        }
+
+        bool TransContiguous::CheckClone(const at::Tensor &src, at::Tensor &self) {
+            // self tensor may not be temporary constructed empty tensor from src, so:
+            // 1. contiguous storage is needed:storage_offset and numels eq
+            // 2. full memory copy: size match between src and self
+            if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() &&
+                src.sizes().equals(self.sizes()) &&
+                self.sizes().equals(torch_npu::NPUBridge::GetNpuStorageImpl(self)->get_npu_desc().base_sizes_)) {
+                return true;
+            }
+            return false;
+        }
+
+        bool TransContiguous::can_optimize_(ContiguousTensorDesc &tensor_desc) {
+            for (auto opt_case : tensor_desc.opt_cases_) {
+                bool res = register_opt::CopyOptRegister::GetInstance()->CanOptimize(
+                        opt_case, tensor_desc);
+                if (res) {
+                    // refresh patterns to only keep optimized pattern
+                    tensor_desc.opt_cases_.clear();
+                    tensor_desc.opt_cases_.emplace_back(opt_case);
+                    return true;
+                }
+            }
+            return false;
         }
-    }
-  return false;
-}
-
-    size_t GetHash_(const c10::SmallVector<int64_t, MAX_DIM>& small_vector_size)
-    {
-        size_t seed = 0;
-        for (auto i = 0; i < small_vector_size.size(); i++) {
-            seed ^= small_vector_size[i] + (seed << 6) + (seed >> 2);
+
+        bool TransContiguous::CanOptimize(ContiguousTensorDesc &tensor_desc) {
+            return can_optimize_(tensor_desc);
         }
-        return seed;
-    }
-
-    size_t GetHash_(const ContiguousTensorDesc &src_desc)
-    {
-        size_t hash_src_desc = (GetHash_(src_desc.sizes_)<<52) +
-                               (GetHash_(src_desc.base_sizes_)<<40) +
-                               (GetHash_(src_desc.strides_)<<28) +
-                               (GetHash_(src_desc.base_strides_)<<16) +
-                               (src_desc.offset_ << 4) +
-                               src_desc.npu_format_;
-        return hash_src_desc;
-    }
-
-    bool equalDesc(const ContiguousTensorDesc &src_desc, const ContiguousTensorDesc &desc_desc)
-    {
-        if (src_desc.sizes_ == desc_desc.sizes_ &&
-            src_desc.base_sizes_ == desc_desc.base_sizes_ &&
-            src_desc.strides_ == desc_desc.strides_ &&
-            src_desc.base_strides_ == desc_desc.base_strides_ &&
-            src_desc.offset_ == desc_desc.offset_ &&
-            src_desc.npu_format_ == desc_desc.npu_format_) {
-            return true;
+
+        bool TransContiguous::CanOptimize(const at::Tensor &tensor,
+                                          const OptimizationCases &opt_cases) {
+            ContiguousTensorDesc tensor_desc = GetTensorDescInfo(tensor, opt_cases);
+            return can_optimize_(tensor_desc);
         }
-        return false;
-    }
-
-    bool TransContiguous::cached_contiguous_optimize_with_anyformat_(
-        at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc)
-    {
-        // No cached, try caching
-        if (!CheckClone(src, self)) {
+
+        bool TransContiguous::contiguous_optimize_with_anyformat_(
+                at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc) {
+            if (!CheckClone(src, self)) {
+                return false;
+            }
+            for (auto &opt_case : src_desc.opt_cases_) {
+                bool res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self,
+                                                                             src, src_desc);
+                if (res) {
+                    return true;
+                }
+            }
             return false;
         }
-        src_desc.hash_src_desc = GetHash_(src_desc);
-        auto it = TransContiguous::cached_contiguous_opt.find(src_desc.hash_src_desc);
-        if (it != TransContiguous::cached_contiguous_opt.end()) {
-            // Cached
-            if (equalDesc(src_desc, it->second.contiguous_tensor_desc)) {
-                src_desc.cached_contiguous = true;
-                auto &opt_case = it->second.cached_opt_case;
-                return register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self,
-                                                                               src, src_desc);
+
+        size_t GetHash_(const c10::SmallVector<int64_t, MAX_DIM>& small_vector_size)
+        {
+            size_t seed = 0;
+            for (auto i = 0; i < small_vector_size.size(); i++) {
+                seed ^= small_vector_size[i] + (seed << 6) + (seed >> 2);
             }
+            return seed;
+        }
+
+        size_t GetHash_(const ContiguousTensorDesc &src_desc)
+        {
+            size_t hash_src_desc = (GetHash_(src_desc.sizes_)<<52) +
+                                   (GetHash_(src_desc.base_sizes_)<<40) +
+                                   (GetHash_(src_desc.strides_)<<28) +
+                                   (GetHash_(src_desc.base_strides_)<<16) +
+                                   (src_desc.offset_ << 4) +
+                                   src_desc.npu_format_;
+            return hash_src_desc;
+        }
+
+        bool equalDesc(const ContiguousTensorDesc &src_desc, const ContiguousTensorDesc &desc_desc)
+        {
+            if (src_desc.sizes_ == desc_desc.sizes_ &&
+                src_desc.base_sizes_ == desc_desc.base_sizes_ &&
+                src_desc.strides_ == desc_desc.strides_ &&
+                src_desc.base_strides_ == desc_desc.base_strides_ &&
+                src_desc.offset_ == desc_desc.offset_ &&
+                src_desc.npu_format_ == desc_desc.npu_format_) {
+                return true;
+            }
+            return false;
+        }
+
+        bool TransContiguous::cached_contiguous_optimize_with_anyformat_(
+                at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc)
+        {
+            // No cached, try caching
+            if (!CheckClone(src, self)) {
+                return false;
+            }
+            src_desc.hash_src_desc = GetHash_(src_desc);
+            auto it = TransContiguous::cached_contiguous_opt.find(src_desc.hash_src_desc);
+            if (it != TransContiguous::cached_contiguous_opt.end()) {
+                // Cached
+                if (equalDesc(src_desc, it->second.contiguous_tensor_desc)) {
+                    src_desc.cached_contiguous = true;
+                    auto &opt_case = it->second.cached_opt_case;
+                    return register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self,
+                                                                                   src, src_desc);
+                }
+                return contiguous_optimize_with_anyformat_(self, src, src_desc);
+            }
+
+            for (auto &opt_case : src_desc.opt_cases_) {
+                bool res = false;
+                if (TransContiguous::cached_contiguous_opt.size() >= CachedMaxSize) {
+                    res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, src, src_desc);
+                } else {
+                    src_desc.cached_contiguous = false;
+                    res =  register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, src, src_desc);
+                }
+                if (res) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        bool TransContiguous::ContiguousOptimizeWithAnyFormat(
+                at::Tensor &self, const at::Tensor &src,
+                const OptimizationCases &opt_cases) {
+            ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
             return contiguous_optimize_with_anyformat_(self, src, src_desc);
         }
 
-        for (auto &opt_case : src_desc.opt_cases_) {
-            bool res = false;
-            if (TransContiguous::cached_contiguous_opt.size() >= CachedMaxSize) {
-                res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, src, src_desc);
-            } else {
-                src_desc.cached_contiguous = false;
-                res =  register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, src, src_desc);
+        c10::optional<at::Tensor> TransContiguous::ContiguousOptimizeWithAnyFormat(
+                const at::Tensor &src, const OptimizationCases &opt_cases) {
+            TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1,
+                        "Expected all tensors to be on the same device. "
+                        "Expected NPU tensor, please check whether the input tensor device is correct.");
+            auto self = OpPreparation::ApplyTensorWithFormat(
+                    src.sizes(), src.options(), torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_);
+            ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
+            if (cached_contiguous_optimize_with_anyformat_(self, src, src_desc)) {
+                return self;
             }
-            if (res) {
-                return true;
+            return c10::nullopt;
+        }
+
+        bool TransContiguous::ContiguousOptimizeWithBaseFormat(
+                at::Tensor &self, const at::Tensor &src, const OptimizationCases &opt_cases,
+                bool OpenCombined) {
+            TORCH_CHECK(FormatHelper::IsBaseFormatType(src),
+                        "ContiguousOptimizeWithBaseFormat func requires Input Tensor "
+                        "with base format!");
+            // In non-specific cases, classify the cases and simplify judgement.
+            ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
+            if (OpenCombined &&
+                c10_npu::option::OptionsManager::CheckCombinedOptimizerEnable()) {
+                src_desc.add_optimization_case("combined");
             }
+            return cached_contiguous_optimize_with_anyformat_(self, src, src_desc);
         }
-        return false;
-    }
-
-bool TransContiguous::ContiguousOptimizeWithAnyFormat(
-    at::Tensor &self, const at::Tensor &src,
-    const OptimizationCases &opt_cases) {
-  ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
-  return contiguous_optimize_with_anyformat_(self, src, src_desc);
-}
-
-c10::optional<at::Tensor> TransContiguous::ContiguousOptimizeWithAnyFormat(
-    const at::Tensor &src, const OptimizationCases &opt_cases) {
-  TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1,
-      "Expected all tensors to be on the same device. "
-      "Expected NPU tensor, please check whether the input tensor device is correct.");
-  auto self = OpPreparation::ApplyTensorWithFormat(
-      src.sizes(), src.options(), torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_);
-  ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
-  if (cached_contiguous_optimize_with_anyformat_(self, src, src_desc)) {
-    return self;
-  }
-  return c10::nullopt;
-}
-
-bool TransContiguous::ContiguousOptimizeWithBaseFormat(
-    at::Tensor &self, const at::Tensor &src, const OptimizationCases &opt_cases,
-    bool OpenCombined) {
-  TORCH_CHECK(FormatHelper::IsBaseFormatType(src),
-              "ContiguousOptimizeWithBaseFormat func requires Input Tensor "
-              "with base format!");
-  // In non-specific cases, classify the cases and simplify judgement.
-  ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
-  if (OpenCombined &&
-      c10_npu::option::OptionsManager::CheckCombinedOptimizerEnable()) {
-    src_desc.add_optimization_case("combined");
-  }
-  return cached_contiguous_optimize_with_anyformat_(self, src, src_desc);
-}
-
-
-    at::Tensor TransContiguous::view_tensor(const at::Tensor& self,
-                                            int64_t offset,
-                                            const c10::IntArrayRef& sizes,
-                                            const c10::IntArrayRef& strides)
-    {
-        at::Tensor self_;
-        if (self.is_quantized()) {
-            self_ = at::detail::make_tensor<at::QTensorImpl>(
-                    c10::TensorImpl::VIEW,
-                    c10::Storage(self.storage()),
-                    self.key_set(),
-                    self.dtype(),
-                    get_qtensorimpl(self)->quantizer());
-        } else {
-            self_ = at::detail::make_tensor<at::TensorImpl>(
-                    c10::TensorImpl::VIEW,
-                    c10::Storage(self.storage()),
-                    self.key_set(),
-                    self.dtype());
+
+
+        at::Tensor TransContiguous::view_tensor(const at::Tensor& self,
+                                                int64_t offset,
+                                                const c10::IntArrayRef& sizes,
+                                                const c10::IntArrayRef& strides)
+        {
+            at::Tensor self_;
+            if (self.is_quantized()) {
+                self_ = at::detail::make_tensor<at::QTensorImpl>(
+                        c10::TensorImpl::VIEW,
+                        c10::Storage(self.storage()),
+                        self.key_set(),
+                        self.dtype(),
+                        get_qtensorimpl(self)->quantizer());
+            } else {
+                self_ = at::detail::make_tensor<at::TensorImpl>(
+                        c10::TensorImpl::VIEW,
+                        c10::Storage(self.storage()),
+                        self.key_set(),
+                        self.dtype());
+            }
+            auto* self_tmp_ = self_.unsafeGetTensorImpl();
+            self_tmp_->set_storage_offset(offset);
+            self_tmp_->set_sizes_and_strides(sizes, strides);
+            at::namedinference::propagate_names(self_, self);
+            return self_;
         }
-        auto* self_tmp_ = self_.unsafeGetTensorImpl();
-        self_tmp_->set_storage_offset(offset);
-        self_tmp_->set_sizes_and_strides(sizes, strides);
-        at::namedinference::propagate_names(self_, self);
-        return self_;
-    }
-
-} // namespace native
+
+    } // namespace native
 } // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
index fb8255f44fa..cbd5fcefa75 100644
--- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp
@@ -6,463 +6,463 @@
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 
 namespace at_npu {
-namespace native {
-
-class CombinedContiguousOpt : public ContiguousOpt {
-public:
-    // Combined tensor == discontiguous tensor caused by combined view operators.
-    bool Optimizer(at::Tensor &self, const at::Tensor &src,
-                   const ContiguousTensorDesc &src_desc) override {
-        // Maximum combined operators suggested: combined_cases_num = 2
-        // NOTE: n-cmobined(n>2) can also be supported
-        int combined_cases_num = MaxCombinedCasesNum;
-
-        ShapeStrideStack shape_stride_stacks;
-        OffsetStack offset_stack;
-
-        if (can_use_combined(shape_stride_stacks, offset_stack, src_desc,
-                             combined_cases_num)) {
-            RECORD_FUNCTION("contiguous_h_combined", std::vector<c10::IValue>({src}));
-            return pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack);
-        }
-        return false;
-    }
-
-    bool CachedOptimizer(at::Tensor &self, const at::Tensor &src,
-                         const ContiguousTensorDesc &src_desc) override
-    {
-        ShapeStrideStack shape_stride_stacks;
-        OffsetStack offset_stack;
-        if (src_desc.cached_contiguous) {
-            RECORD_FUNCTION("cached_contiguous_h_combined", std::vector<c10::IValue>({src}));
-
-            CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc];
-            shape_stride_stacks = cachedContiguousOpt.shape_stride_stack;
-            offset_stack = cachedContiguousOpt.offset_stack;
-            return pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack);
-        }
-
-        int combined_cases_num = MaxCombinedCasesNum;
-        if (can_use_combined(shape_stride_stacks, offset_stack, src_desc,
-                             combined_cases_num)) {
-            ShapeStrideStack cached_shape_stride_stacks = shape_stride_stacks;
-            OffsetStack cached_offset_stack = offset_stack;
-            RECORD_FUNCTION("contiguous_h_combined", std::vector<c10::IValue>({src}));
-
-            bool contiguousOrNot = pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack);
-            if (contiguousOrNot) {
-                CachedContiguousOpt cached_opt = CachedContiguousOpt{
-                        "combined"
-                };
-                cached_opt.shape_stride_stack = cached_shape_stride_stacks;
-                cached_opt.offset_stack = cached_offset_stack;
-                cached_opt.contiguous_tensor_desc = src_desc;
-                TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt;
+    namespace native {
+
+        class CombinedContiguousOpt : public ContiguousOpt {
+        public:
+            // Combined tensor == discontiguous tensor caused by combined view operators.
+            bool Optimizer(at::Tensor &self, const at::Tensor &src,
+                           const ContiguousTensorDesc &src_desc) override {
+              // Maximum combined operators suggested: combined_cases_num = 2
+              // NOTE: n-cmobined(n>2) can also be supported
+              int combined_cases_num = MaxCombinedCasesNum;
+
+              ShapeStrideStack shape_stride_stacks;
+              OffsetStack offset_stack;
+
+              if (can_use_combined(shape_stride_stacks, offset_stack, src_desc,
+                                   combined_cases_num)) {
+                RECORD_FUNCTION("contiguous_h_combined", std::vector<c10::IValue>({src}));
+                return pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack);
+              }
+              return false;
             }
-            return contiguousOrNot;
-        }
-        return false;
-    }
 
-private:
+            bool CachedOptimizer(at::Tensor &self, const at::Tensor &src,
+                                 const ContiguousTensorDesc &src_desc) override
+            {
+              ShapeStrideStack shape_stride_stacks;
+              OffsetStack offset_stack;
+              if (src_desc.cached_contiguous) {
+                RECORD_FUNCTION("cached_contiguous_h_combined", std::vector<c10::IValue>({src}));
+
+                CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc];
+                shape_stride_stacks = cachedContiguousOpt.shape_stride_stack;
+                offset_stack = cachedContiguousOpt.offset_stack;
+                return pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack);
+              }
+
+              int combined_cases_num = MaxCombinedCasesNum;
+              if (can_use_combined(shape_stride_stacks, offset_stack, src_desc,
+                                   combined_cases_num)) {
+                ShapeStrideStack cached_shape_stride_stacks = shape_stride_stacks;
+                OffsetStack cached_offset_stack = offset_stack;
+                RECORD_FUNCTION("contiguous_h_combined", std::vector<c10::IValue>({src}));
+
+                bool contiguousOrNot = pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack);
+                if (contiguousOrNot) {
+                  CachedContiguousOpt cached_opt = CachedContiguousOpt{
+                          "combined"
+                  };
+                  cached_opt.shape_stride_stack = cached_shape_stride_stacks;
+                  cached_opt.offset_stack = cached_offset_stack;
+                  cached_opt.contiguous_tensor_desc = src_desc;
+                  TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt;
+                }
+                return contiguousOrNot;
+              }
+              return false;
+            }
 
-    bool pre_combined_to_contiguous(at::Tensor &self, const at::Tensor &src,
-                                    ShapeStrideStack &shape_stride_stacks,
-                                    OffsetStack &offset_stack)
-    {
-        // Record src infos for recovering after trans-contiguous
-        auto src_storage_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
-
-        at::Tensor base_tensor =
-                at::empty(src_storage_desc.base_sizes_, src.options());
-        base_tensor.set_(src.storage());
-
-        // Reconstruct combined discontiguous tensor ==trans==> contiguous tensor
-        bool contiguousOrNot = combined_to_contiguous(self, base_tensor, shape_stride_stacks, offset_stack);
-        // Recover modified tensor infos of src after trans-contiguous
-        StorageDescHelper::CopyDesc(base_tensor, src_storage_desc);
-        return contiguousOrNot;
-    }
-
-    bool cases_avoid(const ContiguousTensorDesc &tensor_desc)
-    {
-        for (const auto i : c10::irange(tensor_desc.sizes_.size())) {
-            // expand+x,x+expand
-            if (tensor_desc.strides_[i] == 0) {
+        private:
+
+            bool pre_combined_to_contiguous(at::Tensor &self, const at::Tensor &src,
+                                            ShapeStrideStack &shape_stride_stacks,
+                                            OffsetStack &offset_stack)
+            {
+              // Record src infos for recovering after trans-contiguous
+              auto src_storage_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
+
+              at::Tensor base_tensor =
+                      at::empty(src_storage_desc.base_sizes_, src.options());
+              base_tensor.set_(src.storage());
+
+              // Reconstruct combined discontiguous tensor ==trans==> contiguous tensor
+              bool contiguousOrNot = combined_to_contiguous(self, base_tensor, shape_stride_stacks, offset_stack);
+              // Recover modified tensor infos of src after trans-contiguous
+              StorageDescHelper::CopyDesc(base_tensor, src_storage_desc);
+              return contiguousOrNot;
+            }
+
+            bool cases_avoid(const ContiguousTensorDesc &tensor_desc)
+            {
+              for (const auto i : c10::irange(tensor_desc.sizes_.size())) {
+                // expand+x,x+expand
+                if (tensor_desc.strides_[i] == 0) {
+                  return true;
+                }
+              }
+              return false;
+            }
+
+            // Unmatched tensor ==refresh(no copy)==> macthed tensor
+            bool reshape_without_copy_match(at::Tensor &tensor) {
+              if (!tensor.is_contiguous()) {
+                return false;
+              }
+              auto npu_desc = torch_npu::NPUBridge::GetNpuStorageImpl(tensor)->get_npu_desc();
+              if ((c10::multiply_integers(tensor.sizes()) !=
+                   c10::multiply_integers(npu_desc.base_sizes_)) ||
+                  (tensor.storage_offset() != npu_desc.base_offset_)) {
+                return false;
+              }
+              RECORD_FUNCTION("contiguous_h_match", std::vector<c10::IValue>({tensor}));
+              StorageDescHelper::SetDesc(tensor, CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor.sizes()),
+                                         CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor.strides()));
+              return true;
+            }
+
+            // Whether tensor can be optimized(no optimization).
+            bool can_be_optimize_from_default_cases(ContiguousTensorDesc &tensor_desc) {
+              OptimizationCases opt_cases{"reshape", "slice", "select"};
+              tensor_desc.reset_optimization_cases(opt_cases);
+              return TransContiguous::CanOptimize(tensor_desc);
+            }
+
+            // Conduct trans-contiguous for given optimization cases.
+            bool
+            copy_optimize_contiguous_by_given_cases(at::Tensor &self,
+                                                    const at::Tensor &tensor,
+                                                    OptimizationCases &optimizations) {
+              // Set "OpenCombined = false" to avoid recursion.
+              return TransContiguous::ContiguousOptimizeWithBaseFormat(
+                      self, tensor, optimizations, false);
+            }
+
+            // Weak constrains for transpose cases
+            bool maybe_permute(const ContiguousTensorDesc &tensor_desc) {
+              // tensors with nonmonotonic strides will be taken into consideration
+              // (Ascend): 对于特殊stride的情况例如：[*,*,1,1]这种，需要进一步分析影响
+              for (const auto i : c10::irange(tensor_desc.sizes_.size() - 1)) {
+                if (tensor_desc.strides_[i] < tensor_desc.strides_[i + 1]) {
+                  return true;
+                }
+              }
+              return false;
+            }
+
+            // Weak constrains for select cases
+            bool maybe_select(const ContiguousTensorDesc &tensor_desc) {
+              for (auto i = tensor_desc.sizes_.size() - 1; i > 0; i--) {
+                if (tensor_desc.strides_[i - 1] %
+                    (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) !=
+                    0) {
+                  return false;
+                }
+                if (tensor_desc.strides_[i - 1] /
+                    (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) !=
+                    1) {
+                  if (tensor_desc.offset_ %
+                      (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) !=
+                      0) {
+                    return false;
+                  }
+                  // Avoid combined-cases such as squeeze+indexing at the first axis.
+                  if (tensor_desc.strides_[0] != tensor_desc.base_strides_[0]) {
+                    return false;
+                  }
+                }
+              }
+              return true;
+            }
+
+            // Weak constrains for slice cases
+            bool maybe_slice(const ContiguousTensorDesc &tensor_desc) {
+              // tensors with reduced numel will be taken into consideration.
+              if (c10::multiply_integers(tensor_desc.sizes_) <
+                  c10::multiply_integers(tensor_desc.base_sizes_)) {
+                for (const auto i : c10::irange(tensor_desc.sizes_.size() - 2)) {
+                  if (tensor_desc.strides_[i] % tensor_desc.strides_[i + 1] != 0) {
+                    return false;
+                  }
+                }
+                return true;
+              }
+              return false;
+            }
+
+            /*
+          Kernel function of "Inference",
+          Key inferred infos: infer_size,infer_stride and infer_offset,
+          Inference order: permute, select, slice.
+          */
+            bool can_infer_view_tensor(ContiguousTensorDesc &tensor_desc,
+                                       FormatShape &infer_size, FormatShape &infer_stride,
+                                       int64_t &infer_offset) {
+              const auto &view_sizes = tensor_desc.sizes_;
+              const auto &view_strides = tensor_desc.strides_;
+
+              if (maybe_permute(tensor_desc)) {
+                FormatShape &permute_size_sorted = infer_size;
+                FormatShape &permute_stride_sorted = infer_stride;
+                permute_size_sorted = view_sizes;
+                permute_stride_sorted = view_strides;
+
+                // Sort stride
+                std::sort(permute_stride_sorted.rbegin(), permute_stride_sorted.rend());
+
+                // Map stride to shape
+                std::map<int64_t, int64_t> map_shape_stride;
+                std::map<int64_t, int64_t> label_map_shape_stride;
+                for (const auto i : c10::irange(view_sizes.size())) {
+                  map_shape_stride[view_strides[i]] = view_sizes[i];
+                }
+                // 除去第0维，其他维shape为1时，不记录对应的stride值，该stride的值会和其他维的stride有重复
+                for (const auto i : c10::irange(view_sizes.size())) {
+                  if (i == 0) {
+                    map_shape_stride[view_strides[0]] = view_sizes[0];
+                  } else if (i != 0 && view_sizes[i] != 1) {
+                    map_shape_stride[view_strides[i]] = view_sizes[i];
+                  }
+                }
+                // stride中有相等的情况，后面相等的stride对应的shape为1
+                for (const auto i : c10::irange(view_sizes.size())) {
+                  if (label_map_shape_stride[permute_stride_sorted[i]] != true) {
+                    permute_size_sorted[i] = map_shape_stride[permute_stride_sorted[i]];
+                    label_map_shape_stride[permute_stride_sorted[i]] = true;
+                  } else {
+                    permute_size_sorted[i] = 1;
+                  }
+                }
+                infer_offset = 0;
+                // Refresh tensor's base info to construct transposed tensor
+                tensor_desc.base_sizes_ = permute_size_sorted;
+                tensor_desc.base_strides_ = permute_stride_sorted;
+                // double-checking of may_permute is not required, because view strides
+                // does not changed.
                 return true;
+              }
+
+              if (maybe_select(tensor_desc)) {
+                FormatShape &select_size = infer_size;
+                FormatShape &select_stride = infer_stride;
+                // Infer base shape according to view shape and stride
+                select_stride = view_strides;
+                select_size = view_sizes;
+                // select_size and stride should be one more than view_size
+                select_size.emplace_back((int64_t)1);
+                select_stride.emplace_back((int64_t)1);
+
+                int64_t i = static_cast<int64_t>(view_sizes.size()) - 1;
+                if (view_strides[i] == 1) {
+                  select_size[i + 1] = view_sizes[i];
+                  select_stride[i + 1] = 1;
+
+                  for (i = i - 1; i >= 0; i--) {
+                    if (view_strides[i] != view_strides[i + 1] * view_sizes[i + 1]) {
+                      select_size[i + 1] =
+                              view_strides[i] / (view_sizes[i + 1] * view_strides[i + 1]);
+                      select_stride[i + 1] = view_sizes[i + 1] * view_strides[i + 1];
+                      infer_offset = tensor_desc.offset_ % view_strides[i];
+                      break;
+                    }
+                    select_size[i + 1] = view_sizes[i];
+                    select_stride[i + 1] = view_strides[i];
+                  }
+                } else {
+                  select_size[i + 1] = view_strides[i];
+                  select_stride[i + 1] = 1;
+                  infer_offset = tensor_desc.offset_ % view_strides[i];
+                }
+                for (i = i - 1; i >= 0; i--) {
+                  select_size[i + 1] = view_sizes[i + 1];
+                  select_stride[i + 1] = view_strides[i + 1];
+                }
+
+                select_size[0] = view_sizes[0];
+                select_stride[0] = view_strides[0];
+
+                // Refresh tensor's base info to construct selected tensor
+                tensor_desc.base_sizes_ = select_size;
+                tensor_desc.base_strides_ = select_stride;
+                // Whether the construted tensor is selected?
+                return maybe_select(tensor_desc);
+              }
+
+              if (maybe_slice(tensor_desc)) {
+                FormatShape &slice_size = infer_size;
+                FormatShape &slice_stride = infer_stride;
+
+                slice_stride = view_strides;
+                slice_size = view_sizes;
+                // Infer base shape according to base stride
+                for (auto i = slice_size.size() - 1; i > 0; i--) {
+                  // Strides is not divisible means this case cannot be inferred.
+                  if (view_strides[i] == 0 ||
+                      view_strides[i - 1] % view_strides[i] != 0) {
+                    return false;
+                  }
+                  slice_size[i] = (view_strides[i - 1] / view_strides[i]);
+                }
+                slice_size[0] = 1;
+                slice_size[0] = (c10::multiply_integers(tensor_desc.base_sizes_) /
+                                 c10::multiply_integers(slice_size));
+                infer_offset = tensor_desc.offset_;
+                // Refresh tensor's base info and storage info to construct sliced tensor
+                tensor_desc.base_sizes_ = slice_size;
+                tensor_desc.base_strides_ = slice_stride;
+                // Whether the construted tensor is sliced?
+                return maybe_slice(tensor_desc);
+              }
+              return false;
+            }
+
+            bool stack_infer_info(ShapeStrideStack &shape_stride_stacks,
+                                  OffsetStack &offset_stacks, int64_t infer_offset,
+                                  int64_t combined_cases_num,
+                                  ContiguousTensorDesc &tensor_desc) {
+              // Only combined_cases_num-combined Ops cases are taken into consideration
+              if (static_cast<int16_t>(shape_stride_stacks.size()) == combined_cases_num) {
+                return false;
+              }
+
+              c10::SmallVector<FormatShape, 2> stack_shape_stride_part;
+              stack_shape_stride_part.emplace_back(
+                      CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor_desc.sizes_));
+              stack_shape_stride_part.emplace_back(
+                      CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor_desc.strides_));
+
+              shape_stride_stacks.emplace_back(stack_shape_stride_part);
+              offset_stacks.emplace_back(infer_offset);
+              return true;
+            }
+
+            // Conduct inferring
+            bool can_use_combined(ShapeStrideStack &shape_stride_stacks,
+                                  OffsetStack &offset_stacks,
+                                  const ContiguousTensorDesc &src_desc,
+                                  int64_t combined_cases_num) {
+              // combined tensor should be discontiguous
+              if (src_desc.is_contiguous_ || cases_avoid(src_desc)) {
+                return false;
+              }
+
+              // Key infos that should be inferred.
+              FormatShape infer_size;
+              FormatShape infer_stride;
+              int64_t infer_offset = 0;
+
+              // Reconstruct "the discontiguous combined tensor desc"
+              // viewInfo = combined tensor(src)'s viewInfo
+              // baseInfo = combined tensor(src)'s baseInfo
+              // src's desc would be modified, so a local struct is created.
+              ContiguousTensorDesc local_src_desc = src_desc;
+
+              // Construct "the first inferred tensor" inside "can_infer_view_tensor()"
+              // viewInfo = combined tensor(src)'s viewInfo
+              // baseInfo = inferred info(infer_size, infer_stride, infer_offset)
+              // If the first inferred tensor can be optimized, store its info.
+              if (can_infer_view_tensor(local_src_desc, infer_size, infer_stride,
+                                        infer_offset) &&
+                  stack_infer_info(shape_stride_stacks, offset_stacks, infer_offset,
+                                   combined_cases_num, local_src_desc)) {
+                // Construct "the second inferred tensor"
+                // viewInfo = inferred info(infer_size, infer_stride, infer_offset)
+                // baseInfo = combined tensor(src)'s baseInfo
+                local_src_desc.sizes_ = infer_size;
+                local_src_desc.strides_ = infer_stride;
+                local_src_desc.offset_ -= infer_offset;
+                local_src_desc.base_sizes_ = src_desc.base_sizes_;
+                local_src_desc.base_strides_ = src_desc.base_strides_;
+                local_src_desc.refresh_contiguous_using_size_and_stride();
+                // The second inferred tensor can be optimized or not
+                if (can_be_optimize_from_default_cases(local_src_desc) &&
+                    stack_infer_info(shape_stride_stacks, offset_stacks,
+                                     local_src_desc.offset_, combined_cases_num,
+                                     local_src_desc)) {
+                  return true;
+                }
+                // If the second pattern is not inferred successfully, retrun false
+                return false;
+              }
+              // If the first pattern is not inferred successfully, retrun false
+              return false;
+            }
+
+            // Reconstructing discontiguous tensor at trans-contiguous procedure.
+            bool reconstruct_tensor(at::Tensor &src,
+                                    ShapeStrideStack &shape_stride_stacks,
+                                    OffsetStack &offset_stacks) {
+              auto stack_shape_stride = shape_stride_stacks.pop_back_val();
+              auto stack_offset = offset_stacks.pop_back_val();
+              // Set view info to make discontiguous tensor.
+              // stack_shape_stride[0]: stored shape infos in inferring procedure.
+              // stack_shape_stride[1]: stored stride infos in inferring procedure.
+
+              src.set_(src.storage(), stack_offset, stack_shape_stride[0],
+                       stack_shape_stride[1]);
+
+              // If current tensor is sliced and the stack is still not empty:
+              // stored infos in the stack should be modified.
+              if (shape_stride_stacks.size() >= 1 &&
+                  maybe_slice(TransContiguous::GetTensorDescInfo(src))) {
+                auto stack_shape_stride_pre = shape_stride_stacks.pop_back_val();
+
+                std::map<int64_t, int64_t> map_stride_shape;
+                auto computed_stride =
+                        StorageDescHelper::ComputeStrideFromShape(stack_shape_stride[0]);
+                // Adjust shape according to sorted stride
+                for (const auto i : c10::irange(stack_shape_stride_pre[0].size())) {
+                  // if shape_i equals to shape_j, non-unique keys for "map_stride_shape" would be made;
+                  // Temporarily, making size[i] * stride[i] to obtain unique keys;
+                  // (Ascend): explore unique keys for any cases when "shape[i] == shape [j]"
+                  map_stride_shape[stack_shape_stride[0][i] * stack_shape_stride[1][i]] =
+                          computed_stride[i];
+                }
+
+                for (const auto i : c10::irange(stack_shape_stride_pre[0].size())) {
+                  stack_shape_stride_pre[1][i] =
+                          map_stride_shape[stack_shape_stride_pre[0][i] *
+                                           stack_shape_stride_pre[1][i]];
+                }
+                // re-store modified infos
+                shape_stride_stacks.emplace_back(stack_shape_stride_pre);
+              }
+              return true;
             }
-        }
-        return false;
-    }
-
-  // Unmatched tensor ==refresh(no copy)==> macthed tensor
-  bool reshape_without_copy_match(at::Tensor &tensor) {
-    if (!tensor.is_contiguous()) {
-      return false;
-    }
-    auto npu_desc = torch_npu::NPUBridge::GetNpuStorageImpl(tensor)->get_npu_desc();
-    if ((c10::multiply_integers(tensor.sizes()) !=
-         c10::multiply_integers(npu_desc.base_sizes_)) ||
-        (tensor.storage_offset() != npu_desc.base_offset_)) {
-      return false;
-    }
-    RECORD_FUNCTION("contiguous_h_match", std::vector<c10::IValue>({tensor}));
-    StorageDescHelper::SetDesc(tensor, CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor.sizes()),
-                               CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor.strides()));
-    return true;
-  }
-
-  // Whether tensor can be optimized(no optimization).
-  bool can_be_optimize_from_default_cases(ContiguousTensorDesc &tensor_desc) {
-    OptimizationCases opt_cases{"reshape", "slice", "select"};
-    tensor_desc.reset_optimization_cases(opt_cases);
-    return TransContiguous::CanOptimize(tensor_desc);
-  }
-
-  // Conduct trans-contiguous for given optimization cases.
-  bool
-  copy_optimize_contiguous_by_given_cases(at::Tensor &self,
-                                          const at::Tensor &tensor,
-                                          OptimizationCases &optimizations) {
-    // Set "OpenCombined = false" to avoid recursion.
-    return TransContiguous::ContiguousOptimizeWithBaseFormat(
-        self, tensor, optimizations, false);
-  }
-
-  // Weak constrains for transpose cases
-  bool maybe_permute(const ContiguousTensorDesc &tensor_desc) {
-    // tensors with nonmonotonic strides will be taken into consideration
-    // (Ascend): 对于特殊stride的情况例如：[*,*,1,1]这种，需要进一步分析影响
-    for (const auto i : c10::irange(tensor_desc.sizes_.size() - 1)) {
-      if (tensor_desc.strides_[i] < tensor_desc.strides_[i + 1]) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // Weak constrains for select cases
-  bool maybe_select(const ContiguousTensorDesc &tensor_desc) {
-    for (auto i = tensor_desc.sizes_.size() - 1; i > 0; i--) {
-      if (tensor_desc.strides_[i - 1] %
-              (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) !=
-          0) {
-        return false;
-      }
-      if (tensor_desc.strides_[i - 1] /
-              (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) !=
-          1) {
-        if (tensor_desc.offset_ %
-                (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) !=
-            0) {
-          return false;
-        }
-        // Avoid combined-cases such as squeeze+indexing at the first axis.
-        if (tensor_desc.strides_[0] != tensor_desc.base_strides_[0]) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  // Weak constrains for slice cases
-  bool maybe_slice(const ContiguousTensorDesc &tensor_desc) {
-    // tensors with reduced numel will be taken into consideration.
-    if (c10::multiply_integers(tensor_desc.sizes_) <
-        c10::multiply_integers(tensor_desc.base_sizes_)) {
-      for (const auto i : c10::irange(tensor_desc.sizes_.size() - 2)) {
-        if (tensor_desc.strides_[i] % tensor_desc.strides_[i + 1] != 0) {
-          return false;
-        }
-      }
-      return true;
-    }
-    return false;
-  }
-
-  /*
-Kernel function of "Inference",
-Key inferred infos: infer_size,infer_stride and infer_offset,
-Inference order: permute, select, slice.
-*/
-  bool can_infer_view_tensor(ContiguousTensorDesc &tensor_desc,
-                             FormatShape &infer_size, FormatShape &infer_stride,
-                             int64_t &infer_offset) {
-    const auto &view_sizes = tensor_desc.sizes_;
-    const auto &view_strides = tensor_desc.strides_;
-
-    if (maybe_permute(tensor_desc)) {
-      FormatShape &permute_size_sorted = infer_size;
-      FormatShape &permute_stride_sorted = infer_stride;
-      permute_size_sorted = view_sizes;
-      permute_stride_sorted = view_strides;
-
-      // Sort stride
-      std::sort(permute_stride_sorted.rbegin(), permute_stride_sorted.rend());
-
-      // Map stride to shape
-      std::map<int64_t, int64_t> map_shape_stride;
-      std::map<int64_t, int64_t> label_map_shape_stride;
-      for (const auto i : c10::irange(view_sizes.size())) {
-        map_shape_stride[view_strides[i]] = view_sizes[i];
-      }
-      // 除去第0维，其他维shape为1时，不记录对应的stride值，该stride的值会和其他维的stride有重复
-      for (const auto i : c10::irange(view_sizes.size())) {
-        if (i == 0) {
-          map_shape_stride[view_strides[0]] = view_sizes[0];
-        } else if (i != 0 && view_sizes[i] != 1) {
-          map_shape_stride[view_strides[i]] = view_sizes[i];
-        }
-      }
-      // stride中有相等的情况，后面相等的stride对应的shape为1
-      for (const auto i : c10::irange(view_sizes.size())) {
-        if (label_map_shape_stride[permute_stride_sorted[i]] != true) {
-          permute_size_sorted[i] = map_shape_stride[permute_stride_sorted[i]];
-          label_map_shape_stride[permute_stride_sorted[i]] = true;
-        } else {
-          permute_size_sorted[i] = 1;
-        }
-      }
-      infer_offset = 0;
-      // Refresh tensor's base info to construct transposed tensor
-      tensor_desc.base_sizes_ = permute_size_sorted;
-      tensor_desc.base_strides_ = permute_stride_sorted;
-      // double-checking of may_permute is not required, because view strides
-      // does not changed.
-      return true;
-    }
-
-    if (maybe_select(tensor_desc)) {
-      FormatShape &select_size = infer_size;
-      FormatShape &select_stride = infer_stride;
-      // Infer base shape according to view shape and stride
-      select_stride = view_strides;
-      select_size = view_sizes;
-      // select_size and stride should be one more than view_size
-      select_size.emplace_back((int64_t)1);
-      select_stride.emplace_back((int64_t)1);
-
-      int64_t i = static_cast<int64_t>(view_sizes.size()) - 1;
-      if (view_strides[i] == 1) {
-        select_size[i + 1] = view_sizes[i];
-        select_stride[i + 1] = 1;
-
-        for (i = i - 1; i >= 0; i--) {
-          if (view_strides[i] != view_strides[i + 1] * view_sizes[i + 1]) {
-            select_size[i + 1] =
-                view_strides[i] / (view_sizes[i + 1] * view_strides[i + 1]);
-            select_stride[i + 1] = view_sizes[i + 1] * view_strides[i + 1];
-            infer_offset = tensor_desc.offset_ % view_strides[i];
-            break;
-          }
-          select_size[i + 1] = view_sizes[i];
-          select_stride[i + 1] = view_strides[i];
-        }
-      } else {
-        select_size[i + 1] = view_strides[i];
-        select_stride[i + 1] = 1;
-        infer_offset = tensor_desc.offset_ % view_strides[i];
-      }
-      for (i = i - 1; i >= 0; i--) {
-        select_size[i + 1] = view_sizes[i + 1];
-        select_stride[i + 1] = view_strides[i + 1];
-      }
-
-      select_size[0] = view_sizes[0];
-      select_stride[0] = view_strides[0];
-
-      // Refresh tensor's base info to construct selected tensor
-      tensor_desc.base_sizes_ = select_size;
-      tensor_desc.base_strides_ = select_stride;
-      // Whether the construted tensor is selected?
-      return maybe_select(tensor_desc);
-    }
-
-    if (maybe_slice(tensor_desc)) {
-      FormatShape &slice_size = infer_size;
-      FormatShape &slice_stride = infer_stride;
-
-      slice_stride = view_strides;
-      slice_size = view_sizes;
-      // Infer base shape according to base stride
-      for (auto i = slice_size.size() - 1; i > 0; i--) {
-        // Strides is not divisible means this case cannot be inferred.
-        if (view_strides[i] == 0 ||
-            view_strides[i - 1] % view_strides[i] != 0) {
-          return false;
-        }
-        slice_size[i] = (view_strides[i - 1] / view_strides[i]);
-      }
-      slice_size[0] = 1;
-      slice_size[0] = (c10::multiply_integers(tensor_desc.base_sizes_) /
-                       c10::multiply_integers(slice_size));
-      infer_offset = tensor_desc.offset_;
-      // Refresh tensor's base info and storage info to construct sliced tensor
-      tensor_desc.base_sizes_ = slice_size;
-      tensor_desc.base_strides_ = slice_stride;
-      // Whether the construted tensor is sliced?
-      return maybe_slice(tensor_desc);
-    }
-    return false;
-  }
-
-  bool stack_infer_info(ShapeStrideStack &shape_stride_stacks,
-                        OffsetStack &offset_stacks, int64_t infer_offset,
-                        int64_t combined_cases_num,
-                        ContiguousTensorDesc &tensor_desc) {
-    // Only combined_cases_num-combined Ops cases are taken into consideration
-    if (static_cast<int16_t>(shape_stride_stacks.size()) == combined_cases_num) {
-      return false;
-    }
-
-    c10::SmallVector<FormatShape, 2> stack_shape_stride_part;
-    stack_shape_stride_part.emplace_back(
-        CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor_desc.sizes_));
-    stack_shape_stride_part.emplace_back(
-        CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor_desc.strides_));
-
-    shape_stride_stacks.emplace_back(stack_shape_stride_part);
-    offset_stacks.emplace_back(infer_offset);
-    return true;
-  }
-
-  // Conduct inferring
-  bool can_use_combined(ShapeStrideStack &shape_stride_stacks,
-                        OffsetStack &offset_stacks,
-                        const ContiguousTensorDesc &src_desc,
-                        int64_t combined_cases_num) {
-    // combined tensor should be discontiguous
-    if (src_desc.is_contiguous_ || cases_avoid(src_desc)) {
-      return false;
-    }
-
-    // Key infos that should be inferred.
-    FormatShape infer_size;
-    FormatShape infer_stride;
-    int64_t infer_offset = 0;
-
-    // Reconstruct "the discontiguous combined tensor desc"
-    // viewInfo = combined tensor(src)'s viewInfo
-    // baseInfo = combined tensor(src)'s baseInfo
-    // src's desc would be modified, so a local struct is created.
-    ContiguousTensorDesc local_src_desc = src_desc;
-
-    // Construct "the first inferred tensor" inside "can_infer_view_tensor()"
-    // viewInfo = combined tensor(src)'s viewInfo
-    // baseInfo = inferred info(infer_size, infer_stride, infer_offset)
-    // If the first inferred tensor can be optimized, store its info.
-    if (can_infer_view_tensor(local_src_desc, infer_size, infer_stride,
-                              infer_offset) &&
-        stack_infer_info(shape_stride_stacks, offset_stacks, infer_offset,
-                         combined_cases_num, local_src_desc)) {
-      // Construct "the second inferred tensor"
-      // viewInfo = inferred info(infer_size, infer_stride, infer_offset)
-      // baseInfo = combined tensor(src)'s baseInfo
-      local_src_desc.sizes_ = infer_size;
-      local_src_desc.strides_ = infer_stride;
-      local_src_desc.offset_ -= infer_offset;
-      local_src_desc.base_sizes_ = src_desc.base_sizes_;
-      local_src_desc.base_strides_ = src_desc.base_strides_;
-      local_src_desc.refresh_contiguous_using_size_and_stride();
-      // The second inferred tensor can be optimized or not
-      if (can_be_optimize_from_default_cases(local_src_desc) &&
-          stack_infer_info(shape_stride_stacks, offset_stacks,
-                           local_src_desc.offset_, combined_cases_num,
-                           local_src_desc)) {
-        return true;
-      }
-      // If the second pattern is not inferred successfully, retrun false
-      return false;
-    }
-    // If the first pattern is not inferred successfully, retrun false
-    return false;
-  }
-
-  // Reconstructing discontiguous tensor at trans-contiguous procedure.
-  bool reconstruct_tensor(at::Tensor &src,
-                          ShapeStrideStack &shape_stride_stacks,
-                          OffsetStack &offset_stacks) {
-    auto stack_shape_stride = shape_stride_stacks.pop_back_val();
-    auto stack_offset = offset_stacks.pop_back_val();
-    // Set view info to make discontiguous tensor.
-    // stack_shape_stride[0]: stored shape infos in inferring procedure.
-    // stack_shape_stride[1]: stored stride infos in inferring procedure.
-
-    src.set_(src.storage(), stack_offset, stack_shape_stride[0],
-             stack_shape_stride[1]);
-
-    // If current tensor is sliced and the stack is still not empty:
-    // stored infos in the stack should be modified.
-    if (shape_stride_stacks.size() >= 1 &&
-        maybe_slice(TransContiguous::GetTensorDescInfo(src))) {
-      auto stack_shape_stride_pre = shape_stride_stacks.pop_back_val();
-
-      std::map<int64_t, int64_t> map_stride_shape;
-      auto computed_stride =
-          StorageDescHelper::ComputeStrideFromShape(stack_shape_stride[0]);
-      // Adjust shape according to sorted stride
-      for (const auto i : c10::irange(stack_shape_stride_pre[0].size())) {
-        // if shape_i equals to shape_j, non-unique keys for "map_stride_shape" would be made;
-        // Temporarily, making size[i] * stride[i] to obtain unique keys;
-        // (Ascend): explore unique keys for any cases when "shape[i] == shape [j]"
-        map_stride_shape[stack_shape_stride[0][i] * stack_shape_stride[1][i]] =
-            computed_stride[i];
-      }
-
-      for (const auto i : c10::irange(stack_shape_stride_pre[0].size())) {
-        stack_shape_stride_pre[1][i] =
-            map_stride_shape[stack_shape_stride_pre[0][i] *
-                             stack_shape_stride_pre[1][i]];
-      }
-      // re-store modified infos
-      shape_stride_stacks.emplace_back(stack_shape_stride_pre);
-    }
-    return true;
-  }
-
-  // Conduct trans-contiguous under strict constrains
-  bool combined_to_contiguous(at::Tensor &self, at::Tensor &src,
-                              ShapeStrideStack &shape_stride_stacks,
-                              OffsetStack &offset_stacks) {
-    // Base case: the last tensor to be processed.
-    if (shape_stride_stacks.size() == 1) {
-      if (reconstruct_tensor(src, shape_stride_stacks, offset_stacks)) {
-        OptimizationCases opt_cases_last{"reshape", "permute", "slice",
-                                         "select"};
-        return copy_optimize_contiguous_by_given_cases(self, src,
-                                                       opt_cases_last);
-      }
-      return false;
-    }
-    // Construct the first tensor and judge whether it can be optimized.
-    if (reconstruct_tensor(src, shape_stride_stacks, offset_stacks)) {
-      ContiguousTensorDesc src_desc_ = TransContiguous::GetTensorDescInfo(src);
-      OptimizationCases opt_cases_first{"reshape", "slice", "select"};
-      if (reshape_without_copy_match(src)) {
-        // case 1 : The first tensor is reshape-type, refresh its info is enough
-        return combined_to_contiguous(self, src, shape_stride_stacks,
-                                      offset_stacks);
-      } else if (can_be_optimize_from_default_cases(src_desc_)) {
-        // case 2: The first tensor is discontiguous-type,
-        // conduct the standard optimization procedure.
-        auto transfer_tensor = OpPreparation::ApplyTensorWithFormat(
-            src.sizes(), src.options(),
-            torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_);
-        return (copy_optimize_contiguous_by_given_cases(transfer_tensor, src,
-                                                        opt_cases_first) &&
-                combined_to_contiguous(self, transfer_tensor,
-                                       shape_stride_stacks, offset_stacks));
-      }
-      // case3 ： The first tensor is contiguous or cannot be identified==>exit
-      return false;
-    }
-    // If the first tensor cannnot be reconstructed==>exit
-    return false;
-  }
-}; // class combinedContiguousOpt
-
-REGISTER_COPY_OPT(combined, CombinedContiguousOpt)
-
-} // namespace native
+
+            // Conduct trans-contiguous under strict constrains
+            bool combined_to_contiguous(at::Tensor &self, at::Tensor &src,
+                                        ShapeStrideStack &shape_stride_stacks,
+                                        OffsetStack &offset_stacks) {
+              // Base case: the last tensor to be processed.
+              if (shape_stride_stacks.size() == 1) {
+                if (reconstruct_tensor(src, shape_stride_stacks, offset_stacks)) {
+                  OptimizationCases opt_cases_last{"reshape", "permute", "slice",
+                                                   "select"};
+                  return copy_optimize_contiguous_by_given_cases(self, src,
+                                                                 opt_cases_last);
+                }
+                return false;
+              }
+              // Construct the first tensor and judge whether it can be optimized.
+              if (reconstruct_tensor(src, shape_stride_stacks, offset_stacks)) {
+                ContiguousTensorDesc src_desc_ = TransContiguous::GetTensorDescInfo(src);
+                OptimizationCases opt_cases_first{"reshape", "slice", "select"};
+                if (reshape_without_copy_match(src)) {
+                  // case 1 : The first tensor is reshape-type, refresh its info is enough
+                  return combined_to_contiguous(self, src, shape_stride_stacks,
+                                                offset_stacks);
+                } else if (can_be_optimize_from_default_cases(src_desc_)) {
+                  // case 2: The first tensor is discontiguous-type,
+                  // conduct the standard optimization procedure.
+                  auto transfer_tensor = OpPreparation::ApplyTensorWithFormat(
+                          src.sizes(), src.options(),
+                          torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_);
+                  return (copy_optimize_contiguous_by_given_cases(transfer_tensor, src,
+                                                                  opt_cases_first) &&
+                          combined_to_contiguous(self, transfer_tensor,
+                                                 shape_stride_stacks, offset_stacks));
+                }
+                // case3 ： The first tensor is contiguous or cannot be identified==>exit
+                return false;
+              }
+              // If the first tensor cannnot be reconstructed==>exit
+              return false;
+            }
+        }; // class combinedContiguousOpt
+
+        REGISTER_COPY_OPT(combined, CombinedContiguousOpt)
+
+    } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/framework/contiguous/permute_opt.cpp b/torch_npu/csrc/framework/contiguous/permute_opt.cpp
index 54596531099..203a8a918df 100644
--- a/torch_npu/csrc/framework/contiguous/permute_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/permute_opt.cpp
@@ -5,208 +5,208 @@
 #include "op_plugin/OpInterface.h"
 
 namespace at_npu {
-namespace native {
-
-class PermuteContiguousOpt : public ContiguousOpt {
-public:
-  bool Optimizer(at::Tensor &self, const at::Tensor &src,
-                 const ContiguousTensorDesc &src_desc) override {
-    // pattern permute
-    c10::SmallVector<int64_t, MAX_DIM> perm;
-    c10::SmallVector<int64_t, 5> sizes;
-    if (can_use_permute(src_desc, perm, sizes)) {
-      RECORD_FUNCTION("contiguous_d_Transpose", std::vector<c10::IValue>({src}));
-      permute_to_contiguous(self, src, perm, sizes);
-      return true;
-    }
-    return false;
-  }
-
-  bool CanOptimizer(const ContiguousTensorDesc &src_desc) override {
-    c10::SmallVector<int64_t, MAX_DIM> perm;
-    c10::SmallVector<int64_t, 5> sizes;
-    return can_use_permute(src_desc, perm, sizes);
-  }
-
-    bool CachedOptimizer(at::Tensor &self, const at::Tensor &src,
-                         const ContiguousTensorDesc &src_desc) override
-    {
-        if (src_desc.cached_contiguous) {
-            RECORD_FUNCTION("cached_contiguous_d_Transpose", std::vector<c10::IValue>({src}));
-            CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc];
-            c10::SmallVector<int64_t, MAX_DIM> sizes = cachedContiguousOpt.cached_opt_parameters.pop_back_val();
-            c10::SmallVector<int64_t, MAX_DIM> perm = cachedContiguousOpt.cached_opt_parameters.pop_back_val();
-            permute_to_contiguous(self, src, perm, sizes);
-            return true;
-        }
-
-        // pattern permute
-        c10::SmallVector<int64_t, MAX_DIM> perm;
-        c10::SmallVector<int64_t, MAX_DIM> sizes;
-        if (can_use_permute(src_desc, perm, sizes)) {
-            RECORD_FUNCTION("contiguous_d_Transpose", std::vector<c10::IValue>({src}));
-            CachedContiguousOpt cached_opt = CachedContiguousOpt{
-                    "permute"
-            };
-            cached_opt.cached_opt_parameters.emplace_back(perm);
-            cached_opt.cached_opt_parameters.emplace_back(sizes);
-            cached_opt.contiguous_tensor_desc = src_desc;
-            TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt;
-            permute_to_contiguous(self, src, perm, sizes);
-            return true;
-        }
-        return false;
-    }
-
-private:
-
-    void permute_to_contiguous(at::Tensor &self, const at::Tensor &src,
-                               const c10::SmallVector<int64_t, MAX_DIM> &perm,
-                               const c10::SmallVector<int64_t, MAX_DIM> &sizes)
-    {
-        // Refresh src Tensor to match output self Tensor
-        auto src_desc_stored = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
-        auto &src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
-        src_desc.base_sizes_ = sizes;
-        src_desc.base_strides_ = StorageDescHelper::ComputeStrideFromShape(static_cast<FormatShape>(sizes));
-        src_desc.storage_sizes_ = sizes;
-        op_plugin::npu_transpose_out(src, perm, false, self);
-        src_desc = src_desc_stored;
-    }
-
-  bool can_use_permute(const ContiguousTensorDesc &src_desc,
-                       c10::SmallVector<int64_t, MAX_DIM> &perm,
-                       c10::SmallVector<int64_t, 5> &sizes) {
-    const auto &base_sizes = src_desc.base_sizes_;
-    const auto &base_strides = src_desc.base_strides_;
-    auto view_sizes = src_desc.sizes_;
-    auto view_strides = src_desc.strides_;
-
-    c10::SmallVector<int64_t, MAX_DIM> indexes;
-    for (const auto i : c10::irange(src_desc.sizes_.size())) {
-      indexes.emplace_back(i);
-    }
-
-    // After permute or reshape+permute, the total amount of data remains
-    // unchanged.
-    if (c10::multiply_integers(view_sizes) != c10::multiply_integers(base_sizes)) {
-      return false;
-    }
-
-    // Reorder axes of shape and stride in descending order
-    for (const auto i : c10::irange(src_desc.sizes_.size() - 1)) {
-      for (const auto j : c10::irange(i + 1, src_desc.sizes_.size())) {
-        bool need_swap = (view_strides[i] < view_strides[j]) ||
-                         (view_strides[i] == view_strides[j] &&
-                          view_sizes[i] < view_sizes[j]);
-        if (need_swap) {
-          std::swap(view_strides[i], view_strides[j]);
-          std::swap(view_sizes[i], view_sizes[j]);
-          std::swap(indexes[i], indexes[j]);
-        }
-      }
-    }
-
-    // After reordering, check whether the shape and stride match
-    auto current_stride = 1;
-    int64_t src_desc_sizes = static_cast<int64_t>(src_desc.sizes_.size());
-    for (int64_t i = src_desc_sizes - 1; i >= 0; i--) {
-      if (current_stride != view_strides[i]) {
-        ASCEND_LOGD("After reordering, shape and stride still do not match, and "
-                    "permute pattern cannot be used.");
-        return false;
-      }
-      current_stride *= view_sizes[i];
-    }
-    if ((base_sizes.size() - view_sizes.size()) !=
-        (base_strides.size() - view_strides.size())) {
-      ASCEND_LOGD("Reordered shape and base shape do not match, and permute "
-                  "pattern cannot be used.");
-      return false;
-    }
-
-    // Calculate perm and sizes for permute
-    for (const auto ele : view_sizes) {
-      sizes.emplace_back(ele);
-    }
-    perm = indexes;
-    for (const auto i : c10::irange(src_desc.sizes_.size())) {
-      perm[indexes[i]] = i;
-    }
-    return true;
-  }
-
-  void optimize_permute(c10::SmallVector<int64_t, MAX_DIM> &perm,
-                        c10::SmallVector<int64_t, 5> &sizes) {
-    c10::SmallVector<int64_t, MAX_DIM> optimized_perm;
-    c10::SmallVector<int64_t, 5> optimized_sizes;
-    if (perm.size() != sizes.size()) {
-      ASCEND_LOGD("Param perm and sizes do not match.");
-      return;
-    }
-
-    // Gather index
-    int64_t perm_size = static_cast<int64_t>(perm.size());
-    for (int64_t i = 0; i < perm_size; i++) {
-      auto temp_perm_i = perm[i];
-      auto temp_sizes_i = sizes[perm[i]];
-      for (const auto j : c10::irange(i + 1, perm_size)) {
-        if (perm[i] + 1 == perm[j]) {
-          temp_sizes_i *= sizes[perm[j]];
-          ++i;
-          continue;
-        }
-        break;
-      }
-      if (temp_sizes_i == 1) {
-        // Optimize permute calculation for better performance, by squeezing
-        // permute param.
-        continue;
-      }
-      optimized_perm.emplace_back(temp_perm_i);
-      optimized_sizes.emplace_back(temp_sizes_i);
-    }
-    if (optimized_perm.size() == perm.size()) {
-      ASCEND_LOGD("No adjacent axes, cannot be optimized.");
-      return;
-    }
-
-    // Calculate new perm and shape
-    c10::SmallVector<int64_t, MAX_DIM> perm_indexes;
-    for (const auto i : c10::irange(optimized_perm.size())) {
-      perm_indexes.emplace_back(i);
-    }
-    for (const auto i : c10::irange(optimized_perm.size() - 1)) {
-      for (const auto j : c10::irange(i + 1, optimized_perm.size())) {
-        if (optimized_perm[i] > optimized_perm[j]) {
-          std::swap(optimized_perm[i], optimized_perm[j]);
-          std::swap(perm_indexes[i], perm_indexes[j]);
-        }
-      }
-    }
-    perm = perm_indexes;
-    for (const auto i : c10::irange(perm_indexes.size())) {
-      perm[perm_indexes[i]] = i;
-    }
-    sizes = optimized_sizes;
-    for (const auto i : c10::irange(perm_indexes.size())) {
-      sizes[i] = optimized_sizes[perm_indexes[i]];
-    }
-  }
-
-  template <typename T> void squeeze_shape_and_stride(T &shape, T &stride) {
-    int64_t shape_size = static_cast<int64_t>(shape.size());
-    for (int64_t i = 0; i < shape_size; i++) {
-      if (shape[i] == 1) {
-        shape.erase(shape.begin() + i);
-        stride.erase(stride.begin() + i);
-        --i;
-      }
-    }
-  }
-}; // class PermuteContiguousOpt
-
-REGISTER_COPY_OPT(permute, PermuteContiguousOpt)
-
-} // namespace native
+    namespace native {
+
+        class PermuteContiguousOpt : public ContiguousOpt {
+        public:
+            bool Optimizer(at::Tensor &self, const at::Tensor &src,
+                           const ContiguousTensorDesc &src_desc) override {
+                // pattern permute
+                c10::SmallVector<int64_t, MAX_DIM> perm;
+                c10::SmallVector<int64_t, 5> sizes;
+                if (can_use_permute(src_desc, perm, sizes)) {
+                    RECORD_FUNCTION("contiguous_d_Transpose", std::vector<c10::IValue>({src}));
+                    permute_to_contiguous(self, src, perm, sizes);
+                    return true;
+                }
+                return false;
+            }
+
+            bool CanOptimizer(const ContiguousTensorDesc &src_desc) override {
+                c10::SmallVector<int64_t, MAX_DIM> perm;
+                c10::SmallVector<int64_t, 5> sizes;
+                return can_use_permute(src_desc, perm, sizes);
+            }
+
+            bool CachedOptimizer(at::Tensor &self, const at::Tensor &src,
+                                 const ContiguousTensorDesc &src_desc) override
+            {
+                if (src_desc.cached_contiguous) {
+                    RECORD_FUNCTION("cached_contiguous_d_Transpose", std::vector<c10::IValue>({src}));
+                    CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc];
+                    c10::SmallVector<int64_t, MAX_DIM> sizes = cachedContiguousOpt.cached_opt_parameters.pop_back_val();
+                    c10::SmallVector<int64_t, MAX_DIM> perm = cachedContiguousOpt.cached_opt_parameters.pop_back_val();
+                    permute_to_contiguous(self, src, perm, sizes);
+                    return true;
+                }
+
+                // pattern permute
+                c10::SmallVector<int64_t, MAX_DIM> perm;
+                c10::SmallVector<int64_t, MAX_DIM> sizes;
+                if (can_use_permute(src_desc, perm, sizes)) {
+                    RECORD_FUNCTION("contiguous_d_Transpose", std::vector<c10::IValue>({src}));
+                    CachedContiguousOpt cached_opt = CachedContiguousOpt{
+                            "permute"
+                    };
+                    cached_opt.cached_opt_parameters.emplace_back(perm);
+                    cached_opt.cached_opt_parameters.emplace_back(sizes);
+                    cached_opt.contiguous_tensor_desc = src_desc;
+                    TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt;
+                    permute_to_contiguous(self, src, perm, sizes);
+                    return true;
+                }
+                return false;
+            }
+
+        private:
+
+            void permute_to_contiguous(at::Tensor &self, const at::Tensor &src,
+                                       const c10::SmallVector<int64_t, MAX_DIM> &perm,
+                                       const c10::SmallVector<int64_t, MAX_DIM> &sizes)
+            {
+                // Refresh src Tensor to match output self Tensor
+                auto src_desc_stored = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
+                auto &src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
+                src_desc.base_sizes_ = sizes;
+                src_desc.base_strides_ = StorageDescHelper::ComputeStrideFromShape(static_cast<FormatShape>(sizes));
+                src_desc.storage_sizes_ = sizes;
+                op_plugin::npu_transpose_out(src, perm, false, self);
+                src_desc = src_desc_stored;
+            }
+
+            bool can_use_permute(const ContiguousTensorDesc &src_desc,
+                                 c10::SmallVector<int64_t, MAX_DIM> &perm,
+                                 c10::SmallVector<int64_t, 5> &sizes) {
+                const auto &base_sizes = src_desc.base_sizes_;
+                const auto &base_strides = src_desc.base_strides_;
+                auto view_sizes = src_desc.sizes_;
+                auto view_strides = src_desc.strides_;
+
+                c10::SmallVector<int64_t, MAX_DIM> indexes;
+                for (const auto i : c10::irange(src_desc.sizes_.size())) {
+                    indexes.emplace_back(i);
+                }
+
+                // After permute or reshape+permute, the total amount of data remains
+                // unchanged.
+                if (c10::multiply_integers(view_sizes) != c10::multiply_integers(base_sizes)) {
+                    return false;
+                }
+
+                // Reorder axes of shape and stride in descending order
+                for (const auto i : c10::irange(src_desc.sizes_.size() - 1)) {
+                    for (const auto j : c10::irange(i + 1, src_desc.sizes_.size())) {
+                        bool need_swap = (view_strides[i] < view_strides[j]) ||
+                                         (view_strides[i] == view_strides[j] &&
+                                          view_sizes[i] < view_sizes[j]);
+                        if (need_swap) {
+                            std::swap(view_strides[i], view_strides[j]);
+                            std::swap(view_sizes[i], view_sizes[j]);
+                            std::swap(indexes[i], indexes[j]);
+                        }
+                    }
+                }
+
+                // After reordering, check whether the shape and stride match
+                auto current_stride = 1;
+                int64_t src_desc_sizes = static_cast<int64_t>(src_desc.sizes_.size());
+                for (int64_t i = src_desc_sizes - 1; i >= 0; i--) {
+                    if (current_stride != view_strides[i]) {
+                        ASCEND_LOGD("After reordering, shape and stride still do not match, and "
+                                    "permute pattern cannot be used.");
+                        return false;
+                    }
+                    current_stride *= view_sizes[i];
+                }
+                if ((base_sizes.size() - view_sizes.size()) !=
+                    (base_strides.size() - view_strides.size())) {
+                    ASCEND_LOGD("Reordered shape and base shape do not match, and permute "
+                                "pattern cannot be used.");
+                    return false;
+                }
+
+                // Calculate perm and sizes for permute
+                for (const auto ele : view_sizes) {
+                    sizes.emplace_back(ele);
+                }
+                perm = indexes;
+                for (const auto i : c10::irange(src_desc.sizes_.size())) {
+                    perm[indexes[i]] = i;
+                }
+                return true;
+            }
+
+            void optimize_permute(c10::SmallVector<int64_t, MAX_DIM> &perm,
+                                  c10::SmallVector<int64_t, 5> &sizes) {
+                c10::SmallVector<int64_t, MAX_DIM> optimized_perm;
+                c10::SmallVector<int64_t, 5> optimized_sizes;
+                if (perm.size() != sizes.size()) {
+                    ASCEND_LOGD("Param perm and sizes do not match.");
+                    return;
+                }
+
+                // Gather index
+                int64_t perm_size = static_cast<int64_t>(perm.size());
+                for (int64_t i = 0; i < perm_size; i++) {
+                    auto temp_perm_i = perm[i];
+                    auto temp_sizes_i = sizes[perm[i]];
+                    for (const auto j : c10::irange(i + 1, perm_size)) {
+                        if (perm[i] + 1 == perm[j]) {
+                            temp_sizes_i *= sizes[perm[j]];
+                            ++i;
+                            continue;
+                        }
+                        break;
+                    }
+                    if (temp_sizes_i == 1) {
+                        // Optimize permute calculation for better performance, by squeezing
+                        // permute param.
+                        continue;
+                    }
+                    optimized_perm.emplace_back(temp_perm_i);
+                    optimized_sizes.emplace_back(temp_sizes_i);
+                }
+                if (optimized_perm.size() == perm.size()) {
+                    ASCEND_LOGD("No adjacent axes, cannot be optimized.");
+                    return;
+                }
+
+                // Calculate new perm and shape
+                c10::SmallVector<int64_t, MAX_DIM> perm_indexes;
+                for (const auto i : c10::irange(optimized_perm.size())) {
+                    perm_indexes.emplace_back(i);
+                }
+                for (const auto i : c10::irange(optimized_perm.size() - 1)) {
+                    for (const auto j : c10::irange(i + 1, optimized_perm.size())) {
+                        if (optimized_perm[i] > optimized_perm[j]) {
+                            std::swap(optimized_perm[i], optimized_perm[j]);
+                            std::swap(perm_indexes[i], perm_indexes[j]);
+                        }
+                    }
+                }
+                perm = perm_indexes;
+                for (const auto i : c10::irange(perm_indexes.size())) {
+                    perm[perm_indexes[i]] = i;
+                }
+                sizes = optimized_sizes;
+                for (const auto i : c10::irange(perm_indexes.size())) {
+                    sizes[i] = optimized_sizes[perm_indexes[i]];
+                }
+            }
+
+            template <typename T> void squeeze_shape_and_stride(T &shape, T &stride) {
+                int64_t shape_size = static_cast<int64_t>(shape.size());
+                for (int64_t i = 0; i < shape_size; i++) {
+                    if (shape[i] == 1) {
+                        shape.erase(shape.begin() + i);
+                        stride.erase(stride.begin() + i);
+                        --i;
+                    }
+                }
+            }
+        }; // class PermuteContiguousOpt
+
+        REGISTER_COPY_OPT(permute, PermuteContiguousOpt)
+
+    } // namespace native
 } // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/contiguous/select_opt.cpp b/torch_npu/csrc/framework/contiguous/select_opt.cpp
index b7fe0b86a19..9e11dc1fb97 100644
--- a/torch_npu/csrc/framework/contiguous/select_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/select_opt.cpp
@@ -2,127 +2,127 @@
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 
 namespace at_npu {
-namespace native {
+    namespace native {
 
-class SelectContiguousOpt : public ContiguousOpt {
-public:
-  bool Optimizer(at::Tensor &self, const at::Tensor &src,
-                 const ContiguousTensorDesc &src_desc) override {
-    // select(dim, start), length[dim] == 1
-    c10::SmallVector<int64_t, MAX_DIM> start;
-    c10::SmallVector<int64_t, MAX_DIM> length;
+        class SelectContiguousOpt : public ContiguousOpt {
+        public:
+            bool Optimizer(at::Tensor &self, const at::Tensor &src,
+                           const ContiguousTensorDesc &src_desc) override {
+              // select(dim, start), length[dim] == 1
+              c10::SmallVector<int64_t, MAX_DIM> start;
+              c10::SmallVector<int64_t, MAX_DIM> length;
 
-    if (can_use_select(src_desc, start, length)) {
-      RECORD_FUNCTION("contiguous_d_StridedSlice",
-                      std::vector<c10::IValue>({src}));
-      select_to_contiguous(self, src, start, length, src_desc);
-      return true;
-    }
-    return false;
-  }
+              if (can_use_select(src_desc, start, length)) {
+                RECORD_FUNCTION("contiguous_d_StridedSlice",
+                                std::vector<c10::IValue>({src}));
+                select_to_contiguous(self, src, start, length, src_desc);
+                return true;
+              }
+              return false;
+            }
 
-  bool CanOptimizer(const ContiguousTensorDesc &src_desc) override {
-    c10::SmallVector<int64_t, MAX_DIM> start;
-    c10::SmallVector<int64_t, MAX_DIM> length;
-    return can_use_select(src_desc, start, length);
-  }
+            bool CanOptimizer(const ContiguousTensorDesc &src_desc) override {
+              c10::SmallVector<int64_t, MAX_DIM> start;
+              c10::SmallVector<int64_t, MAX_DIM> length;
+              return can_use_select(src_desc, start, length);
+            }
 
-private:
-  bool can_use_select(const ContiguousTensorDesc &src_desc,
-                      c10::SmallVector<int64_t, MAX_DIM> &start,
-                      c10::SmallVector<int64_t, MAX_DIM> &length) {
-    // base info and src info
-    const auto &base_size = src_desc.base_sizes_;
-    const auto &base_stride = src_desc.base_strides_;
-    const auto &select_size = src_desc.sizes_;
-    const auto &select_stride = src_desc.strides_;
+        private:
+            bool can_use_select(const ContiguousTensorDesc &src_desc,
+                                c10::SmallVector<int64_t, MAX_DIM> &start,
+                                c10::SmallVector<int64_t, MAX_DIM> &length) {
+              // base info and src info
+              const auto &base_size = src_desc.base_sizes_;
+              const auto &base_stride = src_desc.base_strides_;
+              const auto &select_size = src_desc.sizes_;
+              const auto &select_stride = src_desc.strides_;
 
-    // len(base_size) - len(select_size) == 1  && len(base_stride) -
-    // len(select_stride) == 1
-    if ((base_size.size() - select_size.size() != 1) ||
-        (base_stride.size() - select_stride.size() != 1)) {
-      return false;
-    }
+              // len(base_size) - len(select_size) == 1  && len(base_stride) -
+              // len(select_stride) == 1
+              if ((base_size.size() - select_size.size() != 1) ||
+                  (base_stride.size() - select_stride.size() != 1)) {
+                return false;
+              }
 
-    // recover src tensor info: shape and stride
-    c10::SmallVector<int64_t, MAX_DIM> temp_size;
-    c10::SmallVector<int64_t, MAX_DIM> temp_stride;
-    for (size_t i = 0U; i <= select_size.size(); i++) {
-      if (base_size[i] != select_size[i] ||
-          base_stride[i] != select_stride[i]) {
-        temp_size.emplace_back(base_size[i]);
-        temp_stride.emplace_back(base_stride[i]);
-        for (const auto j : c10::irange(i + 1, select_size.size() + 1)) {
-          temp_size.emplace_back(select_size[j - 1]);
-          temp_stride.emplace_back(select_stride[j - 1]);
-          i = j + 1;
-        }
-      } else {
-        temp_size.emplace_back(select_size[i]);
-        temp_stride.emplace_back(select_stride[i]);
-      }
-    }
+              // recover src tensor info: shape and stride
+              c10::SmallVector<int64_t, MAX_DIM> temp_size;
+              c10::SmallVector<int64_t, MAX_DIM> temp_stride;
+              for (size_t i = 0U; i <= select_size.size(); i++) {
+                if (base_size[i] != select_size[i] ||
+                    base_stride[i] != select_stride[i]) {
+                  temp_size.emplace_back(base_size[i]);
+                  temp_stride.emplace_back(base_stride[i]);
+                  for (const auto j : c10::irange(i + 1, select_size.size() + 1)) {
+                    temp_size.emplace_back(select_size[j - 1]);
+                    temp_stride.emplace_back(select_stride[j - 1]);
+                    i = j + 1;
+                  }
+                } else {
+                  temp_size.emplace_back(select_size[i]);
+                  temp_stride.emplace_back(select_stride[i]);
+                }
+              }
 
-    for (const auto i : c10::irange(select_size.size() + 1)) {
-      if (base_size[i] == temp_size[i] && base_stride[i] == temp_stride[i]) {
-        continue;
-      } else {
-        return false;
-      }
-    }
+              for (const auto i : c10::irange(select_size.size() + 1)) {
+                if (base_size[i] == temp_size[i] && base_stride[i] == temp_stride[i]) {
+                  continue;
+                } else {
+                  return false;
+                }
+              }
 
-    // Collect the select infos for SliceD: dim, start, length
-    // confirm the selected dim
-    int64_t dim = static_cast<int64_t>(base_size.size()) - 1;
-    for (const auto i : c10::irange(select_size.size())) {
-      if (base_size[i] != select_size[i] ||
-          base_stride[i] != select_stride[i]) {
-        dim = i;
-        break;
-      }
-    }
+              // Collect the select infos for SliceD: dim, start, length
+              // confirm the selected dim
+              int64_t dim = static_cast<int64_t>(base_size.size()) - 1;
+              for (const auto i : c10::irange(select_size.size())) {
+                if (base_size[i] != select_size[i] ||
+                    base_stride[i] != select_stride[i]) {
+                  dim = i;
+                  break;
+                }
+              }
 
-    // Obtain start index and select length
-    int64_t int_index = src_desc.offset_ / base_stride[dim];
-    for (const auto i : c10::irange(base_size.size())) {
-      if (i == dim) {
-        start.emplace_back(int_index);
-        length.emplace_back(1);
-      } else {
-        start.emplace_back(0);
-        length.emplace_back(base_size[i]);
-      }
-    }
-    return true;
-  }
+              // Obtain start index and select length
+              int64_t int_index = src_desc.offset_ / base_stride[dim];
+              for (const auto i : c10::irange(base_size.size())) {
+                if (i == dim) {
+                  start.emplace_back(int_index);
+                  length.emplace_back(1);
+                } else {
+                  start.emplace_back(0);
+                  length.emplace_back(base_size[i]);
+                }
+              }
+              return true;
+            }
 
-  void select_to_contiguous(at::Tensor &self, const at::Tensor &src,
-                            c10::SmallVector<int64_t, MAX_DIM> &start,
-                            c10::SmallVector<int64_t, MAX_DIM> &length,
-                            const ContiguousTensorDesc &src_desc) {
-    const auto &base_size = src_desc.base_sizes_;
-    // Recover base tensor(necessary) a = b.select(1, 1)
-    at::Tensor temp_src = TransContiguous::view_tensor(src, src_desc.base_offset_, base_size, src_desc.base_strides_);
+            void select_to_contiguous(at::Tensor &self, const at::Tensor &src,
+                                      c10::SmallVector<int64_t, MAX_DIM> &start,
+                                      c10::SmallVector<int64_t, MAX_DIM> &length,
+                                      const ContiguousTensorDesc &src_desc) {
+              const auto &base_size = src_desc.base_sizes_;
+              // Recover base tensor(necessary) a = b.select(1, 1)
+              at::Tensor temp_src = TransContiguous::view_tensor(src, src_desc.base_offset_, base_size, src_desc.base_strides_);
 
-    // construct StridedSlice param
-    int64_t axis_size = static_cast<int64_t>(start.size());
-    c10::SmallVector<int64_t, MAX_DIM> strides(axis_size, 1);
-    c10::SmallVector<int64_t, MAX_DIM> end;
-    int64_t shrink_mask = 0;
-    for (int64_t i = 0; i < axis_size; ++i) {
-      end.emplace_back(start[i] + length[i]);
-      if (length[i] == 1 && temp_src.size(i) != 1) {
-        shrink_mask += std::pow(2, i);
-      }
-    }
+              // construct StridedSlice param
+              int64_t axis_size = static_cast<int64_t>(start.size());
+              c10::SmallVector<int64_t, MAX_DIM> strides(axis_size, 1);
+              c10::SmallVector<int64_t, MAX_DIM> end;
+              int64_t shrink_mask = 0;
+              for (int64_t i = 0; i < axis_size; ++i) {
+                end.emplace_back(start[i] + length[i]);
+                if (length[i] == 1 && temp_src.size(i) != 1) {
+                  shrink_mask += std::pow(2, i);
+                }
+              }
 
-    // call StridedSlice op to contiguous
-    custom_ops::npu_indexing_out(temp_src, start, end, strides, 0, 0, 0, 0, shrink_mask, self);
-    return;
-  }
-}; // class SelectContiguousOpt
+              // call StridedSlice op to contiguous
+              custom_ops::npu_indexing_out(temp_src, start, end, strides, 0, 0, 0, 0, shrink_mask, self);
+              return;
+            }
+        }; // class SelectContiguousOpt
 
-REGISTER_COPY_OPT(select, SelectContiguousOpt)
+        REGISTER_COPY_OPT(select, SelectContiguousOpt)
 
-} // namespace native
+    } // namespace native
 } // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/contiguous/slice_opt.cpp b/torch_npu/csrc/framework/contiguous/slice_opt.cpp
index 0eb1f596af2..2753fab7d50 100644
--- a/torch_npu/csrc/framework/contiguous/slice_opt.cpp
+++ b/torch_npu/csrc/framework/contiguous/slice_opt.cpp
@@ -2,148 +2,148 @@
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 
 namespace at_npu {
-namespace native {
+    namespace native {
 
-class SliceContiguousOpt : public ContiguousOpt {
-public:
-  bool Optimizer(at::Tensor &self, const at::Tensor &src,
-                 const ContiguousTensorDesc &src_desc) override {
-    // Pattern slice.
-    // Current pattern does not directly depend on other patterns.
-    // The relative sequence of this pattern and other patterns is not
-    // important.
-    c10::SmallVector<int64_t, MAX_DIM> offsets;
-    c10::SmallVector<int64_t, MAX_DIM> size;
-    if (can_use_slice(src_desc, offsets, size)) {
-      RECORD_FUNCTION("contiguous_d_Slice", std::vector<c10::IValue>({src}));
-      slice_to_contiguous(self, src, offsets, size, src_desc);
-      return true;
-    }
-    return false;
-  }
+        class SliceContiguousOpt : public ContiguousOpt {
+        public:
+            bool Optimizer(at::Tensor &self, const at::Tensor &src,
+                           const ContiguousTensorDesc &src_desc) override {
+                // Pattern slice.
+                // Current pattern does not directly depend on other patterns.
+                // The relative sequence of this pattern and other patterns is not
+                // important.
+                c10::SmallVector<int64_t, MAX_DIM> offsets;
+                c10::SmallVector<int64_t, MAX_DIM> size;
+                if (can_use_slice(src_desc, offsets, size)) {
+                    RECORD_FUNCTION("contiguous_d_Slice", std::vector<c10::IValue>({src}));
+                    slice_to_contiguous(self, src, offsets, size, src_desc);
+                    return true;
+                }
+                return false;
+            }
 
-  bool CanOptimizer(const ContiguousTensorDesc &src_desc) override {
-    c10::SmallVector<int64_t, MAX_DIM> offsets;
-    c10::SmallVector<int64_t, MAX_DIM> size;
-    return can_use_slice(src_desc, offsets, size);
-  }
+            bool CanOptimizer(const ContiguousTensorDesc &src_desc) override {
+                c10::SmallVector<int64_t, MAX_DIM> offsets;
+                c10::SmallVector<int64_t, MAX_DIM> size;
+                return can_use_slice(src_desc, offsets, size);
+            }
 
-    bool CachedOptimizer(at::Tensor &self, const at::Tensor &src,
-                         const ContiguousTensorDesc &src_desc) override
-    {
-        if (src_desc.cached_contiguous) {
-            RECORD_FUNCTION("cached_contiguous_d_Slice", std::vector<c10::IValue>({src}));
-            CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc];
-            c10::SmallVector<int64_t, MAX_DIM> size = cachedContiguousOpt.cached_opt_parameters.pop_back_val();
-            c10::SmallVector<int64_t, MAX_DIM> offsets = cachedContiguousOpt.cached_opt_parameters.pop_back_val();
-            slice_to_contiguous(self, src, offsets, size, src_desc);
-            return true;
-        }
-        c10::SmallVector<int64_t, MAX_DIM> offsets;
-        c10::SmallVector<int64_t, MAX_DIM> size;
-        if (can_use_slice(src_desc, offsets, size)) {
-            RECORD_FUNCTION("contiguous_d_Slice", std::vector<c10::IValue>({src}));
-            CachedContiguousOpt cached_opt = CachedContiguousOpt{
-                    "slice"
-            };
-            cached_opt.cached_opt_parameters.emplace_back(offsets);
-            cached_opt.cached_opt_parameters.emplace_back(size);
-            cached_opt.contiguous_tensor_desc = src_desc;
-            TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt;
-            slice_to_contiguous(self, src, offsets, size, src_desc);
-            return true;
-        }
-        return false;
-    }
+            bool CachedOptimizer(at::Tensor &self, const at::Tensor &src,
+                                 const ContiguousTensorDesc &src_desc) override
+            {
+                if (src_desc.cached_contiguous) {
+                    RECORD_FUNCTION("cached_contiguous_d_Slice", std::vector<c10::IValue>({src}));
+                    CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc];
+                    c10::SmallVector<int64_t, MAX_DIM> size = cachedContiguousOpt.cached_opt_parameters.pop_back_val();
+                    c10::SmallVector<int64_t, MAX_DIM> offsets = cachedContiguousOpt.cached_opt_parameters.pop_back_val();
+                    slice_to_contiguous(self, src, offsets, size, src_desc);
+                    return true;
+                }
+                c10::SmallVector<int64_t, MAX_DIM> offsets;
+                c10::SmallVector<int64_t, MAX_DIM> size;
+                if (can_use_slice(src_desc, offsets, size)) {
+                    RECORD_FUNCTION("contiguous_d_Slice", std::vector<c10::IValue>({src}));
+                    CachedContiguousOpt cached_opt = CachedContiguousOpt{
+                            "slice"
+                    };
+                    cached_opt.cached_opt_parameters.emplace_back(offsets);
+                    cached_opt.cached_opt_parameters.emplace_back(size);
+                    cached_opt.contiguous_tensor_desc = src_desc;
+                    TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt;
+                    slice_to_contiguous(self, src, offsets, size, src_desc);
+                    return true;
+                }
+                return false;
+            }
 
-private:
-  // npu-slice pattern cover several view ops, including chunk, split, narrow
-  // and part of index. Judgment logic is based on the implement of view ops in
-  // adapter layer.
-  bool can_use_slice(const ContiguousTensorDesc &src_desc,
-                     c10::SmallVector<int64_t, MAX_DIM> &offsets,
-                     c10::SmallVector<int64_t, MAX_DIM> &size) {
-    const auto &base_sizes = src_desc.base_sizes_;
-    const auto &base_strides = src_desc.base_strides_;
-    auto view_sizes = src_desc.sizes_;
-    auto view_strides = src_desc.strides_;
+        private:
+            // npu-slice pattern cover several view ops, including chunk, split, narrow
+            // and part of index. Judgment logic is based on the implement of view ops in
+            // adapter layer.
+            bool can_use_slice(const ContiguousTensorDesc &src_desc,
+                               c10::SmallVector<int64_t, MAX_DIM> &offsets,
+                               c10::SmallVector<int64_t, MAX_DIM> &size) {
+                const auto &base_sizes = src_desc.base_sizes_;
+                const auto &base_strides = src_desc.base_strides_;
+                auto view_sizes = src_desc.sizes_;
+                auto view_strides = src_desc.strides_;
 
-    // narrow+select(select at last dim) ==> single narrow
-    // 限制条件：1. 最后一轴stride非1==>最后一轴select；2.
-    // 基础格式；3.非最后一轴发生narrow（元素减少）
-    // 最小化影响：仅限最后一轴的select，即tensor.select(-1, 1) ==
-    // tensor[**,1:2],select过渡到narrow
-    if (view_strides[view_strides.size() - 1] != 1 &&
-        FormatHelper::IsBaseFormatType(src_desc.npu_format_) &&
-        view_strides.size() < base_strides.size() &&
-        c10::multiply_integers(view_sizes) <
-            c10::multiply_integers(base_sizes) / base_sizes[base_sizes.size() - 1]) {
-      view_sizes.emplace_back(1);
-      view_strides.emplace_back(1);
-    }
+                // narrow+select(select at last dim) ==> single narrow
+                // 限制条件：1. 最后一轴stride非1==>最后一轴select；2.
+                // 基础格式；3.非最后一轴发生narrow（元素减少）
+                // 最小化影响：仅限最后一轴的select，即tensor.select(-1, 1) ==
+                // tensor[**,1:2],select过渡到narrow
+                if (view_strides[view_strides.size() - 1] != 1 &&
+                    FormatHelper::IsBaseFormatType(src_desc.npu_format_) &&
+                    view_strides.size() < base_strides.size() &&
+                    c10::multiply_integers(view_sizes) <
+                    c10::multiply_integers(base_sizes) / base_sizes[base_sizes.size() - 1]) {
+                    view_sizes.emplace_back(1);
+                    view_strides.emplace_back(1);
+                }
 
-    // Strides must be the same.
-    if (view_strides != base_strides) {
-      return false;
-    }
+                // Strides must be the same.
+                if (view_strides != base_strides) {
+                    return false;
+                }
 
-    // Only narrow dims are different.
-    c10::SmallVector<int64_t, MAX_DIM> narrow_dims;
-    if (view_sizes.size() != base_sizes.size()) {
-      return false;
-    }
-    for (const auto i : c10::irange(view_sizes.size())) {
-      if (view_sizes[i] == base_sizes[i]) {
-        narrow_dims.emplace_back(0);
-      } else if (view_sizes[i] < base_sizes[i]) {
-        narrow_dims.emplace_back(1);
-      } else {
-        return false;
-      }
-    }
+                // Only narrow dims are different.
+                c10::SmallVector<int64_t, MAX_DIM> narrow_dims;
+                if (view_sizes.size() != base_sizes.size()) {
+                    return false;
+                }
+                for (const auto i : c10::irange(view_sizes.size())) {
+                    if (view_sizes[i] == base_sizes[i]) {
+                        narrow_dims.emplace_back(0);
+                    } else if (view_sizes[i] < base_sizes[i]) {
+                        narrow_dims.emplace_back(1);
+                    } else {
+                        return false;
+                    }
+                }
 
-    // Calculate npu slice param.
-    size = view_sizes;
-    offsets.clear();
-    int64_t storage_offsets = src_desc.offset_;
-    // src.storage_offset() == start[narrow_dims[i]]*stride[narrow_dims[i]]
-    for (const auto i : c10::irange(view_strides.size())) {
-      offsets.emplace_back(storage_offsets / view_strides[i]);
-      storage_offsets = storage_offsets % view_strides[i];
-    }
-    if (storage_offsets != 0) {
-      return false;
-    }
-    for (const auto i : c10::irange(offsets.size())) {
-      if ((offsets[i] + view_sizes[i]) > base_sizes[i]) {
-        // In narrow calculation, (start + length) <= cur_size
-        return false;
-      }
-      if (offsets[i] != 0 && narrow_dims[i] == 0) {
-        // narrow_dims[i] == 0 means dim i is not involved in narrow
-        // calculation. offsets[i] != 0 means dim i has the start of narrow
-        // calculation. Two conditions are contradictory.
-        return false;
-      }
-    }
-    return true;
-  }
+                // Calculate npu slice param.
+                size = view_sizes;
+                offsets.clear();
+                int64_t storage_offsets = src_desc.offset_;
+                // src.storage_offset() == start[narrow_dims[i]]*stride[narrow_dims[i]]
+                for (const auto i : c10::irange(view_strides.size())) {
+                    offsets.emplace_back(storage_offsets / view_strides[i]);
+                    storage_offsets = storage_offsets % view_strides[i];
+                }
+                if (storage_offsets != 0) {
+                    return false;
+                }
+                for (const auto i : c10::irange(offsets.size())) {
+                    if ((offsets[i] + view_sizes[i]) > base_sizes[i]) {
+                        // In narrow calculation, (start + length) <= cur_size
+                        return false;
+                    }
+                    if (offsets[i] != 0 && narrow_dims[i] == 0) {
+                        // narrow_dims[i] == 0 means dim i is not involved in narrow
+                        // calculation. offsets[i] != 0 means dim i has the start of narrow
+                        // calculation. Two conditions are contradictory.
+                        return false;
+                    }
+                }
+                return true;
+            }
 
 
-    void slice_to_contiguous(at::Tensor &self, const at::Tensor &src,
-                             const c10::SmallVector<int64_t, MAX_DIM> &offsets,
-                             const c10::SmallVector<int64_t, MAX_DIM> &size,
-                             const ContiguousTensorDesc &src_desc) {
-        // create contiguous tensor for npu slice
-        const auto &temp_tensor_size = src_desc.base_sizes_;
-        at::Tensor temp_src = TransContiguous::view_tensor(src, src_desc.base_offset_, temp_tensor_size, src_desc.base_strides_);
+            void slice_to_contiguous(at::Tensor &self, const at::Tensor &src,
+                                     const c10::SmallVector<int64_t, MAX_DIM> &offsets,
+                                     const c10::SmallVector<int64_t, MAX_DIM> &size,
+                                     const ContiguousTensorDesc &src_desc) {
+                // create contiguous tensor for npu slice
+                const auto &temp_tensor_size = src_desc.base_sizes_;
+                at::Tensor temp_src = TransContiguous::view_tensor(src, src_desc.base_offset_, temp_tensor_size, src_desc.base_strides_);
 
-        custom_ops::npu_slice_out(temp_src, offsets, size, self);
-        return;
-    }
-}; // class SliceContiguousOpt
+                custom_ops::npu_slice_out(temp_src, offsets, size, self);
+                return;
+            }
+        }; // class SliceContiguousOpt
 
-REGISTER_COPY_OPT(slice, SliceContiguousOpt)
+        REGISTER_COPY_OPT(slice, SliceContiguousOpt)
 
-} // namespace native
+    } // namespace native
 } // namespace at_npu
\ No newline at end of file
-- 
Gitee