From a6096c278dc375215a747ec437a7885bf466a5ee Mon Sep 17 00:00:00 2001 From: feihujiang Date: Wed, 27 Dec 2023 15:07:11 +0800 Subject: [PATCH] Maintain consistent style --- .../framework/contiguous/ContiguousOpt.cpp | 414 ++++---- .../framework/contiguous/combined_opt.cpp | 908 +++++++++--------- .../csrc/framework/contiguous/permute_opt.cpp | 408 ++++---- .../csrc/framework/contiguous/select_opt.cpp | 218 ++--- .../csrc/framework/contiguous/slice_opt.cpp | 262 ++--- 5 files changed, 1105 insertions(+), 1105 deletions(-) diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp index b6b2b788a29..bdbbc3950a1 100644 --- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp +++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp @@ -3,224 +3,224 @@ #include namespace at_npu { -namespace native { - -OptimizationCases TransContiguous::optCasesDefault = {}; -OptimizationCases TransContiguous::optCasesAnyFormat = {"reshape", "slice"}; -ska::flat_hash_map TransContiguous::cached_contiguous_opt; - - -ContiguousTensorDesc TransContiguous::GetTensorDescInfo( - const at::Tensor &src, const OptimizationCases &opt_cases) { - auto src_base_info = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc(); - c10::SmallVector src_size_inferred; - c10::SmallVector src_stride_inferred; - c10::SmallVector src_storage_size_inferred = - src_base_info.storage_sizes_; - if (src.dim() == 0) { - src_size_inferred = {1}; - src_stride_inferred = {1}; - if (src_storage_size_inferred.size() == 0) { - src_storage_size_inferred = {1}; - } - } else { - src_size_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.sizes()); - src_stride_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.strides()); - } - ContiguousTensorDesc src_desc = { - src.is_contiguous(), src_size_inferred, - src_stride_inferred, src.storage_offset(), - src_base_info.base_sizes_, src_base_info.base_strides_, - src_storage_size_inferred, src_base_info.base_offset_, - src_base_info.npu_format_, opt_cases}; - if (src_desc.opt_cases_.empty()) { - src_desc.find_match_optimization_cases(); - } - return src_desc; -} - -bool TransContiguous::CheckClone(const at::Tensor &src, at::Tensor &self) { - // self tensor may not be temporary constructed empty tensor from src, so: - // 1. contiguous storage is needed:storage_offset and numels eq - // 2. full memory copy: size match between src and self - if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() && - src.sizes().equals(self.sizes()) && - self.sizes().equals(torch_npu::NPUBridge::GetNpuStorageImpl(self)->get_npu_desc().base_sizes_)) { - return true; - } - return false; -} - -bool TransContiguous::can_optimize_(ContiguousTensorDesc &tensor_desc) { - for (auto opt_case : tensor_desc.opt_cases_) { - bool res = register_opt::CopyOptRegister::GetInstance()->CanOptimize( - opt_case, tensor_desc); - if (res) { - // refresh patterns to only keep optimized pattern - tensor_desc.opt_cases_.clear(); - tensor_desc.opt_cases_.emplace_back(opt_case); - return true; - } - } - return false; -} - -bool TransContiguous::CanOptimize(ContiguousTensorDesc &tensor_desc) { - return can_optimize_(tensor_desc); -} - -bool TransContiguous::CanOptimize(const at::Tensor &tensor, - const OptimizationCases &opt_cases) { - ContiguousTensorDesc tensor_desc = GetTensorDescInfo(tensor, opt_cases); - return can_optimize_(tensor_desc); -} - -bool TransContiguous::contiguous_optimize_with_anyformat_( - at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc) { - if (!CheckClone(src, self)) { - return false; - } - for (auto &opt_case : src_desc.opt_cases_) { - bool res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, - src, src_desc); - if (res) { - return true; + namespace native { + + OptimizationCases TransContiguous::optCasesDefault = {}; + OptimizationCases TransContiguous::optCasesAnyFormat = {"reshape", "slice"}; + ska::flat_hash_map TransContiguous::cached_contiguous_opt; + + + ContiguousTensorDesc TransContiguous::GetTensorDescInfo( + const at::Tensor &src, const OptimizationCases &opt_cases) { + auto src_base_info = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc(); + c10::SmallVector src_size_inferred; + c10::SmallVector src_stride_inferred; + c10::SmallVector src_storage_size_inferred = + src_base_info.storage_sizes_; + if (src.dim() == 0) { + src_size_inferred = {1}; + src_stride_inferred = {1}; + if (src_storage_size_inferred.size() == 0) { + src_storage_size_inferred = {1}; + } + } else { + src_size_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.sizes()); + src_stride_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.strides()); + } + ContiguousTensorDesc src_desc = { + src.is_contiguous(), src_size_inferred, + src_stride_inferred, src.storage_offset(), + src_base_info.base_sizes_, src_base_info.base_strides_, + src_storage_size_inferred, src_base_info.base_offset_, + src_base_info.npu_format_, opt_cases}; + if (src_desc.opt_cases_.empty()) { + src_desc.find_match_optimization_cases(); + } + return src_desc; + } + + bool TransContiguous::CheckClone(const at::Tensor &src, at::Tensor &self) { + // self tensor may not be temporary constructed empty tensor from src, so: + // 1. contiguous storage is needed:storage_offset and numels eq + // 2. full memory copy: size match between src and self + if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() && + src.sizes().equals(self.sizes()) && + self.sizes().equals(torch_npu::NPUBridge::GetNpuStorageImpl(self)->get_npu_desc().base_sizes_)) { + return true; + } + return false; + } + + bool TransContiguous::can_optimize_(ContiguousTensorDesc &tensor_desc) { + for (auto opt_case : tensor_desc.opt_cases_) { + bool res = register_opt::CopyOptRegister::GetInstance()->CanOptimize( + opt_case, tensor_desc); + if (res) { + // refresh patterns to only keep optimized pattern + tensor_desc.opt_cases_.clear(); + tensor_desc.opt_cases_.emplace_back(opt_case); + return true; + } + } + return false; } - } - return false; -} - - size_t GetHash_(const c10::SmallVector& small_vector_size) - { - size_t seed = 0; - for (auto i = 0; i < small_vector_size.size(); i++) { - seed ^= small_vector_size[i] + (seed << 6) + (seed >> 2); + + bool TransContiguous::CanOptimize(ContiguousTensorDesc &tensor_desc) { + return can_optimize_(tensor_desc); } - return seed; - } - - size_t GetHash_(const ContiguousTensorDesc &src_desc) - { - size_t hash_src_desc = (GetHash_(src_desc.sizes_)<<52) + - (GetHash_(src_desc.base_sizes_)<<40) + - (GetHash_(src_desc.strides_)<<28) + - (GetHash_(src_desc.base_strides_)<<16) + - (src_desc.offset_ << 4) + - src_desc.npu_format_; - return hash_src_desc; - } - - bool equalDesc(const ContiguousTensorDesc &src_desc, const ContiguousTensorDesc &desc_desc) - { - if (src_desc.sizes_ == desc_desc.sizes_ && - src_desc.base_sizes_ == desc_desc.base_sizes_ && - src_desc.strides_ == desc_desc.strides_ && - src_desc.base_strides_ == desc_desc.base_strides_ && - src_desc.offset_ == desc_desc.offset_ && - src_desc.npu_format_ == desc_desc.npu_format_) { - return true; + + bool TransContiguous::CanOptimize(const at::Tensor &tensor, + const OptimizationCases &opt_cases) { + ContiguousTensorDesc tensor_desc = GetTensorDescInfo(tensor, opt_cases); + return can_optimize_(tensor_desc); } - return false; - } - - bool TransContiguous::cached_contiguous_optimize_with_anyformat_( - at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc) - { - // No cached, try caching - if (!CheckClone(src, self)) { + + bool TransContiguous::contiguous_optimize_with_anyformat_( + at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc) { + if (!CheckClone(src, self)) { + return false; + } + for (auto &opt_case : src_desc.opt_cases_) { + bool res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, + src, src_desc); + if (res) { + return true; + } + } return false; } - src_desc.hash_src_desc = GetHash_(src_desc); - auto it = TransContiguous::cached_contiguous_opt.find(src_desc.hash_src_desc); - if (it != TransContiguous::cached_contiguous_opt.end()) { - // Cached - if (equalDesc(src_desc, it->second.contiguous_tensor_desc)) { - src_desc.cached_contiguous = true; - auto &opt_case = it->second.cached_opt_case; - return register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, - src, src_desc); + + size_t GetHash_(const c10::SmallVector& small_vector_size) + { + size_t seed = 0; + for (auto i = 0; i < small_vector_size.size(); i++) { + seed ^= small_vector_size[i] + (seed << 6) + (seed >> 2); } + return seed; + } + + size_t GetHash_(const ContiguousTensorDesc &src_desc) + { + size_t hash_src_desc = (GetHash_(src_desc.sizes_)<<52) + + (GetHash_(src_desc.base_sizes_)<<40) + + (GetHash_(src_desc.strides_)<<28) + + (GetHash_(src_desc.base_strides_)<<16) + + (src_desc.offset_ << 4) + + src_desc.npu_format_; + return hash_src_desc; + } + + bool equalDesc(const ContiguousTensorDesc &src_desc, const ContiguousTensorDesc &desc_desc) + { + if (src_desc.sizes_ == desc_desc.sizes_ && + src_desc.base_sizes_ == desc_desc.base_sizes_ && + src_desc.strides_ == desc_desc.strides_ && + src_desc.base_strides_ == desc_desc.base_strides_ && + src_desc.offset_ == desc_desc.offset_ && + src_desc.npu_format_ == desc_desc.npu_format_) { + return true; + } + return false; + } + + bool TransContiguous::cached_contiguous_optimize_with_anyformat_( + at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc) + { + // No cached, try caching + if (!CheckClone(src, self)) { + return false; + } + src_desc.hash_src_desc = GetHash_(src_desc); + auto it = TransContiguous::cached_contiguous_opt.find(src_desc.hash_src_desc); + if (it != TransContiguous::cached_contiguous_opt.end()) { + // Cached + if (equalDesc(src_desc, it->second.contiguous_tensor_desc)) { + src_desc.cached_contiguous = true; + auto &opt_case = it->second.cached_opt_case; + return register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, + src, src_desc); + } + return contiguous_optimize_with_anyformat_(self, src, src_desc); + } + + for (auto &opt_case : src_desc.opt_cases_) { + bool res = false; + if (TransContiguous::cached_contiguous_opt.size() >= CachedMaxSize) { + res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, src, src_desc); + } else { + src_desc.cached_contiguous = false; + res = register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, src, src_desc); + } + if (res) { + return true; + } + } + return false; + } + + bool TransContiguous::ContiguousOptimizeWithAnyFormat( + at::Tensor &self, const at::Tensor &src, + const OptimizationCases &opt_cases) { + ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases); return contiguous_optimize_with_anyformat_(self, src, src_desc); } - for (auto &opt_case : src_desc.opt_cases_) { - bool res = false; - if (TransContiguous::cached_contiguous_opt.size() >= CachedMaxSize) { - res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, src, src_desc); - } else { - src_desc.cached_contiguous = false; - res = register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, src, src_desc); + c10::optional TransContiguous::ContiguousOptimizeWithAnyFormat( + const at::Tensor &src, const OptimizationCases &opt_cases) { + TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1, + "Expected all tensors to be on the same device. " + "Expected NPU tensor, please check whether the input tensor device is correct."); + auto self = OpPreparation::ApplyTensorWithFormat( + src.sizes(), src.options(), torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_); + ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases); + if (cached_contiguous_optimize_with_anyformat_(self, src, src_desc)) { + return self; } - if (res) { - return true; + return c10::nullopt; + } + + bool TransContiguous::ContiguousOptimizeWithBaseFormat( + at::Tensor &self, const at::Tensor &src, const OptimizationCases &opt_cases, + bool OpenCombined) { + TORCH_CHECK(FormatHelper::IsBaseFormatType(src), + "ContiguousOptimizeWithBaseFormat func requires Input Tensor " + "with base format!"); + // In non-specific cases, classify the cases and simplify judgement. + ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases); + if (OpenCombined && + c10_npu::option::OptionsManager::CheckCombinedOptimizerEnable()) { + src_desc.add_optimization_case("combined"); } + return cached_contiguous_optimize_with_anyformat_(self, src, src_desc); } - return false; - } - -bool TransContiguous::ContiguousOptimizeWithAnyFormat( - at::Tensor &self, const at::Tensor &src, - const OptimizationCases &opt_cases) { - ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases); - return contiguous_optimize_with_anyformat_(self, src, src_desc); -} - -c10::optional TransContiguous::ContiguousOptimizeWithAnyFormat( - const at::Tensor &src, const OptimizationCases &opt_cases) { - TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1, - "Expected all tensors to be on the same device. " - "Expected NPU tensor, please check whether the input tensor device is correct."); - auto self = OpPreparation::ApplyTensorWithFormat( - src.sizes(), src.options(), torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_); - ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases); - if (cached_contiguous_optimize_with_anyformat_(self, src, src_desc)) { - return self; - } - return c10::nullopt; -} - -bool TransContiguous::ContiguousOptimizeWithBaseFormat( - at::Tensor &self, const at::Tensor &src, const OptimizationCases &opt_cases, - bool OpenCombined) { - TORCH_CHECK(FormatHelper::IsBaseFormatType(src), - "ContiguousOptimizeWithBaseFormat func requires Input Tensor " - "with base format!"); - // In non-specific cases, classify the cases and simplify judgement. - ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases); - if (OpenCombined && - c10_npu::option::OptionsManager::CheckCombinedOptimizerEnable()) { - src_desc.add_optimization_case("combined"); - } - return cached_contiguous_optimize_with_anyformat_(self, src, src_desc); -} - - - at::Tensor TransContiguous::view_tensor(const at::Tensor& self, - int64_t offset, - const c10::IntArrayRef& sizes, - const c10::IntArrayRef& strides) - { - at::Tensor self_; - if (self.is_quantized()) { - self_ = at::detail::make_tensor( - c10::TensorImpl::VIEW, - c10::Storage(self.storage()), - self.key_set(), - self.dtype(), - get_qtensorimpl(self)->quantizer()); - } else { - self_ = at::detail::make_tensor( - c10::TensorImpl::VIEW, - c10::Storage(self.storage()), - self.key_set(), - self.dtype()); + + + at::Tensor TransContiguous::view_tensor(const at::Tensor& self, + int64_t offset, + const c10::IntArrayRef& sizes, + const c10::IntArrayRef& strides) + { + at::Tensor self_; + if (self.is_quantized()) { + self_ = at::detail::make_tensor( + c10::TensorImpl::VIEW, + c10::Storage(self.storage()), + self.key_set(), + self.dtype(), + get_qtensorimpl(self)->quantizer()); + } else { + self_ = at::detail::make_tensor( + c10::TensorImpl::VIEW, + c10::Storage(self.storage()), + self.key_set(), + self.dtype()); + } + auto* self_tmp_ = self_.unsafeGetTensorImpl(); + self_tmp_->set_storage_offset(offset); + self_tmp_->set_sizes_and_strides(sizes, strides); + at::namedinference::propagate_names(self_, self); + return self_; } - auto* self_tmp_ = self_.unsafeGetTensorImpl(); - self_tmp_->set_storage_offset(offset); - self_tmp_->set_sizes_and_strides(sizes, strides); - at::namedinference::propagate_names(self_, self); - return self_; - } - -} // namespace native + + } // namespace native } // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/contiguous/combined_opt.cpp b/torch_npu/csrc/framework/contiguous/combined_opt.cpp index fb8255f44fa..cbd5fcefa75 100644 --- a/torch_npu/csrc/framework/contiguous/combined_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/combined_opt.cpp @@ -6,463 +6,463 @@ #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h" namespace at_npu { -namespace native { - -class CombinedContiguousOpt : public ContiguousOpt { -public: - // Combined tensor == discontiguous tensor caused by combined view operators. - bool Optimizer(at::Tensor &self, const at::Tensor &src, - const ContiguousTensorDesc &src_desc) override { - // Maximum combined operators suggested: combined_cases_num = 2 - // NOTE: n-cmobined(n>2) can also be supported - int combined_cases_num = MaxCombinedCasesNum; - - ShapeStrideStack shape_stride_stacks; - OffsetStack offset_stack; - - if (can_use_combined(shape_stride_stacks, offset_stack, src_desc, - combined_cases_num)) { - RECORD_FUNCTION("contiguous_h_combined", std::vector({src})); - return pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack); - } - return false; - } - - bool CachedOptimizer(at::Tensor &self, const at::Tensor &src, - const ContiguousTensorDesc &src_desc) override - { - ShapeStrideStack shape_stride_stacks; - OffsetStack offset_stack; - if (src_desc.cached_contiguous) { - RECORD_FUNCTION("cached_contiguous_h_combined", std::vector({src})); - - CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc]; - shape_stride_stacks = cachedContiguousOpt.shape_stride_stack; - offset_stack = cachedContiguousOpt.offset_stack; - return pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack); - } - - int combined_cases_num = MaxCombinedCasesNum; - if (can_use_combined(shape_stride_stacks, offset_stack, src_desc, - combined_cases_num)) { - ShapeStrideStack cached_shape_stride_stacks = shape_stride_stacks; - OffsetStack cached_offset_stack = offset_stack; - RECORD_FUNCTION("contiguous_h_combined", std::vector({src})); - - bool contiguousOrNot = pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack); - if (contiguousOrNot) { - CachedContiguousOpt cached_opt = CachedContiguousOpt{ - "combined" - }; - cached_opt.shape_stride_stack = cached_shape_stride_stacks; - cached_opt.offset_stack = cached_offset_stack; - cached_opt.contiguous_tensor_desc = src_desc; - TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt; + namespace native { + + class CombinedContiguousOpt : public ContiguousOpt { + public: + // Combined tensor == discontiguous tensor caused by combined view operators. + bool Optimizer(at::Tensor &self, const at::Tensor &src, + const ContiguousTensorDesc &src_desc) override { + // Maximum combined operators suggested: combined_cases_num = 2 + // NOTE: n-cmobined(n>2) can also be supported + int combined_cases_num = MaxCombinedCasesNum; + + ShapeStrideStack shape_stride_stacks; + OffsetStack offset_stack; + + if (can_use_combined(shape_stride_stacks, offset_stack, src_desc, + combined_cases_num)) { + RECORD_FUNCTION("contiguous_h_combined", std::vector({src})); + return pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack); + } + return false; } - return contiguousOrNot; - } - return false; - } -private: + bool CachedOptimizer(at::Tensor &self, const at::Tensor &src, + const ContiguousTensorDesc &src_desc) override + { + ShapeStrideStack shape_stride_stacks; + OffsetStack offset_stack; + if (src_desc.cached_contiguous) { + RECORD_FUNCTION("cached_contiguous_h_combined", std::vector({src})); + + CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc]; + shape_stride_stacks = cachedContiguousOpt.shape_stride_stack; + offset_stack = cachedContiguousOpt.offset_stack; + return pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack); + } + + int combined_cases_num = MaxCombinedCasesNum; + if (can_use_combined(shape_stride_stacks, offset_stack, src_desc, + combined_cases_num)) { + ShapeStrideStack cached_shape_stride_stacks = shape_stride_stacks; + OffsetStack cached_offset_stack = offset_stack; + RECORD_FUNCTION("contiguous_h_combined", std::vector({src})); + + bool contiguousOrNot = pre_combined_to_contiguous(self, src, shape_stride_stacks, offset_stack); + if (contiguousOrNot) { + CachedContiguousOpt cached_opt = CachedContiguousOpt{ + "combined" + }; + cached_opt.shape_stride_stack = cached_shape_stride_stacks; + cached_opt.offset_stack = cached_offset_stack; + cached_opt.contiguous_tensor_desc = src_desc; + TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt; + } + return contiguousOrNot; + } + return false; + } - bool pre_combined_to_contiguous(at::Tensor &self, const at::Tensor &src, - ShapeStrideStack &shape_stride_stacks, - OffsetStack &offset_stack) - { - // Record src infos for recovering after trans-contiguous - auto src_storage_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc(); - - at::Tensor base_tensor = - at::empty(src_storage_desc.base_sizes_, src.options()); - base_tensor.set_(src.storage()); - - // Reconstruct combined discontiguous tensor ==trans==> contiguous tensor - bool contiguousOrNot = combined_to_contiguous(self, base_tensor, shape_stride_stacks, offset_stack); - // Recover modified tensor infos of src after trans-contiguous - StorageDescHelper::CopyDesc(base_tensor, src_storage_desc); - return contiguousOrNot; - } - - bool cases_avoid(const ContiguousTensorDesc &tensor_desc) - { - for (const auto i : c10::irange(tensor_desc.sizes_.size())) { - // expand+x,x+expand - if (tensor_desc.strides_[i] == 0) { + private: + + bool pre_combined_to_contiguous(at::Tensor &self, const at::Tensor &src, + ShapeStrideStack &shape_stride_stacks, + OffsetStack &offset_stack) + { + // Record src infos for recovering after trans-contiguous + auto src_storage_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc(); + + at::Tensor base_tensor = + at::empty(src_storage_desc.base_sizes_, src.options()); + base_tensor.set_(src.storage()); + + // Reconstruct combined discontiguous tensor ==trans==> contiguous tensor + bool contiguousOrNot = combined_to_contiguous(self, base_tensor, shape_stride_stacks, offset_stack); + // Recover modified tensor infos of src after trans-contiguous + StorageDescHelper::CopyDesc(base_tensor, src_storage_desc); + return contiguousOrNot; + } + + bool cases_avoid(const ContiguousTensorDesc &tensor_desc) + { + for (const auto i : c10::irange(tensor_desc.sizes_.size())) { + // expand+x,x+expand + if (tensor_desc.strides_[i] == 0) { + return true; + } + } + return false; + } + + // Unmatched tensor ==refresh(no copy)==> macthed tensor + bool reshape_without_copy_match(at::Tensor &tensor) { + if (!tensor.is_contiguous()) { + return false; + } + auto npu_desc = torch_npu::NPUBridge::GetNpuStorageImpl(tensor)->get_npu_desc(); + if ((c10::multiply_integers(tensor.sizes()) != + c10::multiply_integers(npu_desc.base_sizes_)) || + (tensor.storage_offset() != npu_desc.base_offset_)) { + return false; + } + RECORD_FUNCTION("contiguous_h_match", std::vector({tensor})); + StorageDescHelper::SetDesc(tensor, CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor.sizes()), + CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor.strides())); + return true; + } + + // Whether tensor can be optimized(no optimization). + bool can_be_optimize_from_default_cases(ContiguousTensorDesc &tensor_desc) { + OptimizationCases opt_cases{"reshape", "slice", "select"}; + tensor_desc.reset_optimization_cases(opt_cases); + return TransContiguous::CanOptimize(tensor_desc); + } + + // Conduct trans-contiguous for given optimization cases. + bool + copy_optimize_contiguous_by_given_cases(at::Tensor &self, + const at::Tensor &tensor, + OptimizationCases &optimizations) { + // Set "OpenCombined = false" to avoid recursion. + return TransContiguous::ContiguousOptimizeWithBaseFormat( + self, tensor, optimizations, false); + } + + // Weak constrains for transpose cases + bool maybe_permute(const ContiguousTensorDesc &tensor_desc) { + // tensors with nonmonotonic strides will be taken into consideration + // (Ascend): 对于特殊stride的情况例如:[*,*,1,1]这种,需要进一步分析影响 + for (const auto i : c10::irange(tensor_desc.sizes_.size() - 1)) { + if (tensor_desc.strides_[i] < tensor_desc.strides_[i + 1]) { + return true; + } + } + return false; + } + + // Weak constrains for select cases + bool maybe_select(const ContiguousTensorDesc &tensor_desc) { + for (auto i = tensor_desc.sizes_.size() - 1; i > 0; i--) { + if (tensor_desc.strides_[i - 1] % + (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) != + 0) { + return false; + } + if (tensor_desc.strides_[i - 1] / + (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) != + 1) { + if (tensor_desc.offset_ % + (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) != + 0) { + return false; + } + // Avoid combined-cases such as squeeze+indexing at the first axis. + if (tensor_desc.strides_[0] != tensor_desc.base_strides_[0]) { + return false; + } + } + } + return true; + } + + // Weak constrains for slice cases + bool maybe_slice(const ContiguousTensorDesc &tensor_desc) { + // tensors with reduced numel will be taken into consideration. + if (c10::multiply_integers(tensor_desc.sizes_) < + c10::multiply_integers(tensor_desc.base_sizes_)) { + for (const auto i : c10::irange(tensor_desc.sizes_.size() - 2)) { + if (tensor_desc.strides_[i] % tensor_desc.strides_[i + 1] != 0) { + return false; + } + } + return true; + } + return false; + } + + /* + Kernel function of "Inference", + Key inferred infos: infer_size,infer_stride and infer_offset, + Inference order: permute, select, slice. + */ + bool can_infer_view_tensor(ContiguousTensorDesc &tensor_desc, + FormatShape &infer_size, FormatShape &infer_stride, + int64_t &infer_offset) { + const auto &view_sizes = tensor_desc.sizes_; + const auto &view_strides = tensor_desc.strides_; + + if (maybe_permute(tensor_desc)) { + FormatShape &permute_size_sorted = infer_size; + FormatShape &permute_stride_sorted = infer_stride; + permute_size_sorted = view_sizes; + permute_stride_sorted = view_strides; + + // Sort stride + std::sort(permute_stride_sorted.rbegin(), permute_stride_sorted.rend()); + + // Map stride to shape + std::map map_shape_stride; + std::map label_map_shape_stride; + for (const auto i : c10::irange(view_sizes.size())) { + map_shape_stride[view_strides[i]] = view_sizes[i]; + } + // 除去第0维,其他维shape为1时,不记录对应的stride值,该stride的值会和其他维的stride有重复 + for (const auto i : c10::irange(view_sizes.size())) { + if (i == 0) { + map_shape_stride[view_strides[0]] = view_sizes[0]; + } else if (i != 0 && view_sizes[i] != 1) { + map_shape_stride[view_strides[i]] = view_sizes[i]; + } + } + // stride中有相等的情况,后面相等的stride对应的shape为1 + for (const auto i : c10::irange(view_sizes.size())) { + if (label_map_shape_stride[permute_stride_sorted[i]] != true) { + permute_size_sorted[i] = map_shape_stride[permute_stride_sorted[i]]; + label_map_shape_stride[permute_stride_sorted[i]] = true; + } else { + permute_size_sorted[i] = 1; + } + } + infer_offset = 0; + // Refresh tensor's base info to construct transposed tensor + tensor_desc.base_sizes_ = permute_size_sorted; + tensor_desc.base_strides_ = permute_stride_sorted; + // double-checking of may_permute is not required, because view strides + // does not changed. return true; + } + + if (maybe_select(tensor_desc)) { + FormatShape &select_size = infer_size; + FormatShape &select_stride = infer_stride; + // Infer base shape according to view shape and stride + select_stride = view_strides; + select_size = view_sizes; + // select_size and stride should be one more than view_size + select_size.emplace_back((int64_t)1); + select_stride.emplace_back((int64_t)1); + + int64_t i = static_cast(view_sizes.size()) - 1; + if (view_strides[i] == 1) { + select_size[i + 1] = view_sizes[i]; + select_stride[i + 1] = 1; + + for (i = i - 1; i >= 0; i--) { + if (view_strides[i] != view_strides[i + 1] * view_sizes[i + 1]) { + select_size[i + 1] = + view_strides[i] / (view_sizes[i + 1] * view_strides[i + 1]); + select_stride[i + 1] = view_sizes[i + 1] * view_strides[i + 1]; + infer_offset = tensor_desc.offset_ % view_strides[i]; + break; + } + select_size[i + 1] = view_sizes[i]; + select_stride[i + 1] = view_strides[i]; + } + } else { + select_size[i + 1] = view_strides[i]; + select_stride[i + 1] = 1; + infer_offset = tensor_desc.offset_ % view_strides[i]; + } + for (i = i - 1; i >= 0; i--) { + select_size[i + 1] = view_sizes[i + 1]; + select_stride[i + 1] = view_strides[i + 1]; + } + + select_size[0] = view_sizes[0]; + select_stride[0] = view_strides[0]; + + // Refresh tensor's base info to construct selected tensor + tensor_desc.base_sizes_ = select_size; + tensor_desc.base_strides_ = select_stride; + // Whether the construted tensor is selected? + return maybe_select(tensor_desc); + } + + if (maybe_slice(tensor_desc)) { + FormatShape &slice_size = infer_size; + FormatShape &slice_stride = infer_stride; + + slice_stride = view_strides; + slice_size = view_sizes; + // Infer base shape according to base stride + for (auto i = slice_size.size() - 1; i > 0; i--) { + // Strides is not divisible means this case cannot be inferred. + if (view_strides[i] == 0 || + view_strides[i - 1] % view_strides[i] != 0) { + return false; + } + slice_size[i] = (view_strides[i - 1] / view_strides[i]); + } + slice_size[0] = 1; + slice_size[0] = (c10::multiply_integers(tensor_desc.base_sizes_) / + c10::multiply_integers(slice_size)); + infer_offset = tensor_desc.offset_; + // Refresh tensor's base info and storage info to construct sliced tensor + tensor_desc.base_sizes_ = slice_size; + tensor_desc.base_strides_ = slice_stride; + // Whether the construted tensor is sliced? + return maybe_slice(tensor_desc); + } + return false; + } + + bool stack_infer_info(ShapeStrideStack &shape_stride_stacks, + OffsetStack &offset_stacks, int64_t infer_offset, + int64_t combined_cases_num, + ContiguousTensorDesc &tensor_desc) { + // Only combined_cases_num-combined Ops cases are taken into consideration + if (static_cast(shape_stride_stacks.size()) == combined_cases_num) { + return false; + } + + c10::SmallVector stack_shape_stride_part; + stack_shape_stride_part.emplace_back( + CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor_desc.sizes_)); + stack_shape_stride_part.emplace_back( + CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor_desc.strides_)); + + shape_stride_stacks.emplace_back(stack_shape_stride_part); + offset_stacks.emplace_back(infer_offset); + return true; + } + + // Conduct inferring + bool can_use_combined(ShapeStrideStack &shape_stride_stacks, + OffsetStack &offset_stacks, + const ContiguousTensorDesc &src_desc, + int64_t combined_cases_num) { + // combined tensor should be discontiguous + if (src_desc.is_contiguous_ || cases_avoid(src_desc)) { + return false; + } + + // Key infos that should be inferred. + FormatShape infer_size; + FormatShape infer_stride; + int64_t infer_offset = 0; + + // Reconstruct "the discontiguous combined tensor desc" + // viewInfo = combined tensor(src)'s viewInfo + // baseInfo = combined tensor(src)'s baseInfo + // src's desc would be modified, so a local struct is created. + ContiguousTensorDesc local_src_desc = src_desc; + + // Construct "the first inferred tensor" inside "can_infer_view_tensor()" + // viewInfo = combined tensor(src)'s viewInfo + // baseInfo = inferred info(infer_size, infer_stride, infer_offset) + // If the first inferred tensor can be optimized, store its info. + if (can_infer_view_tensor(local_src_desc, infer_size, infer_stride, + infer_offset) && + stack_infer_info(shape_stride_stacks, offset_stacks, infer_offset, + combined_cases_num, local_src_desc)) { + // Construct "the second inferred tensor" + // viewInfo = inferred info(infer_size, infer_stride, infer_offset) + // baseInfo = combined tensor(src)'s baseInfo + local_src_desc.sizes_ = infer_size; + local_src_desc.strides_ = infer_stride; + local_src_desc.offset_ -= infer_offset; + local_src_desc.base_sizes_ = src_desc.base_sizes_; + local_src_desc.base_strides_ = src_desc.base_strides_; + local_src_desc.refresh_contiguous_using_size_and_stride(); + // The second inferred tensor can be optimized or not + if (can_be_optimize_from_default_cases(local_src_desc) && + stack_infer_info(shape_stride_stacks, offset_stacks, + local_src_desc.offset_, combined_cases_num, + local_src_desc)) { + return true; + } + // If the second pattern is not inferred successfully, retrun false + return false; + } + // If the first pattern is not inferred successfully, retrun false + return false; + } + + // Reconstructing discontiguous tensor at trans-contiguous procedure. + bool reconstruct_tensor(at::Tensor &src, + ShapeStrideStack &shape_stride_stacks, + OffsetStack &offset_stacks) { + auto stack_shape_stride = shape_stride_stacks.pop_back_val(); + auto stack_offset = offset_stacks.pop_back_val(); + // Set view info to make discontiguous tensor. + // stack_shape_stride[0]: stored shape infos in inferring procedure. + // stack_shape_stride[1]: stored stride infos in inferring procedure. + + src.set_(src.storage(), stack_offset, stack_shape_stride[0], + stack_shape_stride[1]); + + // If current tensor is sliced and the stack is still not empty: + // stored infos in the stack should be modified. + if (shape_stride_stacks.size() >= 1 && + maybe_slice(TransContiguous::GetTensorDescInfo(src))) { + auto stack_shape_stride_pre = shape_stride_stacks.pop_back_val(); + + std::map map_stride_shape; + auto computed_stride = + StorageDescHelper::ComputeStrideFromShape(stack_shape_stride[0]); + // Adjust shape according to sorted stride + for (const auto i : c10::irange(stack_shape_stride_pre[0].size())) { + // if shape_i equals to shape_j, non-unique keys for "map_stride_shape" would be made; + // Temporarily, making size[i] * stride[i] to obtain unique keys; + // (Ascend): explore unique keys for any cases when "shape[i] == shape [j]" + map_stride_shape[stack_shape_stride[0][i] * stack_shape_stride[1][i]] = + computed_stride[i]; + } + + for (const auto i : c10::irange(stack_shape_stride_pre[0].size())) { + stack_shape_stride_pre[1][i] = + map_stride_shape[stack_shape_stride_pre[0][i] * + stack_shape_stride_pre[1][i]]; + } + // re-store modified infos + shape_stride_stacks.emplace_back(stack_shape_stride_pre); + } + return true; } - } - return false; - } - - // Unmatched tensor ==refresh(no copy)==> macthed tensor - bool reshape_without_copy_match(at::Tensor &tensor) { - if (!tensor.is_contiguous()) { - return false; - } - auto npu_desc = torch_npu::NPUBridge::GetNpuStorageImpl(tensor)->get_npu_desc(); - if ((c10::multiply_integers(tensor.sizes()) != - c10::multiply_integers(npu_desc.base_sizes_)) || - (tensor.storage_offset() != npu_desc.base_offset_)) { - return false; - } - RECORD_FUNCTION("contiguous_h_match", std::vector({tensor})); - StorageDescHelper::SetDesc(tensor, CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor.sizes()), - CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor.strides())); - return true; - } - - // Whether tensor can be optimized(no optimization). - bool can_be_optimize_from_default_cases(ContiguousTensorDesc &tensor_desc) { - OptimizationCases opt_cases{"reshape", "slice", "select"}; - tensor_desc.reset_optimization_cases(opt_cases); - return TransContiguous::CanOptimize(tensor_desc); - } - - // Conduct trans-contiguous for given optimization cases. - bool - copy_optimize_contiguous_by_given_cases(at::Tensor &self, - const at::Tensor &tensor, - OptimizationCases &optimizations) { - // Set "OpenCombined = false" to avoid recursion. - return TransContiguous::ContiguousOptimizeWithBaseFormat( - self, tensor, optimizations, false); - } - - // Weak constrains for transpose cases - bool maybe_permute(const ContiguousTensorDesc &tensor_desc) { - // tensors with nonmonotonic strides will be taken into consideration - // (Ascend): 对于特殊stride的情况例如:[*,*,1,1]这种,需要进一步分析影响 - for (const auto i : c10::irange(tensor_desc.sizes_.size() - 1)) { - if (tensor_desc.strides_[i] < tensor_desc.strides_[i + 1]) { - return true; - } - } - return false; - } - - // Weak constrains for select cases - bool maybe_select(const ContiguousTensorDesc &tensor_desc) { - for (auto i = tensor_desc.sizes_.size() - 1; i > 0; i--) { - if (tensor_desc.strides_[i - 1] % - (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) != - 0) { - return false; - } - if (tensor_desc.strides_[i - 1] / - (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) != - 1) { - if (tensor_desc.offset_ % - (tensor_desc.sizes_[i] * tensor_desc.strides_[i]) != - 0) { - return false; - } - // Avoid combined-cases such as squeeze+indexing at the first axis. - if (tensor_desc.strides_[0] != tensor_desc.base_strides_[0]) { - return false; - } - } - } - return true; - } - - // Weak constrains for slice cases - bool maybe_slice(const ContiguousTensorDesc &tensor_desc) { - // tensors with reduced numel will be taken into consideration. - if (c10::multiply_integers(tensor_desc.sizes_) < - c10::multiply_integers(tensor_desc.base_sizes_)) { - for (const auto i : c10::irange(tensor_desc.sizes_.size() - 2)) { - if (tensor_desc.strides_[i] % tensor_desc.strides_[i + 1] != 0) { - return false; - } - } - return true; - } - return false; - } - - /* -Kernel function of "Inference", -Key inferred infos: infer_size,infer_stride and infer_offset, -Inference order: permute, select, slice. -*/ - bool can_infer_view_tensor(ContiguousTensorDesc &tensor_desc, - FormatShape &infer_size, FormatShape &infer_stride, - int64_t &infer_offset) { - const auto &view_sizes = tensor_desc.sizes_; - const auto &view_strides = tensor_desc.strides_; - - if (maybe_permute(tensor_desc)) { - FormatShape &permute_size_sorted = infer_size; - FormatShape &permute_stride_sorted = infer_stride; - permute_size_sorted = view_sizes; - permute_stride_sorted = view_strides; - - // Sort stride - std::sort(permute_stride_sorted.rbegin(), permute_stride_sorted.rend()); - - // Map stride to shape - std::map map_shape_stride; - std::map label_map_shape_stride; - for (const auto i : c10::irange(view_sizes.size())) { - map_shape_stride[view_strides[i]] = view_sizes[i]; - } - // 除去第0维,其他维shape为1时,不记录对应的stride值,该stride的值会和其他维的stride有重复 - for (const auto i : c10::irange(view_sizes.size())) { - if (i == 0) { - map_shape_stride[view_strides[0]] = view_sizes[0]; - } else if (i != 0 && view_sizes[i] != 1) { - map_shape_stride[view_strides[i]] = view_sizes[i]; - } - } - // stride中有相等的情况,后面相等的stride对应的shape为1 - for (const auto i : c10::irange(view_sizes.size())) { - if (label_map_shape_stride[permute_stride_sorted[i]] != true) { - permute_size_sorted[i] = map_shape_stride[permute_stride_sorted[i]]; - label_map_shape_stride[permute_stride_sorted[i]] = true; - } else { - permute_size_sorted[i] = 1; - } - } - infer_offset = 0; - // Refresh tensor's base info to construct transposed tensor - tensor_desc.base_sizes_ = permute_size_sorted; - tensor_desc.base_strides_ = permute_stride_sorted; - // double-checking of may_permute is not required, because view strides - // does not changed. - return true; - } - - if (maybe_select(tensor_desc)) { - FormatShape &select_size = infer_size; - FormatShape &select_stride = infer_stride; - // Infer base shape according to view shape and stride - select_stride = view_strides; - select_size = view_sizes; - // select_size and stride should be one more than view_size - select_size.emplace_back((int64_t)1); - select_stride.emplace_back((int64_t)1); - - int64_t i = static_cast(view_sizes.size()) - 1; - if (view_strides[i] == 1) { - select_size[i + 1] = view_sizes[i]; - select_stride[i + 1] = 1; - - for (i = i - 1; i >= 0; i--) { - if (view_strides[i] != view_strides[i + 1] * view_sizes[i + 1]) { - select_size[i + 1] = - view_strides[i] / (view_sizes[i + 1] * view_strides[i + 1]); - select_stride[i + 1] = view_sizes[i + 1] * view_strides[i + 1]; - infer_offset = tensor_desc.offset_ % view_strides[i]; - break; - } - select_size[i + 1] = view_sizes[i]; - select_stride[i + 1] = view_strides[i]; - } - } else { - select_size[i + 1] = view_strides[i]; - select_stride[i + 1] = 1; - infer_offset = tensor_desc.offset_ % view_strides[i]; - } - for (i = i - 1; i >= 0; i--) { - select_size[i + 1] = view_sizes[i + 1]; - select_stride[i + 1] = view_strides[i + 1]; - } - - select_size[0] = view_sizes[0]; - select_stride[0] = view_strides[0]; - - // Refresh tensor's base info to construct selected tensor - tensor_desc.base_sizes_ = select_size; - tensor_desc.base_strides_ = select_stride; - // Whether the construted tensor is selected? - return maybe_select(tensor_desc); - } - - if (maybe_slice(tensor_desc)) { - FormatShape &slice_size = infer_size; - FormatShape &slice_stride = infer_stride; - - slice_stride = view_strides; - slice_size = view_sizes; - // Infer base shape according to base stride - for (auto i = slice_size.size() - 1; i > 0; i--) { - // Strides is not divisible means this case cannot be inferred. - if (view_strides[i] == 0 || - view_strides[i - 1] % view_strides[i] != 0) { - return false; - } - slice_size[i] = (view_strides[i - 1] / view_strides[i]); - } - slice_size[0] = 1; - slice_size[0] = (c10::multiply_integers(tensor_desc.base_sizes_) / - c10::multiply_integers(slice_size)); - infer_offset = tensor_desc.offset_; - // Refresh tensor's base info and storage info to construct sliced tensor - tensor_desc.base_sizes_ = slice_size; - tensor_desc.base_strides_ = slice_stride; - // Whether the construted tensor is sliced? - return maybe_slice(tensor_desc); - } - return false; - } - - bool stack_infer_info(ShapeStrideStack &shape_stride_stacks, - OffsetStack &offset_stacks, int64_t infer_offset, - int64_t combined_cases_num, - ContiguousTensorDesc &tensor_desc) { - // Only combined_cases_num-combined Ops cases are taken into consideration - if (static_cast(shape_stride_stacks.size()) == combined_cases_num) { - return false; - } - - c10::SmallVector stack_shape_stride_part; - stack_shape_stride_part.emplace_back( - CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor_desc.sizes_)); - stack_shape_stride_part.emplace_back( - CalcuOpUtil::ConvertIntArrayRefToSmallVector(tensor_desc.strides_)); - - shape_stride_stacks.emplace_back(stack_shape_stride_part); - offset_stacks.emplace_back(infer_offset); - return true; - } - - // Conduct inferring - bool can_use_combined(ShapeStrideStack &shape_stride_stacks, - OffsetStack &offset_stacks, - const ContiguousTensorDesc &src_desc, - int64_t combined_cases_num) { - // combined tensor should be discontiguous - if (src_desc.is_contiguous_ || cases_avoid(src_desc)) { - return false; - } - - // Key infos that should be inferred. - FormatShape infer_size; - FormatShape infer_stride; - int64_t infer_offset = 0; - - // Reconstruct "the discontiguous combined tensor desc" - // viewInfo = combined tensor(src)'s viewInfo - // baseInfo = combined tensor(src)'s baseInfo - // src's desc would be modified, so a local struct is created. - ContiguousTensorDesc local_src_desc = src_desc; - - // Construct "the first inferred tensor" inside "can_infer_view_tensor()" - // viewInfo = combined tensor(src)'s viewInfo - // baseInfo = inferred info(infer_size, infer_stride, infer_offset) - // If the first inferred tensor can be optimized, store its info. - if (can_infer_view_tensor(local_src_desc, infer_size, infer_stride, - infer_offset) && - stack_infer_info(shape_stride_stacks, offset_stacks, infer_offset, - combined_cases_num, local_src_desc)) { - // Construct "the second inferred tensor" - // viewInfo = inferred info(infer_size, infer_stride, infer_offset) - // baseInfo = combined tensor(src)'s baseInfo - local_src_desc.sizes_ = infer_size; - local_src_desc.strides_ = infer_stride; - local_src_desc.offset_ -= infer_offset; - local_src_desc.base_sizes_ = src_desc.base_sizes_; - local_src_desc.base_strides_ = src_desc.base_strides_; - local_src_desc.refresh_contiguous_using_size_and_stride(); - // The second inferred tensor can be optimized or not - if (can_be_optimize_from_default_cases(local_src_desc) && - stack_infer_info(shape_stride_stacks, offset_stacks, - local_src_desc.offset_, combined_cases_num, - local_src_desc)) { - return true; - } - // If the second pattern is not inferred successfully, retrun false - return false; - } - // If the first pattern is not inferred successfully, retrun false - return false; - } - - // Reconstructing discontiguous tensor at trans-contiguous procedure. - bool reconstruct_tensor(at::Tensor &src, - ShapeStrideStack &shape_stride_stacks, - OffsetStack &offset_stacks) { - auto stack_shape_stride = shape_stride_stacks.pop_back_val(); - auto stack_offset = offset_stacks.pop_back_val(); - // Set view info to make discontiguous tensor. - // stack_shape_stride[0]: stored shape infos in inferring procedure. - // stack_shape_stride[1]: stored stride infos in inferring procedure. - - src.set_(src.storage(), stack_offset, stack_shape_stride[0], - stack_shape_stride[1]); - - // If current tensor is sliced and the stack is still not empty: - // stored infos in the stack should be modified. - if (shape_stride_stacks.size() >= 1 && - maybe_slice(TransContiguous::GetTensorDescInfo(src))) { - auto stack_shape_stride_pre = shape_stride_stacks.pop_back_val(); - - std::map map_stride_shape; - auto computed_stride = - StorageDescHelper::ComputeStrideFromShape(stack_shape_stride[0]); - // Adjust shape according to sorted stride - for (const auto i : c10::irange(stack_shape_stride_pre[0].size())) { - // if shape_i equals to shape_j, non-unique keys for "map_stride_shape" would be made; - // Temporarily, making size[i] * stride[i] to obtain unique keys; - // (Ascend): explore unique keys for any cases when "shape[i] == shape [j]" - map_stride_shape[stack_shape_stride[0][i] * stack_shape_stride[1][i]] = - computed_stride[i]; - } - - for (const auto i : c10::irange(stack_shape_stride_pre[0].size())) { - stack_shape_stride_pre[1][i] = - map_stride_shape[stack_shape_stride_pre[0][i] * - stack_shape_stride_pre[1][i]]; - } - // re-store modified infos - shape_stride_stacks.emplace_back(stack_shape_stride_pre); - } - return true; - } - - // Conduct trans-contiguous under strict constrains - bool combined_to_contiguous(at::Tensor &self, at::Tensor &src, - ShapeStrideStack &shape_stride_stacks, - OffsetStack &offset_stacks) { - // Base case: the last tensor to be processed. - if (shape_stride_stacks.size() == 1) { - if (reconstruct_tensor(src, shape_stride_stacks, offset_stacks)) { - OptimizationCases opt_cases_last{"reshape", "permute", "slice", - "select"}; - return copy_optimize_contiguous_by_given_cases(self, src, - opt_cases_last); - } - return false; - } - // Construct the first tensor and judge whether it can be optimized. - if (reconstruct_tensor(src, shape_stride_stacks, offset_stacks)) { - ContiguousTensorDesc src_desc_ = TransContiguous::GetTensorDescInfo(src); - OptimizationCases opt_cases_first{"reshape", "slice", "select"}; - if (reshape_without_copy_match(src)) { - // case 1 : The first tensor is reshape-type, refresh its info is enough - return combined_to_contiguous(self, src, shape_stride_stacks, - offset_stacks); - } else if (can_be_optimize_from_default_cases(src_desc_)) { - // case 2: The first tensor is discontiguous-type, - // conduct the standard optimization procedure. - auto transfer_tensor = OpPreparation::ApplyTensorWithFormat( - src.sizes(), src.options(), - torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_); - return (copy_optimize_contiguous_by_given_cases(transfer_tensor, src, - opt_cases_first) && - combined_to_contiguous(self, transfer_tensor, - shape_stride_stacks, offset_stacks)); - } - // case3 : The first tensor is contiguous or cannot be identified==>exit - return false; - } - // If the first tensor cannnot be reconstructed==>exit - return false; - } -}; // class combinedContiguousOpt - -REGISTER_COPY_OPT(combined, CombinedContiguousOpt) - -} // namespace native + + // Conduct trans-contiguous under strict constrains + bool combined_to_contiguous(at::Tensor &self, at::Tensor &src, + ShapeStrideStack &shape_stride_stacks, + OffsetStack &offset_stacks) { + // Base case: the last tensor to be processed. + if (shape_stride_stacks.size() == 1) { + if (reconstruct_tensor(src, shape_stride_stacks, offset_stacks)) { + OptimizationCases opt_cases_last{"reshape", "permute", "slice", + "select"}; + return copy_optimize_contiguous_by_given_cases(self, src, + opt_cases_last); + } + return false; + } + // Construct the first tensor and judge whether it can be optimized. + if (reconstruct_tensor(src, shape_stride_stacks, offset_stacks)) { + ContiguousTensorDesc src_desc_ = TransContiguous::GetTensorDescInfo(src); + OptimizationCases opt_cases_first{"reshape", "slice", "select"}; + if (reshape_without_copy_match(src)) { + // case 1 : The first tensor is reshape-type, refresh its info is enough + return combined_to_contiguous(self, src, shape_stride_stacks, + offset_stacks); + } else if (can_be_optimize_from_default_cases(src_desc_)) { + // case 2: The first tensor is discontiguous-type, + // conduct the standard optimization procedure. + auto transfer_tensor = OpPreparation::ApplyTensorWithFormat( + src.sizes(), src.options(), + torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_); + return (copy_optimize_contiguous_by_given_cases(transfer_tensor, src, + opt_cases_first) && + combined_to_contiguous(self, transfer_tensor, + shape_stride_stacks, offset_stacks)); + } + // case3 : The first tensor is contiguous or cannot be identified==>exit + return false; + } + // If the first tensor cannnot be reconstructed==>exit + return false; + } + }; // class combinedContiguousOpt + + REGISTER_COPY_OPT(combined, CombinedContiguousOpt) + + } // namespace native } // namespace at_npu diff --git a/torch_npu/csrc/framework/contiguous/permute_opt.cpp b/torch_npu/csrc/framework/contiguous/permute_opt.cpp index 54596531099..203a8a918df 100644 --- a/torch_npu/csrc/framework/contiguous/permute_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/permute_opt.cpp @@ -5,208 +5,208 @@ #include "op_plugin/OpInterface.h" namespace at_npu { -namespace native { - -class PermuteContiguousOpt : public ContiguousOpt { -public: - bool Optimizer(at::Tensor &self, const at::Tensor &src, - const ContiguousTensorDesc &src_desc) override { - // pattern permute - c10::SmallVector perm; - c10::SmallVector sizes; - if (can_use_permute(src_desc, perm, sizes)) { - RECORD_FUNCTION("contiguous_d_Transpose", std::vector({src})); - permute_to_contiguous(self, src, perm, sizes); - return true; - } - return false; - } - - bool CanOptimizer(const ContiguousTensorDesc &src_desc) override { - c10::SmallVector perm; - c10::SmallVector sizes; - return can_use_permute(src_desc, perm, sizes); - } - - bool CachedOptimizer(at::Tensor &self, const at::Tensor &src, - const ContiguousTensorDesc &src_desc) override - { - if (src_desc.cached_contiguous) { - RECORD_FUNCTION("cached_contiguous_d_Transpose", std::vector({src})); - CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc]; - c10::SmallVector sizes = cachedContiguousOpt.cached_opt_parameters.pop_back_val(); - c10::SmallVector perm = cachedContiguousOpt.cached_opt_parameters.pop_back_val(); - permute_to_contiguous(self, src, perm, sizes); - return true; - } - - // pattern permute - c10::SmallVector perm; - c10::SmallVector sizes; - if (can_use_permute(src_desc, perm, sizes)) { - RECORD_FUNCTION("contiguous_d_Transpose", std::vector({src})); - CachedContiguousOpt cached_opt = CachedContiguousOpt{ - "permute" - }; - cached_opt.cached_opt_parameters.emplace_back(perm); - cached_opt.cached_opt_parameters.emplace_back(sizes); - cached_opt.contiguous_tensor_desc = src_desc; - TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt; - permute_to_contiguous(self, src, perm, sizes); - return true; - } - return false; - } - -private: - - void permute_to_contiguous(at::Tensor &self, const at::Tensor &src, - const c10::SmallVector &perm, - const c10::SmallVector &sizes) - { - // Refresh src Tensor to match output self Tensor - auto src_desc_stored = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc(); - auto &src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_; - src_desc.base_sizes_ = sizes; - src_desc.base_strides_ = StorageDescHelper::ComputeStrideFromShape(static_cast(sizes)); - src_desc.storage_sizes_ = sizes; - op_plugin::npu_transpose_out(src, perm, false, self); - src_desc = src_desc_stored; - } - - bool can_use_permute(const ContiguousTensorDesc &src_desc, - c10::SmallVector &perm, - c10::SmallVector &sizes) { - const auto &base_sizes = src_desc.base_sizes_; - const auto &base_strides = src_desc.base_strides_; - auto view_sizes = src_desc.sizes_; - auto view_strides = src_desc.strides_; - - c10::SmallVector indexes; - for (const auto i : c10::irange(src_desc.sizes_.size())) { - indexes.emplace_back(i); - } - - // After permute or reshape+permute, the total amount of data remains - // unchanged. - if (c10::multiply_integers(view_sizes) != c10::multiply_integers(base_sizes)) { - return false; - } - - // Reorder axes of shape and stride in descending order - for (const auto i : c10::irange(src_desc.sizes_.size() - 1)) { - for (const auto j : c10::irange(i + 1, src_desc.sizes_.size())) { - bool need_swap = (view_strides[i] < view_strides[j]) || - (view_strides[i] == view_strides[j] && - view_sizes[i] < view_sizes[j]); - if (need_swap) { - std::swap(view_strides[i], view_strides[j]); - std::swap(view_sizes[i], view_sizes[j]); - std::swap(indexes[i], indexes[j]); - } - } - } - - // After reordering, check whether the shape and stride match - auto current_stride = 1; - int64_t src_desc_sizes = static_cast(src_desc.sizes_.size()); - for (int64_t i = src_desc_sizes - 1; i >= 0; i--) { - if (current_stride != view_strides[i]) { - ASCEND_LOGD("After reordering, shape and stride still do not match, and " - "permute pattern cannot be used."); - return false; - } - current_stride *= view_sizes[i]; - } - if ((base_sizes.size() - view_sizes.size()) != - (base_strides.size() - view_strides.size())) { - ASCEND_LOGD("Reordered shape and base shape do not match, and permute " - "pattern cannot be used."); - return false; - } - - // Calculate perm and sizes for permute - for (const auto ele : view_sizes) { - sizes.emplace_back(ele); - } - perm = indexes; - for (const auto i : c10::irange(src_desc.sizes_.size())) { - perm[indexes[i]] = i; - } - return true; - } - - void optimize_permute(c10::SmallVector &perm, - c10::SmallVector &sizes) { - c10::SmallVector optimized_perm; - c10::SmallVector optimized_sizes; - if (perm.size() != sizes.size()) { - ASCEND_LOGD("Param perm and sizes do not match."); - return; - } - - // Gather index - int64_t perm_size = static_cast(perm.size()); - for (int64_t i = 0; i < perm_size; i++) { - auto temp_perm_i = perm[i]; - auto temp_sizes_i = sizes[perm[i]]; - for (const auto j : c10::irange(i + 1, perm_size)) { - if (perm[i] + 1 == perm[j]) { - temp_sizes_i *= sizes[perm[j]]; - ++i; - continue; - } - break; - } - if (temp_sizes_i == 1) { - // Optimize permute calculation for better performance, by squeezing - // permute param. - continue; - } - optimized_perm.emplace_back(temp_perm_i); - optimized_sizes.emplace_back(temp_sizes_i); - } - if (optimized_perm.size() == perm.size()) { - ASCEND_LOGD("No adjacent axes, cannot be optimized."); - return; - } - - // Calculate new perm and shape - c10::SmallVector perm_indexes; - for (const auto i : c10::irange(optimized_perm.size())) { - perm_indexes.emplace_back(i); - } - for (const auto i : c10::irange(optimized_perm.size() - 1)) { - for (const auto j : c10::irange(i + 1, optimized_perm.size())) { - if (optimized_perm[i] > optimized_perm[j]) { - std::swap(optimized_perm[i], optimized_perm[j]); - std::swap(perm_indexes[i], perm_indexes[j]); - } - } - } - perm = perm_indexes; - for (const auto i : c10::irange(perm_indexes.size())) { - perm[perm_indexes[i]] = i; - } - sizes = optimized_sizes; - for (const auto i : c10::irange(perm_indexes.size())) { - sizes[i] = optimized_sizes[perm_indexes[i]]; - } - } - - template void squeeze_shape_and_stride(T &shape, T &stride) { - int64_t shape_size = static_cast(shape.size()); - for (int64_t i = 0; i < shape_size; i++) { - if (shape[i] == 1) { - shape.erase(shape.begin() + i); - stride.erase(stride.begin() + i); - --i; - } - } - } -}; // class PermuteContiguousOpt - -REGISTER_COPY_OPT(permute, PermuteContiguousOpt) - -} // namespace native + namespace native { + + class PermuteContiguousOpt : public ContiguousOpt { + public: + bool Optimizer(at::Tensor &self, const at::Tensor &src, + const ContiguousTensorDesc &src_desc) override { + // pattern permute + c10::SmallVector perm; + c10::SmallVector sizes; + if (can_use_permute(src_desc, perm, sizes)) { + RECORD_FUNCTION("contiguous_d_Transpose", std::vector({src})); + permute_to_contiguous(self, src, perm, sizes); + return true; + } + return false; + } + + bool CanOptimizer(const ContiguousTensorDesc &src_desc) override { + c10::SmallVector perm; + c10::SmallVector sizes; + return can_use_permute(src_desc, perm, sizes); + } + + bool CachedOptimizer(at::Tensor &self, const at::Tensor &src, + const ContiguousTensorDesc &src_desc) override + { + if (src_desc.cached_contiguous) { + RECORD_FUNCTION("cached_contiguous_d_Transpose", std::vector({src})); + CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc]; + c10::SmallVector sizes = cachedContiguousOpt.cached_opt_parameters.pop_back_val(); + c10::SmallVector perm = cachedContiguousOpt.cached_opt_parameters.pop_back_val(); + permute_to_contiguous(self, src, perm, sizes); + return true; + } + + // pattern permute + c10::SmallVector perm; + c10::SmallVector sizes; + if (can_use_permute(src_desc, perm, sizes)) { + RECORD_FUNCTION("contiguous_d_Transpose", std::vector({src})); + CachedContiguousOpt cached_opt = CachedContiguousOpt{ + "permute" + }; + cached_opt.cached_opt_parameters.emplace_back(perm); + cached_opt.cached_opt_parameters.emplace_back(sizes); + cached_opt.contiguous_tensor_desc = src_desc; + TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt; + permute_to_contiguous(self, src, perm, sizes); + return true; + } + return false; + } + + private: + + void permute_to_contiguous(at::Tensor &self, const at::Tensor &src, + const c10::SmallVector &perm, + const c10::SmallVector &sizes) + { + // Refresh src Tensor to match output self Tensor + auto src_desc_stored = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc(); + auto &src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_; + src_desc.base_sizes_ = sizes; + src_desc.base_strides_ = StorageDescHelper::ComputeStrideFromShape(static_cast(sizes)); + src_desc.storage_sizes_ = sizes; + op_plugin::npu_transpose_out(src, perm, false, self); + src_desc = src_desc_stored; + } + + bool can_use_permute(const ContiguousTensorDesc &src_desc, + c10::SmallVector &perm, + c10::SmallVector &sizes) { + const auto &base_sizes = src_desc.base_sizes_; + const auto &base_strides = src_desc.base_strides_; + auto view_sizes = src_desc.sizes_; + auto view_strides = src_desc.strides_; + + c10::SmallVector indexes; + for (const auto i : c10::irange(src_desc.sizes_.size())) { + indexes.emplace_back(i); + } + + // After permute or reshape+permute, the total amount of data remains + // unchanged. + if (c10::multiply_integers(view_sizes) != c10::multiply_integers(base_sizes)) { + return false; + } + + // Reorder axes of shape and stride in descending order + for (const auto i : c10::irange(src_desc.sizes_.size() - 1)) { + for (const auto j : c10::irange(i + 1, src_desc.sizes_.size())) { + bool need_swap = (view_strides[i] < view_strides[j]) || + (view_strides[i] == view_strides[j] && + view_sizes[i] < view_sizes[j]); + if (need_swap) { + std::swap(view_strides[i], view_strides[j]); + std::swap(view_sizes[i], view_sizes[j]); + std::swap(indexes[i], indexes[j]); + } + } + } + + // After reordering, check whether the shape and stride match + auto current_stride = 1; + int64_t src_desc_sizes = static_cast(src_desc.sizes_.size()); + for (int64_t i = src_desc_sizes - 1; i >= 0; i--) { + if (current_stride != view_strides[i]) { + ASCEND_LOGD("After reordering, shape and stride still do not match, and " + "permute pattern cannot be used."); + return false; + } + current_stride *= view_sizes[i]; + } + if ((base_sizes.size() - view_sizes.size()) != + (base_strides.size() - view_strides.size())) { + ASCEND_LOGD("Reordered shape and base shape do not match, and permute " + "pattern cannot be used."); + return false; + } + + // Calculate perm and sizes for permute + for (const auto ele : view_sizes) { + sizes.emplace_back(ele); + } + perm = indexes; + for (const auto i : c10::irange(src_desc.sizes_.size())) { + perm[indexes[i]] = i; + } + return true; + } + + void optimize_permute(c10::SmallVector &perm, + c10::SmallVector &sizes) { + c10::SmallVector optimized_perm; + c10::SmallVector optimized_sizes; + if (perm.size() != sizes.size()) { + ASCEND_LOGD("Param perm and sizes do not match."); + return; + } + + // Gather index + int64_t perm_size = static_cast(perm.size()); + for (int64_t i = 0; i < perm_size; i++) { + auto temp_perm_i = perm[i]; + auto temp_sizes_i = sizes[perm[i]]; + for (const auto j : c10::irange(i + 1, perm_size)) { + if (perm[i] + 1 == perm[j]) { + temp_sizes_i *= sizes[perm[j]]; + ++i; + continue; + } + break; + } + if (temp_sizes_i == 1) { + // Optimize permute calculation for better performance, by squeezing + // permute param. + continue; + } + optimized_perm.emplace_back(temp_perm_i); + optimized_sizes.emplace_back(temp_sizes_i); + } + if (optimized_perm.size() == perm.size()) { + ASCEND_LOGD("No adjacent axes, cannot be optimized."); + return; + } + + // Calculate new perm and shape + c10::SmallVector perm_indexes; + for (const auto i : c10::irange(optimized_perm.size())) { + perm_indexes.emplace_back(i); + } + for (const auto i : c10::irange(optimized_perm.size() - 1)) { + for (const auto j : c10::irange(i + 1, optimized_perm.size())) { + if (optimized_perm[i] > optimized_perm[j]) { + std::swap(optimized_perm[i], optimized_perm[j]); + std::swap(perm_indexes[i], perm_indexes[j]); + } + } + } + perm = perm_indexes; + for (const auto i : c10::irange(perm_indexes.size())) { + perm[perm_indexes[i]] = i; + } + sizes = optimized_sizes; + for (const auto i : c10::irange(perm_indexes.size())) { + sizes[i] = optimized_sizes[perm_indexes[i]]; + } + } + + template void squeeze_shape_and_stride(T &shape, T &stride) { + int64_t shape_size = static_cast(shape.size()); + for (int64_t i = 0; i < shape_size; i++) { + if (shape[i] == 1) { + shape.erase(shape.begin() + i); + stride.erase(stride.begin() + i); + --i; + } + } + } + }; // class PermuteContiguousOpt + + REGISTER_COPY_OPT(permute, PermuteContiguousOpt) + + } // namespace native } // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/contiguous/select_opt.cpp b/torch_npu/csrc/framework/contiguous/select_opt.cpp index b7fe0b86a19..9e11dc1fb97 100644 --- a/torch_npu/csrc/framework/contiguous/select_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/select_opt.cpp @@ -2,127 +2,127 @@ #include "torch_npu/csrc/aten/CustomFunctions.h" namespace at_npu { -namespace native { + namespace native { -class SelectContiguousOpt : public ContiguousOpt { -public: - bool Optimizer(at::Tensor &self, const at::Tensor &src, - const ContiguousTensorDesc &src_desc) override { - // select(dim, start), length[dim] == 1 - c10::SmallVector start; - c10::SmallVector length; + class SelectContiguousOpt : public ContiguousOpt { + public: + bool Optimizer(at::Tensor &self, const at::Tensor &src, + const ContiguousTensorDesc &src_desc) override { + // select(dim, start), length[dim] == 1 + c10::SmallVector start; + c10::SmallVector length; - if (can_use_select(src_desc, start, length)) { - RECORD_FUNCTION("contiguous_d_StridedSlice", - std::vector({src})); - select_to_contiguous(self, src, start, length, src_desc); - return true; - } - return false; - } + if (can_use_select(src_desc, start, length)) { + RECORD_FUNCTION("contiguous_d_StridedSlice", + std::vector({src})); + select_to_contiguous(self, src, start, length, src_desc); + return true; + } + return false; + } - bool CanOptimizer(const ContiguousTensorDesc &src_desc) override { - c10::SmallVector start; - c10::SmallVector length; - return can_use_select(src_desc, start, length); - } + bool CanOptimizer(const ContiguousTensorDesc &src_desc) override { + c10::SmallVector start; + c10::SmallVector length; + return can_use_select(src_desc, start, length); + } -private: - bool can_use_select(const ContiguousTensorDesc &src_desc, - c10::SmallVector &start, - c10::SmallVector &length) { - // base info and src info - const auto &base_size = src_desc.base_sizes_; - const auto &base_stride = src_desc.base_strides_; - const auto &select_size = src_desc.sizes_; - const auto &select_stride = src_desc.strides_; + private: + bool can_use_select(const ContiguousTensorDesc &src_desc, + c10::SmallVector &start, + c10::SmallVector &length) { + // base info and src info + const auto &base_size = src_desc.base_sizes_; + const auto &base_stride = src_desc.base_strides_; + const auto &select_size = src_desc.sizes_; + const auto &select_stride = src_desc.strides_; - // len(base_size) - len(select_size) == 1 && len(base_stride) - - // len(select_stride) == 1 - if ((base_size.size() - select_size.size() != 1) || - (base_stride.size() - select_stride.size() != 1)) { - return false; - } + // len(base_size) - len(select_size) == 1 && len(base_stride) - + // len(select_stride) == 1 + if ((base_size.size() - select_size.size() != 1) || + (base_stride.size() - select_stride.size() != 1)) { + return false; + } - // recover src tensor info: shape and stride - c10::SmallVector temp_size; - c10::SmallVector temp_stride; - for (size_t i = 0U; i <= select_size.size(); i++) { - if (base_size[i] != select_size[i] || - base_stride[i] != select_stride[i]) { - temp_size.emplace_back(base_size[i]); - temp_stride.emplace_back(base_stride[i]); - for (const auto j : c10::irange(i + 1, select_size.size() + 1)) { - temp_size.emplace_back(select_size[j - 1]); - temp_stride.emplace_back(select_stride[j - 1]); - i = j + 1; - } - } else { - temp_size.emplace_back(select_size[i]); - temp_stride.emplace_back(select_stride[i]); - } - } + // recover src tensor info: shape and stride + c10::SmallVector temp_size; + c10::SmallVector temp_stride; + for (size_t i = 0U; i <= select_size.size(); i++) { + if (base_size[i] != select_size[i] || + base_stride[i] != select_stride[i]) { + temp_size.emplace_back(base_size[i]); + temp_stride.emplace_back(base_stride[i]); + for (const auto j : c10::irange(i + 1, select_size.size() + 1)) { + temp_size.emplace_back(select_size[j - 1]); + temp_stride.emplace_back(select_stride[j - 1]); + i = j + 1; + } + } else { + temp_size.emplace_back(select_size[i]); + temp_stride.emplace_back(select_stride[i]); + } + } - for (const auto i : c10::irange(select_size.size() + 1)) { - if (base_size[i] == temp_size[i] && base_stride[i] == temp_stride[i]) { - continue; - } else { - return false; - } - } + for (const auto i : c10::irange(select_size.size() + 1)) { + if (base_size[i] == temp_size[i] && base_stride[i] == temp_stride[i]) { + continue; + } else { + return false; + } + } - // Collect the select infos for SliceD: dim, start, length - // confirm the selected dim - int64_t dim = static_cast(base_size.size()) - 1; - for (const auto i : c10::irange(select_size.size())) { - if (base_size[i] != select_size[i] || - base_stride[i] != select_stride[i]) { - dim = i; - break; - } - } + // Collect the select infos for SliceD: dim, start, length + // confirm the selected dim + int64_t dim = static_cast(base_size.size()) - 1; + for (const auto i : c10::irange(select_size.size())) { + if (base_size[i] != select_size[i] || + base_stride[i] != select_stride[i]) { + dim = i; + break; + } + } - // Obtain start index and select length - int64_t int_index = src_desc.offset_ / base_stride[dim]; - for (const auto i : c10::irange(base_size.size())) { - if (i == dim) { - start.emplace_back(int_index); - length.emplace_back(1); - } else { - start.emplace_back(0); - length.emplace_back(base_size[i]); - } - } - return true; - } + // Obtain start index and select length + int64_t int_index = src_desc.offset_ / base_stride[dim]; + for (const auto i : c10::irange(base_size.size())) { + if (i == dim) { + start.emplace_back(int_index); + length.emplace_back(1); + } else { + start.emplace_back(0); + length.emplace_back(base_size[i]); + } + } + return true; + } - void select_to_contiguous(at::Tensor &self, const at::Tensor &src, - c10::SmallVector &start, - c10::SmallVector &length, - const ContiguousTensorDesc &src_desc) { - const auto &base_size = src_desc.base_sizes_; - // Recover base tensor(necessary) a = b.select(1, 1) - at::Tensor temp_src = TransContiguous::view_tensor(src, src_desc.base_offset_, base_size, src_desc.base_strides_); + void select_to_contiguous(at::Tensor &self, const at::Tensor &src, + c10::SmallVector &start, + c10::SmallVector &length, + const ContiguousTensorDesc &src_desc) { + const auto &base_size = src_desc.base_sizes_; + // Recover base tensor(necessary) a = b.select(1, 1) + at::Tensor temp_src = TransContiguous::view_tensor(src, src_desc.base_offset_, base_size, src_desc.base_strides_); - // construct StridedSlice param - int64_t axis_size = static_cast(start.size()); - c10::SmallVector strides(axis_size, 1); - c10::SmallVector end; - int64_t shrink_mask = 0; - for (int64_t i = 0; i < axis_size; ++i) { - end.emplace_back(start[i] + length[i]); - if (length[i] == 1 && temp_src.size(i) != 1) { - shrink_mask += std::pow(2, i); - } - } + // construct StridedSlice param + int64_t axis_size = static_cast(start.size()); + c10::SmallVector strides(axis_size, 1); + c10::SmallVector end; + int64_t shrink_mask = 0; + for (int64_t i = 0; i < axis_size; ++i) { + end.emplace_back(start[i] + length[i]); + if (length[i] == 1 && temp_src.size(i) != 1) { + shrink_mask += std::pow(2, i); + } + } - // call StridedSlice op to contiguous - custom_ops::npu_indexing_out(temp_src, start, end, strides, 0, 0, 0, 0, shrink_mask, self); - return; - } -}; // class SelectContiguousOpt + // call StridedSlice op to contiguous + custom_ops::npu_indexing_out(temp_src, start, end, strides, 0, 0, 0, 0, shrink_mask, self); + return; + } + }; // class SelectContiguousOpt -REGISTER_COPY_OPT(select, SelectContiguousOpt) + REGISTER_COPY_OPT(select, SelectContiguousOpt) -} // namespace native + } // namespace native } // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/framework/contiguous/slice_opt.cpp b/torch_npu/csrc/framework/contiguous/slice_opt.cpp index 0eb1f596af2..2753fab7d50 100644 --- a/torch_npu/csrc/framework/contiguous/slice_opt.cpp +++ b/torch_npu/csrc/framework/contiguous/slice_opt.cpp @@ -2,148 +2,148 @@ #include "torch_npu/csrc/aten/CustomFunctions.h" namespace at_npu { -namespace native { + namespace native { -class SliceContiguousOpt : public ContiguousOpt { -public: - bool Optimizer(at::Tensor &self, const at::Tensor &src, - const ContiguousTensorDesc &src_desc) override { - // Pattern slice. - // Current pattern does not directly depend on other patterns. - // The relative sequence of this pattern and other patterns is not - // important. - c10::SmallVector offsets; - c10::SmallVector size; - if (can_use_slice(src_desc, offsets, size)) { - RECORD_FUNCTION("contiguous_d_Slice", std::vector({src})); - slice_to_contiguous(self, src, offsets, size, src_desc); - return true; - } - return false; - } + class SliceContiguousOpt : public ContiguousOpt { + public: + bool Optimizer(at::Tensor &self, const at::Tensor &src, + const ContiguousTensorDesc &src_desc) override { + // Pattern slice. + // Current pattern does not directly depend on other patterns. + // The relative sequence of this pattern and other patterns is not + // important. + c10::SmallVector offsets; + c10::SmallVector size; + if (can_use_slice(src_desc, offsets, size)) { + RECORD_FUNCTION("contiguous_d_Slice", std::vector({src})); + slice_to_contiguous(self, src, offsets, size, src_desc); + return true; + } + return false; + } - bool CanOptimizer(const ContiguousTensorDesc &src_desc) override { - c10::SmallVector offsets; - c10::SmallVector size; - return can_use_slice(src_desc, offsets, size); - } + bool CanOptimizer(const ContiguousTensorDesc &src_desc) override { + c10::SmallVector offsets; + c10::SmallVector size; + return can_use_slice(src_desc, offsets, size); + } - bool CachedOptimizer(at::Tensor &self, const at::Tensor &src, - const ContiguousTensorDesc &src_desc) override - { - if (src_desc.cached_contiguous) { - RECORD_FUNCTION("cached_contiguous_d_Slice", std::vector({src})); - CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc]; - c10::SmallVector size = cachedContiguousOpt.cached_opt_parameters.pop_back_val(); - c10::SmallVector offsets = cachedContiguousOpt.cached_opt_parameters.pop_back_val(); - slice_to_contiguous(self, src, offsets, size, src_desc); - return true; - } - c10::SmallVector offsets; - c10::SmallVector size; - if (can_use_slice(src_desc, offsets, size)) { - RECORD_FUNCTION("contiguous_d_Slice", std::vector({src})); - CachedContiguousOpt cached_opt = CachedContiguousOpt{ - "slice" - }; - cached_opt.cached_opt_parameters.emplace_back(offsets); - cached_opt.cached_opt_parameters.emplace_back(size); - cached_opt.contiguous_tensor_desc = src_desc; - TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt; - slice_to_contiguous(self, src, offsets, size, src_desc); - return true; - } - return false; - } + bool CachedOptimizer(at::Tensor &self, const at::Tensor &src, + const ContiguousTensorDesc &src_desc) override + { + if (src_desc.cached_contiguous) { + RECORD_FUNCTION("cached_contiguous_d_Slice", std::vector({src})); + CachedContiguousOpt cachedContiguousOpt = TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc]; + c10::SmallVector size = cachedContiguousOpt.cached_opt_parameters.pop_back_val(); + c10::SmallVector offsets = cachedContiguousOpt.cached_opt_parameters.pop_back_val(); + slice_to_contiguous(self, src, offsets, size, src_desc); + return true; + } + c10::SmallVector offsets; + c10::SmallVector size; + if (can_use_slice(src_desc, offsets, size)) { + RECORD_FUNCTION("contiguous_d_Slice", std::vector({src})); + CachedContiguousOpt cached_opt = CachedContiguousOpt{ + "slice" + }; + cached_opt.cached_opt_parameters.emplace_back(offsets); + cached_opt.cached_opt_parameters.emplace_back(size); + cached_opt.contiguous_tensor_desc = src_desc; + TransContiguous::cached_contiguous_opt[src_desc.hash_src_desc] = cached_opt; + slice_to_contiguous(self, src, offsets, size, src_desc); + return true; + } + return false; + } -private: - // npu-slice pattern cover several view ops, including chunk, split, narrow - // and part of index. Judgment logic is based on the implement of view ops in - // adapter layer. - bool can_use_slice(const ContiguousTensorDesc &src_desc, - c10::SmallVector &offsets, - c10::SmallVector &size) { - const auto &base_sizes = src_desc.base_sizes_; - const auto &base_strides = src_desc.base_strides_; - auto view_sizes = src_desc.sizes_; - auto view_strides = src_desc.strides_; + private: + // npu-slice pattern cover several view ops, including chunk, split, narrow + // and part of index. Judgment logic is based on the implement of view ops in + // adapter layer. + bool can_use_slice(const ContiguousTensorDesc &src_desc, + c10::SmallVector &offsets, + c10::SmallVector &size) { + const auto &base_sizes = src_desc.base_sizes_; + const auto &base_strides = src_desc.base_strides_; + auto view_sizes = src_desc.sizes_; + auto view_strides = src_desc.strides_; - // narrow+select(select at last dim) ==> single narrow - // 限制条件:1. 最后一轴stride非1==>最后一轴select;2. - // 基础格式;3.非最后一轴发生narrow(元素减少) - // 最小化影响:仅限最后一轴的select,即tensor.select(-1, 1) == - // tensor[**,1:2],select过渡到narrow - if (view_strides[view_strides.size() - 1] != 1 && - FormatHelper::IsBaseFormatType(src_desc.npu_format_) && - view_strides.size() < base_strides.size() && - c10::multiply_integers(view_sizes) < - c10::multiply_integers(base_sizes) / base_sizes[base_sizes.size() - 1]) { - view_sizes.emplace_back(1); - view_strides.emplace_back(1); - } + // narrow+select(select at last dim) ==> single narrow + // 限制条件:1. 最后一轴stride非1==>最后一轴select;2. + // 基础格式;3.非最后一轴发生narrow(元素减少) + // 最小化影响:仅限最后一轴的select,即tensor.select(-1, 1) == + // tensor[**,1:2],select过渡到narrow + if (view_strides[view_strides.size() - 1] != 1 && + FormatHelper::IsBaseFormatType(src_desc.npu_format_) && + view_strides.size() < base_strides.size() && + c10::multiply_integers(view_sizes) < + c10::multiply_integers(base_sizes) / base_sizes[base_sizes.size() - 1]) { + view_sizes.emplace_back(1); + view_strides.emplace_back(1); + } - // Strides must be the same. - if (view_strides != base_strides) { - return false; - } + // Strides must be the same. + if (view_strides != base_strides) { + return false; + } - // Only narrow dims are different. - c10::SmallVector narrow_dims; - if (view_sizes.size() != base_sizes.size()) { - return false; - } - for (const auto i : c10::irange(view_sizes.size())) { - if (view_sizes[i] == base_sizes[i]) { - narrow_dims.emplace_back(0); - } else if (view_sizes[i] < base_sizes[i]) { - narrow_dims.emplace_back(1); - } else { - return false; - } - } + // Only narrow dims are different. + c10::SmallVector narrow_dims; + if (view_sizes.size() != base_sizes.size()) { + return false; + } + for (const auto i : c10::irange(view_sizes.size())) { + if (view_sizes[i] == base_sizes[i]) { + narrow_dims.emplace_back(0); + } else if (view_sizes[i] < base_sizes[i]) { + narrow_dims.emplace_back(1); + } else { + return false; + } + } - // Calculate npu slice param. - size = view_sizes; - offsets.clear(); - int64_t storage_offsets = src_desc.offset_; - // src.storage_offset() == start[narrow_dims[i]]*stride[narrow_dims[i]] - for (const auto i : c10::irange(view_strides.size())) { - offsets.emplace_back(storage_offsets / view_strides[i]); - storage_offsets = storage_offsets % view_strides[i]; - } - if (storage_offsets != 0) { - return false; - } - for (const auto i : c10::irange(offsets.size())) { - if ((offsets[i] + view_sizes[i]) > base_sizes[i]) { - // In narrow calculation, (start + length) <= cur_size - return false; - } - if (offsets[i] != 0 && narrow_dims[i] == 0) { - // narrow_dims[i] == 0 means dim i is not involved in narrow - // calculation. offsets[i] != 0 means dim i has the start of narrow - // calculation. Two conditions are contradictory. - return false; - } - } - return true; - } + // Calculate npu slice param. + size = view_sizes; + offsets.clear(); + int64_t storage_offsets = src_desc.offset_; + // src.storage_offset() == start[narrow_dims[i]]*stride[narrow_dims[i]] + for (const auto i : c10::irange(view_strides.size())) { + offsets.emplace_back(storage_offsets / view_strides[i]); + storage_offsets = storage_offsets % view_strides[i]; + } + if (storage_offsets != 0) { + return false; + } + for (const auto i : c10::irange(offsets.size())) { + if ((offsets[i] + view_sizes[i]) > base_sizes[i]) { + // In narrow calculation, (start + length) <= cur_size + return false; + } + if (offsets[i] != 0 && narrow_dims[i] == 0) { + // narrow_dims[i] == 0 means dim i is not involved in narrow + // calculation. offsets[i] != 0 means dim i has the start of narrow + // calculation. Two conditions are contradictory. + return false; + } + } + return true; + } - void slice_to_contiguous(at::Tensor &self, const at::Tensor &src, - const c10::SmallVector &offsets, - const c10::SmallVector &size, - const ContiguousTensorDesc &src_desc) { - // create contiguous tensor for npu slice - const auto &temp_tensor_size = src_desc.base_sizes_; - at::Tensor temp_src = TransContiguous::view_tensor(src, src_desc.base_offset_, temp_tensor_size, src_desc.base_strides_); + void slice_to_contiguous(at::Tensor &self, const at::Tensor &src, + const c10::SmallVector &offsets, + const c10::SmallVector &size, + const ContiguousTensorDesc &src_desc) { + // create contiguous tensor for npu slice + const auto &temp_tensor_size = src_desc.base_sizes_; + at::Tensor temp_src = TransContiguous::view_tensor(src, src_desc.base_offset_, temp_tensor_size, src_desc.base_strides_); - custom_ops::npu_slice_out(temp_src, offsets, size, self); - return; - } -}; // class SliceContiguousOpt + custom_ops::npu_slice_out(temp_src, offsets, size, self); + return; + } + }; // class SliceContiguousOpt -REGISTER_COPY_OPT(slice, SliceContiguousOpt) + REGISTER_COPY_OPT(slice, SliceContiguousOpt) -} // namespace native + } // namespace native } // namespace at_npu \ No newline at end of file -- Gitee