From 20d139bd26598344bfc32faa0ce20a2999ceeff7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Sun, 27 Apr 2025 02:57:49 +0000
Subject: [PATCH 01/46] =?UTF-8?q?!2651=20=E3=80=90AR20241227785719?=
 =?UTF-8?q?=E3=80=91fp8/hi8=20weight=20only=20Merge=20pull=20request=20!26?=
 =?UTF-8?q?51=20from=20=E5=BC=A0=E9=91=AB/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../hif8_fp8_weight_quantization/README_CN.md |  45 +++++
 .../requirements.txt                          |   7 +
 .../src/quantization.cfg                      |   5 +
 .../src/run_llama7b_quantization.py           | 154 ++++++++++++++++++
 .../hif8_fp8_weight_quantization/src/utils.py |  82 ++++++++++
 5 files changed, 293 insertions(+)
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
new file mode 100644
index 000000000..04b9c0973
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -0,0 +1,45 @@
+# FP8/HIF8量化
+
+## 1 FP8/HIF8量化前提
+
+### 1.1 安装依赖
+
+本sample依赖包可参考[requirements.txt](requirements.txt)
+
+### 1.2 模型和数据集准备
+
+本sample以Llama2-7b模型，pileval和wikitext2数据集为示例，请用户自行下载，并适配utils.py文件中加载数据集和模型的路径。当前sample中数据集保存目录需根据实际保存目录修改。
+
+### 1.3 简易量化配置
+./src/quantization.cfg文件为用户自定义的简易量化配置，具体表示信息如下：
+
+| 字段 |类型| 说明 | 默认值 | 取值范围 |
+|:--| :-: | :-- | :-: | :-: |
+|skip_layers|str|跳过量化的层 |/|/|
+|weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
+|weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
+
+## 2 FLOAT8_E4M3FN量化示例
+> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN，如果需要HIFLOAT8仅权重量化，请适配修改quantization.cfg
+
+> 如果要验证deploy模型，需要设置save_post_quant_model接口中参数mode为'deploy'，并将生成的部署模型搬到npu上进行推理
+
+### 2.1 使用接口方式调用
+
+请在当前目录执行如下命令运行示例程序，用户需根据实际情况修改示例程序中的模型和数据集路径：
+
+`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py`
+
+若出现如下信息，则说明量化成功：
+
+```none
+Test time taken:  1.0 min  38.24865388870239 s
+Score:  5.48
+```
+
+推理成功后，在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./output文件夹，该文件夹内包含以下内容：
+
+- config.json：量化配置文件，描述了如何对模型中的每一层进行量化。
+- record.txt：量化因子记录文件。
+
+> 如果outputs目录下已经存在量化配置文件或量化因子记录文件，再次运行示例程序时，如果新生成的文件与已有文件同名，则会覆盖已有的量化配置文件或量化因子记录文件。
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt
new file mode 100644
index 000000000..55441d062
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/requirements.txt
@@ -0,0 +1,7 @@
+torch==2.1.0
+transformers==4.40.0
+accelerate==0.30.1
+datasets==2.19.1
+sentencepiece==0.2.0
+numpy==1.23.5
+protobuf==3.20.2
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
new file mode 100644
index 000000000..2d8b3dcc3
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/quantization.cfg
@@ -0,0 +1,5 @@
+skip_layers: "lm_head"
+weight_only_config: {
+    weight_compress_only: True
+    wts_type: FLOAT8_E4M3FN
+}
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
new file mode 100644
index 000000000..092238d22
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
@@ -0,0 +1,154 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+
+import os
+import copy
+import time
+import tqdm
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoConfig
+from accelerate import infer_auto_device_map, dispatch_model
+from accelerate.utils.modeling import get_balanced_memory
+
+from utils import get_loaders,  get_llama2, get_calib_dataset
+import amct_pytorch as amct
+
+
+def build_model_and_enc(model, model_path, gpu_num):
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    if "mpt" in config.__class__.__name__.lower():
+        enc = AutoTokenizer.from_pretrained(
+            config.tokenizer_name, trust_remote_code=True
+        )
+    else:
+        enc = AutoTokenizer.from_pretrained(
+            model_path, use_fast=False, trust_remote_code=True
+        )
+
+    # Move the model to GPU (as much as possible) for LM evaluation
+    # max_memory = ['0:16GiB', '1:16GiB','2:16GiB', 'cpu:30GiB'], '0' means the first GPU that you specify.
+    # I don't recommend use 16GiB, we need to reserve some space for other tensors during calculation
+    # please see the recommand memeory allocation in the Word file
+    # Adjust the max_size accroding to the real situation
+    # a clever way:
+
+    max_memory = []
+    for i in range(gpu_num):
+        max_memory.append(f'{i}:12GiB')
+    max_memory.append('cpu:80GiB')
+    print('Max_memory allocation: \n', max_memory)
+
+    max_memory = [v.split(":") for v in (max_memory or [])]
+    max_memory = {(int(k) if k.isdigit() else k): v for k, v in max_memory}
+    kwargs = {
+        "max_memory": get_balanced_memory(
+            model, max_memory if len(max_memory) > 0 else None
+        )
+    }
+    model.tie_weights()
+    device_map = infer_auto_device_map(
+        model,
+        no_split_module_classes=[
+            "LlamaDecoderLayer",
+        ],
+        **kwargs,
+    )
+    model = dispatch_model(model, device_map=device_map, 
+        offload_dir=os.path.join(model_path, 'offload_dir'))
+
+    return model, enc
+
+if __name__ == '__main__':
+    model, model_path = get_llama2('7b')
+    model = model.eval()
+    copied_model = copy.deepcopy(model)
+    gpu_num = torch.cuda.device_count()
+    model, enc = build_model_and_enc(model, model_path, gpu_num)
+
+    proto_path = './src/quantization.cfg'
+    config_file = './output/config.json'
+    record_file = './output/record.txt'
+
+    test_start_time = time.time()
+    # Phase1: generate quant config json
+    amct.create_post_quant_config(config_file,
+                             model,
+                             config_defination=proto_path)
+    
+    # Phase2: do weights calibration and generate calibration model
+    samples = get_calib_dataset(
+        data="pileval", tokenizer=enc, n_samples=512, block_size=256
+    )
+    samples = torch.cat(samples, dim=0)[:1,:]
+    model.config.use_cache = False
+    post_quant_model = amct.create_post_quant_model(config_file,
+                                                    record_file,
+                                                    model)
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    post_quant_model.config.use_cache = False
+    with torch.no_grad():
+        post_quant_model(samples.to(next(post_quant_model.parameters()).device))
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    test_end_time = time.time()
+    total_time = test_end_time - test_start_time
+    print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's')
+    # save memory, del unuse model
+    del post_quant_model
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    model, enc = build_model_and_enc(copied_model, model_path, gpu_num)
+    
+    # Phase3: save fakequant model
+    testenc = get_loaders(dataset_name='wikitext2',
+                        enc=enc,
+                        seqlen=model.seqlen)
+
+    testenc = testenc.input_ids.to(model.device)
+    fake_quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+    nsamples = testenc.numel() // model.seqlen
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    
+    # Phase4: Test ppl result
+    nlls = []
+    test_start_time = time.time()
+    for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
+        batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(
+            model.device
+        )
+        with torch.no_grad():
+            lm_logits = fake_quant_model(batch).logits
+        shift_logits = lm_logits[:, :-1, :].contiguous().float()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    test_end_time = time.time()
+
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+
+    total_time = test_end_time - test_start_time
+    print('Test time taken: ', total_time // 60, 'min ', total_time%60, 's'  )
+    print('Score: ', ppl.item())
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
new file mode 100644
index 000000000..af20318be
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
@@ -0,0 +1,82 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+import torch
+import torch.nn as nn
+from datasets import load_dataset,load_from_disk
+
+def get_llama2(model, seqlen=2048):
+    '''If model is specified from ['7b', '13b', '70b'], then we load official pretrained model;
+       If you want to load checkpoints other than the official ones, please specifiy the model path,
+       otherwise please choose from ['7b', '13b', '70b'] for better clarity
+    '''
+
+    def skip(*args, **kwargs):
+        pass
+
+    if model in ['7b', '13b', '70b']:
+        model_path = f'/data/Models/pytorch/Llama2/Llama2_{model}_hf'
+        print(f'Getting official pretrained Llama2-{model}')
+    else:
+        model_path = model
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    
+    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/")
+
+    model.seqlen = seqlen
+    return model, model_path
+
+
+def get_loaders(dataset_name: str, enc, seqlen):
+    if dataset_name == 'wikitext2':
+        print('Loading dataset: Wikitext2')
+        testenc = load_dataset('/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py', 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
+        testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
+    
+    return testenc
+
+
+def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=512):
+    if data == "pileval":
+        dataset = load_from_disk('/pile_val_backup')
+    else:
+        raise NotImplementedError
+    dataset = dataset.shuffle(seed=42)
+    samples = []
+    n_run = 0
+    for data in dataset:
+        line = data["text"]
+        line = line.strip()
+        line_encoded = tokenizer.encode(line)
+        if len(line_encoded) > 512:
+            continue
+        sample = torch.tensor([line_encoded])
+        if sample.numel() == 0:
+            continue
+        samples.append(sample)
+        n_run += 1
+        if n_run == n_samples:
+            break
+    # now concatenate all samples and split according to block size
+    cat_samples = torch.cat(samples, dim=1)
+    n_split = cat_samples.shape[1] // block_size
+    print(f" * Split into {n_split} blocks")
+    return [
+        cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split)
+    ]
-- 
Gitee


From fbcdc1308f223968f8d6ca2269ee31a5bb8e96c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Tue, 29 Apr 2025 07:44:34 +0000
Subject: [PATCH 02/46] =?UTF-8?q?!2652=20=E3=80=90AR20241227785719?=
 =?UTF-8?q?=E3=80=91fp8/hi8=20weight=20only=20npu=20Merge=20pull=20request?=
 =?UTF-8?q?=20!2652=20from=20=E5=BC=A0=E9=91=AB/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../hif8_fp8_weight_quantization/README_CN.md | 15 +++++--
 .../src/run_llama7b_quantization.py           | 39 ++++++++++++-------
 .../hif8_fp8_weight_quantization/src/utils.py | 31 +++++----------
 3 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
index 04b9c0973..2c5cc0108 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/README_CN.md
@@ -8,7 +8,7 @@
 
 ### 1.2 模型和数据集准备
 
-本sample以Llama2-7b模型，pileval和wikitext2数据集为示例，请用户自行下载，并适配utils.py文件中加载数据集和模型的路径。当前sample中数据集保存目录需根据实际保存目录修改。
+本sample以Llama2-7b模型，pileval和wikitext2数据集为示例，请用户自行下载。
 
 ### 1.3 简易量化配置
 ./src/quantization.cfg文件为用户自定义的简易量化配置，具体表示信息如下：
@@ -22,13 +22,20 @@
 ## 2 FLOAT8_E4M3FN量化示例
 > 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT8_E4M3FN，如果需要HIFLOAT8仅权重量化，请适配修改quantization.cfg
 
-> 如果要验证deploy模型，需要设置save_post_quant_model接口中参数mode为'deploy'，并将生成的部署模型搬到npu上进行推理
 
 ### 2.1 使用接口方式调用
 
-请在当前目录执行如下命令运行示例程序，用户需根据实际情况修改示例程序中的模型和数据集路径：
+请在当前目录执行如下命令运行示例程序
 
-`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py`
+验证fakequant模型脚本：
+
+`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py --test_on_npu_flag=false --calibration_data=/pile_val_backup/ --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf`
+
+验证deploy模型脚本（需要适配npu相关环境）：
+
+`python3 src/run_llama7b_quantization.py --test_on_npu_flag=true`
+
+> test_on_npu_flag参数表明是否生成部署模型在npu上推理，calibration_data参数为校准集路径，verify_data为验证集的路径，model为模型存放路径
 
 若出现如下信息，则说明量化成功：
 
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
index 092238d22..2b2f14603 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/run_llama7b_quantization.py
@@ -14,7 +14,7 @@
 # limitations under the License. 
 """
 
-
+import argparse
 import os
 import copy
 import time
@@ -74,7 +74,14 @@ def build_model_and_enc(model, model_path, gpu_num):
     return model, enc
 
 if __name__ == '__main__':
-    model, model_path = get_llama2('7b')
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--test_on_npu_flag', type=lambda x: (str(x).lower() == 'true'))
+    parser.add_argument('--calibration_data', type=str, default='/pile_val_backup')
+    parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py')
+    parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf')
+
+    args = parser.parse_args()
+    model, model_path = get_llama2(args.model)
     model = model.eval()
     copied_model = copy.deepcopy(model)
     gpu_num = torch.cuda.device_count()
@@ -92,20 +99,20 @@ if __name__ == '__main__':
     
     # Phase2: do weights calibration and generate calibration model
     samples = get_calib_dataset(
-        data="pileval", tokenizer=enc, n_samples=512, block_size=256
+        data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256
     )
     samples = torch.cat(samples, dim=0)[:1,:]
-    model.config.use_cache = False
+
     post_quant_model = amct.create_post_quant_model(config_file,
                                                     record_file,
                                                     model)
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-    post_quant_model.config.use_cache = False
+
     with torch.no_grad():
         post_quant_model(samples.to(next(post_quant_model.parameters()).device))
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     test_end_time = time.time()
     total_time = test_end_time - test_start_time
     print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's')
@@ -117,12 +124,18 @@ if __name__ == '__main__':
     model, enc = build_model_and_enc(copied_model, model_path, gpu_num)
     
     # Phase3: save fakequant model
-    testenc = get_loaders(dataset_name='wikitext2',
+    testenc = get_loaders(data_path=args.verify_data,
                         enc=enc,
                         seqlen=model.seqlen)
 
     testenc = testenc.input_ids.to(model.device)
-    fake_quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+
+    if args.test_on_npu_flag:
+        quant_model = amct.save_post_quant_model(record_file, model, mode='deploy')
+        quant_model = quant_model.npu()
+    else:
+        quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+
     nsamples = testenc.numel() // model.seqlen
     
     if torch.cuda.is_available():
@@ -133,12 +146,12 @@ if __name__ == '__main__':
     test_start_time = time.time()
     for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
         batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(
-            model.device
+            quant_model.device
         )
         with torch.no_grad():
-            lm_logits = fake_quant_model(batch).logits
-        shift_logits = lm_logits[:, :-1, :].contiguous().float()
-        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
+            lm_logits = quant_model(batch).logits
+        shift_logits = lm_logits[:, :-1, :].contiguous().float().cpu()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:].cpu()
         loss_fct = nn.CrossEntropyLoss()
         loss = loss_fct(
             shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
diff --git a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
index af20318be..586916fbd 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/hif8_fp8_weight_quantization/src/utils.py
@@ -18,45 +18,32 @@ import torch
 import torch.nn as nn
 from datasets import load_dataset,load_from_disk
 
-def get_llama2(model, seqlen=2048):
-    '''If model is specified from ['7b', '13b', '70b'], then we load official pretrained model;
-       If you want to load checkpoints other than the official ones, please specifiy the model path,
-       otherwise please choose from ['7b', '13b', '70b'] for better clarity
-    '''
-
+def get_llama2(model_path, seqlen=2048):
     def skip(*args, **kwargs):
         pass
 
-    if model in ['7b', '13b', '70b']:
-        model_path = f'/data/Models/pytorch/Llama2/Llama2_{model}_hf'
-        print(f'Getting official pretrained Llama2-{model}')
-    else:
-        model_path = model
     torch.nn.init.kaiming_uniform_ = skip
     torch.nn.init.uniform_ = skip
     torch.nn.init.normal_ = skip
     from transformers import LlamaForCausalLM
     
-    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/")
+    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, offload_folder="offload/")
 
     model.seqlen = seqlen
     return model, model_path
 
 
-def get_loaders(dataset_name: str, enc, seqlen):
-    if dataset_name == 'wikitext2':
-        print('Loading dataset: Wikitext2')
-        testenc = load_dataset('/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py', 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
-        testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
+def get_loaders(data_path: str, enc, seqlen):
+
+    print('Loading dataset: Wikitext2')
+    testenc = load_dataset(data_path, 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
+    testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
     
     return testenc
 
 
-def get_calib_dataset(data="pileval", tokenizer=None, n_samples=512, block_size=512):
-    if data == "pileval":
-        dataset = load_from_disk('/pile_val_backup')
-    else:
-        raise NotImplementedError
+def get_calib_dataset(data_path, tokenizer=None, n_samples=512, block_size=512):
+    dataset = load_from_disk(data_path)
     dataset = dataset.shuffle(seed=42)
     samples = []
     n_run = 0
-- 
Gitee


From 7f26cead2c9063679ce9933fd896cb31ae59fa50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AE=81?= <lining.li@huawei.com>
Date: Tue, 13 May 2025 09:26:42 +0000
Subject: [PATCH 03/46] =?UTF-8?q?!2654=20fix=20error=20in=20torch=202.1=20?=
 =?UTF-8?q?Merge=20pull=20request=20!2654=20from=20=E6=9D=8E=E5=AE=81/mast?=
 =?UTF-8?q?er?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../dataflow/plugin/torch/torch_plugin.py     | 69 ++++++++++++-------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/inference/dataflow/py_dflow/python/dataflow/plugin/torch/torch_plugin.py b/inference/dataflow/py_dflow/python/dataflow/plugin/torch/torch_plugin.py
index 1bbf8fb88..684a859d7 100644
--- a/inference/dataflow/py_dflow/python/dataflow/plugin/torch/torch_plugin.py
+++ b/inference/dataflow/py_dflow/python/dataflow/plugin/torch/torch_plugin.py
@@ -18,6 +18,7 @@
 import functools
 import inspect
 import traceback
+import threading
 from typing import Union, List
 import dataflow.data_type as dt
 import dataflow.dataflow as df
@@ -40,47 +41,63 @@ _npu_actor_model_support_args_ = _npu_model_support_args_ + [
     "input_descs",
 ]
 
+_df_to_torch_dtype = None
+_torch_to_df_dtype = None
+_lock = threading.Lock()
 
-def _convert_df_to_torch_tensor_dtype(df_dtype):
-    import torch
-
-    df_to_torch_dtype = {
-        dt.DT_FLOAT: torch.float32,
-        dt.DT_FLOAT16: torch.float16,
-        dt.DT_BF16: torch.bfloat16,
-        dt.DT_INT8: torch.int8,
-        dt.DT_INT16: torch.int16,
-        dt.DT_UINT16: torch.uint16,
-        dt.DT_UINT8: torch.uint8,
-        dt.DT_INT32: torch.int32,
-        dt.DT_INT64: torch.int64,
-        dt.DT_UINT32: torch.uint32,
-        dt.DT_UINT64: torch.uint64,
-        dt.DT_BOOL: torch.bool,
-        dt.DT_DOUBLE: torch.float64,
-    }
-    return df_to_torch_dtype[df_dtype]
 
-
-def _convert_torch_to_df_tensor_dtype(torch_dtype):
+def _initialize_torch_to_df_dtype():
     import torch
 
-    torch_to_df_dtype = {
+    global _torch_to_df_dtype
+    global _df_to_torch_dtype
+    _torch_to_df_dtype = {
         torch.float32: dt.DT_FLOAT,
         torch.float16: dt.DT_FLOAT16,
         torch.bfloat16: dt.DT_BF16,
         torch.int8: dt.DT_INT8,
         torch.int16: dt.DT_INT16,
-        torch.uint16: dt.DT_UINT16,
         torch.uint8: dt.DT_UINT8,
         torch.int32: dt.DT_INT32,
         torch.int64: dt.DT_INT64,
-        torch.uint32: dt.DT_UINT32,
-        torch.uint64: dt.DT_UINT64,
         torch.bool: dt.DT_BOOL,
         torch.float64: dt.DT_DOUBLE,
     }
-    return torch_to_df_dtype[torch_dtype]
+    if torch.__version__ >= "2.3":
+        _torch_to_df_dtype.update(
+            {
+                torch.uint16: dt.DT_UINT16,
+                torch.uint32: dt.DT_UINT32,
+                torch.uint64: dt.DT_UINT64,
+            }
+        )
+    _df_to_torch_dtype = {v: k for k, v in _torch_to_df_dtype.items()}
+
+
+def _convert_df_to_torch_tensor_dtype(df_dtype):
+    global _df_to_torch_dtype
+    # 使用锁来确保初始化操作是线程安全的
+    if _df_to_torch_dtype is None:
+        with _lock:  # 获取锁
+            if _df_to_torch_dtype is None:  # 双重检查，确保只有一个线程初始化
+                _initialize_torch_to_df_dtype()
+
+    if df_dtype not in _df_to_torch_dtype:
+        raise ValueError(f"df_dtype {df_dtype} is not supported")
+    return _df_to_torch_dtype[df_dtype]
+
+
+def _convert_torch_to_df_tensor_dtype(torch_dtype):
+    global _torch_to_df_dtype
+    # 使用锁来确保初始化操作是线程安全的
+    if _torch_to_df_dtype is None:
+        with _lock:  # 获取锁
+            if _torch_to_df_dtype is None:  # 双重检查，确保只有一个线程初始化
+                _initialize_torch_to_df_dtype()
+
+    if torch_dtype not in _torch_to_df_dtype:
+        raise ValueError(f"torch_dtype {torch_dtype} is not supported")
+    return _torch_to_df_dtype[torch_dtype]
 
 
 def _prepare_inputs(inputs: Union[List[fw.FlowMsg]], input_num):
-- 
Gitee


From fdf8ca5fa8d01941c33dd2307dec05f02ee903f9 Mon Sep 17 00:00:00 2001
From: renjie <renjie88@huawei.com>
Date: Thu, 22 May 2025 11:43:43 +0000
Subject: [PATCH 04/46] =?UTF-8?q?!2656=20=E3=80=90tiling=E4=B8=8B=E6=B2=89?=
 =?UTF-8?q?=E6=A0=B7=E4=BE=8B=E3=80=91=E3=80=90AR20250522891845=E3=80=91Ad?=
 =?UTF-8?q?dCustomTilingSink=E6=A0=B7=E4=BE=8B=20Merge=20pull=20request=20?=
 =?UTF-8?q?!2656=20from=20renjie/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../OpImpl/AddCustomTilingSink.json           |  40 ++++++
 .../AddCustomTilingSink/OpImpl/README.md      | 117 ++++++++++++++++++
 .../tf_plugin/tensorflow_add_custom_plugin.cc |  22 ++++
 .../AddCustomTilingSink/OpImpl/install.sh     |  57 +++++++++
 .../OpImpl/op_host/add_custom_tiling_sink.cpp |  56 +++++++++
 .../op_host/add_custom_tiling_sink_tiling.cpp |  36 ++++++
 .../op_host/add_custom_tiling_sink_tiling.h   |  25 ++++
 .../op_kernel/add_custom_tiling_sink.cpp      |  95 ++++++++++++++
 8 files changed, 448 insertions(+)
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/AddCustomTilingSink.json
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/README.md
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/framework/tf_plugin/tensorflow_add_custom_plugin.cc
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink.cpp
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.cpp
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.h
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_kernel/add_custom_tiling_sink.cpp

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/AddCustomTilingSink.json b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/AddCustomTilingSink.json
new file mode 100644
index 000000000..1d93e1f49
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/AddCustomTilingSink.json
@@ -0,0 +1,40 @@
+[
+    {
+        "op": "AddCustomTilingSink",
+        "language": "cpp",
+        "input_desc": [
+            {
+                "name": "x",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float32"
+                ]
+            },
+            {
+                "name": "y",
+                "param_type": "optional",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float32"
+                ]
+            }
+        ],
+        "output_desc": [
+            {
+                "name": "z",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float32"
+                ]
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/README.md b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/README.md
new file mode 100644
index 000000000..a89d51c80
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/README.md
@@ -0,0 +1,117 @@
+
+## 概述
+本样例基于AddCustom算子工程，提供了支持Tiling下沉的自定义算子开发样例。
+若要使能tiling下沉，算子tiling函数必须独立实现，详细开发指导请参考[Tiling下沉自定义算子开发指南](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_00014.html)
+
+## 目录结构介绍
+```
+├─OpImpl										// 算子实现
+│   ├─framework									// 算子插件实现文件目录
+│   ├─op_host									// host侧实现文件
+│   │   ├─add_custom_tiling_sink.cpp			// 算子原型定义、tiling函数注册等
+│   │   │ add_custom_tiling_sink_tiling.cpp		// 算子tiling函数的所有实现(必须独立实现于cpp中)
+│   │   └─add_custom_tiling_sink_tiling.h		// 算子tiling结构体定义
+│   └─op_kernel									// kernel侧实现文件
+│  AddCustomTilingSink.json						// 算子的原型定义json文件
+│  install.sh									// 脚本，调用msOpGen生成自定义算子工程，并编译
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
+```
+z = x + y
+```
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas A3 训练系列产品/Atlas A3 推理系列产品
+
+## 编译运行样例算子
+针对自定义算子工程，编译运行包含如下步骤：
+- 调用msOpGen工具生成自定义算子工程；
+- 完成算子host和kernel实现；
+- 编译自定义算子工程生成自定义算子包；
+- 安装自定义算子包到自定义算子库中；
+- 调用执行自定义算子；
+
+详细操作如下所示。
+### 1. 获取源码包
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
+
+### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
+  - 切换到msOpGen脚本install.sh所在目录
+    ```bash
+    # 若开发者以git命令行方式clone了master分支代码，并切换目录
+    cd ${git_clone_path}/samples/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl
+    ```
+
+  - 调用脚本，生成自定义算子工程，复制host和kernel实现并编译算子
+    - 方式一：配置环境变量运行脚本
+      请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量命令。
+      - 默认路径，root用户安装CANN软件包
+        ```bash
+        export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+        ```
+      - 默认路径，非root用户安装CANN软件包
+        ```bash
+        export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+        ```
+      - 指定路径install_path，安装CANN软件包
+        ```bash
+        export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+        ```
+        运行install.sh脚本
+        ```bash
+        bash install.sh -v [SOC_VERSION]
+        ```
+    - 方式二：指定命令行安装路径来运行脚本
+      ```bash
+      bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH]
+      ```
+    参数说明：
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+		- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+		- Atlas A3 训练系列产品/Atlas A3 推理系列产品
+    - ASCEND_INSTALL_PATH：CANN软件包安装路径
+
+    脚本运行成功后，会在当前目录下创建CustomOp目录，编译完成后，会在CustomOp/build_out中，生成自定义算子安装包custom_opp_\<target os>_\<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
+
+
+### 3. 部署自定义算子包
+- 部署自定义算子包前，请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH
+    ```bash
+    echo $ASCEND_OPP_PATH
+    # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp
+
+    # 若没有，则需导出CANN环境变量
+    source [ASCEND_INSTALL_PATH]/bin/setenv.bash
+    # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash
+    ```
+    参数说明：
+    - ASCEND_INSTALL_PATH：CANN软件包安装路径，一般和上一步中指定的路径保持一致
+
+- 在自定义算子安装包所在路径下，执行如下命令安装自定义算子包
+    ```bash
+    cd CustomOp/build_out
+    ./custom_opp_<target os>_<target architecture>.run
+    ```
+  命令执行成功后，自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。若要执行tiling下沉样例，则算子包不支持通过--install-path指定目录安装。
+
+## 更新说明
+| 时间       | 更新事项                     |
+| ---------- | ---------------------------- |
+| 2025/5/22 | 新增AddCustomTilingSink算子样例 |
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/framework/tf_plugin/tensorflow_add_custom_plugin.cc b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/framework/tf_plugin/tensorflow_add_custom_plugin.cc
new file mode 100644
index 000000000..b96757140
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/framework/tf_plugin/tensorflow_add_custom_plugin.cc
@@ -0,0 +1,22 @@
+/* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the Apache License Version 2.0.
+ * You may not use this file except in compliance with the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Apache License for more details at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#include "register/register.h"
+
+namespace domi {
+// register op info to GE
+REGISTER_CUSTOM_OP("AddCustomTilingSink")
+    .FrameworkType(TENSORFLOW)   // type: CAFFE, TENSORFLOW
+    .OriginOpType("Add")      // name in tf module
+    .ParseParamsByOperatorFn(AutoMappingByOpFn);
+}  // namespace domi
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh
new file mode 100644
index 000000000..5c36ce5f4
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+SHORT=v:,i:,
+LONG=soc-version:,install-path:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+VERSION_LIST="Ascend910B1"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH
+
+OP_NAME=AddCustomTilingSink
+rm -rf CustomOp
+# Generate the op framework
+msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp
+# Copy op implementation files to CustomOp
+cp -rf framework CustomOp/;cp -rf op_host CustomOp/;cp -rf op_kernel CustomOp/
+#Add Device Compile Task in op_host/CMakeLists.txt
+sed -i '$a ascendc_device_library( TARGET cust_opmaster\n                        OPTION SHARED\n                        SRC ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_tiling_sink_tiling.cpp)' CustomOp/op_host/CMakeLists.txt
+# Build CustomOp project
+(cd CustomOp && bash build.sh)
\ No newline at end of file
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink.cpp
new file mode 100644
index 000000000..c88a110b0
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink.cpp
@@ -0,0 +1,56 @@
+/**
+ * @file add_custom_tiling_sink.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling_sink_tiling.h"
+
+namespace ge {
+static graphStatus InferShape(gert::InferShapeContext *context)
+{
+    const gert::Shape *x1_shape = context->GetInputShape(0);
+    gert::Shape *y_shape = context->GetOutputShape(0);
+    *y_shape = *x1_shape;
+    return GRAPH_SUCCESS;
+}
+
+static graphStatus InferDataType(gert::InferDataTypeContext *context)
+{
+    const auto inputDataType = context->GetInputDataType(0);
+    context->SetOutputDataType(0, inputDataType);
+    return ge::GRAPH_SUCCESS;
+}
+} // namespace ge
+
+namespace ops {
+class AddCustomTilingSink : public OpDef {
+public:
+    explicit AddCustomTilingSink(const char *name) : OpDef(name)
+    {
+        this->Input("x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND});
+        this->Input("y")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .ValueDepend(OPTIONAL, DependScope::TILING); // 表示输入y为Tiling值依赖
+        this->Output("z")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND});
+
+        this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType);
+
+        this->AICore().SetTiling(optiling::AddCustomSinkTilingFunc);
+        
+        this->AICore().AddConfig("ascend910b");
+    }
+};
+OP_ADD(AddCustomTilingSink);
+} // namespace ops
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.cpp
new file mode 100644
index 000000000..32ffb8a3e
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.cpp
@@ -0,0 +1,36 @@
+/**
+ * @file add_custom_tiling_sink_tiling.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include "add_custom_tiling_sink_tiling.h"
+#include "register/device_op_impl_registry.h"
+
+namespace optiling {
+static constexpr uint32_t BLOCK_DIM = 8;
+static constexpr uint32_t TILE_NUM = 8;
+static constexpr size_t MAX_WORKSPACE_SIZE = 32; // 能获取到的最大workspace大小
+static constexpr size_t DEFAULT_WORKSPACE_SIZE = 1;
+ge::graphStatus AddCustomSinkTilingFunc(gert::TilingContext *context)
+{
+    TilingSinkTilingData tiling;
+    uint32_t totalLength = context->GetInputTensor(0)->GetShapeSize();
+    context->SetBlockDim(BLOCK_DIM);
+    tiling.set_totalLength(totalLength);
+    tiling.set_tileNum(TILE_NUM);
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    currentWorkspace[0] = DEFAULT_WORKSPACE_SIZE;
+    if (context->GetInputTensor(1) != nullptr && context->GetInputTensor(1)->GetData<float>() == nullptr) {
+        currentWorkspace[0] = MAX_WORKSPACE_SIZE;
+    }
+    return ge::GRAPH_SUCCESS;
+}
+DEVICE_IMPL_OP_OPTILING(AddCustomTilingSink).Tiling(optiling::AddCustomSinkTilingFunc); // 下沉tiling函数注册
+} // namespace optiling
\ No newline at end of file
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.h b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.h
new file mode 100644
index 000000000..3230af7ba
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.h
@@ -0,0 +1,25 @@
+/**
+ * @file add_custom_tiling_sink_tiling.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADD_CUSTOM_TILING_SINK_TILING_H
+#define ADD_CUSTOM_TILING_SINK_TILING_H
+#include "register/tilingdata_base.h"
+#include "register/op_def_registry.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(TilingSinkTilingData)
+TILING_DATA_FIELD_DEF(uint32_t, totalLength);
+TILING_DATA_FIELD_DEF(uint32_t, tileNum);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(AddCustomTilingSink, TilingSinkTilingData)
+
+ge::graphStatus AddCustomSinkTilingFunc(gert::TilingContext* context);
+} // namespace optiling
+#endif // ADD_CUSTOM_TILING_SINK_TILING_H
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_kernel/add_custom_tiling_sink.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_kernel/add_custom_tiling_sink.cpp
new file mode 100644
index 000000000..4b1cb2f1d
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_kernel/add_custom_tiling_sink.cpp
@@ -0,0 +1,95 @@
+/**
+ * @file add_custom_tiling_sink.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+namespace AscendC {
+constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
+    {
+        this->blockLength = totalLength / AscendC::GetBlockNum();
+        this->tileNum = tileNum;
+        if (tileNum == 0 || BUFFER_NUM == 0) {
+            return;
+        }
+        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
+
+        xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
+        AscendC::LocalTensor<DTYPE_Y> yLocal = inQueueY.AllocTensor<DTYPE_Y>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
+        AscendC::LocalTensor<DTYPE_Y> yLocal = inQueueY.DeQue<DTYPE_Y>();
+        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
+        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+        outQueueZ.EnQue<DTYPE_Z>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<DTYPE_X> xGm;
+    AscendC::GlobalTensor<DTYPE_Y> yGm;
+    AscendC::GlobalTensor<DTYPE_Z> zGm;
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+};
+} // namespace AscendC
+
+extern "C" __global__ __aicore__ void add_custom_tiling_sink(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
+{
+    GET_TILING_DATA(tiling_data, tiling);
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIC_1_2);
+    if ASCEND_IS_AIC {
+        return;
+    }
+    AscendC::KernelAdd op;
+    op.Init(x, y, z, tiling_data.totalLength, tiling_data.tileNum);
+    op.Process();
+}
\ No newline at end of file
-- 
Gitee


From 3337bde74b5f816fef941bcd57f92714c6fbfa78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8?= <chenweiheng1@hisilicon.com>
Date: Thu, 22 May 2025 13:43:27 +0000
Subject: [PATCH 05/46] =?UTF-8?q?!2659=20[feature]torchair=20support=20til?=
 =?UTF-8?q?ing=20custom=20op=20Merge=20pull=20request=20!2659=20from=20?=
 =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../AddCustomTilingSink/AddCustom/README.md   | 322 ++++++++++++++++++
 .../AddCustom/src/add_custom.py               |  78 +++++
 .../AddCustom/test_add_custom.py              |  34 ++
 3 files changed, 434 insertions(+)
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md
new file mode 100644
index 000000000..c2f849645
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md
@@ -0,0 +1,322 @@
+## 背景介绍
+
+Tiling下沉是在Device侧CPU做Tiling计算。由于NPU中AI Core内部存储无法完全容纳算子输入输出的所有数据，需要每次搬运一部分输入数据进行计算然后搬出，再搬运下一部分输入数据进行计算，该过程称之为Tiling；根据算子的shape等信息来确定数据切分算法相关参数（比如每次搬运的块大小，以及总共循环多少次）的计算程序，称之为Tiling实现。由于Tiling实现中完成的均为标量计算，AI Core并不擅长，故一般在Host侧CPU上执行，但是满足下述条件Tiling实现会下沉到Device侧执行：
+
+模型为静态shape。
+模型中的算子支持Tiling下沉，比如FusedInferAttentionScore、IncreFlashAttention等融合算子。
+支持Tiling下沉的算子值有依赖，需要满足前一个算子的值有device的执行结果；如果依赖的值是Const，则不需要下沉执行Tiling，编译时会完成Tiling。
+
+## 目录结构介绍
+
+```
+├── AddCustom   // torch注册的自定义算子
+│   ├── src
+│   │   ├── add_custom.py      // 自定义算子py文件
+│   └── test_add_custom.py    // 测试脚本
+```
+
+## 代码实现介绍
+
+新增自定义算子入图步骤，该过程可参考[torchair社区新增自定义算子入图介绍](https://gitee.com/ascend/torchair/blob/master/CONTRIBUTING.md#converter%E8%A1%A5%E9%BD%90)converter补齐第五小节：
+1.下载torchair仓，新建一个add_custom.py文件放在torchair/python/torchair/ops/add_custom.py，然后自定义算子在torch框架中注册：
+
+```python
+# add_custom.py
+import torch
+
+lib = torch.library.Library("air", "FRAGMENT")
+lib.define(
+    """
+    add_custom(Tensor x, Tensor y) -> Tensor
+    """
+)
+```
+
+2.向torch注册自定义算子meta后端实现，用来完成图模式下的shape推导:
+
+```python
+@torch.library.impl(lib, "add_custom", "Meta")
+   def kernel_meta(x, y):
+       return torch.empty_like(x)
+```
+
+3.codegen生成ge构图api
+（1）将REG_OP算子原型放置到codegen/custom_op/custom_reg_op.h文件中，替换原来示例的REG_OP：
+
+```cpp
+#ifndef ASCENDADAPTER2_CUSTOM_REG_OP_H
+#define ASCENDADAPTER2_CUSTOM_REG_OP_H
+#include "graph/operator_reg.h"
+
+namespace ge {
+REG_OP(AddCustomTilingsink)
+   .INPUT(x, TensorType::ALL())
+   .INPUT(y, TensorType::ALL())
+   .OUTPUT(z, TensorType::ALL())
+   .OP_END_FACTORY_REG(AddCustomTilingsink)
+}
+
+#endif  // ASCENDADAPTER2_CUSTOM_REG_OP_H
+```
+
+（2）进入torchair仓根目录执行编译命令：
+
+```
+cd build
+cmake ..
+make generate_ge_raw_custom_ops
+```
+
+生成的ge.api函数在codegen/custom\_op/auto\_generated\_ge\_raw\_custom\_ops.py文件中, 内容如下所示：
+
+```python
+# This file is auto-generated
+# Summary: total 1, generated 1, skipped 0
+from typing import Any, Dict, List, Tuple, Union, Callable, Optional
+from torchair.ge._ge_graph import auto_convert_to_tensor, TensorType
+from torchair.ge import Tensor, DataType, attr
+from torchair._ge_concrete_graph.ge_converter import ge_op, IrDef
+
+
+# This api is auto-generated from IR AddCustomTilingsink
+@auto_convert_to_tensor([False, False], [False, False], inputs_tensor_type=[TensorType.TT_ALL, TensorType.TT_ALL])
+def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
+    """REG_OP(AddCustomTilingsink)\n
+.INPUT(x, TensorType::ALL())\n
+.INPUT(y, TensorType::ALL())\n
+.OUTPUT(z, TensorType::ALL())\n
+"""
+
+    # process inputs
+    inputs = {
+        "x": x,
+        "y": y,
+    }
+
+    # process attrs
+    attrs = {
+    }
+
+    # process outputs
+    outputs = [
+    "z",
+    ]
+
+    return ge_op(
+        op_type="AddCustomTilingsink",
+        inputs=inputs,
+        attrs=attrs,
+        outputs=outputs,
+        dependencies=dependencies,
+        ir=IrDef("AddCustomTilingsink") \
+        .input("x", "") \
+        .input("y", "") \
+        .output("z" , "")
+    )
+```
+
+需要修改`from torchair._ge_concrete_graph.ge_converter import ge_op, IrDef`
+为``from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef``
+
+将上述生成内容拷贝至前面我们新建的add_custom.py文件中。
+
+4.向torchair注册自定义算子的converter：
+
+```python
+@register_fx_node_ge_converter(torch.ops.air.add_custom.default)
+def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
+    return AddCustomTilingsink(x, y)
+```
+
+5.单算子部分为用户自行注册，此处预留未实现：
+
+```python
+def kernel_impl(x, y):
+    raise NotImplementedError("torch.ops.air.add_custom kernel_impl is not implemented!")
+
+
+torch.library.impl(lib, "add_custom", "CPU")(kernel_impl)
+torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
+```
+
+6.调用，需要import前面新建的add_custom.py：
+
+```python
+import torchair.ops.add_custom
+
+def forward(self, x, y):
+    z = torch.ops.air.add_custom.default(x, y)
+    return z
+```
+
+## 运行样例算子
+
+### 1. 编译安装torchair包
+
+1.编译，进入torchair根目录，执行：
+
+```
+bash build.sh -c
+```
+
+2.安装，进入torchair根目录，执行注意pip3.x为对应Python版本：
+
+```
+pip3.x uninstall torchair
+pip3.x install output/torchair_xxxx.whl
+```
+
+3.删除环境上torch_npu模块下的torchair子模块，使得我们安装的torchair模块生效：
+
+```
+rm -rf /usr/local/python3.8.1/lib/python3.8/site-packages/torch_npu/dynamo/torchair
+```
+
+查看torch_npu路径：
+
+```
+pip3.x show torch_npu
+```
+### 2. 部署自定义算子包
+请参考[tiling下沉样例](https://gitee.com/ascend/samples/tree/master/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl)部署自定义算子包章节：
+
+### 3. 执行脚本
+
+需要脚本中先打开tiling下沉的开关
+
+```python
+from torchair.configs.compiler_config import CompilerConfig
+
+config = CompilerConfig()
+config.experimental_config.tiling_schedule_optimize = True
+```
+
+## 更新说明
+
+| 时间      | 更新事项     |
+| --------- | ------------ |
+| 2025/5/22 | 新增本readme |
+
+## add_custom.py
+
+```python
+from typing import (
+    Optional,
+    Union,
+    List,
+)
+import torch
+from torchair._ge_concrete_graph.fx2ge_converter import register_fx_node_ge_converter
+from torchair.ge._ge_graph import Tensor, TensorSpec
+
+lib = torch.library.Library("air", "FRAGMENT")
+lib.define(
+    """
+    add_custom(Tensor x, Tensor y) -> Tensor
+    """
+)
+
+
+@torch.library.impl(lib, "add_custom", "Meta")
+def kernel_meta(x, y):
+    return torch.empty_like(x)
+
+
+def kernel_impl(x, y):
+    raise NotImplementedError("torch.ops.air.add_custom kernel_impl is not implemented!")
+
+
+torch.library.impl(lib, "add_custom", "CPU")(kernel_impl)
+torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
+
+
+@register_fx_node_ge_converter(torch.ops.air.add_custom.default)
+def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
+    return AddCustomTilingsink(x, y)
+
+
+# This file is auto-generated by
+# Summary: total 1, generated 1, skipped 0
+from typing import Any, Dict, List, Tuple, Union, Callable, Optional
+from torchair.ge._ge_graph import auto_convert_to_tensor, TensorType
+from torchair.ge import Tensor, DataType, attr
+from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef
+
+
+# This api is auto-generated from IR AddCustomTilingsink
+@auto_convert_to_tensor([False, False], [False, False], inputs_tensor_type=[TensorType.TT_ALL, TensorType.TT_ALL])
+def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
+    """REG_OP(AddCustomTilingsink)\n
+.INPUT(x, TensorType::ALL())\n
+.INPUT(y, TensorType::ALL())\n
+.OUTPUT(z, TensorType::ALL())\n
+"""
+
+    # process inputs
+    inputs = {
+        "x": x,
+        "y": y,
+    }
+
+    # process attrs
+    attrs = {
+    }
+
+    # process outputs
+    outputs = [
+    "z",
+    ]
+
+    return ge_op(
+        op_type="AddCustomTilingsink",
+        inputs=inputs,
+        attrs=attrs,
+        outputs=outputs,
+        dependencies=dependencies,
+        ir=IrDef("AddCustomTilingsink") \
+        .input("x", "") \
+        .input("y", "") \
+        .output("z" , "")
+    )
+
+```
+
+## test_add_custom.py
+
+```python
+import torch
+import torch_npu
+import torchair
+from torchair.configs.compiler_config import CompilerConfig
+from torchair.core.utils import logger
+import logging
+
+logger.setLevel(logging.DEBUG)
+config = CompilerConfig()
+config.debug.graph_dump.type = "pbtxt"
+config.experimental_config.tiling_schedule_optimize = True
+npu_backend = torchair.get_npu_backend(compiler_config=config)
+
+import torchair.ops._add_custom
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super(MyModule, self).__init__()
+
+    def forward(self, x, y):
+        z = torch.ops.air.add_custom.default(x, y)
+        return z
+
+
+# 创建并编译模块
+module = MyModule().npu()
+module = torch.compile(module, fullgraph=True, backend=npu_backend, dynamic=False)
+
+# 示例输入
+x = torch.randn(6, 64, dtype=torch.float32).npu()
+y = torch.randn(6, 64, dtype=torch.float32).npu()
+
+output = module(x, y)
+print(output)
+
+```
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py
new file mode 100644
index 000000000..4dd84002b
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py
@@ -0,0 +1,78 @@
+from typing import (
+    Optional,
+    Union,
+    List,
+)
+import torch
+from torchair._ge_concrete_graph.fx2ge_converter import register_fx_node_ge_converter
+from torchair.ge._ge_graph import Tensor, TensorSpec
+
+lib = torch.library.Library("air", "FRAGMENT")
+lib.define(
+    """
+    add_custom(Tensor x, Tensor y) -> Tensor
+    """
+)
+
+
+@torch.library.impl(lib, "add_custom", "Meta")
+def kernel_meta(x, y):
+    return torch.empty_like(x)
+
+
+def kernel_impl(x, y):
+    raise NotImplementedError("torch.ops.air.add_custom kernel_impl is not implemented!")
+
+
+torch.library.impl(lib, "add_custom", "CPU")(kernel_impl)
+torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
+
+
+@register_fx_node_ge_converter(torch.ops.air.add_custom.default)
+def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
+    return AddCustomTilingsink(x, y)
+
+
+# This file is auto-generated by
+# Summary: total 1, generated 1, skipped 0
+from typing import Any, Dict, List, Tuple, Union, Callable, Optional
+from torchair.ge._ge_graph import auto_convert_to_tensor, TensorType
+from torchair.ge import Tensor, DataType, attr
+from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef
+
+
+# This api is auto-generated from IR AddCustomTilingsink
+@auto_convert_to_tensor([False, False], [False, False], inputs_tensor_type=[TensorType.TT_ALL, TensorType.TT_ALL])
+def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
+    """REG_OP(AddCustomTilingsink)\n
+.INPUT(x, TensorType::ALL())\n
+.INPUT(y, TensorType::ALL())\n
+.OUTPUT(z, TensorType::ALL())\n
+"""
+
+    # process inputs
+    inputs = {
+        "x": x,
+        "y": y,
+    }
+
+    # process attrs
+    attrs = {
+    }
+
+    # process outputs
+    outputs = [
+    "z",
+    ]
+
+    return ge_op(
+        op_type="AddCustomTilingsink",
+        inputs=inputs,
+        attrs=attrs,
+        outputs=outputs,
+        dependencies=dependencies,
+        ir=IrDef("AddCustomTilingsink") \
+        .input("x", "") \
+        .input("y", "") \
+        .output("z" , "")
+    )
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py
new file mode 100644
index 000000000..c093d75b8
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py
@@ -0,0 +1,34 @@
+import torch
+import torch_npu
+import torchair
+from torchair.configs.compiler_config import CompilerConfig
+from torchair.core.utils import logger
+import logging
+
+logger.setLevel(logging.DEBUG)
+config = CompilerConfig()
+config.debug.graph_dump.type = "pbtxt"
+config.experimental_config.tiling_schedule_optimize = True
+npu_backend = torchair.get_npu_backend(compiler_config=config)
+
+import torchair.ops._add_custom
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super(MyModule, self).__init__()
+
+    def forward(self, x, y):
+        z = torch.ops.air.add_custom.default(x, y)
+        return z
+
+
+# 创建并编译模块
+module = MyModule().npu()
+module = torch.compile(module, fullgraph=True, backend=npu_backend, dynamic=False)
+
+# 示例输入
+x = torch.randn(6, 64, dtype=torch.float32).npu()
+y = torch.randn(6, 64, dtype=torch.float32).npu()
+
+output = module(x, y)
+print(output.shape)
\ No newline at end of file
-- 
Gitee


From e474a6d7c2f4cb9b4e43f9d1b576c860d881891b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8?= <chenweiheng1@hisilicon.com>
Date: Fri, 23 May 2025 09:04:23 +0000
Subject: [PATCH 06/46] =?UTF-8?q?!2661=20[feature]fix=20add=5Fcustom=20Mer?=
 =?UTF-8?q?ge=20pull=20request=20!2661=20from=20=E9=99=88=E5=A8=81?=
 =?UTF-8?q?=E4=BA=A8/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../AddCustomTilingSink/AddCustom/README.md   | 39 ++++++++++---------
 .../AddCustom/src/add_custom.py               | 12 +++---
 .../AddCustom/test_add_custom.py              |  2 +-
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md
index c2f849645..217cbfad8 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md
@@ -18,7 +18,7 @@ Tiling下沉是在Device侧CPU做Tiling计算。由于NPU中AI Core内部存储
 ## 代码实现介绍
 
 新增自定义算子入图步骤，该过程可参考[torchair社区新增自定义算子入图介绍](https://gitee.com/ascend/torchair/blob/master/CONTRIBUTING.md#converter%E8%A1%A5%E9%BD%90)converter补齐第五小节：
-1.下载torchair仓，新建一个add_custom.py文件放在torchair/python/torchair/ops/add_custom.py，然后自定义算子在torch框架中注册：
+1.下载[torchair仓](https://gitee.com/ascend/torchair)，新建一个add_custom.py文件放在torchair/python/torchair/ops/add_custom.py，然后在torch框架中注册自定义算子：
 
 ```python
 # add_custom.py
@@ -49,11 +49,11 @@ lib.define(
 #include "graph/operator_reg.h"
 
 namespace ge {
-REG_OP(AddCustomTilingsink)
+REG_OP(AddCustomTilingSink)
    .INPUT(x, TensorType::ALL())
    .INPUT(y, TensorType::ALL())
    .OUTPUT(z, TensorType::ALL())
-   .OP_END_FACTORY_REG(AddCustomTilingsink)
+   .OP_END_FACTORY_REG(AddCustomTilingSink)
 }
 
 #endif  // ASCENDADAPTER2_CUSTOM_REG_OP_H
@@ -78,10 +78,10 @@ from torchair.ge import Tensor, DataType, attr
 from torchair._ge_concrete_graph.ge_converter import ge_op, IrDef
 
 
-# This api is auto-generated from IR AddCustomTilingsink
+# This api is auto-generated from IR AddCustomTilingSink
 @auto_convert_to_tensor([False, False], [False, False], inputs_tensor_type=[TensorType.TT_ALL, TensorType.TT_ALL])
-def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
-    """REG_OP(AddCustomTilingsink)\n
+def AddCustomTilingSink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
+    """REG_OP(AddCustomTilingSink)\n
 .INPUT(x, TensorType::ALL())\n
 .INPUT(y, TensorType::ALL())\n
 .OUTPUT(z, TensorType::ALL())\n
@@ -103,12 +103,12 @@ def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None
     ]
 
     return ge_op(
-        op_type="AddCustomTilingsink",
+        op_type="AddCustomTilingSink",
         inputs=inputs,
         attrs=attrs,
         outputs=outputs,
         dependencies=dependencies,
-        ir=IrDef("AddCustomTilingsink") \
+        ir=IrDef("AddCustomTilingSink") \
         .input("x", "") \
         .input("y", "") \
         .output("z" , "")
@@ -116,7 +116,7 @@ def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None
 ```
 
 需要修改`from torchair._ge_concrete_graph.ge_converter import ge_op, IrDef`
-为``from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef``
+为`from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef`
 
 将上述生成内容拷贝至前面我们新建的add_custom.py文件中。
 
@@ -125,7 +125,7 @@ def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None
 ```python
 @register_fx_node_ge_converter(torch.ops.air.add_custom.default)
 def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
-    return AddCustomTilingsink(x, y)
+    return AddCustomTilingSink(x, y)
 ```
 
 5.单算子部分为用户自行注册，此处预留未实现：
@@ -139,7 +139,7 @@ torch.library.impl(lib, "add_custom", "CPU")(kernel_impl)
 torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
 ```
 
-6.调用，需要import前面新建的add_custom.py：
+6.调用时，需要import前面新建的add_custom.py：
 
 ```python
 import torchair.ops.add_custom
@@ -172,11 +172,12 @@ pip3.x install output/torchair_xxxx.whl
 rm -rf /usr/local/python3.8.1/lib/python3.8/site-packages/torch_npu/dynamo/torchair
 ```
 
-查看torch_npu路径：
+查看环境上安装的torch_npu的路径：
 
 ```
 pip3.x show torch_npu
 ```
+
 ### 2. 部署自定义算子包
 请参考[tiling下沉样例](https://gitee.com/ascend/samples/tree/master/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl)部署自定义算子包章节：
 
@@ -232,7 +233,7 @@ torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
 
 @register_fx_node_ge_converter(torch.ops.air.add_custom.default)
 def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
-    return AddCustomTilingsink(x, y)
+    return AddCustomTilingSink(x, y)
 
 
 # This file is auto-generated by
@@ -243,10 +244,10 @@ from torchair.ge import Tensor, DataType, attr
 from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef
 
 
-# This api is auto-generated from IR AddCustomTilingsink
+# This api is auto-generated from IR AddCustomTilingSink
 @auto_convert_to_tensor([False, False], [False, False], inputs_tensor_type=[TensorType.TT_ALL, TensorType.TT_ALL])
-def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
-    """REG_OP(AddCustomTilingsink)\n
+def AddCustomTilingSink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
+    """REG_OP(AddCustomTilingSink)\n
 .INPUT(x, TensorType::ALL())\n
 .INPUT(y, TensorType::ALL())\n
 .OUTPUT(z, TensorType::ALL())\n
@@ -268,12 +269,12 @@ def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None
     ]
 
     return ge_op(
-        op_type="AddCustomTilingsink",
+        op_type="AddCustomTilingSink",
         inputs=inputs,
         attrs=attrs,
         outputs=outputs,
         dependencies=dependencies,
-        ir=IrDef("AddCustomTilingsink") \
+        ir=IrDef("AddCustomTilingSink") \
         .input("x", "") \
         .input("y", "") \
         .output("z" , "")
@@ -297,7 +298,7 @@ config.debug.graph_dump.type = "pbtxt"
 config.experimental_config.tiling_schedule_optimize = True
 npu_backend = torchair.get_npu_backend(compiler_config=config)
 
-import torchair.ops._add_custom
+import torchair.ops.add_custom
 
 class MyModule(torch.nn.Module):
     def __init__(self):
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py
index 4dd84002b..dc73f0b07 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py
@@ -30,7 +30,7 @@ torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
 
 @register_fx_node_ge_converter(torch.ops.air.add_custom.default)
 def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
-    return AddCustomTilingsink(x, y)
+    return AddCustomTilingSink(x, y)
 
 
 # This file is auto-generated by
@@ -41,10 +41,10 @@ from torchair.ge import Tensor, DataType, attr
 from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef
 
 
-# This api is auto-generated from IR AddCustomTilingsink
+# This api is auto-generated from IR AddCustomTilingSink
 @auto_convert_to_tensor([False, False], [False, False], inputs_tensor_type=[TensorType.TT_ALL, TensorType.TT_ALL])
-def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
-    """REG_OP(AddCustomTilingsink)\n
+def AddCustomTilingSink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
+    """REG_OP(AddCustomTilingSink)\n
 .INPUT(x, TensorType::ALL())\n
 .INPUT(y, TensorType::ALL())\n
 .OUTPUT(z, TensorType::ALL())\n
@@ -66,12 +66,12 @@ def AddCustomTilingsink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None
     ]
 
     return ge_op(
-        op_type="AddCustomTilingsink",
+        op_type="AddCustomTilingSink",
         inputs=inputs,
         attrs=attrs,
         outputs=outputs,
         dependencies=dependencies,
-        ir=IrDef("AddCustomTilingsink") \
+        ir=IrDef("AddCustomTilingSink") \
         .input("x", "") \
         .input("y", "") \
         .output("z" , "")
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py
index c093d75b8..81bba97bb 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py
@@ -11,7 +11,7 @@ config.debug.graph_dump.type = "pbtxt"
 config.experimental_config.tiling_schedule_optimize = True
 npu_backend = torchair.get_npu_backend(compiler_config=config)
 
-import torchair.ops._add_custom
+import torchair.ops.add_custom
 
 class MyModule(torch.nn.Module):
     def __init__(self):
-- 
Gitee


From a4f649d8020d8d7841ef71a12cf3558f97b4a0d6 Mon Sep 17 00:00:00 2001
From: renjie <renjie88@huawei.com>
Date: Fri, 23 May 2025 09:17:21 +0000
Subject: [PATCH 07/46] =?UTF-8?q?!2662=20=E5=88=A0=E9=99=A4=E6=A0=A1?=
 =?UTF-8?q?=E9=AA=8C=20*=20=E5=88=A0=E9=99=A4=E6=A0=A1=E9=AA=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh     | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh
index 5c36ce5f4..d4ee2aa9a 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh
@@ -25,11 +25,6 @@ while :; do
     esac
 done
 
-VERSION_LIST="Ascend910B1"
-if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
-    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
-    exit -1
-fi
 
 if [ -n "$ASCEND_INSTALL_PATH" ]; then
     _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
-- 
Gitee


From 06fb25b16e417d0a6d4e607fa7eeba414be88e20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=87=E7=A5=96=E6=B6=9B?= <wanzutao1@h-partners.com>
Date: Mon, 26 May 2025 06:30:09 +0000
Subject: [PATCH 08/46] =?UTF-8?q?!2655=20kernel=E7=9B=B4=E8=B0=83AddCustom?=
 =?UTF-8?q?=E6=A0=B7=E4=BE=8B=E6=9B=B4=E6=96=B0=20Merge=20pull=20request?=
 =?UTF-8?q?=20!2655=20from=20=E4=B8=87=E7=A5=96=E6=B6=9B/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../AddCustomKernel/CMakeLists.txt            |  30 +++
 .../KernelLaunch/AddCustomKernel/README.md    |  77 +++++++
 .../AddCustomKernel/add_custom.cpp            |  84 ++++++++
 .../examples/CPPInvocation/CMakeLists.txt     |  34 +++
 .../examples/CPPInvocation/README.md          |  40 ++++
 .../examples/CPPInvocation/data_utils.h       | 204 ++++++++++++++++++
 .../examples/CPPInvocation/main.cpp           |  62 ++++++
 .../examples/CPPInvocation/run.sh             |  47 ++++
 .../CPPInvocation/scripts/gen_data.py         |  25 +++
 .../CPPInvocation/scripts/verify_result.py    |  53 +++++
 .../KernelLaunch/AddCustomKernel/run.sh       |  79 +++++++
 .../AddCustomTilingKernel/CMakeLists.txt      |  30 +++
 .../AddCustomTilingKernel/README.md           |  78 +++++++
 .../AddCustomTilingKernel/add_custom.cpp      |  92 ++++++++
 .../AddCustomTilingKernel/add_custom_tiling.h |  15 ++
 .../examples/CPPInvocation/CMakeLists.txt     |  35 +++
 .../examples/CPPInvocation/README.md          |  40 ++++
 .../examples/CPPInvocation/data_utils.h       | 203 +++++++++++++++++
 .../examples/CPPInvocation/main.cpp           |  63 ++++++
 .../examples/CPPInvocation/run.sh             |  47 ++++
 .../CPPInvocation/scripts/gen_data.py         |  25 +++
 .../CPPInvocation/scripts/verify_result.py    |  53 +++++
 .../examples/PythonInvocation/CMakeLists.txt  |  53 +++++
 .../examples/PythonInvocation/README.md       |  69 ++++++
 .../PythonInvocation/add_custom_test.py       |  38 ++++
 .../examples/PythonInvocation/pybind11.cpp    |  40 ++++
 .../examples/PythonInvocation/run.sh          |  25 +++
 .../KernelLaunch/AddCustomTilingKernel/run.sh |  79 +++++++
 operator_contrib/AddCustomSample/README.md    | 112 ++++++++++
 29 files changed, 1832 insertions(+)
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/CMakeLists.txt
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/README.md
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/add_custom.cpp
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/CMakeLists.txt
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/README.md
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/data_utils.h
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/main.cpp
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/run.sh
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/gen_data.py
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/verify_result.py
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/run.sh
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/CMakeLists.txt
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/README.md
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/add_custom.cpp
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/add_custom_tiling.h
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/CMakeLists.txt
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/README.md
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/data_utils.h
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/main.cpp
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/run.sh
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/gen_data.py
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/verify_result.py
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/CMakeLists.txt
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/README.md
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/add_custom_test.py
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/pybind11.cpp
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/run.sh
 create mode 100644 operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/run.sh
 create mode 100644 operator_contrib/AddCustomSample/README.md

diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/CMakeLists.txt b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/CMakeLists.txt
new file mode 100644
index 000000000..a3a200642
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(Ascend_C)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+file(GLOB SOURCES "*.cpp")
+# user-defined configuration
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
+set(RUN_MODE "npu" CACHE STRING "run mode: npu")
+set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+set(LIBRARY_TYPE "SHARED" CACHE STRING "library type:SHARED or STATIC")
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+if(LIBRARY_TYPE STREQUAL "SHARED")
+    ascendc_library(kernels SHARED ${SOURCES})
+else()
+    ascendc_library(kernels STATIC ${SOURCES})
+endif()
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/README.md b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/README.md
new file mode 100644
index 000000000..449abe628
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/README.md
@@ -0,0 +1,77 @@
+## `AddCustom`自定义算子样例说明
+
+本样例通过`Ascend C`编程语言实现了`AddCustom`算子不带Tiling场景。
+
+### 算子描述
+
+`AddCustom`算子返回两个数据相加的结果。
+
+### 算子规格描述
+
+| 算子类型(OpType) | AddCustom  |          |           |        |
+| ---------------- | ---------- | -------- | --------- | ------ |
+| 算子输入         | name       | shape    | data type | format |
+| x                | 8 * 2048   | float16  | ND        |        |
+| y                | 8 * 2048   | float16  | ND        |        |
+| 算子输出         | z          | 8 * 2048 | float16   | ND     |
+| 核函数名         | add_custom |          |           |        |
+
+### 支持的产品型号
+
+本样例支持如下产品型号：
+
+- Atlas 训练系列产品
+- Atlas 推理系列产品
+- Atlas A2训练系列产品
+- Atlas 800I A2推理产品
+- Atlas 200I/500 A2推理产品
+
+### 目录结构介绍
+
+```
+├── examples                     // 调用示例目录
+├── add_custom.cpp               // 算子kernel代码
+├── CMakeLists.txt               // cmake编译文件
+├── run.sh                       // 运行脚本
+└── README.md                   // 样例指导手册 
+```
+
+### 环境要求
+
+编译运行此样例前，请参考[《CANN软件安装指南》](https://gitee.com/link?target=https%3A%2F%2Fhiascend.com%2Fdocument%2Fredirect%2FCannCommunityInstSoftware)完成开发运行环境的部署。
+
+### 算子包编译部署
+
+1.进入到样例目录
+
+```
+cd ${git_clone_path}/samples/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel
+```
+
+2.算子编译部署
+
+- 打包动态库部署
+
+  ```
+  bash run.sh -l SHARED -v Ascend***(由npu-smi info查询得到)
+  ```
+
+- 打包静态库部署
+
+  ```
+  bash run.sh -l STATIC -v Ascend***(由npu-smi info查询得到)
+  ```
+
+  
+
+### 算子调用
+
+| 目录                                                         | 描述                                     |
+| ------------------------------------------------------------ | ---------------------------------------- |
+| [CPPInvocation](./examples/CPPInvocation) | Pybind方式调用AddCustom算子。            |
+
+### 更新说明
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/01/06 | 新增本readme |
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/add_custom.cpp b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/add_custom.cpp
new file mode 100644
index 000000000..eb662e8aa
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/add_custom.cpp
@@ -0,0 +1,84 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+ #include "kernel_operator.h"
+
+ constexpr int32_t TOTAL_LENGTH = 8 * 2048;                            // total length of data
+ constexpr int32_t USE_CORE_NUM = 8;                                   // num of core used
+ constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;         // length computed of each core
+ constexpr int32_t TILE_NUM = 8;                                       // split data into 8 tiles for each core
+ constexpr int32_t BUFFER_NUM = 2;                                     // tensor num for each queue
+ constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // separate to 2 parts, due to double buffer
+ 
+ class KernelAdd {
+ public:
+     __aicore__ inline KernelAdd() {}
+     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+     {
+         xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+         yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+         zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+         pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+         pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+         pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+     }
+     __aicore__ inline void Process()
+     {
+         int32_t loopCount = TILE_NUM * BUFFER_NUM;
+         for (int32_t i = 0; i < loopCount; i++) {
+             CopyIn(i);
+             Compute(i);
+             CopyOut(i);
+         }
+     }
+ 
+ private:
+     __aicore__ inline void CopyIn(int32_t progress)
+     {
+         AscendC::LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+         AscendC::LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+         AscendC::DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+         AscendC::DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
+         inQueueX.EnQue(xLocal);
+         inQueueY.EnQue(yLocal);
+     }
+     __aicore__ inline void Compute(int32_t progress)
+     {
+         AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+         AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+         AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+         AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+         outQueueZ.EnQue<half>(zLocal);
+         inQueueX.FreeTensor(xLocal);
+         inQueueY.FreeTensor(yLocal);
+     }
+     __aicore__ inline void CopyOut(int32_t progress)
+     {
+         AscendC::LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+         AscendC::DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
+         outQueueZ.FreeTensor(zLocal);
+     }
+ 
+ private:
+     AscendC::TPipe pipe;
+     AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+     AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+     AscendC::GlobalTensor<half> xGm;
+     AscendC::GlobalTensor<half> yGm;
+     AscendC::GlobalTensor<half> zGm;
+ };
+ 
+ extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+ {
+     KernelAdd op;
+     op.Init(x, y, z);
+     op.Process();
+ }
+ 
+ 
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/CMakeLists.txt b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/CMakeLists.txt
new file mode 100644
index 000000000..ccef61311
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--copy-dt-needed-entries")
+add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+target_link_directories(ascendc_kernels_bbit PRIVATE
+  ${ASCEND_CANN_PACKAGE_PATH}/lib64
+)
+target_include_directories(ascendc_kernels_bbit PRIVATE
+  ${ASCEND_CANN_PACKAGE_PATH}/include
+)
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    ascendcl
+    kernels
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/README.md b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/README.md
new file mode 100644
index 000000000..df0672608
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/README.md
@@ -0,0 +1,40 @@
+## 概述
+
+通过C++接口调用核函数直调实现的不带Tiling的AddCustom算子
+
+## 目录结构介绍
+
+```
+├── CPPInvocation
+│   ├── scripts
+        └── gen_data.py       // 输入数据和标杆数据构造脚本
+        └── verify_result.py  // 标杆数据和自定义算子输出数据对比脚本
+│   ├── CMakeLists.txt        // cmake编译文件
+│   ├── main.cpp              // 算子调用代码
+│   ├── data_utils.h          // 数据类型定义,数据读取代码
+│   ├── run.sh                // 编译运行算子的脚本
+```
+
+## 运行样例算子
+
+  **请确保已根据算子包编译部署步骤完成本算子的编译部署动作。**
+
+  - 进入样例代码所在路径
+
+  ```bash
+ cd ${git_clone_path}/samples/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation
+  ```
+
+  - 样例执行
+
+    样例执行过程中会自动生成测试数据，然后编译与运行C++调用样例，最后打印运行结果。
+
+    ```bash
+    bash run.sh
+    ```
+
+## 更新说明
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/05/19 | 样例首次提交 |
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/data_utils.h b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/data_utils.h
new file mode 100644
index 000000000..ae9cf84f9
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/data_utils.h
@@ -0,0 +1,204 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+ #ifndef DATA_UTILS_H
+ #define DATA_UTILS_H
+ #include <fcntl.h>
+ #include <sys/stat.h>
+ #include <unistd.h>
+ 
+ #include <cassert>
+ #include <cstdio>
+ #include <fstream>
+ #include <iomanip>
+ #include <iostream>
+ #include <string>
+ #include <vector>
+ 
+ #include "acl/acl.h"
+ 
+ typedef enum {
+     DT_UNDEFINED = -1,
+     FLOAT = 0,
+     HALF = 1,
+     INT8_T = 2,
+     INT32_T = 3,
+     UINT8_T = 4,
+     INT16_T = 6,
+     UINT16_T = 7,
+     UINT32_T = 8,
+     INT64_T = 9,
+     UINT64_T = 10,
+     DOUBLE = 11,
+     BOOL = 12,
+     STRING = 13,
+     COMPLEX64 = 16,
+     COMPLEX128 = 17,
+     BF16 = 27
+ } printDataType;
+ 
+ #define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+ #define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+ #define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+ #define CHECK_ACL(x)                                                                        \
+     do {                                                                                    \
+         aclError __ret = x;                                                                 \
+         if (__ret != ACL_ERROR_NONE) {                                                      \
+             std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+         }                                                                                   \
+     } while (0);
+ 
+ /**
+  * @brief Read data from file
+  * @param [in] filePath: file path
+  * @param [out] fileSize: file size
+  * @return read result
+  */
+ bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+ {
+     struct stat sBuf;
+     int fileStatus = stat(filePath.data(), &sBuf);
+     if (fileStatus == -1) {
+         ERROR_LOG("failed to get file");
+         return false;
+     }
+     if (S_ISREG(sBuf.st_mode) == 0) {
+         ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+         return false;
+     }
+ 
+     std::ifstream file;
+     file.open(filePath, std::ios::binary);
+     if (!file.is_open()) {
+         ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+         return false;
+     }
+ 
+     std::filebuf *buf = file.rdbuf();
+     size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+     if (size == 0) {
+         ERROR_LOG("file size is 0");
+         file.close();
+         return false;
+     }
+     if (size > bufferSize) {
+         ERROR_LOG("file size is larger than buffer size");
+         file.close();
+         return false;
+     }
+     buf->pubseekpos(0, std::ios::in);
+     buf->sgetn(static_cast<char *>(buffer), size);
+     fileSize = size;
+     file.close();
+     return true;
+ }
+ 
+ /**
+  * @brief Write data to file
+  * @param [in] filePath: file path
+  * @param [in] buffer: data to write to file
+  * @param [in] size: size to write
+  * @return write result
+  */
+ bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+ {
+     if (buffer == nullptr) {
+         ERROR_LOG("Write file failed. buffer is nullptr");
+         return false;
+     }
+ 
+     int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+     if (fd < 0) {
+         ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+         return false;
+     }
+ 
+     size_t writeSize = write(fd, buffer, size);
+     (void)close(fd);
+     if (writeSize != size) {
+         ERROR_LOG("Write file Failed.");
+         return false;
+     }
+ 
+     return true;
+ }
+ 
+ template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+ {
+     assert(elementsPerRow != 0);
+     for (size_t i = 0; i < count; ++i) {
+         std::cout << std::setw(10) << data[i];
+         if (i % elementsPerRow == elementsPerRow - 1) {
+             std::cout << std::endl;
+         }
+     }
+ }
+ 
+ void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+ {
+     assert(elementsPerRow != 0);
+     for (size_t i = 0; i < count; ++i) {
+         std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+         if (i % elementsPerRow == elementsPerRow - 1) {
+             std::cout << std::endl;
+         }
+     }
+ }
+ 
+ void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+ {
+     if (data == nullptr) {
+         ERROR_LOG("Print data failed. data is nullptr");
+         return;
+     }
+ 
+     switch (dataType) {
+         case BOOL:
+             DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+             break;
+         case INT8_T:
+             DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+             break;
+         case UINT8_T:
+             DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+             break;
+         case INT16_T:
+             DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+             break;
+         case UINT16_T:
+             DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+             break;
+         case INT32_T:
+             DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+             break;
+         case UINT32_T:
+             DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+             break;
+         case INT64_T:
+             DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+             break;
+         case UINT64_T:
+             DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+             break;
+         case HALF:
+             DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+             break;
+         case FLOAT:
+             DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+             break;
+         case DOUBLE:
+             DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+             break;
+         default:
+             ERROR_LOG("Unsupported type: %d", dataType);
+     }
+     std::cout << std::endl;
+ }
+ #endif // DATA_UTILS_H
+ 
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/main.cpp b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/main.cpp
new file mode 100644
index 000000000..78b372929
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/main.cpp
@@ -0,0 +1,62 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+ #include "data_utils.h"
+ #include "acl/acl.h"
+ #include "aclrtlaunch_add_custom.h"
+ 
+ int32_t main(int32_t argc, char *argv[])
+ {
+    uint32_t blockDim = 8;
+    size_t inputByteSize = 8 * 2048 * sizeof(uint16_t);
+    size_t outputByteSize = 8 * 2048 * sizeof(uint16_t);
+
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    void *xHost, *yHost, *zHost;
+    void *xDevice, *yDevice, *zDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // add_custom_do(blockDim, stream, xDevice, yDevice, zDevice);
+    ACLRT_LAUNCH_KERNEL(add_custom)
+    (blockDim, stream, xDevice, yDevice, zDevice);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output_z.bin", zHost, outputByteSize);
+
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(zHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+    return 0;
+ }
+ 
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/run.sh b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/run.sh
new file mode 100644
index 000000000..9e80ad982
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/run.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    msprof op --application=./ascendc_kernels_bbit
+    
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z.bin output/golden.bin
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/gen_data.py b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/gen_data.py
new file mode 100644
index 000000000..ea8ce828a
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/gen_data.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1, 100, [8, 2048]).astype(np.float16)
+    input_y = np.random.uniform(1, 100, [8, 2048]).astype(np.float16)
+    golden = (input_x + input_y).astype(np.float16)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/verify_result.py b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/verify_result.py
new file mode 100644
index 000000000..1a21d809a
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float16
+relative_tol = 1e-3
+absolute_tol = 1e-5
+error_tol = 1e-3
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float16).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float16).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/run.sh b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/run.sh
new file mode 100644
index 000000000..cdb2e0be7
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/run.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+BUILD_TYPE="Debug"
+LIBRARY_TYPE="SHARED"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+RUN_MODE="npu"
+SHORT=v:,l:,
+LONG=soc-version:,library-type
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+SOC_VERSION="Ascend310P3"
+
+while :; do
+    case "$1" in
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -l | --library-type)
+        LIBRARY_TYPE="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+
+VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+LIBRARY_LIST="SHARED STATIC"
+if [[ " $LIBRARY_LIST " != *" $LIBRARY_TYPE "* ]]; then
+    echo "ERROR: LIBRARY_TYPE should be in [$LIBRARY_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} \
+    -DLIBRARY_TYPE=${LIBRARY_TYPE}
+cmake --build build -j
+cmake --install build
+cp -rf out/lib/libkernels.* /usr/lib
+cp -rf out/include/kernels/aclrtlaunch_add_custom.h /usr/include
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/CMakeLists.txt b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/CMakeLists.txt
new file mode 100644
index 000000000..a3a200642
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(Ascend_C)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+file(GLOB SOURCES "*.cpp")
+# user-defined configuration
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
+set(RUN_MODE "npu" CACHE STRING "run mode: npu")
+set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+set(LIBRARY_TYPE "SHARED" CACHE STRING "library type:SHARED or STATIC")
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+if(LIBRARY_TYPE STREQUAL "SHARED")
+    ascendc_library(kernels SHARED ${SOURCES})
+else()
+    ascendc_library(kernels STATIC ${SOURCES})
+endif()
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/README.md b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/README.md
new file mode 100644
index 000000000..1992e4ed2
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/README.md
@@ -0,0 +1,78 @@
+## `AddCustom`自定义算子样例说明
+
+本样例通过`Ascend C`编程语言实现了`AddCustom`算子带Tiling场景。
+
+### 算子描述
+
+`AddCustom`算子返回两个数据相加的结果。
+
+### 算子规格描述
+
+| 算子类型(OpType) | AddCustom  |          |           |        |
+| ---------------- | ---------- | -------- | --------- | ------ |
+| 算子输入         | name       | shape    | data type | format |
+| x                | 8 * 2048   | float16  | ND        |        |
+| y                | 8 * 2048   | float16  | ND        |        |
+| 算子输出         | z          | 8 * 2048 | float16   | ND     |
+| 核函数名         | add_custom |          |           |        |
+
+### 支持的产品型号
+
+本样例支持如下产品型号：
+
+- Atlas 训练系列产品
+- Atlas 推理系列产品
+- Atlas A2训练系列产品
+- Atlas 800I A2推理产品
+- Atlas 200I/500 A2推理产品
+
+### 目录结构介绍
+
+```
+├── examples                     // 调用示例目录
+├── add_custom_tiling.h          // 算子tiling结构体定义
+├── add_custom.cpp               // 算子kernel代码
+├── CMakeLists.txt               // cmake编译文件
+├── run.sh                       // 运行脚本
+└── README.md                   // 样例指导手册 
+```
+
+### 环境要求
+
+编译运行此样例前，请参考[《CANN软件安装指南》](https://gitee.com/link?target=https%3A%2F%2Fhiascend.com%2Fdocument%2Fredirect%2FCannCommunityInstSoftware)完成开发运行环境的部署。
+
+### 算子包编译部署
+
+1.进入到样例目录
+
+```
+cd ${git_clone_path}/samples/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernelTiling
+```
+
+2.算子编译部署
+
+- 打包动态库部署
+
+  ```
+  bash run.sh -l SHARED -v Ascend***(由npu-smi info查询得到)
+  ```
+
+- 打包静态库部署
+
+  ```
+  bash run.sh -l STATIC -v Ascend***(由npu-smi info查询得到)
+  ```
+
+  
+
+### 算子调用
+
+| 目录                                                         | 描述                                     |
+| ------------------------------------------------------------ | ---------------------------------------- |
+| [PythonInvocation](./examples/PythonInvocation) | 通过Python调用的方式调用AddCustom算子。 |
+
+### 更新说明
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/01/06 | 新增本readme |
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/add_custom.cpp b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/add_custom.cpp
new file mode 100644
index 000000000..35196ea70
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/add_custom.cpp
@@ -0,0 +1,92 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
+    {
+        this->blockLength = totalLength / AscendC::GetBlockNum();
+        this->tileNum = tileNum;
+        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
+
+        xGm.SetGlobalBuffer((__gm__ half *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ half *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        zGm.SetGlobalBuffer((__gm__ half *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+        outQueueZ.EnQue<half>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<half> xGm;
+    AscendC::GlobalTensor<half> yGm;
+    AscendC::GlobalTensor<half> zGm;
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+};
+
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTiling tiling)
+{
+    KernelAdd op;
+    op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
+    op.Process();
+}
+
+AddCustomTiling* GenerateAddCustomTiling(uint32_t totalLength)
+{
+    AddCustomTiling* tiling = new AddCustomTiling();
+    uint32_t tileNum = 8;
+    tiling->totalLength = totalLength;
+    tiling->tileNum = tileNum;
+    return tiling;
+}
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/add_custom_tiling.h b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/add_custom_tiling.h
new file mode 100644
index 000000000..9ab640d51
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/add_custom_tiling.h
@@ -0,0 +1,15 @@
+/**
+ * @file add_custom_tiling.h
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+struct AddCustomTiling{
+    uint32_t tileNum;
+    uint32_t totalLength;
+};
+ 
+
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/CMakeLists.txt b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/CMakeLists.txt
new file mode 100644
index 000000000..63e29f1bb
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/CMakeLists.txt
@@ -0,0 +1,35 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--copy-dt-needed-entries")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+target_link_directories(ascendc_kernels_bbit PRIVATE
+  ${ASCEND_CANN_PACKAGE_PATH}/lib64
+)
+target_include_directories(ascendc_kernels_bbit PRIVATE
+  ${ASCEND_CANN_PACKAGE_PATH}/include
+)
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    ascendcl
+    kernels
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/README.md b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/README.md
new file mode 100644
index 000000000..585b04f13
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/README.md
@@ -0,0 +1,40 @@
+## 概述
+
+通过C++接口调用核函数直调实现的带Tiling的AddCustom算子
+
+## 目录结构介绍
+
+```
+├── CPPInvocation
+│   ├── scripts
+        └── gen_data.py       // 输入数据和标杆数据构造脚本
+        └── verify_result.py  // 标杆数据和自定义算子输出数据对比脚本
+│   ├── CMakeLists.txt        // cmake编译文件
+│   ├── main.cpp              // 算子调用代码
+│   ├── data_utils.h          // 数据类型定义,数据读取代码
+│   ├── run.sh                // 编译运行算子的脚本
+```
+
+## 运行样例算子
+
+  **请确保已根据算子包编译部署步骤完成本算子的编译部署动作。**
+
+  - 进入样例代码所在路径
+
+  ```bash
+ cd ${git_clone_path}/samples/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation
+  ```
+
+  - 样例执行
+
+    样例执行过程中会自动生成测试数据，然后编译与运行C++调用样例，最后打印运行结果。
+
+    ```bash
+    bash run.sh
+    ```
+
+## 更新说明
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/05/19 | 样例首次提交 |
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/data_utils.h b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/data_utils.h
new file mode 100644
index 000000000..fb1363721
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/data_utils.h
@@ -0,0 +1,203 @@
+/**
+* @file data_utils.h
+*
+* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+* @brief Read data from file
+* @param [in] filePath: file path
+* @param [out] fileSize: file size
+* @return read result
+*/
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+* @brief Write data to file
+* @param [in] filePath: file path
+* @param [in] buffer: data to write to file
+* @param [in] size: size to write
+* @return write result
+*/
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/main.cpp b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/main.cpp
new file mode 100644
index 000000000..a00e0b2bd
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/main.cpp
@@ -0,0 +1,63 @@
+/**
+* @file main.cpp
+*
+* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#include "data_utils.h"
+#include "acl/acl.h"
+#include "add_custom_tiling.h"
+#include "aclrtlaunch_add_custom.h"
+extern AddCustomTiling* GenerateAddCustomTiling(uint32_t totalLength);
+int32_t main(int32_t argc, char *argv[])
+{
+   uint32_t blockDim = 8;
+   size_t inputByteSize = 8 * 2048 * sizeof(uint16_t);
+   size_t outputByteSize = 8 * 2048 * sizeof(uint16_t);
+   size_t totalLength = 8 * 2048;
+
+   CHECK_ACL(aclInit(nullptr));
+   int32_t deviceId = 0;
+   CHECK_ACL(aclrtSetDevice(deviceId));
+   aclrtStream stream = nullptr;
+   CHECK_ACL(aclrtCreateStream(&stream));
+
+   void *xHost, *yHost, *zHost;
+   void *xDevice, *yDevice, *zDevice;
+
+   CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputByteSize));
+   CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputByteSize));
+   CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+   CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+   CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+   CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+   ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
+   ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
+
+   CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+   CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+   
+   AddCustomTiling* tiling = GenerateAddCustomTiling(totalLength);
+   ACLRT_LAUNCH_KERNEL(add_custom)
+   (blockDim, stream, xDevice, yDevice, zDevice,tiling);
+   CHECK_ACL(aclrtSynchronizeStream(stream));
+
+   CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+   WriteFile("./output/output_z.bin", zHost, outputByteSize);
+
+   CHECK_ACL(aclrtFree(xDevice));
+   CHECK_ACL(aclrtFree(yDevice));
+   CHECK_ACL(aclrtFree(zDevice));
+   CHECK_ACL(aclrtFreeHost(xHost));
+   CHECK_ACL(aclrtFreeHost(yHost));
+   CHECK_ACL(aclrtFreeHost(zHost));
+
+   CHECK_ACL(aclrtDestroyStream(stream));
+   CHECK_ACL(aclrtResetDevice(deviceId));
+   CHECK_ACL(aclFinalize());
+   return 0;
+}
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/run.sh b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/run.sh
new file mode 100644
index 000000000..600568cff
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/run.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    #msprof op --application=./ascendc_kernels_bbit
+    ./ascendc_kernels_bbit
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z.bin output/golden.bin
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/gen_data.py b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/gen_data.py
new file mode 100644
index 000000000..ea8ce828a
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/gen_data.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1, 100, [8, 2048]).astype(np.float16)
+    input_y = np.random.uniform(1, 100, [8, 2048]).astype(np.float16)
+    golden = (input_x + input_y).astype(np.float16)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/verify_result.py b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/verify_result.py
new file mode 100644
index 000000000..1a21d809a
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float16
+relative_tol = 1e-3
+absolute_tol = 1e-5
+error_tol = 1e-3
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float16).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float16).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/CMakeLists.txt b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/CMakeLists.txt
new file mode 100644
index 000000000..549a603f8
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/CMakeLists.txt
@@ -0,0 +1,53 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(Ascend_C)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+file(GLOB SOURCES "*.cpp")
+# user-defined configuration
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--copy-dt-needed-entries")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+add_library(pybind11_lib SHARED pybind11.cpp)
+target_link_libraries(pybind11_lib PRIVATE
+  kernels
+  torch_npu
+)
+execute_process(COMMAND python3 -c "import os; import torch; print(os.path.dirname(torch.__file__))"
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE TORCH_PATH
+)
+message("TORCH_PATH is ${TORCH_PATH}")
+set(ENV{ASCEND_HOME_PATH} ${ASCEND_CANN_PACKAGE_PATH})
+execute_process(COMMAND python3 -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))"
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE TORCH_NPU_PATH
+)
+message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
+target_link_directories(pybind11_lib PRIVATE
+  ${TORCH_PATH}/lib
+  ${TORCH_NPU_PATH}/lib
+)
+target_include_directories(pybind11_lib PRIVATE
+  ${TORCH_NPU_PATH}/include
+  ${TORCH_PATH}/include
+  ${TORCH_PATH}/include/torch/csrc/api/include
+  ${ASCEND_CANN_PACKAGE_PATH}/include
+)
+execute_process(COMMAND python3 -m pybind11 --includes
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE PYBIND11_INC
+)
+string(REPLACE " " ";" PYBIND11_INC ${PYBIND11_INC})
+target_compile_options(pybind11_lib PRIVATE
+  ${PYBIND11_INC}
+  -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+execute_process(COMMAND python3-config --extension-suffix
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE PYBIND11_SUFFIX
+)
+set_target_properties(pybind11_lib PROPERTIES
+  OUTPUT_NAME add_custom${PYBIND11_SUFFIX}
+  PREFIX "" SUFFIX ""
+)
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/README.md b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/README.md
new file mode 100644
index 000000000..41b5db466
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/README.md
@@ -0,0 +1,69 @@
+## 概述
+
+通过Python接口调用核函数直调实现的带Tiling的AddCustom算子
+
+## 目录结构介绍
+
+```
+├── PythonInvocation
+│   ├── add_custom_test.py    // add_custom python调用测试代码 
+│   ├── CMakeLists.txt        // cmake编译文件
+│   ├── pybind11.cpp          // pybind绑定核函数和python接口代码
+│   ├── run.sh                // 编译运行算子的脚本
+│   ├── README.md             // 样例指导手册
+```
+
+## 运行样例算子
+  - 安装pytorch (这里使用2.1.0版本为例)
+
+    **aarch64:**
+
+    ```bash
+    pip3 install torch==2.1.0
+    ```
+
+    **x86:**
+
+    ```bash
+    pip3 install torch==2.1.0+cpu  --index-url https://download.pytorch.org/whl/cpu
+    ```
+
+  - 安装torch-npu （以Pytorch2.1.0、python3.9、CANN版本8.0.RC1.alpha002为例）
+
+    ```bash
+    git clone https://gitee.com/ascend/pytorch.git -b v6.0.rc1.alpha002-pytorch2.1.0
+    cd pytorch/
+    bash ci/build.sh --python=3.9
+    pip3 install dist/*.whl
+    ```
+
+    安装pybind11
+    ```bash
+    pip3 install pybind11
+    ```
+    安装expecttest
+    ```bash
+    pip3 install expecttest
+    ```
+
+  **请确保已根据算子包编译部署步骤完成本算子的编译部署动作。**
+
+  - 进入样例代码所在路径
+
+  ```bash
+ cd ${git_clone_path}/samples/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation
+  ```
+
+  - 样例执行
+
+    样例执行过程中会自动生成测试数据，然后编译与运行C++调用样例，最后打印运行结果。
+
+    ```bash
+    bash run.sh
+    ```
+
+## 更新说明
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/05/19 | 样例首次提交 |
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/add_custom_test.py b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/add_custom_test.py
new file mode 100644
index 000000000..efdda2537
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/add_custom_test.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import sys, os
+
+sys.path.append(os.getcwd())
+import add_custom
+
+torch.npu.config.allow_internal_format = False
+
+
+class TestCustomAdd(TestCase):
+
+    def test_add_custom_ops(self):
+        length = [8, 2048]
+        x = torch.rand(length, device='cpu', dtype=torch.float16)
+        y = torch.rand(length, device='cpu', dtype=torch.float16)
+
+        x_npu = x.npu()
+        y_npu = y.npu()
+        output = add_custom.run_add_custom(x_npu, y_npu)
+        cpuout = torch.add(x, y)
+
+        self.assertRtolEqual(output, cpuout)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/pybind11.cpp b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/pybind11.cpp
new file mode 100644
index 000000000..629782a55
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/pybind11.cpp
@@ -0,0 +1,40 @@
+/**
+ * @file pybind11.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+ #include <pybind11/pybind11.h>
+ #include <torch/extension.h>
+ #include "add_custom_tiling.h"
+ #include "aclrtlaunch_add_custom.h"
+ 
+ #include "torch_npu/csrc/core/npu/NPUStream.h"
+ extern AddCustomTiling* GenerateAddCustomTiling(uint32_t totalLength);
+ namespace my_add {
+ at::Tensor run_add_custom(const at::Tensor &x, const at::Tensor &y)
+ {
+     auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);
+     at::Tensor z = at::empty_like(x);
+     uint32_t blockDim = 8;
+     uint32_t totalLength = 1;
+     for (uint32_t size : x.sizes()) {
+         totalLength *= size;
+     }
+     AddCustomTiling* tiling = GenerateAddCustomTiling(totalLength);
+     ACLRT_LAUNCH_KERNEL(add_custom)
+     (blockDim, acl_stream, const_cast<void *>(x.storage().data()), const_cast<void *>(y.storage().data()),
+      const_cast<void *>(z.storage().data()), tiling);
+     return z;
+ }
+ } // namespace my_add
+ 
+ PYBIND11_MODULE(add_custom, m)
+ {
+     m.doc() = "add_custom pybind11 interfaces"; // optional module docstring
+     m.def("run_add_custom", &my_add::run_add_custom, "");
+ }
+ 
\ No newline at end of file
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/run.sh b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/run.sh
new file mode 100644
index 000000000..0cf216ce5
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/PythonInvocation/run.sh
@@ -0,0 +1,25 @@
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+
+set -e
+pip3 install pybind11
+rm -rf build
+mkdir -p build
+cmake -B build \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+(
+    cd build
+    python3 ../add_custom_test.py
+)
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/run.sh b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/run.sh
new file mode 100644
index 000000000..f945ec5bf
--- /dev/null
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/run.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+BUILD_TYPE="Debug"
+LIBRARY_TYPE="SHARED"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+RUN_MODE="npu"
+SHORT=v:,l:,
+LONG=soc-version:,library-type
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+SOC_VERSION="Ascend310P3"
+
+while :; do
+    case "$1" in
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -l | --library-type)
+        LIBRARY_TYPE="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+
+VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+LIBRARY_LIST="SHARED STATIC"
+if [[ " $LIBRARY_LIST " != *" $LIBRARY_TYPE "* ]]; then
+    echo "ERROR: LIBRARY_TYPE should be in [$LIBRARY_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} \
+    -DLIBRARY_TYPE=${LIBRARY_TYPE}
+cmake --build build -j
+cmake --install build
+cp -rf out/lib/libkernels.* /usr/lib
+cp -rf add_custom_tiling.h out/include/kernels/aclrtlaunch_add_custom.h /usr/include
diff --git a/operator_contrib/AddCustomSample/README.md b/operator_contrib/AddCustomSample/README.md
new file mode 100644
index 000000000..616f2c6cf
--- /dev/null
+++ b/operator_contrib/AddCustomSample/README.md
@@ -0,0 +1,112 @@
+## Add自定义算子样例说明
+
+本样例通过Ascend C编程语言实现了Add算子不带Tiling和带Tiling的场景,并提供了C++和Python的调用方式.
+
+- [KernelLaunch](./KernelLaunch)：使用核函数直调Add自定义算子。  
+  核函数的基础调用（Kernel Launch）方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。
+
+本样例中包含如下调用方式：
+
+<table>
+    <th>调用方式</th><th>目录</th><th>描述</th>
+    <tr>
+        <!-- 列的方向占据4个cell -->
+        <td rowspan='4'><a href="./KernelLaunch"> KernelLaunch</td>
+    </tr>
+    <tr>
+        <td><a href="./KernelLaunch/AddCustomKernel"> AddCustomKernel</td><td>AddCustom不带Tiling场景,同时提供了静态库和动态库的算子打包方式</td>
+    </tr>
+    <tr>
+        <td><a href="./KernelLaunch/AddCustomTilingKernel"> AddCustomTilingKernel</td><td>AddCustom带Tiling场景,同时提供了静态库和动态库的算子打包方式</td>
+    </tr>
+    </tr>
+</table>
+
+
+## 算子描述
+
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：  
+
+```
+z = x + y
+```
+
+## 算子规格描述
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+
+## 支持的产品型号
+
+本样例支持如下产品型号：
+
+- Atlas 训练系列产品
+- Atlas 推理系列产品AI Core
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+- Atlas 200/500 A2推理产品
+
+## 目录结构介绍
+
+```
+└── KernelLaunch            // 使用核函数直调的方式调用Add自定义算子。
+  └── AddCustomKernel       // AddCustom不带Tiling场景
+  └── AddCustomKernelTiling // AddCustom带Tiling场景
+```
+
+## 环境要求
+
+编译运行此样例前，请参考[《CANN软件安装指南》](https://hiascend.com/document/redirect/CannCommunityInstSoftware)完成开发运行环境的部署。
+
+## 编译运行样例算子
+
+### 1. 准备：获取样例代码<a name="codeready"></a>
+
+ 可以使用以下两种方式下载，请选择其中一种进行源码准备。
+
+ - 命令行方式下载（下载时间较长，但步骤简单）。
+
+   ```bash
+   # 开发环境，非root用户命令行中执行以下命令下载源码仓。git_clone_path为用户自己创建的某个目录。
+   cd ${git_clone_path}
+   git clone https://gitee.com/ascend/samples.git
+   ```
+
+   **注：如果需要切换到其它tag版本，以v0.5.0为例，可执行以下命令。**
+
+   ```bash
+   git checkout v0.5.0
+   ```
+
+ - 压缩包方式下载（下载时间较短，但步骤稍微复杂）。
+
+   **注：如果需要下载其它版本代码，请先请根据前置条件说明进行samples仓分支切换。下载压缩包命名跟tag/branch相关，此处以master分支为例，下载的名字将会是samples-master.zip**
+
+   ```bash
+   # 1. samples仓右上角选择 【克隆/下载】 下拉框并选择 【下载ZIP】。
+   # 2. 将ZIP包上传到开发环境中的普通用户某个目录中，【例如：${git_clone_path}/samples-master.zip】。
+   # 3. 开发环境中，执行以下命令，解压zip包。
+   cd ${git_clone_path}
+   unzip samples-master.zip
+   ```
+
+### 2. 编译运行样例工程
+
+- 若是不带tiling场景，编译运行操作请参见[AddCustomKernel](./KernelLaunch/AddCustomKernel)。
+- 若是带tiling场景，编译运行操作请参见[AddCustomTilingKernel](./KernelLaunch/AddCustomTilingKernel)。
+
+## 更新说明
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/05/19 | 样例首次上仓 |
\ No newline at end of file
-- 
Gitee


From babe54f102e8917a81c7b8f80dc39f8b6c567efa Mon Sep 17 00:00:00 2001
From: wangyuqing <wangyuqing33@huawei.com>
Date: Thu, 29 May 2025 11:51:21 +0000
Subject: [PATCH 09/46] !2665 add lut4 llama7b quantization Merge pull request
 !2665 from wangyuqing/master

---
 .../lut4_quantization/README_CN.md            |  61 +++++++++
 .../lut4_quantization/config/lut4_quant.cfg   |   9 ++
 .../lut4_quantization/requirements.txt        |   7 ++
 .../src/run_llama7b_calibration.py            |  82 ++++++++++++
 .../src/save_llama7b_quant_model.py           |  79 ++++++++++++
 .../lut4_quantization/src/utils.py            | 117 ++++++++++++++++++
 6 files changed, 355 insertions(+)
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/README_CN.md
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/config/lut4_quant.cfg
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/requirements.txt
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/run_llama7b_calibration.py
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/save_llama7b_quant_model.py
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/utils.py

diff --git a/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/README_CN.md
new file mode 100644
index 000000000..eef6c9c18
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/README_CN.md
@@ -0,0 +1,61 @@
+# LUT4bit量化
+
+## 1 LUT4bit量化前提
+
+### 1.1 安装依赖
+
+本sample依赖包可参考[requirements.txt](requirements.txt)
+
+### 1.2 模型和数据集准备
+
+本sample以Llama2-7b模型,pileval和wikitext2数据集为示例，请用户自行下载
+
+### 1.3 简易量化配置
+./src/lut4_quant.cfg文件为用户自定义的简易量化配置，具体表示信息如下：
+
+| 字段 |类型| 说明 | 默认值 | 取值范围 | 注意事项 |
+|:--| :-: | :-- | :-: | :-: | :-: |
+|batch_num|uint32|量化使用的batch数量 |1|/|校准使用batch数与推理使用输入数据有关，是校准脚本中的batch_num|
+|skip_layers|str|跳过量化的层 |/|/|跳过量化层支持模糊匹配，当配置字符串为层名字串，或与层名一致时，跳过该层量化，不生成量化配置。字符串必须包含数字或字母|
+|weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|LUT4bit量化目前仅支持权重量化，需要设置为True|
+|weight_only_config.wts_type|enum|量化后权重类型|INT8|本sample支持INT4|/|
+|weight_only_config.weight_granularity|enum|权重量化粒度|PER_TENSOR|PER_TENSOR/PER_CHANNEL/PER_GROUP|LUT4bit仅支持PER_GROUP模式|
+|weight_only_config.round_mode|enum|舍入模式|/|HYBRID/ROUND/RINT|LUT4bit仅支持RINT模式|
+|weight_only_config.lut_quantize.lut_alog|enum|lut量化算法模式|CLUSTER|CLUSTER/ATCTAN|
+
+## 2 LUT4量化示例
+
+### 2.1 使用接口方式调用
+
+**step 1.**  请在当前目录执行如下两条命令运行示例程序，用户需根据实际情况修改示例程序中的模型和数据集路径：
+
+校准:
+`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_calibration.py --calibration_data=/pile_val_backup/ --model=/data/Models/pytorch/Llama2/Llama2_7b_hf`
+- 校准可以使用--finetune, 入参格式是bool,用来表示做精调/粗调
+
+
+保存并推理量化模型:
+`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/save_llama7b_quant_model.py --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf`
+
+若出现如下信息，则说明校准成功：
+
+```none
+Calibration success, time taken:  56.0 min  20.263916969299316 s
+```
+
+出现如下信息，说明量化成功
+
+```none
+Test time taken:  7.0 min  12.269736528396606 s
+Score:  5.595210552215576
+```
+
+**step 2.**  推理成功后，在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./outputs文件夹，该文件夹内包含以下内容：
+
+- config.json：量化配置文件，描述了如何对模型中的每一层进行量化。
+- record.txt：量化因子记录文件。
+- lut_result.pt：lut算法参数文件。
+
+> 如果outputs目录下已经存在量化配置文件或量化因子记录文件，再次运行示例程序时，如果新生成的文件与已有文件同名，则会覆盖已有的量化配置文件或量化因子记录文件。
+
+**LLMHelper:**  定义用于大语言模型量化校准的辅助类，核心参数有:校准模型，校准数据集，前向方法，校准模块，校准模块推理方法，学习率，迭代次数，是否开启量化层筛选，量化误差比例阈值，量化误差平均阈值。详细使用方式可查阅AMCT使用手册
diff --git a/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/config/lut4_quant.cfg b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/config/lut4_quant.cfg
new file mode 100644
index 000000000..6f532c21c
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/config/lut4_quant.cfg
@@ -0,0 +1,9 @@
+batch_num: 1
+skip_layers: "lm_head"
+weight_only_config: {
+    weight_compress_only: True
+    wts_type: INT4
+    lut_quantize : {
+        lut_algo: CLUSTER
+    }
+}
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/requirements.txt b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/requirements.txt
new file mode 100644
index 000000000..55441d062
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/requirements.txt
@@ -0,0 +1,7 @@
+torch==2.1.0
+transformers==4.40.0
+accelerate==0.30.1
+datasets==2.19.1
+sentencepiece==0.2.0
+numpy==1.23.5
+protobuf==3.20.2
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/run_llama7b_calibration.py b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/run_llama7b_calibration.py
new file mode 100644
index 000000000..6df231876
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/run_llama7b_calibration.py
@@ -0,0 +1,82 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+
+import os
+import copy
+import time
+import tqdm
+import torch
+import argparse
+import torch.nn as nn
+
+from utils import get_llama2, get_calib_dataset, build_model_and_enc
+import amct_pytorch as amct
+from amct_pytorch.post_quant_calibration import LLMHelper
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--calibration_data', type=str, default='/pile_val_backup')
+    parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf')
+    parser.add_argument('--finetune', type=bool, default=False)
+
+    args = parser.parse_args()
+    model, model_path = get_llama2(args.model)
+    model = model.eval()
+    gpu_num = torch.cuda.device_count()
+    model, enc = build_model_and_enc(model, model_path, gpu_num)
+
+    proto_file = './config/lut4_quant.cfg'
+    config_file = './output/config.json'
+    record_file = './output/record.txt'
+
+    test_start_time = time.time()
+    # Phase1: generate quant config json
+    amct.create_post_quant_config(config_file,
+                             model,
+                             config_defination=proto_file)
+    
+    # Phase2: generate calibration model
+    samples = get_calib_dataset(
+        data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=256
+    )
+    samples = torch.cat(samples, dim=0)[:1,:]
+    # do weights calibration without finetune
+    # Please check README.md for LLMHelper usage
+    with torch.no_grad():
+        post_quant_model = amct.create_post_quant_model(config_file,
+                                                        record_file,
+                                                        model)
+    calibration_helper = LLMHelper(post_quant_model, samples, calibration_block='LlamaDecoderLayer', layer_filter=True)
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    post_quant_model.config.use_cache = False
+    amct.quant_calibration(calibration_helper)
+     # do weights calibration with finetune
+    if args.finetune:
+        with torch.no_grad():
+            post_quant_model = amct.create_post_quant_model(config_file,
+                                                            record_file,
+                                                            post_quant_model)
+        calibration_finetune_helper = LLMHelper(post_quant_model, samples, calibration_block='LlamaDecoderLayer', layer_filter=True)                                                   
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        post_quant_model.config.use_cache = False
+        amct.quant_calibration(calibration_finetune_helper)
+    test_end_time = time.time()
+    total_time = test_end_time - test_start_time
+    print('Calibration success, time taken: ', total_time // 60, 'min ', total_time%60, 's')
diff --git a/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/save_llama7b_quant_model.py b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/save_llama7b_quant_model.py
new file mode 100644
index 000000000..11f79c1c5
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/save_llama7b_quant_model.py
@@ -0,0 +1,79 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+
+import os
+import copy
+import time
+import tqdm
+import torch
+import argparse
+import torch.nn as nn
+
+from utils import get_loaders,  get_llama2, get_calib_dataset, build_model_and_enc
+import amct_pytorch as amct
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py')
+    parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf')
+
+    args = parser.parse_args()
+    model, model_path = get_llama2(args.model)
+    model = model.eval()
+    gpu_num = torch.cuda.device_count()
+
+    record_file = './output/record.txt'
+
+    test_start_time = time.time()
+    model, enc = build_model_and_enc(model, model_path, gpu_num)
+    
+    # Phase1: save fakequant model
+    testenc = get_loaders(data_path=args.verify_data,
+                        enc=enc,
+                        seqlen=model.seqlen)
+
+    testenc = testenc.input_ids.to(model.device)
+    nsamples = testenc.numel() // model.seqlen
+    fake_quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    
+    # Phase2: Test ppl result
+    nlls = []
+    test_start_time = time.time()
+    for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
+        batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(
+            model.device
+        )
+        with torch.no_grad():
+            lm_logits = fake_quant_model(batch).logits
+        shift_logits = lm_logits[:, :-1, :].contiguous().float()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    test_end_time = time.time()
+
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    total_time = test_end_time - test_start_time
+    print('Test time taken: ', total_time // 60, 'min ', total_time%60, 's'  )
+    print('Score: ', ppl.item())
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/utils.py
new file mode 100644
index 000000000..7bd34ba3d
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/lut4_quantization/src/utils.py
@@ -0,0 +1,117 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+import os
+import torch
+import torch.nn as nn
+from datasets import load_dataset,load_from_disk
+
+from transformers import AutoTokenizer, AutoConfig
+from accelerate import infer_auto_device_map, dispatch_model
+from accelerate.utils.modeling import get_balanced_memory
+
+def build_model_and_enc(model, model_path, gpu_num):
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    if "mpt" in config.__class__.__name__.lower():
+        enc = AutoTokenizer.from_pretrained(
+            config.tokenizer_name, trust_remote_code=True
+        )
+    else:
+        enc = AutoTokenizer.from_pretrained(
+            model_path, use_fast=False, trust_remote_code=True
+        )
+
+    # Move the model to GPU (as much as possible) for LM evaluation
+    # max_memory = ['0:16GiB', '1:16GiB','2:16GiB', 'cpu:30GiB'], '0' means the first GPU that you specify.
+    # I don't recommend use 16GiB, we need to reserve some space for other tensors during calculation
+    # please see the recommand memeory allocation in the Word file
+    # Adjust the max_size accroding to the real situation
+    # a clever way:
+
+    max_memory = []
+    for i in range(gpu_num):
+        max_memory.append(f'{i}:12GiB')
+    max_memory.append('cpu:80GiB')
+    print('Max_memory allocation: \n', max_memory)
+
+    max_memory = [v.split(":") for v in (max_memory or [])]
+    max_memory = {(int(k) if k.isdigit() else k): v for k, v in max_memory}
+    kwargs = {
+        "max_memory": get_balanced_memory(
+            model, max_memory if len(max_memory) > 0 else None
+        )
+    }
+    model.tie_weights()
+    device_map = infer_auto_device_map(
+        model,
+        no_split_module_classes=[
+            "LlamaDecoderLayer",
+        ],
+        **kwargs,
+    )
+    model = dispatch_model(model, device_map=device_map, 
+        offload_dir=os.path.join(model_path, 'offload_dir'))
+
+    return model, enc
+
+def get_llama2(model_path, seqlen=2048):
+    def skip(*args, **kwargs):
+        pass
+
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    
+    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32, offload_folder="offload/")
+
+    model.seqlen = seqlen
+    return model, model_path
+
+
+def get_loaders(data_path: str, enc, seqlen):
+    print('Loading dataset: Wikitext2')
+    testenc = load_dataset(data_path, 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
+    testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
+    
+    return testenc
+
+
+def get_calib_dataset(data_path="pileval", tokenizer=None, n_samples=512, block_size=512):
+    dataset = load_from_disk(data_path)
+    dataset = dataset.shuffle(seed=42)
+    samples = []
+    n_run = 0
+    for data in dataset:
+        line = data["text"]
+        line = line.strip()
+        line_encoded = tokenizer.encode(line)
+        if len(line_encoded) > 512:
+            continue
+        sample = torch.tensor([line_encoded])
+        if sample.numel() == 0:
+            continue
+        samples.append(sample)
+        n_run += 1
+        if n_run == n_samples:
+            break
+    # now concatenate all samples and split according to block size
+    cat_samples = torch.cat(samples, dim=1)
+    n_split = cat_samples.shape[1] // block_size
+    print(f" * Split into {n_split} blocks")
+    return [
+        cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split)
+    ]
-- 
Gitee


From 7c0d773c717383bbaf8c3ddeccc8430637517aa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8?= <chenweiheng1@hisilicon.com>
Date: Fri, 30 May 2025 01:42:17 +0000
Subject: [PATCH 10/46] =?UTF-8?q?!2664=20update=20tiling=20sink=20Merge=20?=
 =?UTF-8?q?pull=20request=20!2664=20from=20=E9=99=88=E5=A8=81=E4=BA=A8/mas?=
 =?UTF-8?q?ter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../AddCustomTilingSink.json                  |   0
 .../{OpImpl => AddCustomTilingSink}/README.md |   0
 .../tf_plugin/tensorflow_add_custom_plugin.cc |   0
 .../install.sh                                |   0
 .../op_host/add_custom_tiling_sink.cpp        |   0
 .../op_host/add_custom_tiling_sink_tiling.cpp |   0
 .../op_host/add_custom_tiling_sink_tiling.h   |   0
 .../op_kernel/add_custom_tiling_sink.cpp      |   0
 .../README.md                                 | 228 ++++--------------
 .../src/add_custom_tiling_sink.py}            |  14 +-
 .../test_add_custom_tiling_sink.py}           |   4 +-
 .../AddCustomTilingSink/README.md             |  51 ++++
 .../2_features/17_tiling_sink/README.md       |  16 ++
 13 files changed, 124 insertions(+), 189 deletions(-)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{OpImpl => AddCustomTilingSink}/AddCustomTilingSink.json (100%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{OpImpl => AddCustomTilingSink}/README.md (100%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{OpImpl => AddCustomTilingSink}/framework/tf_plugin/tensorflow_add_custom_plugin.cc (100%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{OpImpl => AddCustomTilingSink}/install.sh (100%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{OpImpl => AddCustomTilingSink}/op_host/add_custom_tiling_sink.cpp (100%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{OpImpl => AddCustomTilingSink}/op_host/add_custom_tiling_sink_tiling.cpp (100%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{OpImpl => AddCustomTilingSink}/op_host/add_custom_tiling_sink_tiling.h (100%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{OpImpl => AddCustomTilingSink}/op_kernel/add_custom_tiling_sink.cpp (100%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{AddCustom => PytorchInvocation}/README.md (32%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{AddCustom/src/add_custom.py => PytorchInvocation/src/add_custom_tiling_sink.py} (73%)
 rename operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/{AddCustom/test_add_custom.py => PytorchInvocation/test_add_custom_tiling_sink.py} (88%)
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/README.md
 create mode 100644 operator/ascendc/2_features/17_tiling_sink/README.md

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/AddCustomTilingSink.json b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/AddCustomTilingSink.json
similarity index 100%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/AddCustomTilingSink.json
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/AddCustomTilingSink.json
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/README.md b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/README.md
similarity index 100%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/README.md
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/README.md
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/framework/tf_plugin/tensorflow_add_custom_plugin.cc b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/framework/tf_plugin/tensorflow_add_custom_plugin.cc
similarity index 100%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/framework/tf_plugin/tensorflow_add_custom_plugin.cc
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/framework/tf_plugin/tensorflow_add_custom_plugin.cc
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/install.sh
similarity index 100%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/install.sh
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/install.sh
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink.cpp
similarity index 100%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink.cpp
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink.cpp
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
similarity index 100%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.cpp
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.h b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.h
similarity index 100%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_host/add_custom_tiling_sink_tiling.h
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.h
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_kernel/add_custom_tiling_sink.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_kernel/add_custom_tiling_sink.cpp
similarity index 100%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl/op_kernel/add_custom_tiling_sink.cpp
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_kernel/add_custom_tiling_sink.cpp
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/README.md
similarity index 32%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/README.md
index 217cbfad8..8167f74d0 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/README.md
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/README.md
@@ -1,47 +1,51 @@
-## 背景介绍
-
-Tiling下沉是在Device侧CPU做Tiling计算。由于NPU中AI Core内部存储无法完全容纳算子输入输出的所有数据，需要每次搬运一部分输入数据进行计算然后搬出，再搬运下一部分输入数据进行计算，该过程称之为Tiling；根据算子的shape等信息来确定数据切分算法相关参数（比如每次搬运的块大小，以及总共循环多少次）的计算程序，称之为Tiling实现。由于Tiling实现中完成的均为标量计算，AI Core并不擅长，故一般在Host侧CPU上执行，但是满足下述条件Tiling实现会下沉到Device侧执行：
-
-模型为静态shape。
-模型中的算子支持Tiling下沉，比如FusedInferAttentionScore、IncreFlashAttention等融合算子。
-支持Tiling下沉的算子值有依赖，需要满足前一个算子的值有device的执行结果；如果依赖的值是Const，则不需要下沉执行Tiling，编译时会完成Tiling。
-
 ## 目录结构介绍
 
 ```
-├── AddCustom   // torch注册的自定义算子
+├── PytorchInvocation       // torch注册的自定义算子
 │   ├── src
-│   │   ├── add_custom.py      // 自定义算子py文件
-│   └── test_add_custom.py    // 测试脚本
+│   │   ├── add_custom_tiling_sink.py   // 自定义算子py文件
+│   └── test_add_custom_tiling_sink.py  // 测试脚本
 ```
 
-## 代码实现介绍
+## 代码实现
+
+src/add_custom_tiling_sink.py是调用自定义算子的torch脚本，如何开发该脚本代码，具体步骤如下。
+> 注意：如需详细了解入图操作，请参考Ascend torchair仓中[converter补齐](https://gitee.com/ascend/torchair/blob/master/CONTRIBUTING.md#converter%E8%A1%A5%E9%BD%90)章节。 
 
-新增自定义算子入图步骤，该过程可参考[torchair社区新增自定义算子入图介绍](https://gitee.com/ascend/torchair/blob/master/CONTRIBUTING.md#converter%E8%A1%A5%E9%BD%90)converter补齐第五小节：
-1.下载[torchair仓](https://gitee.com/ascend/torchair)，新建一个add_custom.py文件放在torchair/python/torchair/ops/add_custom.py，然后在torch框架中注册自定义算子：
+1.下载[torchair工程源码](https://gitee.com/ascend/torchair)，并在torchair/python/torchair/ops目录下新建add_custom_tiling_sink.py空文件。  
+> 注意，请根据实际情况下载配套版本分支的torchair工程源码，版本配套关系请查看[PyTorch框架适配官网](https://www.hiascend.com/software/ai-frameworks/pytorch)。
 
+2.将自定义算子注册到PyTorch框架。
 ```python
-# add_custom.py
+# add_custom_tiling_sink.py
 import torch
 
 lib = torch.library.Library("air", "FRAGMENT")
 lib.define(
     """
-    add_custom(Tensor x, Tensor y) -> Tensor
+    add_custom_tiling_sink(Tensor x, Tensor y) -> Tensor
     """
 )
 ```
+3.实现自定义算子的单算子模式。  
+该部分目前仅为示例，当前预留为为实现，请用户根据实际需要自行定义。
+```python
+def kernel_impl(x, y):
+    raise NotImplementedError("torch.ops.air.add_custom_tiling_sink kernel_impl is not implemented!")
 
-2.向torch注册自定义算子meta后端实现，用来完成图模式下的shape推导:
+torch.library.impl(lib, "add_custom_tiling_sink", "CPU")(kernel_impl)
+torch.library.impl(lib, "add_custom_tiling_sink", "PrivateUse1")(kernel_impl)
+```
 
+4.为自定义算子注册Meta函数，通过PyTorch Meta后端完成入图时所需要的shape和data type推导。
 ```python
-@torch.library.impl(lib, "add_custom", "Meta")
-   def kernel_meta(x, y):
-       return torch.empty_like(x)
+@torch.library.impl(lib, "add_custom_tiling_sink", "Meta")
+def kernel_meta(x, y):
+    return torch.empty_like(x)
 ```
 
-3.codegen生成ge构图api
-（1）将REG_OP算子原型放置到codegen/custom_op/custom_reg_op.h文件中，替换原来示例的REG_OP：
+5.codegen生成ge构图api  
+（1）将REG_OP算子原型放置到codegen/custom_op/custom_reg_op.h文件中，替换原来示例的REG_OP
 
 ```cpp
 #ifndef ASCENDADAPTER2_CUSTOM_REG_OP_H
@@ -59,7 +63,7 @@ REG_OP(AddCustomTilingSink)
 #endif  // ASCENDADAPTER2_CUSTOM_REG_OP_H
 ```
 
-（2）进入torchair仓根目录执行编译命令：
+（2）进入torchair工程源码根目录执行编译命令，产物在codegen/custom_op/auto_generated_ge_raw_custom_ops.py目录。
 
 ```
 cd build
@@ -67,7 +71,7 @@ cmake ..
 make generate_ge_raw_custom_ops
 ```
 
-生成的ge.api函数在codegen/custom\_op/auto\_generated\_ge\_raw\_custom\_ops.py文件中, 内容如下所示：
+生成的ge.api函数内容如下所示：
 
 ```python
 # This file is auto-generated
@@ -118,40 +122,19 @@ def AddCustomTilingSink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None
 需要修改`from torchair._ge_concrete_graph.ge_converter import ge_op, IrDef`
 为`from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef`
 
-将上述生成内容拷贝至前面我们新建的add_custom.py文件中。
-
-4.向torchair注册自定义算子的converter：
-
-```python
-@register_fx_node_ge_converter(torch.ops.air.add_custom.default)
-def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
-    return AddCustomTilingSink(x, y)
-```
-
-5.单算子部分为用户自行注册，此处预留未实现：
+将上述生成内容拷贝至前面我们新建的add_custom_tiling_sink.py文件中。
 
-```python
-def kernel_impl(x, y):
-    raise NotImplementedError("torch.ops.air.add_custom kernel_impl is not implemented!")
-
-
-torch.library.impl(lib, "add_custom", "CPU")(kernel_impl)
-torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
-```
-
-6.调用时，需要import前面新建的add_custom.py：
+6.实现自定算子converetr并注册：
 
 ```python
-import torchair.ops.add_custom
-
-def forward(self, x, y):
-    z = torch.ops.air.add_custom.default(x, y)
-    return z
+@register_fx_node_ge_converter(torch.ops.air.add_custom_tiling_sink.default)
+def convert_add_custom_tiling_sink(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
+    return AddCustomTilingSink(x, y) # 此为前面生产的构图api
 ```
 
 ## 运行样例算子
 
-### 1. 编译安装torchair包
+### 编译安装torchair包
 
 1.编译，进入torchair根目录，执行：
 
@@ -178,13 +161,22 @@ rm -rf /usr/local/python3.8.1/lib/python3.8/site-packages/torch_npu/dynamo/torch
 pip3.x show torch_npu
 ```
 
-### 2. 部署自定义算子包
-请参考[tiling下沉样例](https://gitee.com/ascend/samples/tree/master/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl)部署自定义算子包章节：
+### 编译部署自定义算子包
+请参考[AddCustomTilingSink自定义算子实现](../AddCustomTilingSink/README.md)。
 
-### 3. 执行脚本
+### 执行脚本
+test_add_custom_tiling_sink.py是图模式调用算子tiling下沉测试脚本，请根据实际情况替换里面的模型定义、参数等内容。  
+该脚本有2个关键操作必须确保完成，具体如下：  
+1.测试脚本必须import自定义的add_custom_tiling_sink.py模块。
+```python
+import torchair.ops.add_custom_tiling_sink
 
-需要脚本中先打开tiling下沉的开关
+def forward(self, x, y):
+    z = torch.ops.air.add_custom_tiling_sink.default(x, y)
+    return z
+```
 
+2.测试脚本显式开启tiling_schedule_optimize配置项。
 ```python
 from torchair.configs.compiler_config import CompilerConfig
 
@@ -196,128 +188,4 @@ config.experimental_config.tiling_schedule_optimize = True
 
 | 时间      | 更新事项     |
 | --------- | ------------ |
-| 2025/5/22 | 新增本readme |
-
-## add_custom.py
-
-```python
-from typing import (
-    Optional,
-    Union,
-    List,
-)
-import torch
-from torchair._ge_concrete_graph.fx2ge_converter import register_fx_node_ge_converter
-from torchair.ge._ge_graph import Tensor, TensorSpec
-
-lib = torch.library.Library("air", "FRAGMENT")
-lib.define(
-    """
-    add_custom(Tensor x, Tensor y) -> Tensor
-    """
-)
-
-
-@torch.library.impl(lib, "add_custom", "Meta")
-def kernel_meta(x, y):
-    return torch.empty_like(x)
-
-
-def kernel_impl(x, y):
-    raise NotImplementedError("torch.ops.air.add_custom kernel_impl is not implemented!")
-
-
-torch.library.impl(lib, "add_custom", "CPU")(kernel_impl)
-torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
-
-
-@register_fx_node_ge_converter(torch.ops.air.add_custom.default)
-def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
-    return AddCustomTilingSink(x, y)
-
-
-# This file is auto-generated by
-# Summary: total 1, generated 1, skipped 0
-from typing import Any, Dict, List, Tuple, Union, Callable, Optional
-from torchair.ge._ge_graph import auto_convert_to_tensor, TensorType
-from torchair.ge import Tensor, DataType, attr
-from torchair._ge_concrete_graph.compat_ir import ge_op, IrDef
-
-
-# This api is auto-generated from IR AddCustomTilingSink
-@auto_convert_to_tensor([False, False], [False, False], inputs_tensor_type=[TensorType.TT_ALL, TensorType.TT_ALL])
-def AddCustomTilingSink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None):
-    """REG_OP(AddCustomTilingSink)\n
-.INPUT(x, TensorType::ALL())\n
-.INPUT(y, TensorType::ALL())\n
-.OUTPUT(z, TensorType::ALL())\n
-"""
-
-    # process inputs
-    inputs = {
-        "x": x,
-        "y": y,
-    }
-
-    # process attrs
-    attrs = {
-    }
-
-    # process outputs
-    outputs = [
-    "z",
-    ]
-
-    return ge_op(
-        op_type="AddCustomTilingSink",
-        inputs=inputs,
-        attrs=attrs,
-        outputs=outputs,
-        dependencies=dependencies,
-        ir=IrDef("AddCustomTilingSink") \
-        .input("x", "") \
-        .input("y", "") \
-        .output("z" , "")
-    )
-
-```
-
-## test_add_custom.py
-
-```python
-import torch
-import torch_npu
-import torchair
-from torchair.configs.compiler_config import CompilerConfig
-from torchair.core.utils import logger
-import logging
-
-logger.setLevel(logging.DEBUG)
-config = CompilerConfig()
-config.debug.graph_dump.type = "pbtxt"
-config.experimental_config.tiling_schedule_optimize = True
-npu_backend = torchair.get_npu_backend(compiler_config=config)
-
-import torchair.ops.add_custom
-
-class MyModule(torch.nn.Module):
-    def __init__(self):
-        super(MyModule, self).__init__()
-
-    def forward(self, x, y):
-        z = torch.ops.air.add_custom.default(x, y)
-        return z
-
-
-# 创建并编译模块
-module = MyModule().npu()
-module = torch.compile(module, fullgraph=True, backend=npu_backend, dynamic=False)
-
-# 示例输入
-x = torch.randn(6, 64, dtype=torch.float32).npu()
-y = torch.randn(6, 64, dtype=torch.float32).npu()
-
-output = module(x, y)
-print(output)
-
-```
+| 2025/5/22 | 新增本readme |
\ No newline at end of file
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/src/add_custom_tiling_sink.py
similarity index 73%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/src/add_custom_tiling_sink.py
index dc73f0b07..8da1ef815 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/src/add_custom.py
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/src/add_custom_tiling_sink.py
@@ -10,26 +10,26 @@ from torchair.ge._ge_graph import Tensor, TensorSpec
 lib = torch.library.Library("air", "FRAGMENT")
 lib.define(
     """
-    add_custom(Tensor x, Tensor y) -> Tensor
+    add_custom_tiling_sink(Tensor x, Tensor y) -> Tensor
     """
 )
 
 
-@torch.library.impl(lib, "add_custom", "Meta")
+@torch.library.impl(lib, "add_custom_tiling_sink", "Meta")
 def kernel_meta(x, y):
     return torch.empty_like(x)
 
 
 def kernel_impl(x, y):
-    raise NotImplementedError("torch.ops.air.add_custom kernel_impl is not implemented!")
+    raise NotImplementedError("torch.ops.air.add_custom_tiling_sink kernel_impl is not implemented!")
 
 
-torch.library.impl(lib, "add_custom", "CPU")(kernel_impl)
-torch.library.impl(lib, "add_custom", "PrivateUse1")(kernel_impl)
+torch.library.impl(lib, "add_custom_tiling_sink", "CPU")(kernel_impl)
+torch.library.impl(lib, "add_custom_tiling_sink", "PrivateUse1")(kernel_impl)
 
 
-@register_fx_node_ge_converter(torch.ops.air.add_custom.default)
-def convert_add_custom(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
+@register_fx_node_ge_converter(torch.ops.air.add_custom_tiling_sink.default)
+def convert_add_custom_tiling_sink(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
     return AddCustomTilingSink(x, y)
 
 
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/test_add_custom_tiling_sink.py
similarity index 88%
rename from operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py
rename to operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/test_add_custom_tiling_sink.py
index 81bba97bb..04aef9313 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustom/test_add_custom.py
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/test_add_custom_tiling_sink.py
@@ -11,14 +11,14 @@ config.debug.graph_dump.type = "pbtxt"
 config.experimental_config.tiling_schedule_optimize = True
 npu_backend = torchair.get_npu_backend(compiler_config=config)
 
-import torchair.ops.add_custom
+import torchair.ops.add_custom_tiling_sink
 
 class MyModule(torch.nn.Module):
     def __init__(self):
         super(MyModule, self).__init__()
 
     def forward(self, x, y):
-        z = torch.ops.air.add_custom.default(x, y)
+        z = torch.ops.air.add_custom_tiling_sink.default(x, y)
         return z
 
 
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/README.md b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/README.md
new file mode 100644
index 000000000..8d884730e
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/README.md
@@ -0,0 +1,51 @@
+## 概述
+本样例以AddCustomTilingSink自定义算子为例，介绍了在开发自定义算子时如何启用Tiling下沉，以及如何通过PyTorch在图模式下调用该自定义算子的完整流程。
+
+## 目录结构介绍
+
+```
+├── AddCustomTilingSink      
+│   ├── AddCustomTilingSink  // AscendC算子实现
+│   └── PytorchInvocation    // Pytorch调用样例
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
+```
+z = x + y
+```
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom_tiling_sink</td></tr>
+</table>
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas A3 训练系列产品/Atlas A3 推理系列产品
+
+## 编译运行样例算子
+
+### 1. 实现Pytorch自定义算子并注册
+请参考本目录中[PytorchInvocation/readme.md](./PytorchInvocation/README.md)实现Pytorch侧注册。
+
+### 2. 实现CANN自定义算子，并完成编译部署
+请参考本目录中[AddCustomTilingSink/README.md](./AddCustomTilingSink/README.md)部署自定义算子包。
+
+### 3. 执行测试脚本
+执行本目录中[PytorchInvocation/test_add_custom.py](./PytorchInvocation/test_add_custom.py)测试脚本验证功能。 
+
+## 更新说明
+
+| 时间      | 更新事项     |
+| --------- | ------------ |
+| 2025/5/28 | 新增本readme |
diff --git a/operator/ascendc/2_features/17_tiling_sink/README.md b/operator/ascendc/2_features/17_tiling_sink/README.md
new file mode 100644
index 000000000..3cf3b3be2
--- /dev/null
+++ b/operator/ascendc/2_features/17_tiling_sink/README.md
@@ -0,0 +1,16 @@
+## 背景介绍
+
+在静态图模式下，可以通过整图下沉优化调度性能。将完整的计算图一次性下发至Device侧，后续执行则无需Host参与，由Device自主完成计算，从而减少Host-Device交互开销，提升执行效率。部分算子的Tiling计算依赖运行时输入的具体数值（Tiling值依赖），需在执行时动态计算Tiling参数。针对该场景，可采用Tiling下沉优化方案：将Tiling计算下沉至Device侧的AI CPU上执行，从而实现计算全程在Device侧高效完成。  
+当前仅融合算子（矢量计算和矩阵计算融合）支持进行Tiling下沉。  
+
+## 算子开发样例
+当前本目录包含的所有样例如下。
+|  目录名称                                                   |  功能描述                                              |  运行环境 |
+| ------------------------------------------------------------ | ---------------------------------------------------- | -- |
+| [AddCustomTilingSink](./AddCustomTilingSink/) | 基于Ascend C的自定义Tiling下沉算子及Pytorch调用样例，通过使能Tiling下沉，实现下沉执行优化调度性能。|Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件<br>Atlas A3 训练系列产品/Atlas A3 推理系列产品|
+
+## 更新说明
+
+| 时间      | 更新事项     |
+| --------- | ------------ |
+| 2025/5/28 | 新增本readme |
-- 
Gitee


From 4bd13a6999b4e930ed0e7991e5fc0ac4e5f2e37d Mon Sep 17 00:00:00 2001
From: renjie <renjie88@huawei.com>
Date: Tue, 3 Jun 2025 09:32:57 +0000
Subject: [PATCH 11/46] =?UTF-8?q?!2667=20=E3=80=90tiling=E4=B8=8B=E6=B2=89?=
 =?UTF-8?q?=E6=A0=B7=E4=BE=8B=E3=80=91=E3=80=90AR20250522891845=E3=80=91RE?=
 =?UTF-8?q?ADME=E4=BF=AE=E6=94=B9=20Merge=20pull=20request=20!2667=20from?=
 =?UTF-8?q?=20renjie/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../AddCustomTilingSink.json                  |  2 +-
 .../AddCustomTilingSink/README.md             | 50 ++++++++++++-------
 .../tf_plugin/tensorflow_add_custom_plugin.cc | 22 --------
 .../op_host/add_custom_tiling_sink.cpp        |  6 ++-
 .../op_host/add_custom_tiling_sink_tiling.cpp |  8 +--
 .../op_kernel/add_custom_tiling_sink.cpp      |  2 +-
 6 files changed, 42 insertions(+), 48 deletions(-)
 delete mode 100644 operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/framework/tf_plugin/tensorflow_add_custom_plugin.cc

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/AddCustomTilingSink.json b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/AddCustomTilingSink.json
index 1d93e1f49..9a1ee691b 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/AddCustomTilingSink.json
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/AddCustomTilingSink.json
@@ -15,7 +15,7 @@
             },
             {
                 "name": "y",
-                "param_type": "optional",
+                "param_type": "required",
                 "format": [
                     "ND"
                 ],
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/README.md b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/README.md
index a89d51c80..16e430cc8 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/README.md
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/README.md
@@ -1,29 +1,28 @@
-
 ## 概述
-本样例基于AddCustom算子工程，提供了支持Tiling下沉的自定义算子开发样例。
-若要使能tiling下沉，算子tiling函数必须独立实现，详细开发指导请参考[Tiling下沉自定义算子开发指南](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_00014.html)
+本样例基于AddCustom算子工程，提供支持Tiling下沉的自定义算子开发样例。
+若要使能Tiling下沉，算子Tiling函数必须独立实现，详细开发指导请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)手册中的Tiling下沉章节。
 
 ## 目录结构介绍
 ```
-├─OpImpl										// 算子实现
-│   ├─framework									// 算子插件实现文件目录
-│   ├─op_host									// host侧实现文件
-│   │   ├─add_custom_tiling_sink.cpp			// 算子原型定义、tiling函数注册等
-│   │   │ add_custom_tiling_sink_tiling.cpp		// 算子tiling函数的所有实现(必须独立实现于cpp中)
-│   │   └─add_custom_tiling_sink_tiling.h		// 算子tiling结构体定义
-│   └─op_kernel									// kernel侧实现文件
-│  AddCustomTilingSink.json						// 算子的原型定义json文件
-│  install.sh									// 脚本，调用msOpGen生成自定义算子工程，并编译
-```
+├─op_host									// host侧实现文件
+│   ├─add_custom_tiling_sink.cpp			// 算子原型定义、Tiling函数注册等
+│   │ add_custom_tiling_sink_tiling.cpp		// 算子Tiling函数的所有实现(必须独立实现于cpp中)
+│   └─add_custom_tiling_sink_tiling.h		// 算子Tiling结构体定义
+├─op_kernel									// kernel侧实现文件
+├─AddCustomTilingSink.json					// 算子的原型定义json文件
+├─install.sh								// 脚本，调用msOpGen生成自定义算子工程，并编译
 
+```
 ## 算子描述
-Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
+AddCustomTilingSink算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
 ```
+
 z = x + y
+
 ```
 ## 算子规格描述
 <table>
-<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">AddCustomTilingSink</td></tr>
 </tr>
 <tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
 <tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
@@ -32,9 +31,22 @@ z = x + y
 </tr>
 <tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
 </tr>
-<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom_tiling_sink</td></tr>
 </table>
 
+## 代码实现介绍
+本样例基于AddCustom算子工程，使能Tiling下沉做出了以下修改：
+- 算子原型定义：在op_host/add_custom_tiling_sink.cpp中，定义了算子原型，指定输入"y"为Tiling值依赖。
+- Tiling函数逻辑：添加判断逻辑，通过判断值依赖InputTensor的Data是否为空指针，确认当前是否处于编译期。若处于编译期，需要设置最大的workspace用于内存分配。
+- Tiling函数下沉注册：将所有的Tiling函数逻辑单独在op_host/add_custom_tiling_sink_tiling.cpp中实现，并通过DEVICE_IMPL_OP_OPTILING接口注册下沉的Tiling函数。(DEVICE_IMPL_OP_OPTILING接口定义在头文件device_op_impl_registry.h中)
+- 算子host侧CMakeList.txt：Tiling下沉需要添加device侧的编译任务，本样例通过install.sh脚本添加，具体添加内容如下。
+```
+ascendc_device_library( TARGET cust_opmaster
+                        OPTION SHARED
+                        SRC ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_tiling_sink_tiling.cpp)
+```
+- 算子kernel实现：通过KERNEL_TASK_TYPE_DEFAULT接口将算子强制指定在AIC、AIV混合场景运行，满足Tiling下沉算子条件。
+
 ## 支持的产品型号
 本样例支持如下产品型号：
 - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
@@ -56,7 +68,7 @@ z = x + y
   - 切换到msOpGen脚本install.sh所在目录
     ```bash
     # 若开发者以git命令行方式clone了master分支代码，并切换目录
-    cd ${git_clone_path}/samples/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/OpImpl
+    cd ${git_clone_path}/samples/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink
     ```
 
   - 调用脚本，生成自定义算子工程，复制host和kernel实现并编译算子
@@ -109,9 +121,9 @@ z = x + y
     cd CustomOp/build_out
     ./custom_opp_<target os>_<target architecture>.run
     ```
-  命令执行成功后，自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。若要执行tiling下沉样例，则算子包不支持通过--install-path指定目录安装。
+  命令执行成功后，自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。若要执行Tiling下沉样例，则算子包不支持通过--install-path指定目录安装。
 
 ## 更新说明
 | 时间       | 更新事项                     |
 | ---------- | ---------------------------- |
-| 2025/5/22 | 新增AddCustomTilingSink算子样例 |
+| 2025/5/22 | 新增AddCustomTilingSink算子样例 |
\ No newline at end of file
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/framework/tf_plugin/tensorflow_add_custom_plugin.cc b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/framework/tf_plugin/tensorflow_add_custom_plugin.cc
deleted file mode 100644
index b96757140..000000000
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/framework/tf_plugin/tensorflow_add_custom_plugin.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the Apache License Version 2.0.
- * You may not use this file except in compliance with the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * Apache License for more details at
- * http://www.apache.org/licenses/LICENSE-2.0
- */
-
-#include "register/register.h"
-
-namespace domi {
-// register op info to GE
-REGISTER_CUSTOM_OP("AddCustomTilingSink")
-    .FrameworkType(TENSORFLOW)   // type: CAFFE, TENSORFLOW
-    .OriginOpType("Add")      // name in tf module
-    .ParseParamsByOperatorFn(AutoMappingByOpFn);
-}  // namespace domi
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink.cpp
index c88a110b0..ea682bb3c 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink.cpp
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink.cpp
@@ -36,7 +36,7 @@ public:
             .DataType({ge::DT_FLOAT})
             .Format({ge::FORMAT_ND});
         this->Input("y")
-            .ParamType(OPTIONAL)
+            .ParamType(REQUIRED)
             .DataType({ge::DT_FLOAT})
             .Format({ge::FORMAT_ND})
             .ValueDepend(OPTIONAL, DependScope::TILING); // 表示输入y为Tiling值依赖
@@ -49,7 +49,9 @@ public:
 
         this->AICore().SetTiling(optiling::AddCustomSinkTilingFunc);
         
-        this->AICore().AddConfig("ascend910b");
+        this->AICore()
+            .AddConfig("ascend910b")
+            .AddConfig("ascend910_93");
     }
 };
 OP_ADD(AddCustomTilingSink);
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
index 32ffb8a3e..563ba0b63 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
@@ -14,8 +14,8 @@
 namespace optiling {
 static constexpr uint32_t BLOCK_DIM = 8;
 static constexpr uint32_t TILE_NUM = 8;
-static constexpr size_t MAX_WORKSPACE_SIZE = 32; // 能获取到的最大workspace大小
-static constexpr size_t DEFAULT_WORKSPACE_SIZE = 1;
+static constexpr size_t MAX_WORKSPACE_SIZE = 32; // 算子所需workspace的最大值，AddCustomTilingSink样例不需要workspace，不涉及设置，此处设置为固定值仅作为示例
+static constexpr size_t DEFAULT_WORKSPACE_SIZE = 0;
 ge::graphStatus AddCustomSinkTilingFunc(gert::TilingContext *context)
 {
     TilingSinkTilingData tiling;
@@ -26,8 +26,10 @@ ge::graphStatus AddCustomSinkTilingFunc(gert::TilingContext *context)
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
-    currentWorkspace[0] = DEFAULT_WORKSPACE_SIZE;
+    currentWorkspace[0] = DEFAULT_WORKSPACE_SIZE; // 设置运行时workspace大小
     if (context->GetInputTensor(1) != nullptr && context->GetInputTensor(1)->GetData<float>() == nullptr) {
+        // 通过判断值依赖InputTensor的Data是否为空指针来确认当前是否处于编译期。
+        // Tiling下沉场景，编译期需要为算子分配内存，包括其所需的workspace。为了保证运行时的高效性，编译期应根据算子的执行需求，合理设置所需的workspace最大值，以避免内存不足或浪费。
         currentWorkspace[0] = MAX_WORKSPACE_SIZE;
     }
     return ge::GRAPH_SUCCESS;
diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_kernel/add_custom_tiling_sink.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_kernel/add_custom_tiling_sink.cpp
index 4b1cb2f1d..d8b3738ce 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_kernel/add_custom_tiling_sink.cpp
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_kernel/add_custom_tiling_sink.cpp
@@ -85,7 +85,7 @@ private:
 extern "C" __global__ __aicore__ void add_custom_tiling_sink(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
 {
     GET_TILING_DATA(tiling_data, tiling);
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIC_1_2);
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIC_1_2); // 将算子强制指定在AIC、AIV混合场景运行，模拟融合算子场景
     if ASCEND_IS_AIC {
         return;
     }
-- 
Gitee


From 6f4cf00910fa1650c2b96590b2f909175c6c4372 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8?= <chenweiheng1@hisilicon.com>
Date: Wed, 4 Jun 2025 02:34:45 +0000
Subject: [PATCH 12/46] =?UTF-8?q?!2668=20[bugfix]fix=20ModuleNotFoundError?=
 =?UTF-8?q?:torch=5Fnpu.meta=20Merge=20pull=20request=20!2668=20from=20?=
 =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../test_ops_custom_register_in_graph.py             | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/PytorchInvocation/test_ops_custom_register_in_graph.py b/operator/ascendc/0_introduction/1_add_frameworklaunch/PytorchInvocation/test_ops_custom_register_in_graph.py
index a8f095457..f9bed9c44 100644
--- a/operator/ascendc/0_introduction/1_add_frameworklaunch/PytorchInvocation/test_ops_custom_register_in_graph.py
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/PytorchInvocation/test_ops_custom_register_in_graph.py
@@ -16,7 +16,17 @@ import torchair
 from torch_npu.testing.testcase import TestCase, run_tests
 from torchair import register_fx_node_ge_converter
 from torchair.ge import Tensor
-from torch_npu.meta._meta_registrations import m
+try:
+    from torch_npu.meta._meta_registrations import m
+except ModuleNotFoundError:
+    try:
+        from torch_npu.op_plugin.meta import _meta_registrations as m
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError(
+            "Failed to import '_meta_registrations' module. "
+            "Neither 'torch_npu.meta._meta_registrations' "
+            "nor 'torch_npu.op_plugin.meta._meta_registrations' could be found. "
+        )
 
 
 @impl(m, "npu_add_custom")
-- 
Gitee


From fc7a1cc09da29bb2cd41e36b6e7a213b89cdfab0 Mon Sep 17 00:00:00 2001
From: PengC <chupeng5@huawei.com>
Date: Tue, 10 Jun 2025 01:45:07 +0000
Subject: [PATCH 13/46] =?UTF-8?q?!2672=20=E4=BF=AE=E6=94=B9=E7=9C=9F?=
 =?UTF-8?q?=E5=80=BC=E7=94=9F=E6=88=90=E7=B1=BB=E5=9E=8B=20Merge=20pull=20?=
 =?UTF-8?q?request=20!2672=20from=20PengC/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../MmadBiasInvocation/scripts/gen_data.py                  | 6 +++---
 .../20_mmad_kernellaunch/MmadInvocation/scripts/gen_data.py | 4 ++--
 .../DumpTensorCube/AclNNInvocation/scripts/gen_data.py      | 4 ++--
 .../DumpTensorKernelInvocationCube/scripts/gen_data.py      | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/scripts/gen_data.py b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/scripts/gen_data.py
index 0fdd40e64..4fcd9b96b 100644
--- a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/scripts/gen_data.py
+++ b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/scripts/gen_data.py
@@ -17,9 +17,9 @@ def gen_golden_data():
     N = 32
     K = 32
 
-    x1_gm = np.random.randint(1, 10, [M, K]).astype(np.float16)
-    x2_gm = np.random.randint(1, 10, [K, N]).astype(np.float16)
-    bias_gm = np.random.randint(1, 10, [N]).astype(np.float16)
+    x1_gm = np.random.uniform(1, 10, [M, K]).astype(np.float16)
+    x2_gm = np.random.uniform(1, 10, [K, N]).astype(np.float16)
+    bias_gm = np.random.uniform(1, 10, [N]).astype(np.float16)
     golden = (np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32)) + bias_gm.astype(np.float32)).astype(np.float32)
     os.system("mkdir -p input")
     os.system("mkdir -p output")
diff --git a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/scripts/gen_data.py b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/scripts/gen_data.py
index d4cb3e7d2..dc82df2a1 100644
--- a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/scripts/gen_data.py
+++ b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/scripts/gen_data.py
@@ -17,8 +17,8 @@ def gen_golden_data():
     N = 32
     K = 32
 
-    x1_gm = np.random.randint(1, 10, [M, K]).astype(np.float16)
-    x2_gm = np.random.randint(1, 10, [K, N]).astype(np.float16)
+    x1_gm = np.random.uniform(1, 10, [M, K]).astype(np.float16)
+    x2_gm = np.random.uniform(1, 10, [K, N]).astype(np.float16)
     golden = (np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32))).astype(np.float32)
     os.system("mkdir -p input")
     os.system("mkdir -p output")
diff --git a/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorCube/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorCube/AclNNInvocation/scripts/gen_data.py
index bf5be8383..d773c163b 100644
--- a/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorCube/AclNNInvocation/scripts/gen_data.py
+++ b/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorCube/AclNNInvocation/scripts/gen_data.py
@@ -17,8 +17,8 @@ def gen_golden_data():
     N = 32
     K = 32
 
-    x1_gm = np.random.randint(1, 10, [M, K]).astype(np.float16)
-    x2_gm = np.random.randint(1, 10, [K, N]).astype(np.float16)
+    x1_gm = np.random.uniform(1, 10, [M, K]).astype(np.float16)
+    x2_gm = np.random.uniform(1, 10, [K, N]).astype(np.float16)
     golden = (np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32))).astype(np.float32)
     os.system("mkdir -p input")
     os.system("mkdir -p output")
diff --git a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/scripts/gen_data.py b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/scripts/gen_data.py
index 88b51c629..e00c3067e 100644
--- a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/scripts/gen_data.py
+++ b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/scripts/gen_data.py
@@ -17,8 +17,8 @@ def gen_golden_data():
     N = 32
     K = 32
 
-    x1_gm = np.random.randint(1, 10, [M, K]).astype(np.float16)
-    x2_gm = np.random.randint(1, 10, [K, N]).astype(np.float16)
+    x1_gm = np.random.uniform(1, 10, [M, K]).astype(np.float16)
+    x2_gm = np.random.uniform(1, 10, [K, N]).astype(np.float16)
     golden = (np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32))).astype(np.float32)
     os.system("mkdir -p input")
     os.system("mkdir -p output")
-- 
Gitee


From 5e1236944f5a90058758a918fdbf7d3884d3cd57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E9=81=93=E6=98=8E?= <lidaoming1@huawei.com>
Date: Tue, 10 Jun 2025 03:17:40 +0000
Subject: [PATCH 14/46] =?UTF-8?q?!2671=20add=20limit=20for=20mc2=20Merge?=
 =?UTF-8?q?=20pull=20request=20!2671=20from=20=E6=9D=8E=E9=81=93=E6=98=8E/?=
 =?UTF-8?q?fix=5Flimit=5Fmc2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../4_best_practices/21_all_gather_matmul_custom/README.md   | 3 +++
 .../22_matmul_reduce_scatter_custom/README.md                | 5 ++++-
 .../4_best_practices/23_matmul_all_reduce_custom/README.md   | 5 ++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/operator/ascendc/4_best_practices/21_all_gather_matmul_custom/README.md b/operator/ascendc/4_best_practices/21_all_gather_matmul_custom/README.md
index 074c90feb..c121aac07 100644
--- a/operator/ascendc/4_best_practices/21_all_gather_matmul_custom/README.md
+++ b/operator/ascendc/4_best_practices/21_all_gather_matmul_custom/README.md
@@ -75,6 +75,8 @@ CANN软件包中提供了工程创建工具msopgen，AllGatherMatmulCustom算子
 ### 1. 获取源码包
 编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
 
+注意：本样例的执行依赖8卡集成环境。为保证样例的正常执行，请预先安装2.1版本的torch和torch_npu安装包。
+
 ### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
   - 切换到msOpGen脚本install.sh所在目录
     ```bash
@@ -141,3 +143,4 @@ CANN软件包中提供了工程创建工具msopgen，AllGatherMatmulCustom算子
 | 时间       | 更新事项                     |
 | ---------- | ---------------------------- |
 | 2024/12/19 | 新增样例 |
+| 2025/06/09 | 添加算子执行环境备注 |
diff --git a/operator/ascendc/4_best_practices/22_matmul_reduce_scatter_custom/README.md b/operator/ascendc/4_best_practices/22_matmul_reduce_scatter_custom/README.md
index 0d85f3188..36f47216b 100644
--- a/operator/ascendc/4_best_practices/22_matmul_reduce_scatter_custom/README.md
+++ b/operator/ascendc/4_best_practices/22_matmul_reduce_scatter_custom/README.md
@@ -75,6 +75,8 @@ CANN软件包中提供了工程创建工具msopgen，MatmulReduceScatterCustom
 ### 1. 获取源码包
 编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
 
+注意：本样例的执行依赖8卡集成环境。为保证样例的正常执行，请预先安装2.1版本的torch和torch_npu安装包。
+
 ### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
   - 切换到msOpGen脚本install.sh所在目录
     ```bash
@@ -140,4 +142,5 @@ CANN软件包中提供了工程创建工具msopgen，MatmulReduceScatterCustom
 ## 更新说明
 | 时间       | 更新事项                     |
 | ---------- | ---------------------------- |
-| 2024/12/19 | 新增样例 |
\ No newline at end of file
+| 2024/12/19 | 新增样例 |
+| 2025/06/09 | 添加算子执行环境备注 |
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/README.md b/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/README.md
index 3bc513470..155dcc322 100644
--- a/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/README.md
+++ b/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/README.md
@@ -75,6 +75,8 @@ CANN软件包中提供了工程创建工具msopgen，MatmulAllReduceCustom算子
 ### 1. 获取源码包
 编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
 
+注意：本样例的执行依赖8卡集成环境。为保证样例的正常执行，请预先安装2.1版本的torch和torch_npu安装包。
+
 ### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
   - 切换到msOpGen脚本install.sh所在目录
     ```bash
@@ -140,4 +142,5 @@ CANN软件包中提供了工程创建工具msopgen，MatmulAllReduceCustom算子
 ## 更新说明
 | 时间       | 更新事项                     |
 | ---------- | ---------------------------- |
-| 2024/12/19 | 新增样例 |
\ No newline at end of file
+| 2024/12/19 | 新增样例 |
+| 2025/06/09 | 添加算子执行环境备注 |
\ No newline at end of file
-- 
Gitee


From 575b5fc5ddc5d44e0623e284ee891aedf07c8d02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8?= <chenweiheng1@hisilicon.com>
Date: Wed, 11 Jun 2025 02:39:02 +0000
Subject: [PATCH 15/46] =?UTF-8?q?!2676=20update=2017=5Ftiling=5Fsink=20rea?=
 =?UTF-8?q?dme=20Merge=20pull=20request=20!2676=20from=20=E9=99=88?=
 =?UTF-8?q?=E5=A8=81=E4=BA=A8/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../AddCustomTilingSink/PytorchInvocation/README.md    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/README.md b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/README.md
index 8167f74d0..4c7468bcb 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/README.md
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/PytorchInvocation/README.md
@@ -127,9 +127,17 @@ def AddCustomTilingSink(x: Tensor, y: Tensor, *, dependencies=[], node_name=None
 6.实现自定算子converetr并注册：
 
 ```python
+from typing import (
+    Optional,
+    Union,
+    List,
+)
+from torchair._ge_concrete_graph.fx2ge_converter import register_fx_node_ge_converter
+from torchair.ge._ge_graph import Tensor, TensorSpec
+
 @register_fx_node_ge_converter(torch.ops.air.add_custom_tiling_sink.default)
 def convert_add_custom_tiling_sink(x: torch.Tensor, y: torch.Tensor, meta_outputs: Union[TensorSpec, List[TensorSpec]] = None):
-    return AddCustomTilingSink(x, y) # 此为前面生产的构图api
+    return AddCustomTilingSink(x, y) # 此为前面生成的构图api
 ```
 
 ## 运行样例算子
-- 
Gitee


From 618064c42bbc2ef4150d60083a5db9a0e50c5cd9 Mon Sep 17 00:00:00 2001
From: wangyuqing <wangyuqing33@huawei.com>
Date: Thu, 12 Jun 2025 01:38:08 +0000
Subject: [PATCH 16/46] !2675 update mobilenet_v2_1.0_224.tgz download url
 Merge pull request !2675 from wangyuqing/master

---
 .../9_amct/amct_tensorflow/auto_calibration/README_CN.md    | 2 +-
 .../9_amct/amct_tensorflow/calibration/README_CN.md         | 2 +-
 .../9_amct/amct_tensorflow/cmd/README_CN.md                 | 2 +-
 .../9_amct/amct_tensorflow/convert_model/README_CN.md       | 2 +-
 .../9_amct/amct_tensorflow/mobilenet_v2/README_CN.md        | 6 +++---
 .../9_amct/amct_tensorflow_ascend/mobilenetv2/README_CN.md  | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/level1_single_api/9_amct/amct_tensorflow/auto_calibration/README_CN.md b/python/level1_single_api/9_amct/amct_tensorflow/auto_calibration/README_CN.md
index 1c9649844..1a541b9ca 100644
--- a/python/level1_single_api/9_amct/amct_tensorflow/auto_calibration/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_tensorflow/auto_calibration/README_CN.md
@@ -5,7 +5,7 @@
 ### 1.1 量化前提
 
 + **模型准备**  
-请点击下载 [MobileNet V2](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
+请点击下载 [MobileNet V2](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com:443/resource/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
 
 + **数据集准备**  
 自动量化回退过程中，需要不断的对模型进行校准和测试，因此需要用户准备数据集，本示例所采用的数据集为标准 TFRecord 格式的 ImageNet 的 子集 ILSVRC-2012-CLS 的验证集，共有 50000 张图片，如果采用其他数据集，则需要用户自行修改 sample 文件中的数据预处理部分以匹配模型输入。
diff --git a/python/level1_single_api/9_amct/amct_tensorflow/calibration/README_CN.md b/python/level1_single_api/9_amct/amct_tensorflow/calibration/README_CN.md
index 2eb39eb04..a6ea8bceb 100644
--- a/python/level1_single_api/9_amct/amct_tensorflow/calibration/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_tensorflow/calibration/README_CN.md
@@ -5,7 +5,7 @@
 ### 1.1 量化前提
 
 + **模型准备**  
-请点击下载 [MobileNet V2](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
+请点击下载 [MobileNet V2](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com:443/resource/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
 
 + **数据集准备**  
 使用昇腾模型压缩工具对模型完成量化后，需要对模型进行推理，以测试量化数据的精度。推理过程中需要使用和模型相匹配的数据集。请下载测试图片 [classification.jpg](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/mobilenet_v2_calibration/classification.jpg)，并将该图片放到 [data](./data/) 目录下。
diff --git a/python/level1_single_api/9_amct/amct_tensorflow/cmd/README_CN.md b/python/level1_single_api/9_amct/amct_tensorflow/cmd/README_CN.md
index f6795fd29..d4b58cc67 100644
--- a/python/level1_single_api/9_amct/amct_tensorflow/cmd/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_tensorflow/cmd/README_CN.md
@@ -5,7 +5,7 @@
 ### 1.1 量化前提
 
 + **模型准备**  
-请点击下载 [MobileNet V2](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
+请点击下载 [MobileNet V2](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com:443/resource/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
 
 + **准备校准数据集**
 校准集用来产生量化因子，保证精度。计算量化参数的过程被称为“校准 (calibration)”。校准过程需要使用一部分图片来针对性计算量化参数，使用一个或多个 batch 对量化后的网络模型进行推理即可完成校准。为了保证量化精度，校准集与测试精度的数据集来源应一致。
diff --git a/python/level1_single_api/9_amct/amct_tensorflow/convert_model/README_CN.md b/python/level1_single_api/9_amct/amct_tensorflow/convert_model/README_CN.md
index f4efdce91..58ec4706e 100644
--- a/python/level1_single_api/9_amct/amct_tensorflow/convert_model/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_tensorflow/convert_model/README_CN.md
@@ -5,7 +5,7 @@
 ### 1.1 量化前提
 
 + **模型准备**  
-请点击下载 [MobileNet V2](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
+请点击下载 [MobileNet V2](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com:443/resource/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
 
 + **数据集准备**  
 使用昇腾模型压缩工具对模型完成量化后，需要对模型进行推理，以测试量化数据的精度。推理过程中需要使用和模型相匹配的数据集。请下载测试图片 [classification.jpg](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/mobilenet_v2_calibration/classification.jpg)，并将该图片放到 [data](./data/) 目录下。
diff --git a/python/level1_single_api/9_amct/amct_tensorflow/mobilenet_v2/README_CN.md b/python/level1_single_api/9_amct/amct_tensorflow/mobilenet_v2/README_CN.md
index f33272652..1ae0c9539 100644
--- a/python/level1_single_api/9_amct/amct_tensorflow/mobilenet_v2/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_tensorflow/mobilenet_v2/README_CN.md
@@ -5,7 +5,7 @@
 ### 1.1 量化前提
 
 + **模型准备**  
-请点击下载 [MobileNet V2](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
+请点击下载 [MobileNet V2](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com:443/resource/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
 
 + **数据集准备**  
 使用昇腾模型压缩工具对模型完成量化后，需要对模型进行推理，以测试量化数据的精度。推理过程中需要使用和模型相匹配的数据集。请下载测试图片 [classification.jpg](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/mobilenet_v2_calibration/classification.jpg)，并将该图片放到 [data](./data/) 目录下。
@@ -60,7 +60,7 @@ Quantized Model Prediction:
 ### 2.1 量化前提
 
 + **模型准备**  
-请点击下载 [MobileNet V2](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
+请点击下载 [MobileNet V2](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com:443/resource/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
 
 + **数据集准备**  
 使用昇腾模型压缩工具对模型完成量化后，需要对模型进行推理，以测试量化数据的精度。推理过程中需要使用和模型相匹配的数据集。请下载测试图片 [classification.jpg](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/mobilenet_v2_calibration/classification.jpg)，并将该图片放到 [data](./data/) 目录下。
@@ -158,7 +158,7 @@ Quantized Model Prediction:
 ### 4.1 量化前提
 
 + **模型准备**  
-请点击下载 [MobileNet V2](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
+请点击下载 [MobileNet V2](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com:443/resource/mobilenet_v2_1.0_224.tgz) 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放到 [model](./model/) 目录下。
 
 + **数据集准备**  
 自动量化回退过程中，需要不断的对模型进行校准和测试，因此需要用户准备数据集，本示例所采用的数据集为标准 TFRecord 格式的 ImageNet 的 子集 ILSVRC-2012-CLS 的验证集，共有 50000 张图片，如果采用其他数据集，则需要用户自行修改 sample 文件中的数据预处理部分以匹配模型输入。
diff --git a/python/level1_single_api/9_amct/amct_tensorflow_ascend/mobilenetv2/README_CN.md b/python/level1_single_api/9_amct/amct_tensorflow_ascend/mobilenetv2/README_CN.md
index f6619b1b4..06fb0f66f 100644
--- a/python/level1_single_api/9_amct/amct_tensorflow_ascend/mobilenetv2/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_tensorflow_ascend/mobilenetv2/README_CN.md
@@ -7,7 +7,7 @@
 请按照手册准备好环境并安装好amct_tensorflow_ascend工具包。
 ##### 模型准备
 请至
-[Tensorflow-models](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz)
+[Tensorflow-models](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com:443/resource/mobilenet_v2_1.0_224.tgz)
 下载 MobileNetV2 模型文件。解压并将其中的 mobilenet_v2_1.0_224_frozen.pb 文件放入[pre_model](./pre_model)文件夹中。
 ##### 数据集准备
 可以对量化前后的模型进行推理，以测试量化对精度的影响，推理过程中需要使用和模型相匹配的数据集。请下载[测试图片](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/mobilenet_v2_calibration/classification.jpg)，并将该图片放到 [data](./data/) 目录下。
-- 
Gitee


From ecb188c9da7b863f1307c706edd18038c9b2ac31 Mon Sep 17 00:00:00 2001
From: alpaca12345UUU <zhanglong170@huawei.com>
Date: Thu, 12 Jun 2025 09:28:28 +0000
Subject: [PATCH 17/46] !2677 add tbufpool sample Merge pull request !2677 from
 alpaca12345UUU/master

---
 .../2_features/2_tbufpool/CMakeLists.txt      |  76 +++++++
 .../ascendc/2_features/2_tbufpool/README.md   |  87 +++++++-
 .../2_features/2_tbufpool/cmake/cpu_lib.cmake |  26 +++
 .../2_features/2_tbufpool/cmake/npu_lib.cmake |  12 +
 .../2_features/2_tbufpool/data_utils.h        | 211 ++++++++++++++++++
 .../ascendc/2_features/2_tbufpool/main.cpp    | 172 ++++++++++++++
 .../op_host/tbufpool_custom_tiling.cpp        |  19 ++
 .../op_host/tbufpool_custom_tiling.h          |  18 ++
 .../2_tbufpool/op_kernel/tbufpool_custom.cpp  |  20 ++
 .../2_tbufpool/op_kernel/tbufpool_custom.h    | 128 +++++++++++
 operator/ascendc/2_features/2_tbufpool/run.sh |  48 ++++
 .../2_features/2_tbufpool/scripts/gen_data.py |  32 +++
 operator/ascendc/2_features/README.md         |   1 +
 13 files changed, 849 insertions(+), 1 deletion(-)
 create mode 100644 operator/ascendc/2_features/2_tbufpool/CMakeLists.txt
 create mode 100644 operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake
 create mode 100644 operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake
 create mode 100644 operator/ascendc/2_features/2_tbufpool/data_utils.h
 create mode 100644 operator/ascendc/2_features/2_tbufpool/main.cpp
 create mode 100644 operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp
 create mode 100644 operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h
 create mode 100644 operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp
 create mode 100644 operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h
 create mode 100644 operator/ascendc/2_features/2_tbufpool/run.sh
 create mode 100644 operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py

diff --git a/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt b/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt
new file mode 100644
index 000000000..060c0adc0
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/tbufpool_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(tbufpool_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/tbufpool_custom_tiling.cpp
+)
+
+target_compile_options(tbufpool_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(tbufpool_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(tbufpool_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(tbufpool_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS tbufpool_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/README.md b/operator/ascendc/2_features/2_tbufpool/README.md
index 5af80e6c5..b87611f13 100644
--- a/operator/ascendc/2_features/2_tbufpool/README.md
+++ b/operator/ascendc/2_features/2_tbufpool/README.md
@@ -1 +1,86 @@
-tbufpool（待补充）
\ No newline at end of file
+## 目录结构介绍
+```
+├── 2_tbufpool
+│   ├── cmake                               // 编译工程文件
+│   ├── op_host                             // 本样例tiling代码实现
+│   │   ├── tbufpool_custom_tilling.cpp
+│   │   ├── tbufpool_custom_tilling.h
+│   ├── op_kernel                           // 本样例kernel侧代码实现
+│   │   ├── tbufpool_custom.cpp
+│   │   ├── tbufpool_custom.h
+│   ├── scripts
+│   │   ├── gen_data.py                     // 输入数据和真值数据生成脚本    
+│   ├── CMakeLists.txt                      // 编译工程文件
+│   ├── data_utils.h                        // 数据读入写出函数
+│   ├── main.cpp                            // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                              // 编译运行算子的脚本
+```
+## 代码实现介绍
+数据量较大且内存有限时，无法一次完成所有数据搬运，需要拆分成多个阶段计算，每次计算使用其中的一部分数据，可以通过TBufPool资源池进行内存地址复用。本例中，通过调用InitBufPool基础API对Add算子和Sub算子实现过程进行内存管理。从Tpipe划分出资源池tbufPool0，tbufPool0为src0Gm分配空间后，继续分配了资源池tbufPool1，指定tbufPool1与tbufPool2复用并分别运用于第一、二轮计算，此时tbufPool1及tbufPool2共享起始地址及长度。
+
+- kernel实现  
+  Add算子的数学表达式为：
+  ```
+  z = x + y
+  ```
+  Sub算子的数学表达式为：
+  ```
+  z = x - y
+  ```
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，首先启用tbufool1，将部分输入数据src0Gm，部分输入数据src1Gm搬运进片上储存，调用计算接口完成相加计算，搬出到外部存储上。之后切换到tbufpool2进行剩余数据相减计算，得到最终结果，再搬出到外部存储上。  
+
+  本样例算子的实现流程分为6个基本任务：CopyIn，Compute，CopyOut，CopyIn1，Compute1，CopyOut1。
+  - CopyIn任务负责将Global Memory上的部分输入Tensor src0Gm和src1Gm搬运到Local Memory，分别存储在src0Local、src1Local；
+  - Compute任务负责对src0Local、src1Local执行加法操作，计算结果存储在dstLocal中；
+  - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm0中。
+  - CopyIn1任务负责将Global Memory上的剩余输入Tensor src0Gm和src1Gm搬运到Local Memory，分别存储在src0Local、src1Local；
+  - Compute1任务负责对src0Local、src1Local执行剩余数据减法操作，计算结果存储在dstLocal中；
+  - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm1中。
+
+- 调用实现
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/2_features/2_tbufpool
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    
+    配置仿真模式日志文件目录，默认为sim_log。
+    ```bash
+    export CAMODEL_LOG_PATH=./sim_log
+    ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim / npu]。
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```bash
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake b/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake
new file mode 100644
index 000000000..693f15ac1
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake b/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake
new file mode 100644
index 000000000..8ad136f38
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/data_utils.h b/operator/ascendc/2_features/2_tbufpool/data_utils.h
new file mode 100644
index 000000000..05590dd72
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/data_utils.h
@@ -0,0 +1,211 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <iostream>
+#include <fstream>
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <iomanip>
+#include <cassert>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#endif
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+
+#ifndef ASCENDC_CPU_DEBUG
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+#endif
+
+/**
+* @brief Read data from file
+* @param [in] filePath: file path
+* @param [out] fileSize: file size
+* @return read result
+*/
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+* @brief Write data to file
+* @param [in] filePath: file path
+* @param [in] buffer: data to write to file
+* @param [in] size: size to write
+* @return write result
+*/
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    auto writeSize = write(fd, buffer, size);
+    (void) close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T>
+void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+#endif
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow=16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+#ifndef ASCENDC_CPU_DEBUG 
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+#endif
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // EXAMPLES_COMMON_DATA_UTILS_H
diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp
new file mode 100644
index 000000000..ba4f849dd
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/main.cpp
@@ -0,0 +1,172 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include "data_utils.h"
+#include "./op_host/tbufpool_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_tbufpool_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_ADDR zAdd, TbufPoolTilingData tiling);
+#endif
+
+namespace {
+constexpr uint32_t USED_CORE_NUM = 1;
+constexpr uint32_t TOTAL_LENGTH = 2048;
+constexpr uint32_t DST_LENGTH = 1024;
+constexpr uint32_t TILING_SIZE = 1;
+}
+
+extern void GenerateTilingData(const uint32_t totalLength, uint8_t *tilingBuf);
+
+static bool CompareResult(const void *outputData, int64_t outSize) {
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden.bin success!\n");
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (int i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf(" %lf CompareResult failed output is %lf, golden is %lf\n", float(i), a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        return false;
+    } else {
+        printf("CompareResult golden.bin success!\n");
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+    size_t tilingSize = TILING_SIZE * sizeof(uint32_t);
+    size_t inputSize = TOTAL_LENGTH * sizeof(float);
+    size_t outputSizeAdd = inputSize;
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *zAdd = (uint8_t *)AscendC::GmAlloc(outputSizeAdd);
+    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+    ReadFile("../input/input_y.bin", inputSize, y, inputSize);
+
+    GenerateTilingData(TOTAL_LENGTH, tiling);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode
+
+    ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, *reinterpret_cast<TbufPoolTilingData *>(tiling)); // use this macro for cpu debug
+
+    WriteFile("../output/output.bin", zAdd, outputSizeAdd);
+
+    bool goldenResult = true;
+    goldenResult = CompareResult(zAdd, outputSizeAdd);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)zAdd);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost; 
+    uint8_t *yHost; 
+    uint8_t *zHostAdd; 
+    uint8_t *tiling;
+    uint8_t *xDevice; 
+    uint8_t *yDevice; 
+    uint8_t *zDeviceAdd;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&zHostAdd), outputSizeAdd));
+    CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize));
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&zDeviceAdd, outputSizeAdd, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+    ReadFile("../input/input_y.bin", inputSize, yHost, inputSize);
+
+    GenerateTilingData(TOTAL_LENGTH, tiling);
+
+    // Copy host memory to device memory
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(yDevice, inputSize, yHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(tbufpool_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, zDeviceAdd, reinterpret_cast<TbufPoolTilingData *>(tiling));
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(zHostAdd, outputSizeAdd, zDeviceAdd, outputSizeAdd, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", zHostAdd, outputSizeAdd);
+
+    // Compare the result with the golden result
+    bool goldenResult = true;
+    goldenResult = CompareResult(zHostAdd, outputSizeAdd);
+
+    // Clean up memory
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDeviceAdd));
+
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(zHostAdd));
+
+    CHECK_ACL(aclrtFreeHost(tiling));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+    return 0;
+}
+  
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp
new file mode 100644
index 000000000..0bc2f1c1d
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp
@@ -0,0 +1,19 @@
+/**
+ * @file tbufpool_custom_tiling.cpp
+ *
+ * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include "tiling/tiling_api.h"
+#include "tbufpool_custom_tiling.h"
+
+
+void GenerateTilingData(uint32_t totalLength, uint8_t* tilingBuf)
+{
+    TbufPoolTilingData *tiling = reinterpret_cast<TbufPoolTilingData *>(tilingBuf);
+    tiling->totalLength = totalLength;
+}
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h
new file mode 100644
index 000000000..63c60d78c
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h
@@ -0,0 +1,18 @@
+/**
+ * @file tbufpool_custom_tiling.h
+ *
+ * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef EXAMPLES_ACTIVATION_TBUFPOOL_CUSTOM_TILING_H
+#define EXAMPLES_ACTIVATION_TBUFPOOL_CUSTOM_TILING_H
+#include <cstdint>
+
+struct TbufPoolTilingData {
+    uint32_t totalLength;
+};
+#endif
diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp
new file mode 100644
index 000000000..d17a4d185
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp
@@ -0,0 +1,20 @@
+/**
+ * @file tbufpool_custom.cpp
+ *
+ * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include "./tbufpool_custom.h"
+#include "kernel_operator.h"
+
+extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR src1Gm, GM_ADDR dstGm, TbufPoolTilingData tiling)
+{
+    AscendC::TPipe pipe;
+    MyCustomKernel::TbufPoolImpl op;
+    op.Init(src0Gm, src1Gm, dstGm, tiling, &pipe);
+    op.Process();
+}
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h
new file mode 100644
index 000000000..9c3559512
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h
@@ -0,0 +1,128 @@
+/**
+ * @file tbufpool_custom.h
+ *
+ * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef EXAMPLES_ACTIVATION_INITBUFPOOL_CUSTOM_H
+#define EXAMPLES_ACTIVATION_INITBUFPOOL_CUSTOM_H
+#include "../op_host/tbufpool_custom_tiling.h"
+#include "kernel_operator.h"
+
+
+namespace MyCustomKernel {
+constexpr int32_t BUFFER_NUM = 1;
+constexpr int32_t BUFFER_NUM_T1 = 1;
+constexpr int32_t BUFFER_NUM_T2 = 1;
+constexpr int32_t BUFFER_LENGTH = 4096 * sizeof(float);
+constexpr int32_t BUFF_POOL_LENGTH = 2048 * sizeof(float);
+constexpr int32_t INIT_TENSOR_LENGTH = 1024 * sizeof(float);
+constexpr int32_t COMPUTE_LENGTH = 1024;
+
+class TbufPoolImpl {
+    public:
+        __aicore__ inline TbufPoolImpl() {}
+        __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm, 
+                                     TbufPoolTilingData tiling, AscendC::TPipe* pipeIn)
+        {
+            pipe = pipeIn;
+            src0Global.SetGlobalBuffer((__gm__ float*)src0Gm);
+            src1Global.SetGlobalBuffer((__gm__ float*)src1Gm);
+            dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm);
+            pipe->InitBufPool(tbufPool0, BUFFER_LENGTH);
+            tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0
+            tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH);
+            tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1);
+            tbufPool1.InitBuffer(srcQue1, BUFFER_NUM_T1, INIT_TENSOR_LENGTH);
+            tbufPool1.InitBuffer(dstQue0, BUFFER_NUM_T1, INIT_TENSOR_LENGTH);
+            tbufPool2.InitBuffer(srcQue2, BUFFER_NUM_T2, INIT_TENSOR_LENGTH);
+            tbufPool2.InitBuffer(dstQue1, BUFFER_NUM_T2, INIT_TENSOR_LENGTH);
+        }
+        __aicore__ inline void Process()
+        {
+            //stage 1
+            CopyIn();
+            Compute();
+            CopyOut();
+            tbufPool1.Reset();
+            //stage 2
+            CopyIn1();
+            Compute1();
+            CopyOut1();
+            tbufPool2.Reset();
+            tbufPool0.Reset();
+        }
+  
+    private:
+        __aicore__ inline void CopyIn()
+        {
+            AscendC::LocalTensor<float> src0Local = srcQue0.AllocTensor<float>();
+            AscendC::LocalTensor<float> src1Local = srcQue1.AllocTensor<float>();
+            AscendC::DataCopy(src0Local, src0Global, COMPUTE_LENGTH);
+            AscendC::DataCopy(src1Local, src1Global, COMPUTE_LENGTH);
+            srcQue0.EnQue(src0Local);
+            srcQue1.EnQue(src1Local);
+        }
+        __aicore__ inline void Compute()
+        {
+            AscendC::LocalTensor<float> src0Local = srcQue0.DeQue<float>();
+            AscendC::LocalTensor<float> src1Local = srcQue1.DeQue<float>();
+            AscendC::LocalTensor<float> dstLocal = dstQue0.AllocTensor<float>();
+            AscendC::Add(dstLocal, src0Local, src1Local, COMPUTE_LENGTH);
+            dstQue0.EnQue<float>(dstLocal);
+            srcQue0.FreeTensor(src0Local);
+            srcQue1.FreeTensor(src1Local);
+        }
+        __aicore__ inline void CopyOut()
+        {
+            AscendC::LocalTensor<float> dstLocal = dstQue0.DeQue<float>();
+            AscendC::DataCopy(dstGlobal, dstLocal, COMPUTE_LENGTH);
+            dstQue0.FreeTensor(dstLocal);
+        }
+        __aicore__ inline void CopyIn1()
+        {
+            AscendC::LocalTensor<float> src0Local = srcQue0.AllocTensor<float>();
+            AscendC::LocalTensor<float> src1Local = srcQue2.AllocTensor<float>();
+            AscendC::DataCopy(src0Local, src0Global[COMPUTE_LENGTH], COMPUTE_LENGTH);
+            AscendC::DataCopy(src1Local, src1Global[COMPUTE_LENGTH], COMPUTE_LENGTH);
+            srcQue0.EnQue(src0Local);
+            srcQue2.EnQue(src1Local);
+        }
+        __aicore__ inline void Compute1()
+        {
+            AscendC::LocalTensor<float> src0Local = srcQue0.DeQue<float>();
+            AscendC::LocalTensor<float> src1Local = srcQue2.DeQue<float>();
+            AscendC::LocalTensor<float> dstLocal = dstQue1.AllocTensor<float>();
+            AscendC::Sub(dstLocal, src0Local, src1Local, COMPUTE_LENGTH);
+            dstQue1.EnQue<float>(dstLocal);
+            srcQue0.FreeTensor(src0Local);
+            srcQue2.FreeTensor(src1Local);
+        }
+        __aicore__ inline void CopyOut1()
+        {
+            AscendC::LocalTensor<float> dstLocal = dstQue1.DeQue<float>();
+            AscendC::DataCopy(dstGlobal[COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH);
+            dstQue1.FreeTensor(dstLocal);
+        }
+    private:
+        AscendC::TPipe* pipe;
+        AscendC::TBufPool<AscendC::TPosition::VECCALC> tbufPool0; 
+        AscendC::TBufPool<AscendC::TPosition::VECCALC> tbufPool1; 
+        AscendC::TBufPool<AscendC::TPosition::VECCALC> tbufPool2;
+        AscendC::TQue<AscendC::TPosition::VECIN, 1> srcQue0; 
+        AscendC::TQue<AscendC::TPosition::VECIN, 1> srcQue1; 
+        AscendC::TQue<AscendC::TPosition::VECIN, 1> srcQue2;
+        AscendC::TQue<AscendC::TPosition::VECOUT, 1> dstQue0; 
+        AscendC::TQue<AscendC::TPosition::VECOUT, 1> dstQue1;
+        AscendC::GlobalTensor<float> src0Global; 
+        AscendC::GlobalTensor<float> src1Global; 
+        AscendC::GlobalTensor<float> dstGlobal;
+    };
+}// namespace MyCustomKernel
+
+#endif
+    
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/run.sh b/operator/ascendc/2_features/2_tbufpool/run.sh
new file mode 100644
index 000000000..5ae89dbe9
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/run.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./tbufpool_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./tbufpool_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./tbufpool_direct_kernel_op
+fi
\ No newline at end of file
diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py
new file mode 100644
index 000000000..fb3dc7143
--- /dev/null
+++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float32
+
+    input_shape = [8, 256]
+    input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype)
+    input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype)
+    rows = input_shape[0]
+    mid = rows // 2
+    top_half = input_x[:mid] + input_y[:mid]
+    bottom_half = input_x[mid:] - input_y[mid:]
+    golden = np.vstack((top_half, bottom_half))
+
+    os.system("mkdir -p ./input")
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
\ No newline at end of file
diff --git a/operator/ascendc/2_features/README.md b/operator/ascendc/2_features/README.md
index 8c843758b..b5ccf828a 100644
--- a/operator/ascendc/2_features/README.md
+++ b/operator/ascendc/2_features/README.md
@@ -15,6 +15,7 @@ Ascend C相关特性的样例。特性样例逐步补充中。
 当前本目录包含的所有样例如下。
 |  目录名称                                                   |  功能描述                                              |  运行环境 |
 | ------------------------------------------------------------ | ---------------------------------------------------- | -- |
+| [2_tbufpool](./2_tbufpool) | 基于Ascend C的自定义Vector算子及kernel直调样例，通过TBufPool实现Add算子和Sub算子计算过程中的内存复用，提高计算效率。|Atlas A2训练系列产品/Atlas 800I A2推理产品|
 | [12_cube_group](./12_cube_group) | 基于Ascend C的自定义算子及FrameworkLaunch调用样例，通过软同步控制AIC和AIV之间进行通讯，实现AI Core计算资源分组。|Atlas A2训练系列产品/Atlas 800I A2推理产品|
 | [13_matmul_api_ibshare](./13_matmul_api_ibshare) | 基于Ascend C的自定义Cube算子及Kernellaunch调用样例，通过A矩阵与B矩阵使能IBSHARE，实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品|
 | [14_matmul_api_constant](./14_matmul_api_constant) | 基于Ascend C的自定义Cube算子及FrameworkLaunch调用样例，通过使用全量常量化的MatmulApiStaticTiling模板参数，替代非常量的TCubeTiling参数，以减少Scalar计算开销，实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品|
-- 
Gitee


From 4675ce1f0dd15ad85c003bb35c4785b9087bea41 Mon Sep 17 00:00:00 2001
From: PengC <chupeng5@huawei.com>
Date: Wed, 18 Jun 2025 06:14:45 +0000
Subject: [PATCH 18/46] !2678 fix tolerance Merge pull request !2678 from
 PengC/master

---
 .../AclNNInvocation/scripts/verify_result.py                    | 2 +-
 .../MatmulInvocationNeo/scripts/verify_result.py                | 2 +-
 .../AclNNInvocation/scripts/verify_result.py                    | 2 +-
 .../MatmulLeakyReluInvocation/scripts/verify_result.py          | 2 +-
 .../MatmulLeakyReluInvocationAsync/scripts/verify_result.py     | 2 +-
 .../AclOfflineModel/scripts/verify_result.py                    | 2 +-
 .../AbsDuplicateKernelInvocation/scripts/verify_result.py       | 2 +-
 .../AbsGatherMaskKernelInvocation/scripts/verify_result.py      | 2 +-
 .../AbsPadKernelInvocation/scripts/verify_result.py             | 2 +-
 .../AbsUnPadKernelInvocation/scripts/verify_result.py           | 2 +-
 .../ReduceMinKernelInvocation/scripts/verify_result.py          | 2 +-
 .../WholeReduceSumKernelInvocation/scripts/verify_result.py     | 2 +-
 .../AclNNInvocation/scripts/verify_result.py                    | 2 +-
 .../AclOfflineModel/scripts/verify_result.py                    | 2 +-
 .../AclOnlineModel/scripts/verify_result.py                     | 2 +-
 .../MmadBiasInvocation/scripts/verify_result.py                 | 2 +-
 .../MmadInvocation/scripts/verify_result.py                     | 2 +-
 .../VectorAddMultiCoreWithTiling/scripts/verify_result.py       | 2 +-
 .../scripts/verify_result.py                                    | 2 +-
 .../VectorAddSingleCore/scripts/verify_result.py                | 2 +-
 .../VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py      | 2 +-
 .../AddKernelInvocationNeo/scripts/verify_result.py             | 2 +-
 .../AddKernelInvocationTilingNeo/scripts/verify_result.py       | 2 +-
 .../AclNNInvocation/scripts/verify_result.py                    | 2 +-
 .../0_introduction/5_addn_kernellaunch/scripts/verify_result.py | 2 +-
 .../AclNNInvocation/scripts/verify_result.py                    | 2 +-
 .../AclNNInvocation/scripts/verify_result.py                    | 2 +-
 .../FrameworkLaunch/AclNNInvocation/scripts/verify_result.py    | 2 +-
 .../KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py   | 2 +-
 .../FrameworkLaunch/AclNNInvocation/scripts/verify_result.py    | 2 +-
 .../KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py   | 2 +-
 .../DumpTensorCube/AclNNInvocation/scripts/verify_result.py     | 2 +-
 .../DumpTensorVector/AclNNInvocation/scripts/verify_result.py   | 2 +-
 .../DumpTensorKernelInvocationCube/scripts/verify_result.py     | 2 +-
 .../DumpTensorKernelInvocationVector/scripts/verify_result.py   | 2 +-
 .../12_cube_group/AclNNInvocation/scripts/verify_result.py      | 2 +-
 .../MatmulABshareInvocation/scripts/verify_result.py            | 2 +-
 .../AclNNInvocation/scripts/verify_result.py                    | 2 +-
 .../16_group_barrier/AclNNInvocation/scripts/verify_result.py   | 2 +-
 .../6_group_matmul/KernelLaunch/scripts/verify_result.py        | 2 +-
 .../FrameworkLaunch/AclNNInvocation/scripts/verify_result.py    | 2 +-
 .../FrameworkLaunch/AclOfflineModel/scripts/verify_result.py    | 2 +-
 .../FrameworkLaunch/AclOnlineModel/scripts/verify_result.py     | 2 +-
 .../AddKernelInvocationNeo/scripts/verify_result.py             | 2 +-
 .../AddKernelInvocationTilingNeo/scripts/verify_result.py       | 2 +-
 .../FrameworkLaunch/AclNNInvocation/scripts/verify_result.py    | 2 +-
 .../KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py   | 2 +-
 .../FrameworkLaunch/AclNNInvocation/scripts/verify_result.py    | 2 +-
 .../MatmulLeakyReluInvocation/scripts/verify_result.py          | 2 +-
 .../MatmulLeakyReluInvocationAsync/scripts/verify_result.py     | 2 +-
 .../examples/CPPInvocation/scripts/verify_result.py             | 2 +-
 .../examples/CPPInvocation/scripts/verify_result.py             | 2 +-
 .../AxpySample/AclNNInvocation/scripts/verify_result.py         | 2 +-
 .../FrameworkLaunch/AclNNInvocation/scripts/verify_result.py    | 2 +-
 .../AclNNInvocation/scripts/verify_result.py                    | 2 +-
 55 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/AclNNInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/11_matmul_kernellaunch/MatmulInvocationNeo/scripts/verify_result.py b/operator/ascendc/0_introduction/11_matmul_kernellaunch/MatmulInvocationNeo/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/0_introduction/11_matmul_kernellaunch/MatmulInvocationNeo/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/11_matmul_kernellaunch/MatmulInvocationNeo/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/12_matmulleakyrelu_frameworklaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/12_matmulleakyrelu_frameworklaunch/AclNNInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/0_introduction/12_matmulleakyrelu_frameworklaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/12_matmulleakyrelu_frameworklaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocationAsync/scripts/verify_result.py b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocationAsync/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocationAsync/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocationAsync/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/15_sub_frameworklaunch/AclOfflineModel/scripts/verify_result.py b/operator/ascendc/0_introduction/15_sub_frameworklaunch/AclOfflineModel/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/15_sub_frameworklaunch/AclOfflineModel/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/15_sub_frameworklaunch/AclOfflineModel/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsDuplicateKernelInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsDuplicateKernelInvocation/scripts/verify_result.py
index e3ecffb22..6a700ca94 100644
--- a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsDuplicateKernelInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsDuplicateKernelInvocation/scripts/verify_result.py
@@ -38,7 +38,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsGatherMaskKernelInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsGatherMaskKernelInvocation/scripts/verify_result.py
index e3ecffb22..6a700ca94 100644
--- a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsGatherMaskKernelInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsGatherMaskKernelInvocation/scripts/verify_result.py
@@ -38,7 +38,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsPadKernelInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsPadKernelInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsPadKernelInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsPadKernelInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsUnPadKernelInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsUnPadKernelInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsUnPadKernelInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsUnPadKernelInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/17_unaligned_reducemin_kernellaunch/ReduceMinKernelInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/17_unaligned_reducemin_kernellaunch/ReduceMinKernelInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/17_unaligned_reducemin_kernellaunch/ReduceMinKernelInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/17_unaligned_reducemin_kernellaunch/ReduceMinKernelInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/scripts/verify_result.py
index e3ecffb22..6a700ca94 100644
--- a/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/scripts/verify_result.py
@@ -38,7 +38,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AclOfflineModel/scripts/verify_result.py b/operator/ascendc/0_introduction/1_add_frameworklaunch/AclOfflineModel/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/1_add_frameworklaunch/AclOfflineModel/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AclOfflineModel/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AclOnlineModel/scripts/verify_result.py b/operator/ascendc/0_introduction/1_add_frameworklaunch/AclOnlineModel/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/1_add_frameworklaunch/AclOnlineModel/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AclOnlineModel/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/verify_result.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/verify_result.py
index 277d94780..0c51a2cc3 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/verify_result.py
@@ -42,7 +42,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/verify_result.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/verify_result.py
index 277d94780..0c51a2cc3 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/verify_result.py
@@ -42,7 +42,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/verify_result.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/verify_result.py
index 4e1c4ad45..7cf2a635e 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/verify_result.py
@@ -41,7 +41,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py
index 4e1c4ad45..7cf2a635e 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py
@@ -41,7 +41,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/scripts/verify_result.py b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/4_addn_frameworklaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/4_addn_frameworklaunch/AclNNInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/4_addn_frameworklaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/4_addn_frameworklaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/5_addn_kernellaunch/scripts/verify_result.py b/operator/ascendc/0_introduction/5_addn_kernellaunch/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/5_addn_kernellaunch/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/5_addn_kernellaunch/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/6_addtemplate_frameworklaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/6_addtemplate_frameworklaunch/AclNNInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/0_introduction/6_addtemplate_frameworklaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/6_addtemplate_frameworklaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/0_introduction/9_leakyrelu_frameworklaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/9_leakyrelu_frameworklaunch/AclNNInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/0_introduction/9_leakyrelu_frameworklaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/9_leakyrelu_frameworklaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/1_utilities/0_printf/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/1_utilities/0_printf/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
index b63a4a5e1..455426365 100644
--- a/operator/ascendc/1_utilities/0_printf/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/1_utilities/0_printf/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/1_utilities/0_printf/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py b/operator/ascendc/1_utilities/0_printf/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
index b63a4a5e1..455426365 100644
--- a/operator/ascendc/1_utilities/0_printf/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
+++ b/operator/ascendc/1_utilities/0_printf/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/1_utilities/3_assert/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/1_utilities/3_assert/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
index b63a4a5e1..455426365 100644
--- a/operator/ascendc/1_utilities/3_assert/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/1_utilities/3_assert/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/1_utilities/3_assert/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py b/operator/ascendc/1_utilities/3_assert/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
index b63a4a5e1..455426365 100644
--- a/operator/ascendc/1_utilities/3_assert/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
+++ b/operator/ascendc/1_utilities/3_assert/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorCube/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorCube/AclNNInvocation/scripts/verify_result.py
index b63a4a5e1..455426365 100644
--- a/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorCube/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorCube/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorVector/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorVector/AclNNInvocation/scripts/verify_result.py
index 604d92996..2caf6cdd4 100644
--- a/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorVector/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/1_utilities/7_dumptensor/FrameworkLaunch/DumpTensorVector/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/scripts/verify_result.py b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/scripts/verify_result.py
index b63a4a5e1..455426365 100644
--- a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/scripts/verify_result.py
+++ b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationVector/scripts/verify_result.py b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationVector/scripts/verify_result.py
index 604d92996..2caf6cdd4 100644
--- a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationVector/scripts/verify_result.py
+++ b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationVector/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/2_features/12_cube_group/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/2_features/12_cube_group/AclNNInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/2_features/12_cube_group/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/2_features/12_cube_group/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/2_features/13_matmul_api_ibshare/MatmulABshareInvocation/scripts/verify_result.py b/operator/ascendc/2_features/13_matmul_api_ibshare/MatmulABshareInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/2_features/13_matmul_api_ibshare/MatmulABshareInvocation/scripts/verify_result.py
+++ b/operator/ascendc/2_features/13_matmul_api_ibshare/MatmulABshareInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/2_features/14_matmul_api_constant/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/2_features/14_matmul_api_constant/AclNNInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/2_features/14_matmul_api_constant/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/2_features/14_matmul_api_constant/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/2_features/16_group_barrier/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/2_features/16_group_barrier/AclNNInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/2_features/16_group_barrier/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/2_features/16_group_barrier/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/4_best_practices/6_group_matmul/KernelLaunch/scripts/verify_result.py b/operator/ascendc/4_best_practices/6_group_matmul/KernelLaunch/scripts/verify_result.py
index ab58c2333..1cbe396b5 100644
--- a/operator/ascendc/4_best_practices/6_group_matmul/KernelLaunch/scripts/verify_result.py
+++ b/operator/ascendc/4_best_practices/6_group_matmul/KernelLaunch/scripts/verify_result.py
@@ -36,7 +36,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclOfflineModel/scripts/verify_result.py b/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclOfflineModel/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclOfflineModel/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclOfflineModel/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclOnlineModel/scripts/verify_result.py b/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclOnlineModel/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclOnlineModel/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/AddCustomSample/FrameworkLaunch/AclOnlineModel/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/scripts/verify_result.py b/operator/ascendc/tutorials/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/tutorials/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/AddCustomSample/KernelLaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py b/operator/ascendc/tutorials/AddCustomSample/KernelLaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator/ascendc/tutorials/AddCustomSample/KernelLaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/AddCustomSample/KernelLaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/MatmulCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/tutorials/MatmulCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/tutorials/MatmulCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/MatmulCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/MatmulCustomSample/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py b/operator/ascendc/tutorials/MatmulCustomSample/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/tutorials/MatmulCustomSample/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/MatmulCustomSample/KernelLaunch/MatmulInvocationNeo/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/scripts/verify_result.py b/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAsync/scripts/verify_result.py b/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAsync/scripts/verify_result.py
index a325cfcc6..24b30f8d4 100644
--- a/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAsync/scripts/verify_result.py
+++ b/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAsync/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/verify_result.py b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/verify_result.py
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomKernel/examples/CPPInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/verify_result.py b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/verify_result.py
index 1a21d809a..2dd46f803 100644
--- a/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/verify_result.py
+++ b/operator_contrib/AddCustomSample/KernelLaunch/AddCustomTilingKernel/examples/CPPInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator_contrib/AxpySample/AclNNInvocation/scripts/verify_result.py b/operator_contrib/AxpySample/AclNNInvocation/scripts/verify_result.py
index 2c7ab7c6d..3349011da 100644
--- a/operator_contrib/AxpySample/AclNNInvocation/scripts/verify_result.py
+++ b/operator_contrib/AxpySample/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator_contrib/HighPerfMatMul/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py b/operator_contrib/HighPerfMatMul/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
index 6770149f3..4f57f01b9 100644
--- a/operator_contrib/HighPerfMatMul/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
+++ b/operator_contrib/HighPerfMatMul/FrameworkLaunch/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 100:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
diff --git a/operator_contrib/MatmulLeakyReluCustom/AclNNInvocation/scripts/verify_result.py b/operator_contrib/MatmulLeakyReluCustom/AclNNInvocation/scripts/verify_result.py
index 0e65d9813..74d469705 100644
--- a/operator_contrib/MatmulLeakyReluCustom/AclNNInvocation/scripts/verify_result.py
+++ b/operator_contrib/MatmulLeakyReluCustom/AclNNInvocation/scripts/verify_result.py
@@ -37,7 +37,7 @@ def verify_result(output, golden):
         if index == 10:
             break
     error_ratio = float(different_element_indexes.size) / golden.size
-    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
     return error_ratio <= error_tol
 
 
-- 
Gitee


From a12d29bcb79d751d4dc765bff3aa5f671e22d9e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=9F=E6=B3=A2?= <jiangbo74@huawei.com>
Date: Thu, 19 Jun 2025 08:14:19 +0000
Subject: [PATCH 19/46] =?UTF-8?q?!2680=20=E6=9B=BF=E6=8D=A2<strong>CCE=5FK?=
 =?UTF-8?q?T=5FTEST</strong>=20Merge=20pull=20request=20!2680=20from=20?=
 =?UTF-8?q?=E6=B1=9F=E6=B3=A2/br=5Fj00600688=5FfixDefinedWord?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../AllGatherMatmulCustom/op_kernel/gather_mm.h                 | 2 +-
 .../op_kernel/matmul_reduce_scatter_custom_common.h             | 2 +-
 .../op_kernel/matmul_all_reduce_custom_common.h                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/operator/ascendc/4_best_practices/21_all_gather_matmul_custom/AllGatherMatmulCustom/op_kernel/gather_mm.h b/operator/ascendc/4_best_practices/21_all_gather_matmul_custom/AllGatherMatmulCustom/op_kernel/gather_mm.h
index 9b662b32b..891f1082e 100644
--- a/operator/ascendc/4_best_practices/21_all_gather_matmul_custom/AllGatherMatmulCustom/op_kernel/gather_mm.h
+++ b/operator/ascendc/4_best_practices/21_all_gather_matmul_custom/AllGatherMatmulCustom/op_kernel/gather_mm.h
@@ -11,7 +11,7 @@
 #ifndef MC2_GATHER_MM_H
 #define MC2_GATHER_MM_H
 
-#if defined(__CCE_KT_TEST__)
+#if defined ASCENDC_CPU_DEBUG
 #define SET_G_CORE_TYPE_IS_AIV thread_local int g_coreType = 2
 #define SET_G_CORE_TYPE_IS_AIC thread_local int g_coreType = 1
 #define DTYPE_X1 half
diff --git a/operator/ascendc/4_best_practices/22_matmul_reduce_scatter_custom/MatmulReduceScatterCustom/op_kernel/matmul_reduce_scatter_custom_common.h b/operator/ascendc/4_best_practices/22_matmul_reduce_scatter_custom/MatmulReduceScatterCustom/op_kernel/matmul_reduce_scatter_custom_common.h
index 3d323216d..bb561cf03 100644
--- a/operator/ascendc/4_best_practices/22_matmul_reduce_scatter_custom/MatmulReduceScatterCustom/op_kernel/matmul_reduce_scatter_custom_common.h
+++ b/operator/ascendc/4_best_practices/22_matmul_reduce_scatter_custom/MatmulReduceScatterCustom/op_kernel/matmul_reduce_scatter_custom_common.h
@@ -11,7 +11,7 @@
 #ifndef MC2_ALLREDUCE_COMM_H
 #define MC2_ALLREDUCE_COMM_H
 
-#if defined(__CCE_KT_TEST__)
+#if defined ASCENDC_CPU_DEBUG
 #define SET_G_CORE_TYPE_IS_AIV thread_local int g_coreType = 2
 #define SET_G_CORE_TYPE_IS_AIC thread_local int g_coreType = 1
 #define DTYPE_X1 half
diff --git a/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/MatmulAllReduceCustom/op_kernel/matmul_all_reduce_custom_common.h b/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/MatmulAllReduceCustom/op_kernel/matmul_all_reduce_custom_common.h
index 95605f718..4dbf9e704 100644
--- a/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/MatmulAllReduceCustom/op_kernel/matmul_all_reduce_custom_common.h
+++ b/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/MatmulAllReduceCustom/op_kernel/matmul_all_reduce_custom_common.h
@@ -11,7 +11,7 @@
 #ifndef MC2_ALLREDUCE_COMM_H
 #define MC2_ALLREDUCE_COMM_H
 
-#if defined(__CCE_KT_TEST__)
+#if defined ASCENDC_CPU_DEBUG
 #define SET_G_CORE_TYPE_IS_AIV thread_local int g_coreType = 2
 #define SET_G_CORE_TYPE_IS_AIC thread_local int g_coreType = 1
 #define DTYPE_X1 half
-- 
Gitee


From 7708812f854e743429e9126554903e83c63e9f18 Mon Sep 17 00:00:00 2001
From: alpaca12345UUU <zhanglong170@huawei.com>
Date: Fri, 20 Jun 2025 07:34:30 +0000
Subject: [PATCH 20/46] =?UTF-8?q?!2683=20=E4=BF=AE=E6=94=B9tbufpool=20READ?=
 =?UTF-8?q?ME=20Merge=20pull=20request=20!2683=20from=20alpaca12345UUU/mas?=
 =?UTF-8?q?ter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 operator/ascendc/2_features/2_tbufpool/README.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/operator/ascendc/2_features/2_tbufpool/README.md b/operator/ascendc/2_features/2_tbufpool/README.md
index b87611f13..964f96712 100644
--- a/operator/ascendc/2_features/2_tbufpool/README.md
+++ b/operator/ascendc/2_features/2_tbufpool/README.md
@@ -28,15 +28,15 @@
   z = x - y
   ```
 
-  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，首先启用tbufool1，将部分输入数据src0Gm，部分输入数据src1Gm搬运进片上储存，调用计算接口完成相加计算，搬出到外部存储上。之后切换到tbufpool2进行剩余数据相减计算，得到最终结果，再搬出到外部存储上。  
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，首先启用tbufpool1，将部分输入数据src0Gm，部分输入数据src1Gm搬运进片上储存，调用计算接口完成相加计算，搬出到外部存储上。之后切换到tbufpool2进行剩余数据相减计算，得到最终结果，再搬出到外部存储上。  
 
   本样例算子的实现流程分为6个基本任务：CopyIn，Compute，CopyOut，CopyIn1，Compute1，CopyOut1。
   - CopyIn任务负责将Global Memory上的部分输入Tensor src0Gm和src1Gm搬运到Local Memory，分别存储在src0Local、src1Local；
   - Compute任务负责对src0Local、src1Local执行加法操作，计算结果存储在dstLocal中；
-  - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm0中。
+  - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGlobal中。
   - CopyIn1任务负责将Global Memory上的剩余输入Tensor src0Gm和src1Gm搬运到Local Memory，分别存储在src0Local、src1Local；
   - Compute1任务负责对src0Local、src1Local执行剩余数据减法操作，计算结果存储在dstLocal中；
-  - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm1中。
+  - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGlobal中。
 
 - 调用实现
   1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
@@ -71,6 +71,13 @@
     export CAMODEL_LOG_PATH=./sim_log
     ```
 
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+    
   - 样例执行
 
     ```bash
-- 
Gitee


From 68e759710909bdd5afbac6d573be345cdbcfc19a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=94=90=E7=92=9E?= <tangpu2@h-partners.com>
Date: Fri, 20 Jun 2025 09:26:11 +0000
Subject: [PATCH 21/46] =?UTF-8?q?!2685=20readme=E4=BF=AE=E6=94=B9=20Merge?=
 =?UTF-8?q?=20pull=20request=20!2685=20from=20=E5=94=90=E7=92=9E/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 74 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index cb6675ca6..f4843ebb8 100644
--- a/README.md
+++ b/README.md
@@ -23,32 +23,18 @@
 - 请参考[CANN社区版文档](https://hiascend.com/document/redirect/CannCommunityInstWizard?utm_source=gitee&utm_medium=sample&utm_campaign=samples)相关章节，对昇腾硬件、CANN软件及相应深度学习框架进行安装准备。
 - 本源码仓会适配CANN软件版本创建相应的标签并发行，关于CANN软件版本与本源码仓中标签的配套关系可参见["本源码仓标签与CANN版本配套表"](docs/MATCH.md#cannversionmap)。**需要注意，为确保您的源码定制开发顺利进行，请选择配套的CANN版本与Gitee标签源码，使用master分支可能存在版本不匹配的风险。**
 
-## 推荐样例
-
-|  **样例名称**  |  **样例介绍**  |  **开发语言**  |
-|---|---|---|
-| [DVPP接口样例](https://gitee.com/ascend/samples/tree/master/cplusplus/level2_simple_inference/0_data_process) | 图像视频处理（DVPP）单接口样例，包含图片视频解码（vdec/jpegd）、缩放（resize）、抠图（crop）、转换（vpc）等功能 |  C++ |
-| [单算子样例](https://gitee.com/ascend/samples/tree/master/cplusplus/level1_single_api/4_op_dev/2_verify_op) | 自定义算子开发介绍，单算子调用样例，包含Add/batchnorm/conv2d/lstm/matmul/reshape等算子 |  C++ |
-| [Ascend C单算子样例](https://gitee.com/ascend/samples/tree/master/operator) | 自定义Ascend C算子开发介绍，单算子调用样例，包含Add/LayerNorm/MatMul/MatMulLeakyRelu/MoeSoftMaxTopK等算子 |  Ascend C |
-| [sampleResnetQuickStart](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleResnetQuickStart) | :+1:推理应用入门样例，基于Resnet50模型实现的图像分类应用 | C++/Python |
-| [sampleResnetAIPP](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleResnetAIPP) | AIPP特性使用，基于Resnet50模型实现的图像分类应用 | C++/Python |
-| [sampleResnetDVPP](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleResnetDVPP)  | DVPP特性使用，基于Resnet50模型实现的图像分类应用 | C++/Python |
-| [sampleYOLOV7](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleYOLOV7) | 使能DVPP+AIPP特性，基于YoloV7模型实现的物体检测应用 |  C++ |
-| [sampleResnetRtsp](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleResnetRtsp)  | RTSP视频流输入，基于Resnet50模型实现的图像分类应用 |  C++ |
-| [sampleCarColor](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleCarColor) | 多模型串接，基于YoloV7模型和颜色分类模型实现的检测分类应用 |  C++ |
-| [sampleYOLOV7MultiInput](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleYOLOV7MultiInput)  | :+1:多路输入综合样例，基于YoloV7模型实现的物体检测应用，支持多路RTSP流/视频输入、支持多卡并行 |  C++ |
-| [sampleCrowdCounting](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleCrowdCounting) | 统计图片人头数量，基于CrowdCounting模型实现的推理应用 |  Python |
-| [sampleYOLOV7NMSONNX](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleYOLOV7NMSONNX) | 后处理使用CANN算子进行加速，基于YoloV7模型实现的物体检测应用 |  Python |
-
-
 ## 仓库结构
 
 ```
+- /best_practices：CANN最佳实践样例
+- /common：samples仓公共文件目录
 - /cplusplus：C++样例归档路径（待迁移至/inference）
     |--/contrib：外部贡献推理样例
     |--/level1_single_api：CANN AscendCL接口、DVPP接口、Graph接口等单接口样例
     |--/level2_simple_inference：AscendCL推理应用样例
+    |--/...
 - /docs：CANN Samples样例使用相关说明、FAQ文档
+- /growthpath：开发者CANN的学习路径，帮助开发者快速掌握每一阶段知识点及开发技巧
 - /inference：推理应用相关样例
     |--/ACLHelloWorld：AscendCL推理应用入门“Hello World”
     |--/acllite：AscendCL高阶封装接口，包含C++及Python版本
@@ -56,29 +42,53 @@
     |--/mediaProcess：媒体（音视频）接口相关样例
     |--/memoryManagement：AscendCL内存管理样例
     |--/modelInference：推理应用样例目录，包含C++及Python版本
-- /operator：算子开发与使用相关样例
-    |--/AddCustomSample：Ascend C算子开发Add样例
-       |--FrameworkLaunch: 单算子工程及调用样例
-          |--AclNNInvocation: 单算子API执行样例
-          |--AclOfflineModel: 单算子模型执行样例
-          |--AclOnlineModel: 单算子模型执行样例
-          |--AddCustom: 单算子工程
-          |--CppExtensions: pybind调用样例
-          |--PytorchInvocation: pytorch调用样例
-          |--TensorFlowInvocation: tensorflow调用样例
-       |--KernelLaunch: 内核调试调用样例
-          |--AddKernelInvocation: 内核调试调用样例
-          |--AddKernelInvocationNeo: Kernel Launch调试样例
-          |--AddKernelInvocationTilingNeo: 带Tiling的Kernel Launch调试样例
     |--/...
+- /operator：Ascend C算子开发与使用相关样例
+    |ascendc
+      |0_introduction：简单的示例，适合初学者
+      |1_utilities：编译工程和自定义工程、assert及debug功能、硬件平台信息的查询能力等
+      |2_features：Ascend C的特性
+      |3_libraries：类库的使用示例，包括数学库，激活函数等
+      |4_best_practices：最佳实践示例
+      |tutorials：生态教学的示例
+        |--/AddCustomSample：Ascend C算子开发Add样例
+- /operator_contrib：Ascend C算子开发者贡献样例
+    |--/UnalignAddCustomSample：Ascend C算子开发Add算子（非对齐）样例
     |--/...
 - /python：Python样例归档路径（待迁移至/inference）
     |--/contrib：外部贡献推理样例
     |--/level1_single_api：CANN AscendCL接口、DVPP接口、Graph接口等单接口样例
     |--/level2_simple_inference：AscendCL推理应用样例
     |--/level3_multi_model：多模型串接综合样例
+- /robot：昇腾开发板智能车实际应用样例
+- /st：样例测试用例，主要用于样例的功能性验证
+- /training：训练应用样例
+
 ```
 
+## 算子开发样例
+|  **样例名称**  |  **样例介绍**  |  **开发语言**  |
+|---|---|---|
+| [AddCustomSample](https://gitee.com/ascend/samples/tree/master/operator/ascendc/tutorials/AddCustomSample) | 基于Ascend C的Add自定义Vector算子及调用样例 | C++ |
+| [HelloWorldSample](https://gitee.com/ascend/samples/tree/master/operator/ascendc/tutorials/HelloWorldSample) | 基于Ascend C的自定义算子调用结构演示样例 | C++ |
+| [MatmulCustomSample](https://gitee.com/ascend/samples/tree/master/operator/ascendc/tutorials/MatmulCustomSample) | 基于AscendC的Matmul自定义Cube算子及调用样例 | C++ |
+| [MatmulLeakyReluCustomSample](https://gitee.com/ascend/samples/tree/master/operator/ascendc/tutorials/MatmulLeakyReluCustomSample) | 基于AscendC的MatmulLeakyRelu自定义Cube+Vector算子及调用样例 | C++ |
+| [UnalignAddCustomSample](https://gitee.com/ascend/samples/tree/master/operator_contrib/UnalignAddCustomSample) | 基于AscendC的Add算子（非对齐）算子及调用样例 | C++ |
+
+## 推理开发样例
+|  **样例名称**  |  **样例介绍**  |  **开发语言**  |
+|---|---|---|
+| [DVPP接口样例](https://gitee.com/ascend/samples/tree/master/cplusplus/level2_simple_inference/0_data_process) | 图像视频处理（DVPP）单接口样例，包含图片视频解码（vdec/jpegd）、缩放（resize）、抠图（crop）、转换（vpc）等功能 |  C++ |
+| [sampleResnetQuickStart](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleResnetQuickStart) | 推理应用入门样例，基于Resnet50模型实现的图像分类应用 | C++/Python |
+| [sampleResnetAIPP](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleResnetAIPP) | AIPP特性使用，基于Resnet50模型实现的图像分类应用 | C++/Python |
+| [sampleResnetDVPP](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleResnetDVPP)  | DVPP特性使用，基于Resnet50模型实现的图像分类应用 | C++/Python |
+| [sampleYOLOV7](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleYOLOV7) | 使能DVPP+AIPP特性，基于YoloV7模型实现的物体检测应用 |  C++ |
+| [sampleResnetRtsp](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleResnetRtsp)  | RTSP视频流输入，基于Resnet50模型实现的图像分类应用 |  C++ |
+| [sampleCarColor](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleCarColor) | 多模型串接，基于YoloV7模型和颜色分类模型实现的检测分类应用 |  C++ |
+| [sampleYOLOV7MultiInput](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleYOLOV7MultiInput)  | :+1:多路输入综合样例，基于YoloV7模型实现的物体检测应用，支持多路RTSP流/视频输入、支持多卡并行 |  C++ |
+| [sampleCrowdCounting](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleCrowdCounting) | 统计图片人头数量，基于CrowdCounting模型实现的推理应用 |  Python |
+| [sampleYOLOV7NMSONNX](https://gitee.com/ascend/samples/tree/master/inference/modelInference/sampleYOLOV7NMSONNX) | 后处理使用CANN算子进行加速，基于YoloV7模型实现的物体检测应用 |  Python |
+
 
 ## 变更日志
   
-- 
Gitee


From 6bfdb584d600369ee0cd0ea1ea088b106faac4d5 Mon Sep 17 00:00:00 2001
From: shinoda <zhuyuchen7@huawei.com>
Date: Sat, 21 Jun 2025 07:08:05 +0000
Subject: [PATCH 22/46] !2686 fix README. * fix README.

---
 .../0_introduction/20_mmad_kernellaunch/MmadInvocation/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/README.md b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/README.md
index 84477c6ef..7bb83671c 100644
--- a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/README.md
+++ b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/README.md
@@ -22,6 +22,7 @@
 <tr><td align="center">b</td><td align="center">K * N</td><td align="center">float16</td><td align="center">ND</td></tr>
 </tr>
 </tr>
+<tr></tr>
 <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">M * N</td><td align="center">float</td><td align="center">ND</td></tr>
 </tr>
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">mmad_custom</td></tr>
-- 
Gitee


From 72cc3b1a7b6497e1c3754601ac45684edda26277 Mon Sep 17 00:00:00 2001
From: youxiao <youxiao@huawei.com>
Date: Mon, 23 Jun 2025 11:47:37 +0000
Subject: [PATCH 23/46] !2687 change llm datadist sample Merge pull request
 !2687 from youxiao/master

---
 cplusplus/level1_single_api/11_llm_data_dist/readme.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cplusplus/level1_single_api/11_llm_data_dist/readme.md b/cplusplus/level1_single_api/11_llm_data_dist/readme.md
index 10914d52e..f02114570 100644
--- a/cplusplus/level1_single_api/11_llm_data_dist/readme.md
+++ b/cplusplus/level1_single_api/11_llm_data_dist/readme.md
@@ -15,8 +15,8 @@
 ## 目录结构
 
 ```
-├── prompt_sampe.cpp                // prompt样例main函数
-├── decoder_sampe.cpp               // decoder样例main函数
+├── prompt_sample.cpp                // prompt样例main函数
+├── decoder_sample.cpp               // decoder样例main函数
 ├── CMakeLists.txt                  // 编译脚本 
 ```
 
@@ -71,7 +71,7 @@
 
 3. 在运行环境执行可执行文件。
 
-    - 执行prompt_sample, 参数为device_id与local_ip其中device_id为prompt要使用的device_id, local_ip为prompt所在device的ip，如:
+    - 执行prompt_sample, 参数为device_id与local_ip, 其中device_id为prompt要使用的device_id, local_ip为prompt所在device的ip，如:
         ```
         ./prompt_sample 0 10.10.10.1
         ```
-- 
Gitee


From fb4ad4383846aa727da977973358f8d7ec3f6c09 Mon Sep 17 00:00:00 2001
From: xujiuxu <xujiuxu1@huawei.com>
Date: Tue, 24 Jun 2025 09:37:22 +0000
Subject: [PATCH 24/46] !2691 change some readme Merge pull request !2691 from
 xujiuxu/master

---
 inference/dataflow/cpluscplus/README.md    | 4 ++--
 inference/dataflow/py_dflow/README.md      | 3 ++-
 inference/dataflow/python/README.md        | 9 ++++++++-
 inference/dataflow/udf_workspace/README.md | 2 +-
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/inference/dataflow/cpluscplus/README.md b/inference/dataflow/cpluscplus/README.md
index 9170a3ebf..d0f31887f 100644
--- a/inference/dataflow/cpluscplus/README.md
+++ b/inference/dataflow/cpluscplus/README.md
@@ -26,7 +26,7 @@ python 版本要求：python3.9
 
 ## 程序编译
 ```bash
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source {HOME}/Ascend/ascend-toolkit/set_env.sh # "{HOME}/Ascend"为CANN软件包安装目录，请根据实际安装路径进行替换。
 mkdir build
 cd build
 cmake ..
@@ -41,7 +41,7 @@ cd ..
 export ASCEND_GLOBAL_LOG_LEVEL=3       #0 debug 1 info 2 warn 3 error 不设置默认error级别
 export ASCEND_SLOG_PRINT_TO_STDOUT=1   # 日志打屏，不设置日志落盘默认路径
 # 必选
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source {HOME}/Ascend/ascend-toolkit/set_env.sh # "{HOME}/Ascend"为CANN软件包安装目录，请根据实际安装路径进行替换。
 export RESOURCE_CONFIG_PATH=xxx/xxx/xxx/numa_config.json
 
 cd output
diff --git a/inference/dataflow/py_dflow/README.md b/inference/dataflow/py_dflow/README.md
index ddc4c3490..59a708e71 100644
--- a/inference/dataflow/py_dflow/README.md
+++ b/inference/dataflow/py_dflow/README.md
@@ -33,9 +33,10 @@ py_dflow
 `PyDFlow`提供一键式编译能力，可通过如下命令进行编译：
 
 ```shell
-  source /usr/local/Ascend/ascend-toolkit/set_env.sh
+  source {HOME}/Ascend/ascend-toolkit/set_env.sh #{HOME}为CANN软件包安装目录，请根据实际安装路径进行替换
   bash build.sh --ascend_install_path=${ASCEND_HOME_PATH} --python_path=python3.9
 ```
+"{HOME}/Ascend"为CANN软件包安装目录，请根据实际安装路径进行替换。
 
 - `--ascend_install_path`选项的默认值为`/usr/local/Ascend/ascend-toolkit/latest`，可根据实际安装的路径指定。
 
diff --git a/inference/dataflow/python/README.md b/inference/dataflow/python/README.md
index 7b5ecc1f8..3a5290d85 100644
--- a/inference/dataflow/python/README.md
+++ b/inference/dataflow/python/README.md
@@ -20,6 +20,13 @@
 ├── udf_py   
 │   ├── udf_add.py 使用python实现udf多func功能  
 │   └── udf_control.py 使用python实现udf功能，用于控制udf_add中多func实际执行的func  
+└── udf_py_ws_sample 完整样例用于说明python udf实现     
+    ├── CMakeLists.txt udf python完整工程cmake文件样例   
+    ├── func_add.json  udf python完整工程配置文件样例   
+    ├── src_cpp   
+    │   └── func_add.cpp udf python完整工程C++源码文件样例    
+    └── src_python   
+        └── func_add.py  udf python完整工程python源码文件样例   
 
 
 ## 环境准备
@@ -35,7 +42,7 @@ sample_pytorch.py、sample_npu_model.py样例依赖pytorch和torchvision包,推
 export ASCEND_GLOBAL_LOG_LEVEL=3       #0 debug 1 info 2 warn 3 error 不设置默认error级别
 export ASCEND_SLOG_PRINT_TO_STDOUT=1   # 日志打屏，不设置日志落盘默认路径
 # 必选
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source {HOME}/Ascend/ascend-toolkit/set_env.sh #{HOME}为CANN软件包安装目录，请根据实际安装路径进行替换
 export RESOURCE_CONFIG_PATH=xxx/xxx/xxx/numa_config.json
 
 python3.9 sample1.py
diff --git a/inference/dataflow/udf_workspace/README.md b/inference/dataflow/udf_workspace/README.md
index 816635097..08b682493 100644
--- a/inference/dataflow/udf_workspace/README.md
+++ b/inference/dataflow/udf_workspace/README.md
@@ -35,7 +35,7 @@ FLOW_FUNC_REGISTRAR(AddFlowFunc)
 ## 编译指导
 UDF函数开发完成后，可以使用以下编译指令查看CMakeLists文件及cpp源码是否存在问题。
 ```bash
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source {HOME}/Ascend/ascend-toolkit/set_env.sh #{HOME}为CANN软件包安装目录，请根据实际安装路径进行替换
 # 以01_udf_add为例
 cd 01_udf_add
 mkdir build
-- 
Gitee


From 31d775a16e864e543e61c209662f3f6a8e0b25a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=94=90=E7=92=9E?= <tangpu2@h-partners.com>
Date: Tue, 24 Jun 2025 11:55:16 +0000
Subject: [PATCH 25/46] =?UTF-8?q?!2692=20=E2=80=9C=E5=A2=9E=E5=8A=A0return?=
 =?UTF-8?q?=E2=80=9D=20Merge=20pull=20request=20!2692=20from=20=E5=94=90?=
 =?UTF-8?q?=E7=92=9E/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../level2_simple_inference/0_data_process/venc/src/main.cpp     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cplusplus/level2_simple_inference/0_data_process/venc/src/main.cpp b/cplusplus/level2_simple_inference/0_data_process/venc/src/main.cpp
index 5c31df3a3..92c32c76b 100644
--- a/cplusplus/level2_simple_inference/0_data_process/venc/src/main.cpp
+++ b/cplusplus/level2_simple_inference/0_data_process/venc/src/main.cpp
@@ -184,6 +184,7 @@ Result InitResource()
         ERROR_LOG("acl get run mode failed");
         return FAILED;
     }
+    return SUCCESS;
 }
 
 Result Init(int imgWidth, int imgHeight)
-- 
Gitee


From d45ccbe545823837b3d15b23903a25062c401408 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Thu, 26 Jun 2025 05:54:24 +0000
Subject: [PATCH 26/46] =?UTF-8?q?!2690=20float4=20weight=20quantization=20?=
 =?UTF-8?q?sample=20Merge=20pull=20request=20!2690=20from=20=E5=BC=A0?=
 =?UTF-8?q?=E9=91=AB/zhangxin0623?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fp4_weight_quantization/README_CN.md      |  50 ++++++
 .../fp4_weight_quantization/requirements.txt  |   7 +
 .../src/quantization.cfg                      |   8 +
 .../src/run_llama7b_quantization.py           | 162 ++++++++++++++++++
 .../fp4_weight_quantization/src/utils.py      |  69 ++++++++
 5 files changed, 296 insertions(+)
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
 create mode 100644 python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py

diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
new file mode 100644
index 000000000..93ea0a9ce
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
@@ -0,0 +1,50 @@
+# FP4伪量化
+
+## 1 FP4伪量化
+
+### 1.1 安装依赖
+
+本sample依赖包可参考[requirements.txt](requirements.txt)
+
+### 1.2 模型和数据集准备
+
+本sample以Llama2-7b模型，pileval和wikitext2数据集为示例，请用户自行下载。
+
+### 1.3 简易量化配置
+./src/quantization.cfg文件为用户自定义的简易量化配置，具体表示信息如下：
+
+| 字段 |类型| 说明 | 默认值 | 取值范围 |
+|:--| :-: | :-- | :-: | :-: |
+|skip_layers|str|跳过量化的层 |/|/|
+|weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
+|weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
+|weight_only_config.awq_quantize.grids_num|uint32|awq搜索格点数量|20|/|/|
+
+## 2 FLOAT4_E2M1量化示例
+> 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT4_E2M1
+
+
+### 2.1 使用接口方式调用
+
+请在当前目录执行如下命令运行示例程序
+
+验证fakequant模型脚本：
+
+`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python3 src/run_llama7b_quantization.py --calibration_data=/pile_val_backup/ --verify_data=/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py --model=/data/Models/pytorch/Llama2/Llama2_7b_hf`
+
+
+若出现如下信息，则说明量化成功：
+
+```none
+Test time taken:  9.0 min  38.24865388870239 s
+Score:  5.657759
+```
+
+推理成功后，在当前目录会生成量化日志文件./amct_log/amct_pytorch.log和./output文件夹，该文件夹内包含以下内容：
+
+- config.json：量化配置文件，描述了如何对模型中的每一层进行量化。
+- record.txt：量化因子记录文件。
+- awq_result.pt：存储了awq算法的的scale和clip
+- quant_factor.pt：存储量化缩放因子
+
+> 如果outputs目录下已经存在量化配置文件或量化因子记录文件，再次运行示例程序时，如果新生成的文件与已有文件同名，则会覆盖已有的量化配置文件或量化因子记录文件。
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt
new file mode 100644
index 000000000..55441d062
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/requirements.txt
@@ -0,0 +1,7 @@
+torch==2.1.0
+transformers==4.40.0
+accelerate==0.30.1
+datasets==2.19.1
+sentencepiece==0.2.0
+numpy==1.23.5
+protobuf==3.20.2
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg
new file mode 100644
index 000000000..a43152ad3
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/quantization.cfg
@@ -0,0 +1,8 @@
+skip_layers: "lm_head"
+weight_only_config: {
+    weight_compress_only: True
+    wts_type: FLOAT4_E2M1
+    awq_quantize:{
+        grids_num: 20
+    }
+}
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
new file mode 100644
index 000000000..4aac4fad9
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
@@ -0,0 +1,162 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+import argparse
+import os
+import copy
+import time
+import tqdm
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoConfig
+from accelerate import infer_auto_device_map, dispatch_model
+from accelerate.utils.modeling import get_balanced_memory
+
+from utils import get_loaders,  get_llama2, get_calib_dataset
+import amct_pytorch as amct
+
+
+def build_model_and_enc(model, model_path, gpu_num):
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    if "mpt" in config.__class__.__name__.lower():
+        enc = AutoTokenizer.from_pretrained(
+            config.tokenizer_name, trust_remote_code=True
+        )
+    else:
+        enc = AutoTokenizer.from_pretrained(
+            model_path, use_fast=False, trust_remote_code=True
+        )
+
+    # Move the model to GPU (as much as possible) for LM evaluation
+    # max_memory = ['0:16GiB', '1:16GiB','2:16GiB', 'cpu:30GiB'], '0' means the first GPU that you specify.
+    # I don't recommend use 16GiB, we need to reserve some space for other tensors during calculation
+    # please see the recommand memeory allocation in the Word file
+    # Adjust the max_size accroding to the real situation
+    # a clever way:
+
+    max_memory = []
+    for i in range(gpu_num):
+        max_memory.append(f'{i}:12GiB')
+    max_memory.append('cpu:80GiB')
+    print('Max_memory allocation: \n', max_memory)
+
+    max_memory = [v.split(":") for v in (max_memory or [])]
+    max_memory = {(int(k) if k.isdigit() else k): v for k, v in max_memory}
+    kwargs = {
+        "max_memory": get_balanced_memory(
+            model, max_memory if len(max_memory) > 0 else None
+        )
+    }
+    model.tie_weights()
+    device_map = infer_auto_device_map(
+        model,
+        no_split_module_classes=[
+            "LlamaDecoderLayer",
+        ],
+        **kwargs,
+    )
+    model = dispatch_model(model, device_map=device_map, 
+        offload_dir=os.path.join(model_path, 'offload_dir'))
+
+    return model, enc
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--calibration_data', type=str, default='/pile_val_backup')
+    parser.add_argument('--verify_data', type=str, default='/data/Datasets/wikitext/wikitext-2-raw-v1/wikitext-2-raw/wikiscript.py')
+    parser.add_argument('--model', type=str, default='/data/Models/pytorch/Llama2/Llama2_7b_hf')
+
+    args = parser.parse_args()
+    model, model_path = get_llama2(args.model)
+    model = model.eval()
+    copied_model = copy.deepcopy(model)
+    gpu_num = torch.cuda.device_count()
+    model, enc = build_model_and_enc(model, model_path, gpu_num)
+
+    proto_path = './src/quantization.cfg'
+    config_file = './output/config.json'
+    record_file = './output/record.txt'
+
+    test_start_time = time.time()
+    # Phase1: generate quant config json
+    amct.create_post_quant_config(config_file,
+                             model,
+                             config_defination=proto_path)
+    
+    # Phase2: do weights calibration and generate calibration model
+    samples = get_calib_dataset(
+        data_path=args.calibration_data, tokenizer=enc, n_samples=512, block_size=518
+    )
+    samples = torch.cat(samples, dim=0)[:1,:]
+
+    post_quant_model = amct.create_post_quant_model(config_file,
+                                                    record_file,
+                                                    model)
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    with torch.no_grad():
+        post_quant_model(samples.to(next(post_quant_model.parameters()).device))
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    test_end_time = time.time()
+    total_time = test_end_time - test_start_time
+    print('Calibration time taken: ', total_time // 60, 'min ', total_time%60, 's')
+    # save memory, del unuse model
+    del post_quant_model
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    model, enc = build_model_and_enc(copied_model, model_path, gpu_num)
+    
+    # Phase3: save fakequant model
+    testenc = get_loaders(data_path=args.verify_data,
+                        enc=enc,
+                        seqlen=model.seqlen)
+
+    testenc = testenc.input_ids.to(model.device)
+
+    quant_model = amct.save_post_quant_model(record_file, model, mode='fakequant')
+
+    nsamples = testenc.numel() // model.seqlen
+    
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    
+    # Phase4: Test ppl result
+    nlls = []
+    test_start_time = time.time()
+    for i in tqdm.tqdm(range(nsamples), desc="evaluating..."):
+        batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(
+            quant_model.device
+        )
+        with torch.no_grad():
+            lm_logits = quant_model(batch).logits
+        shift_logits = lm_logits[:, :-1, :].contiguous().float().cpu()
+        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:].cpu()
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    test_end_time = time.time()
+
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+
+    total_time = test_end_time - test_start_time
+    print('Test time taken: ', total_time // 60, 'min ', total_time%60, 's'  )
+    print('Score: ', ppl.item())
\ No newline at end of file
diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py
new file mode 100644
index 000000000..474a5b618
--- /dev/null
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/utils.py
@@ -0,0 +1,69 @@
+"""
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+"""
+
+import torch
+import torch.nn as nn
+from datasets import load_dataset,load_from_disk
+
+def get_llama2(model_path, seqlen=2048):
+    def skip(*args, **kwargs):
+        pass
+
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    
+    model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, offload_folder="offload/")
+
+    model.seqlen = seqlen
+    return model, model_path
+
+
+def get_loaders(data_path: str, enc, seqlen):
+
+    print('Loading dataset: Wikitext2')
+    testenc = load_dataset(data_path, 'wikitext-2-raw-v1', split='test', trust_remote_code=True)
+    testenc = enc("\n\n".join(testenc["text"]), return_tensors="pt")
+    
+    return testenc
+
+
+def get_calib_dataset(data_path, tokenizer=None, n_samples=512, block_size=512):
+    dataset = load_from_disk(data_path)
+    dataset = dataset.shuffle(seed=42)
+    samples = []
+    n_run = 0
+    for data in dataset:
+        line = data["text"]
+        line = line.strip()
+        line_encoded = tokenizer.encode(line)
+        if len(line_encoded) > 512:
+            continue
+        sample = torch.tensor([line_encoded])
+        if sample.numel() == 0:
+            continue
+        samples.append(sample)
+        n_run += 1
+        if n_run == n_samples:
+            break
+    # now concatenate all samples and split according to block size
+    cat_samples = torch.cat(samples, dim=1)
+    n_split = cat_samples.shape[1] // block_size
+    print(f" * Split into {n_split} blocks")
+    return [
+        cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split)
+    ]
-- 
Gitee


From 39eef173d06f5414e858610c524dd15bbb66a057 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Fri, 27 Jun 2025 03:20:13 +0000
Subject: [PATCH 27/46] =?UTF-8?q?!2694=20fix=20fp4=20weight=20quant=20samp?=
 =?UTF-8?q?le=20Merge=20pull=20request=20!2694=20from=20=E5=BC=A0=E9=91=AB?=
 =?UTF-8?q?/zhangxin0627?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fp4_weight_quantization/src/run_llama7b_quantization.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
index 4aac4fad9..37c78da8d 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/src/run_llama7b_quantization.py
@@ -107,7 +107,7 @@ if __name__ == '__main__':
                                                     model)
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-
+    post_quant_model.config.use_cache = False
     with torch.no_grad():
         post_quant_model(samples.to(next(post_quant_model.parameters()).device))
     if torch.cuda.is_available():
-- 
Gitee


From d846deb65f12c69aa9144a0ef8adc4b72c9a9f67 Mon Sep 17 00:00:00 2001
From: shinoda <zhuyuchen7@huawei.com>
Date: Sat, 28 Jun 2025 03:40:45 +0000
Subject: [PATCH 28/46] !2696 remove sim_log configuration and fix tbufpool
 run.sh Merge pull request !2696 from shinoda/master

---
 .../MatmulInvocationNeo/run.sh                |   7 -
 .../MatmulLeakyReluInvocation/run.sh          |   7 -
 .../MatmulLeakyReluInvocationAsync/run.sh     |   7 -
 .../AbsDuplicateKernelInvocation/README.md    |   5 +-
 .../AbsGatherMaskKernelInvocation/README.md   |   5 +-
 .../AbsPadKernelInvocation/README.md          |   5 +-
 .../AbsUnPadKernelInvocation/README.md        |   5 +-
 .../ReduceMinKernelInvocation/README.md       |   5 +-
 .../WholeReduceSumKernelInvocation/README.md  |   5 +-
 .../WholeReduceSumKernelInvocation/run.sh     |   7 -
 .../MmadBiasInvocation/run.sh                 |   7 -
 .../MmadInvocation/run.sh                     |   7 -
 .../VectorAddMultiCoreWithTiling/README.md    |   5 +-
 .../VectorAddMultiCoreWithTiling/run.sh       |   7 -
 .../README.md                                 |   5 +-
 .../run.sh                                    |   7 -
 .../VectorAddSingleCore/README.md             |   5 +-
 .../VectorAddSingleCore/run.sh                |   7 -
 .../VectorAddSingleCoreWithTmpbuf/README.md   |   5 +-
 .../VectorAddSingleCoreWithTmpbuf/run.sh      |   7 -
 .../AddKernelInvocationNeo/README.md          |   5 +-
 .../AddKernelInvocationNeo/run.sh             |   7 -
 .../AddKernelInvocationTilingNeo/README.md    |   5 +-
 .../AddKernelInvocationTilingNeo/run.sh       |   7 -
 .../5_addn_kernellaunch/README.md             |   5 +-
 .../0_introduction/5_addn_kernellaunch/run.sh |   7 -
 .../DumpTensorKernelInvocationCube/run.sh     |   7 -
 .../DumpTensorKernelInvocationVector/run.sh   |   7 -
 .../MatmulABshareInvocation/run.sh            |   7 -
 .../ascendc/2_features/2_tbufpool/README.md   |   5 +-
 .../ascendc/2_features/2_tbufpool/main.cpp    |  14 +-
 operator/ascendc/2_features/2_tbufpool/run.sh | 138 +++++++++++++-----
 .../6_group_matmul/KernelLaunch/run.sh        |   7 -
 33 files changed, 123 insertions(+), 218 deletions(-)

diff --git a/operator/ascendc/0_introduction/11_matmul_kernellaunch/MatmulInvocationNeo/run.sh b/operator/ascendc/0_introduction/11_matmul_kernellaunch/MatmulInvocationNeo/run.sh
index d36adef16..dbca0e151 100755
--- a/operator/ascendc/0_introduction/11_matmul_kernellaunch/MatmulInvocationNeo/run.sh
+++ b/operator/ascendc/0_introduction/11_matmul_kernellaunch/MatmulInvocationNeo/run.sh
@@ -77,13 +77,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/run.sh b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/run.sh
index d36adef16..dbca0e151 100755
--- a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/run.sh
+++ b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/run.sh
@@ -77,13 +77,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocationAsync/run.sh b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocationAsync/run.sh
index 2fc9bfdcc..9e5b60ada 100755
--- a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocationAsync/run.sh
+++ b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocationAsync/run.sh
@@ -77,13 +77,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsDuplicateKernelInvocation/README.md b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsDuplicateKernelInvocation/README.md
index 269648acd..abdf7863a 100644
--- a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsDuplicateKernelInvocation/README.md
+++ b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsDuplicateKernelInvocation/README.md
@@ -51,10 +51,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsGatherMaskKernelInvocation/README.md b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsGatherMaskKernelInvocation/README.md
index 91d619e5d..add51272e 100644
--- a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsGatherMaskKernelInvocation/README.md
+++ b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsGatherMaskKernelInvocation/README.md
@@ -50,10 +50,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsPadKernelInvocation/README.md b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsPadKernelInvocation/README.md
index bead79954..5b2be9c26 100644
--- a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsPadKernelInvocation/README.md
+++ b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsPadKernelInvocation/README.md
@@ -51,10 +51,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsUnPadKernelInvocation/README.md b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsUnPadKernelInvocation/README.md
index a80082e68..a5d3c5607 100644
--- a/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsUnPadKernelInvocation/README.md
+++ b/operator/ascendc/0_introduction/16_unaligned_abs_kernellaunch/AbsUnPadKernelInvocation/README.md
@@ -49,10 +49,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/17_unaligned_reducemin_kernellaunch/ReduceMinKernelInvocation/README.md b/operator/ascendc/0_introduction/17_unaligned_reducemin_kernellaunch/ReduceMinKernelInvocation/README.md
index 32012880a..3c751e543 100644
--- a/operator/ascendc/0_introduction/17_unaligned_reducemin_kernellaunch/ReduceMinKernelInvocation/README.md
+++ b/operator/ascendc/0_introduction/17_unaligned_reducemin_kernellaunch/ReduceMinKernelInvocation/README.md
@@ -49,10 +49,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/README.md b/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/README.md
index 46e2f9776..72a6e06d4 100644
--- a/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/README.md
+++ b/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/README.md
@@ -98,10 +98,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
     
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/run.sh b/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/run.sh
index c4f01fdfd..f239a9a44 100755
--- a/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/run.sh
+++ b/operator/ascendc/0_introduction/19_unaligned_wholereduces_kernellaunch/WholeReduceSumKernelInvocation/run.sh
@@ -77,13 +77,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/run.sh b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/run.sh
index 0c9c7f40b..3359bc3fa 100644
--- a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/run.sh
+++ b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadBiasInvocation/run.sh
@@ -75,13 +75,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/run.sh b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/run.sh
index 0c9c7f40b..3359bc3fa 100644
--- a/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/run.sh
+++ b/operator/ascendc/0_introduction/20_mmad_kernellaunch/MmadInvocation/run.sh
@@ -75,13 +75,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/README.md
index 4a21d3854..e198055e3 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/README.md
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/README.md
@@ -62,10 +62,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/run.sh b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/run.sh
index 8fcd59730..eb66d5395 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/run.sh
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/run.sh
@@ -78,13 +78,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/README.md
index 3feee5e51..e2d449c67 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/README.md
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/README.md
@@ -63,10 +63,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/run.sh b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/run.sh
index 8fcd59730..eb66d5395 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/run.sh
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/run.sh
@@ -78,13 +78,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/README.md
index a3b82f9c0..db52b3f34 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/README.md
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/README.md
@@ -49,10 +49,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
     
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/run.sh b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/run.sh
index 8fcd59730..eb66d5395 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/run.sh
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/run.sh
@@ -78,13 +78,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/README.md
index 26353571a..f7a51c3e6 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/README.md
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/README.md
@@ -52,10 +52,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
     
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/run.sh b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/run.sh
index 8fcd59730..eb66d5395 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/run.sh
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/run.sh
@@ -78,13 +78,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/README.md b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/README.md
index 32e2f1008..3149e087d 100644
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/README.md
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/README.md
@@ -50,10 +50,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/run.sh b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/run.sh
index 8c6cb9c61..9bdf07910 100755
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/run.sh
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationNeo/run.sh
@@ -77,13 +77,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/README.md b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/README.md
index 447cf9219..8a409bdef 100644
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/README.md
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/README.md
@@ -52,10 +52,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
     
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/run.sh b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/run.sh
index 8c6cb9c61..9bdf07910 100755
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/run.sh
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationTilingNeo/run.sh
@@ -77,13 +77,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/0_introduction/5_addn_kernellaunch/README.md b/operator/ascendc/0_introduction/5_addn_kernellaunch/README.md
index e3eaed64a..1d39d13ac 100644
--- a/operator/ascendc/0_introduction/5_addn_kernellaunch/README.md
+++ b/operator/ascendc/0_introduction/5_addn_kernellaunch/README.md
@@ -105,10 +105,7 @@ kernel侧:
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
 
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 样例执行
 
diff --git a/operator/ascendc/0_introduction/5_addn_kernellaunch/run.sh b/operator/ascendc/0_introduction/5_addn_kernellaunch/run.sh
index 8c6cb9c61..9bdf07910 100755
--- a/operator/ascendc/0_introduction/5_addn_kernellaunch/run.sh
+++ b/operator/ascendc/0_introduction/5_addn_kernellaunch/run.sh
@@ -77,13 +77,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/run.sh b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/run.sh
index a755887a9..b38325a40 100755
--- a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/run.sh
+++ b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationCube/run.sh
@@ -75,13 +75,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationVector/run.sh b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationVector/run.sh
index e4cd2e80f..7ff642101 100755
--- a/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationVector/run.sh
+++ b/operator/ascendc/1_utilities/7_dumptensor/KernelLaunch/DumpTensorKernelInvocationVector/run.sh
@@ -77,13 +77,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/2_features/13_matmul_api_ibshare/MatmulABshareInvocation/run.sh b/operator/ascendc/2_features/13_matmul_api_ibshare/MatmulABshareInvocation/run.sh
index 18d24d6fb..b60d42817 100644
--- a/operator/ascendc/2_features/13_matmul_api_ibshare/MatmulABshareInvocation/run.sh
+++ b/operator/ascendc/2_features/13_matmul_api_ibshare/MatmulABshareInvocation/run.sh
@@ -75,13 +75,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
diff --git a/operator/ascendc/2_features/2_tbufpool/README.md b/operator/ascendc/2_features/2_tbufpool/README.md
index 964f96712..fe4e7becd 100644
--- a/operator/ascendc/2_features/2_tbufpool/README.md
+++ b/operator/ascendc/2_features/2_tbufpool/README.md
@@ -66,10 +66,7 @@
       export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
       ```
     
-    配置仿真模式日志文件目录，默认为sim_log。
-    ```bash
-    export CAMODEL_LOG_PATH=./sim_log
-    ```
+
 
   - 生成输入和真值
 
diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp
index ba4f849dd..1f6813a45 100644
--- a/operator/ascendc/2_features/2_tbufpool/main.cpp
+++ b/operator/ascendc/2_features/2_tbufpool/main.cpp
@@ -36,7 +36,7 @@ static bool CompareResult(const void *outputData, int64_t outSize) {
     CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
 #endif
     size_t goldenSize = outSize;
-    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    bool ret = ReadFile("./output/golden.bin", goldenSize, goldenData, goldenSize);
     if (ret) {
         printf("ReadFile golden.bin success!\n");
     } else {
@@ -80,8 +80,8 @@ int32_t main(int32_t argc, char *argv[]) {
     uint8_t *zAdd = (uint8_t *)AscendC::GmAlloc(outputSizeAdd);
     uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
 
-    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
-    ReadFile("../input/input_y.bin", inputSize, y, inputSize);
+    ReadFile("./input/input_x.bin", inputSize, x, inputSize);
+    ReadFile("./input/input_y.bin", inputSize, y, inputSize);
 
     GenerateTilingData(TOTAL_LENGTH, tiling);
 
@@ -89,7 +89,7 @@ int32_t main(int32_t argc, char *argv[]) {
 
     ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, *reinterpret_cast<TbufPoolTilingData *>(tiling)); // use this macro for cpu debug
 
-    WriteFile("../output/output.bin", zAdd, outputSizeAdd);
+    WriteFile("./output/output.bin", zAdd, outputSizeAdd);
 
     bool goldenResult = true;
     goldenResult = CompareResult(zAdd, outputSizeAdd);
@@ -122,8 +122,8 @@ int32_t main(int32_t argc, char *argv[]) {
     CHECK_ACL(aclrtMalloc((void **)&yDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
     CHECK_ACL(aclrtMalloc((void **)&zDeviceAdd, outputSizeAdd, ACL_MEM_MALLOC_HUGE_FIRST));
 
-    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
-    ReadFile("../input/input_y.bin", inputSize, yHost, inputSize);
+    ReadFile("./input/input_x.bin", inputSize, xHost, inputSize);
+    ReadFile("./input/input_y.bin", inputSize, yHost, inputSize);
 
     GenerateTilingData(TOTAL_LENGTH, tiling);
 
@@ -140,7 +140,7 @@ int32_t main(int32_t argc, char *argv[]) {
 
     // Copy result to host memory and write to output file
     CHECK_ACL(aclrtMemcpy(zHostAdd, outputSizeAdd, zDeviceAdd, outputSizeAdd, ACL_MEMCPY_DEVICE_TO_HOST));
-    WriteFile("../output/output.bin", zHostAdd, outputSizeAdd);
+    WriteFile("./output/output.bin", zHostAdd, outputSizeAdd);
 
     // Compare the result with the golden result
     bool goldenResult = true;
diff --git a/operator/ascendc/2_features/2_tbufpool/run.sh b/operator/ascendc/2_features/2_tbufpool/run.sh
index 5ae89dbe9..04d5fd9fc 100644
--- a/operator/ascendc/2_features/2_tbufpool/run.sh
+++ b/operator/ascendc/2_features/2_tbufpool/run.sh
@@ -1,48 +1,114 @@
 #!/bin/bash
-SHORT=r:,v:,
-LONG=run-mode:,soc-version:,
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
 OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
 eval set -- "$OPTS"
-while :
-do
+SOC_VERSION="Ascend310P3"
+
+while :; do
     case "$1" in
-        (-r | --run-mode )
-            RUN_MODE="$2"
-            shift 2;;
-        (-v | --soc-version )
-            SOC_VERSION="$2"
-            shift 2;;
-        (--)
-            shift;
-            break;;
-        (*)
-            echo "[ERROR] Unexpected option: $1";
-            break;;
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
     esac
 done
 
-rm -rf build
-mkdir build
-cd build
-
-# in case of running op in simulator, use stub so instead
-if [ "${RUN_MODE}" = "sim" ]; then
-    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
-    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
 fi
 
-source $ASCEND_HOME_DIR/bin/setenv.bash
-export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
 
-cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
-make -j16
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
 
-if [ "${RUN_MODE}" = "npu" ]; then
-    ./tbufpool_direct_kernel_op
-elif [ "${RUN_MODE}" = "sim" ]; then
-    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
-    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
-    msprof op simulator --application=./tbufpool_direct_kernel_op
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
-    ./tbufpool_direct_kernel_op
-fi
\ No newline at end of file
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f tbufpool_direct_kernel_op
+cp ./out/bin/tbufpool_direct_kernel_op ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./tbufpool_direct_kernel_op
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./tbufpool_direct_kernel_op
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./tbufpool_direct_kernel_op
+        fi
+    else
+        ./tbufpool_direct_kernel_op
+    fi
+)
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/6_group_matmul/KernelLaunch/run.sh b/operator/ascendc/4_best_practices/6_group_matmul/KernelLaunch/run.sh
index 9bed4b408..ef12dd68e 100644
--- a/operator/ascendc/4_best_practices/6_group_matmul/KernelLaunch/run.sh
+++ b/operator/ascendc/4_best_practices/6_group_matmul/KernelLaunch/run.sh
@@ -75,13 +75,6 @@ source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
-    if [ ! $CAMODEL_LOG_PATH ]; then
-        export CAMODEL_LOG_PATH=$(pwd)/sim_log
-    fi
-    if [ -d "$CAMODEL_LOG_PATH" ]; then
-        rm -rf $CAMODEL_LOG_PATH
-    fi
-    mkdir -p $CAMODEL_LOG_PATH
 elif [ "${RUN_MODE}" = "cpu" ]; then
     export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 fi
-- 
Gitee


From 0209698dfeab87b7e8910f942200779f01cc6f1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Sat, 28 Jun 2025 06:06:15 +0000
Subject: [PATCH 29/46] =?UTF-8?q?!2700=20complement=20readme=20Merge=20pul?=
 =?UTF-8?q?l=20request=20!2700=20from=20=E5=BC=A0=E9=91=AB/zhangxin0628?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
index 93ea0a9ce..807a7c044 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
@@ -18,7 +18,7 @@
 |skip_layers|str|跳过量化的层 |/|/|
 |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
 |weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
-|weight_only_config.awq_quantize.grids_num|uint32|awq搜索格点数量|20|/|/|
+|weight_only_config.awq_quantize.grids_num|uint32|awq搜索格点数量|20|1~4294967295（整数）|
 
 ## 2 FLOAT4_E2M1量化示例
 > 当前quantization.cfg文件中weight_only_config.wts_type设置的值为FLOAT4_E2M1
-- 
Gitee


From ba1b6ebd485c56d8d93b519ef5088fe475792ba7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E9=91=AB?= <18810011293@163.com>
Date: Mon, 30 Jun 2025 06:12:54 +0000
Subject: [PATCH 30/46] =?UTF-8?q?!2701=20fix=20fp4=20readme=20Merge=20pull?=
 =?UTF-8?q?=20request=20!2701=20from=20=E5=BC=A0=E9=91=AB/fix0628?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
index 807a7c044..51cb57c93 100644
--- a/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
+++ b/python/level1_single_api/9_amct/amct_pytorch/fp4_weight_quantization/README_CN.md
@@ -17,7 +17,7 @@
 |:--| :-: | :-- | :-: | :-: |
 |skip_layers|str|跳过量化的层 |/|/|
 |weight_only_config.weight_compress_only|bool|是否为仅权重量化|False|True/False|
-|weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN|
+|weight_only_config.wts_type|enum|量化后权重类型|INT8|INT8/MXFP4_E2M1/HIFLOAT8/FLOAT8_E4M3FN/FLOAT4_E2M1/FLOAT4_E1M2|
 |weight_only_config.awq_quantize.grids_num|uint32|awq搜索格点数量|20|1~4294967295（整数）|
 
 ## 2 FLOAT4_E2M1量化示例
-- 
Gitee


From b1a865151ed52b7aff1b254bc4706ed10a40ac72 Mon Sep 17 00:00:00 2001
From: Y_keven <yingkaidi@huawei.com>
Date: Mon, 30 Jun 2025 09:29:44 +0000
Subject: [PATCH 31/46] =?UTF-8?q?!2695=20=E6=96=B0=E5=A2=9EpyACL=E5=BF=AB?=
 =?UTF-8?q?=E9=80=9F=E5=85=A5=E9=97=A8=E6=A0=B7=E4=BE=8B=20resnet50=5Ffirs?=
 =?UTF-8?q?tapp=20Merge=20pull=20request=20!2695=20from=20Y=5Fkeven/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../resnet50_firstapp/README.md               | 156 +++++++++++++++++
 .../resnet50_firstapp/data/.keep              |   1 +
 .../resnet50_firstapp/model/.keep             |   1 +
 .../resnet50_firstapp/src/constant.py         |  22 +++
 .../resnet50_firstapp/src/firstapp.py         | 164 ++++++++++++++++++
 5 files changed, 344 insertions(+)
 create mode 100644 python/level2_simple_inference/1_classification/resnet50_firstapp/README.md
 create mode 100644 python/level2_simple_inference/1_classification/resnet50_firstapp/data/.keep
 create mode 100644 python/level2_simple_inference/1_classification/resnet50_firstapp/model/.keep
 create mode 100644 python/level2_simple_inference/1_classification/resnet50_firstapp/src/constant.py
 create mode 100644 python/level2_simple_inference/1_classification/resnet50_firstapp/src/firstapp.py

diff --git a/python/level2_simple_inference/1_classification/resnet50_firstapp/README.md b/python/level2_simple_inference/1_classification/resnet50_firstapp/README.md
new file mode 100644
index 000000000..466780ba3
--- /dev/null
+++ b/python/level2_simple_inference/1_classification/resnet50_firstapp/README.md
@@ -0,0 +1,156 @@
+# 快速入门
+在本节中，您可以通过一个简单的图片分类应用了解使用AscendCL接口开发应用的基本过程以及开发过程中涉及的关键概念。
+
+## 什么是图片分类应用？
+
+“图片分类应用”，从名称上，我们也能直观地看出它的作用：按图片所属的类别来区分图片。
+
+![输入图片说明](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/resource/pyacl_resnet50_firstapp.png)
+
+但“图片分类应用”是怎么做到这一点的呢？当然得先有一个能做到图片分类的模型，我们可以直接使用一些训练好的开源模型，也可以基于开源模型的源码进行修改、重新训练，还可以自己基于算法、框架构建适合自己的模型。
+
+鉴于当前我们是入门内容，此处我们直接获取已训练好的开源模型，毕竟这种最简单、最快。此处我们选择的是ONNX框架的ResNet-50模型。
+
+ResNet-50模型的基本介绍如下：
+
+-   输入数据：RGB格式、224\*224分辨率的输入图片
+-   输出数据：图片的类别标签及其对应置信度
+
+> **说明：** 
+> -   置信度是指图片所属某个类别可能性。
+> -   类别标签和类别的对应关系与训练模型时使用的数据集有关，需要查阅对应数据集的标签及类别的对应关系。
+
+## 环境要求
+
+-   操作系统及架构：CentOS 7.6 x86\_64、CentOS aarch64、Ubuntu 18.04 x86\_64、EulerOS x86、EulerOS aarch64
+-   芯片：Atlas 200/300/500 推理产品、Atlas 推理系列产品、Atlas 训练系列产品
+-   python及依赖的库：python3.7.5以上，Pillow、Numpy库
+-   已在环境上部署昇腾AI软件栈，并配置对应的的环境变量，请参见[Link](https://www.hiascend.com/document/redirect/CannCommunityInstSoftware)中对应版本的CANN安装指南。  
+    
+    以下步骤中，开发环境指开发代码的环境，运行环境指运行算子、推理或训练等程序的环境，运行环境上必须带昇腾AI处理器。开发环境和运行环境可以合设在同一台服务器上，也可以分设。
+
+## 下载样例
+
+请选择其中一种样例下载方式：
+
+-   压缩包方式下载（下载时间较短，但步骤稍微复杂）
+
+    ```
+    # 1. samples仓右上角选择 【克隆/下载】 下拉框并选择 【下载ZIP】。     
+    # 2. 将ZIP包上传到开发环境中的普通用户家目录中，【例如：${HOME}/ascend-samples-master.zip】。      
+    # 3. 开发环境中，执行以下命令，解压zip包。      
+    cd ${HOME}     
+    unzip ascend-samples-master.zip
+    ```
+
+    注：如果需要下载其它版本代码，请先请根据前置条件说明进行samples仓分支切换。
+
+-   命令行方式下载（下载时间较长，但步骤简单）
+
+    ```
+    # 开发环境，非root用户命令行中执行以下命令下载源码仓。    
+    cd ${HOME}     
+    git clone https://gitee.com/ascend/samples.git
+    ```
+
+    注：如果需要切换到其它tag版本，以v0.5.0为例，可执行以下命令。
+
+    ```
+    git checkout v0.5.0
+    ```
+
+下载成功后，切换到“ <SAMPLE_DIR>/python/level2_simple_inference/1_classification/resnet50_firstapp”目录下，查看该样例的目录结构，**下文所有的操作步骤均需先切换到resnet50_firstapp目录**：
+
+```
+resnet50_firstapp
+├── data                                // 用于存放测试图片的目录
+├── model                               // 用于存放模型文件的目录                 
+├── src
+│   ├── constant.py                     // 常量定义文件
+│   └── firstapp.py                     // 图片分类样例的运行文件
+```
+
+## 准备模型
+
+1.  以运行用户登录开发环境。
+
+2.  下载模型数据。
+
+    执行以下命令，将ONNX模型下载至“model”目录下，命令中的“***<SAMPLE_DIR>***”请根据实际样例包的存放目录替换
+    ```
+    cd <SAMPLE_DIR>/python/level2_simple_inference/1_classification/resnet50_firstapp/model
+    wget https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/003_Atc_Models/resnet50/resnet50.onnx
+    ```
+
+3.  执行模型转换。
+
+    执行以下命令（以 Atlas 推理系列产品为例），将原始模型转换为昇腾AI处理器能识别的\*.om模型文件。请注意，执行命令的用户需具有命令中相关路径的可读、可写权限。以下命令中的“***<soc_version>***”请根据实际昇腾AI处理器版本替换。
+
+    ```
+    atc --model=resnet50.onnx --framework=5 --output=resnet50 --input_shape="actual_input_1:1,3,224,224"  --soc_version=<soc_version>
+    ```
+    
+    -   --model：ResNet-50网络的模型文件路径。
+    -   --framework：原始框架类型。5表示ONNX。
+    -   --output：resnet50.om模型文件的路径。若此处修改模型文件名及存储路径，则需要同步修改src/firstapp.py中模型加载处的模型文件名及存储路径，即model_path变量值。
+    -   --soc\_version：昇腾AI处理器的版本。
+    
+    关于各参数的详细解释，请参见[《ATC离线模型编译工具》](https://www.hiascend.com/document/redirect/AscendTensorCompiler)。
+
+## 准备测试图片
+
+本次样例需要使用两张动物图片，请执行以下命令将图片下载至“data”目录，或通过以下链接获取后放至“data”目录。若此处修改测试图片文件名，则需要同步修改src/firstapp.py中读取图片处的文件名，即image_paths变量值。
+
+-   [测试图片1](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/aclsample/dog1_1024_683.jpg)
+
+    ```
+    cd $HOME/first_app/data
+    wget https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/aclsample/dog1_1024_683.jpg
+    ```
+
+-   [测试图片2](https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/aclsample/dog2_1024_683.jpg)
+
+    ```
+    cd $HOME/first_app/data
+    wget https://obs-9be7.obs.cn-east-2.myhuaweicloud.com/models/aclsample/dog2_1024_683.jpg
+    ```
+
+## 运行应用
+以运行用户将resnet50_firstapp目录放至运行环境，以运行用户登录运行环境，切换到resnet50_firstapp目录下，检查环境变量配置是否正确，执行以下命令。
+
+```
+python3 src/firstapp.py
+```
+可以得到如下输出，分别为两张测试图片的top5分类信息。
+
+其中[161]: 0.810220表示的是类别标识索引“161”的置信度为“0.810220”。
+
+```
+======== top5 inference results: =============
+[161]: 0.810220
+[162]: 0.103008
+[178]: 0.017485
+[166]: 0.013941
+[212]: 0.009581
+======== top5 inference results: =============
+[267]: 0.728255
+[266]: 0.101687
+[265]: 0.100111
+[151]: 0.004214
+[160]: 0.002731
+```
+
+>**说明：** 
+>类别标签和类别的对应关系与训练模型时使用的数据集有关，本样例使用的模型是基于imagenet数据集进行训练的，您可以在互联网上查阅对应数据集的标签及类别的对应关系。
+>
+>当前屏显信息中的类别标识与类别的对应关系如下：
+>
+>"161": ["basset", "basset hound"]
+>
+>"162": ["beagle"]
+>
+>"163": ["bloodhound", "sleuthhound"]
+>
+>"166": ["Walker hound", "Walker foxhound"]
+>
+>"167": ["English foxhound"]
\ No newline at end of file
diff --git a/python/level2_simple_inference/1_classification/resnet50_firstapp/data/.keep b/python/level2_simple_inference/1_classification/resnet50_firstapp/data/.keep
new file mode 100644
index 000000000..8d1c8b69c
--- /dev/null
+++ b/python/level2_simple_inference/1_classification/resnet50_firstapp/data/.keep
@@ -0,0 +1 @@
+ 
diff --git a/python/level2_simple_inference/1_classification/resnet50_firstapp/model/.keep b/python/level2_simple_inference/1_classification/resnet50_firstapp/model/.keep
new file mode 100644
index 000000000..8d1c8b69c
--- /dev/null
+++ b/python/level2_simple_inference/1_classification/resnet50_firstapp/model/.keep
@@ -0,0 +1 @@
+ 
diff --git a/python/level2_simple_inference/1_classification/resnet50_firstapp/src/constant.py b/python/level2_simple_inference/1_classification/resnet50_firstapp/src/constant.py
new file mode 100644
index 000000000..6b389277f
--- /dev/null
+++ b/python/level2_simple_inference/1_classification/resnet50_firstapp/src/constant.py
@@ -0,0 +1,22 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# rule for mem
+ACL_MEM_MALLOC_HUGE_FIRST = 0
+
+# rule for memory copy
+ACL_MEMCPY_HOST_TO_HOST = 0
+ACL_MEMCPY_HOST_TO_DEVICE = 1
+ACL_MEMCPY_DEVICE_TO_HOST = 2
+ACL_MEMCPY_DEVICE_TO_DEVICE = 3
diff --git a/python/level2_simple_inference/1_classification/resnet50_firstapp/src/firstapp.py b/python/level2_simple_inference/1_classification/resnet50_firstapp/src/firstapp.py
new file mode 100644
index 000000000..363becfb5
--- /dev/null
+++ b/python/level2_simple_inference/1_classification/resnet50_firstapp/src/firstapp.py
@@ -0,0 +1,164 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import acl
+import numpy as np
+from PIL import Image
+from constant import ACL_MEM_MALLOC_HUGE_FIRST, \
+    ACL_MEMCPY_HOST_TO_DEVICE, ACL_MEMCPY_DEVICE_TO_HOST
+
+
+class Net:
+    def __init__(self, model_path):
+        # 初始化函数
+        self.device_id = 0  
+        # step1: 初始化
+        ret = acl.init()
+        # 指定运算的Device
+        ret = acl.rt.set_device(self.device_id) 
+        # step2: 加载模型，本示例为ResNet-50模型
+        # 加载离线模型文件，返回标识模型的ID
+        self.model_id, ret = acl.mdl.load_from_file(model_path)
+        # 创建空白模型描述信息，获取模型描述信息的指针地址
+        self.model_desc = acl.mdl.create_desc()
+        # 通过模型的ID，将模型的描述信息填充到model_desc
+        ret = acl.mdl.get_desc(self.model_desc, self.model_id)  
+        # step3：创建输入输出数据集
+        # 创建输入数据集
+        self.input_dataset, self.input_data = self.prepare_dataset('input')
+        # 创建输出数据集
+        self.output_dataset, self.output_data = self.prepare_dataset('output')
+
+    def prepare_dataset(self, io_type):
+        # 准备数据集
+        if io_type == "input":
+            # 获得模型输入的个数
+            io_num = acl.mdl.get_num_inputs(self.model_desc)
+            acl_mdl_get_size_by_index = acl.mdl.get_input_size_by_index
+        else:
+            # 获得模型输出的个数
+            io_num = acl.mdl.get_num_outputs(self.model_desc)
+            acl_mdl_get_size_by_index = acl.mdl.get_output_size_by_index
+        # 创建aclmdlDataset类型的数据，描述模型推理的输入。
+        dataset = acl.mdl.create_dataset()
+        datas = []
+        for i in range(io_num):
+            # 获取所需的buffer内存大小
+            buffer_size = acl_mdl_get_size_by_index(self.model_desc, i)
+            # 申请buffer内存
+            buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
+            # 从内存创建buffer数据
+            data_buffer = acl.create_data_buffer(buffer, buffer_size)
+            # 将buffer数据添加到数据集
+            _, ret = acl.mdl.add_dataset_buffer(dataset, data_buffer)
+            datas.append({"buffer": buffer, "data": data_buffer, "size": buffer_size})
+        return dataset, datas
+
+    def forward(self, inputs):
+        # 执行推理任务
+        # 遍历所有输入，拷贝到对应的buffer内存中
+        input_num = len(inputs)
+        for i in range(input_num):
+            bytes_data = inputs[i].tobytes()
+            bytes_ptr = acl.util.bytes_to_ptr(bytes_data)
+            # 将图片数据从Host传输到Device。
+            ret = acl.rt.memcpy(self.input_data[i]["buffer"],   # 目标地址 device
+                                self.input_data[i]["size"],     # 目标地址大小
+                                bytes_ptr,                      # 源地址 host
+                                len(bytes_data),                # 源地址大小
+                                ACL_MEMCPY_HOST_TO_DEVICE)      # 模式:从host到device
+        # 执行模型推理。
+        ret = acl.mdl.execute(self.model_id, self.input_dataset, self.output_dataset)
+        # 处理模型推理的输出数据，输出top5置信度的类别编号。
+        inference_result = []
+        for i, item in enumerate(self.output_data):
+            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
+            # 将推理输出数据从Device传输到Host。
+            ret = acl.rt.memcpy(buffer_host,                    # 目标地址 host
+                                self.output_data[i]["size"],    # 目标地址大小
+                                self.output_data[i]["buffer"],  # 源地址 device
+                                self.output_data[i]["size"],    # 源地址大小
+                                ACL_MEMCPY_DEVICE_TO_HOST)      # 模式：从device到host
+            # 从内存地址获取bytes对象
+            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
+            # 按照float32格式将数据转为numpy数组
+            data = np.frombuffer(bytes_out, dtype=np.float32)
+            inference_result.append(data)
+            # 释放内存
+            ret = acl.rt.free_host(buffer_host)
+        vals = np.array(inference_result).flatten()
+        # 对结果进行softmax转换
+        vals = np.exp(vals)
+        vals = vals / np.sum(vals)
+        
+        return vals
+
+    def __del__(self):
+        # 析构函数 按照初始化资源的相反顺序释放资源。
+        # 销毁输入输出数据集
+        for dataset in [self.input_data, self.output_data]:
+            while dataset:
+                item = dataset.pop()
+                ret = acl.destroy_data_buffer(item["data"])    # 销毁buffer数据
+                ret = acl.rt.free(item["buffer"])              # 释放buffer内存
+        ret = acl.mdl.destroy_dataset(self.input_dataset)      # 销毁输入数据集
+        ret = acl.mdl.destroy_dataset(self.output_dataset)     # 销毁输出数据集
+        # 销毁模型描述
+        ret = acl.mdl.destroy_desc(self.model_desc)
+        # 卸载模型
+        ret = acl.mdl.unload(self.model_id)
+        # 释放device
+        ret = acl.rt.reset_device(self.device_id)
+        # acl去初始化
+        ret = acl.finalize()
+
+def transfer_pic(input_path):
+    # 图像预处理
+    input_path = os.path.abspath(input_path)
+    with Image.open(input_path) as image_file:
+        # 缩放为224*224
+        img = image_file.resize((224, 224))
+        # 转换为float32类型ndarray
+        img = np.array(img).astype(np.float32)
+    # 根据imageNet图片的均值和方差对图片像素进行归一化
+    img -= [123.675, 116.28, 103.53]
+    img /= [58.395, 57.12, 57.375]
+    # RGB通道交换顺序为BGR
+    img = img[:, :, ::-1]
+    # resnet50为色彩通道在前
+    img = img.transpose((2, 0, 1))
+    # 返回并添加batch通道
+    return np.array([img])
+
+def print_top_5(data):
+    top_5 = data.argsort()[::-1][:5]
+    print("======== top5 inference results: =============")
+    for j in top_5:
+        print("[%d]: %f" % (j, data[j]))
+
+if __name__ == "__main__":
+    image_paths = ["./data/dog1_1024_683.jpg", "./data/dog2_1024_683.jpg"]
+    model_path = './model/resnet50.om'
+    resnet50 = Net(model_path)
+    
+    for path in image_paths:
+        # 图像预处理，此处仅供参考，用户按照自己需求进行预处理
+        image = transfer_pic(path)
+        # 将数据按照每个输入的顺序构造list传入，当前示例的ResNet-50模型只有一个输入
+        result = resnet50.forward([image])
+        # 输出top_5
+        print_top_5(result)
+
+    del resnet50
-- 
Gitee


From 07a3f06baeb89a819da81f391f943df94dd35f54 Mon Sep 17 00:00:00 2001
From: ruoshuisixue <lishangfan@h-partners.com>
Date: Tue, 1 Jul 2025 11:17:13 +0000
Subject: [PATCH 32/46] !2704 sc clean Merge pull request !2704 from
 ruoshuisixue/master

---
 .../AddTemplateCustom/op_kernel/tiling_key_add_custom.h     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/operator/ascendc/0_introduction/6_addtemplate_frameworklaunch/AddTemplateCustom/op_kernel/tiling_key_add_custom.h b/operator/ascendc/0_introduction/6_addtemplate_frameworklaunch/AddTemplateCustom/op_kernel/tiling_key_add_custom.h
index 61dcb08ce..1cc3d7700 100644
--- a/operator/ascendc/0_introduction/6_addtemplate_frameworklaunch/AddTemplateCustom/op_kernel/tiling_key_add_custom.h
+++ b/operator/ascendc/0_introduction/6_addtemplate_frameworklaunch/AddTemplateCustom/op_kernel/tiling_key_add_custom.h
@@ -56,14 +56,14 @@ ASCENDC_TPL_SEL(
     ASCENDC_TPL_DTYPE_SEL(D_T_Y, ADD_TPL_FP16),
     ASCENDC_TPL_DTYPE_SEL(D_T_Z, ADD_TPL_FP16),
     ASCENDC_TPL_UINT_SEL(TILE_NUM, ASCENDC_TPL_UI_LIST, 1, 8),
-    ASCENDC_TPL_BOOL_SEL(IS_SPLIT, 0, 1),
+    ASCENDC_TPL_BOOL_SEL(IS_SPLIT, 0, 1)
     ),
     ASCENDC_TPL_ARGS_SEL(
     ASCENDC_TPL_DTYPE_SEL(D_T_X, ADD_TPL_FP32),
     ASCENDC_TPL_DTYPE_SEL(D_T_Y, ADD_TPL_FP32),
     ASCENDC_TPL_DTYPE_SEL(D_T_Z, ADD_TPL_FP32),
     ASCENDC_TPL_UINT_SEL(TILE_NUM, ASCENDC_TPL_UI_LIST, 1, 8),
-    ASCENDC_TPL_BOOL_SEL(IS_SPLIT, 0, 1),
-    ),
+    ASCENDC_TPL_BOOL_SEL(IS_SPLIT, 0, 1)
+    )
 );
 #endif
\ No newline at end of file
-- 
Gitee


From 4ff91413ea15980d4623fb12381b558fcb39f75b Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Wed, 2 Jul 2025 07:28:50 +0000
Subject: [PATCH 33/46] !2705 add bank conflict cases Merge pull request !2705
 from zhanghao0689/master

---
 .../KernelLaunch/CMakeLists.txt               |  47 ++++
 .../4_bank_conflict/KernelLaunch/README.md    |  88 ++++++++
 .../KernelLaunch/add_custom_v1.cpp            |  86 ++++++++
 .../KernelLaunch/add_custom_v2.cpp            |  90 ++++++++
 .../KernelLaunch/cmake/cpu_lib.cmake          |   9 +
 .../KernelLaunch/cmake/npu_lib.cmake          |  11 +
 .../4_bank_conflict/KernelLaunch/data_utils.h | 203 ++++++++++++++++++
 .../4_bank_conflict/KernelLaunch/main.cpp     | 127 +++++++++++
 .../4_bank_conflict/KernelLaunch/run.sh       | 113 ++++++++++
 .../KernelLaunch/scripts/gen_data.py          |  25 +++
 .../KernelLaunch/scripts/verify_result.py     |  53 +++++
 .../4_bank_conflict/README.md                 |  70 +++++-
 operator/ascendc/4_best_practices/README.md   |   4 +-
 13 files changed, 924 insertions(+), 2 deletions(-)
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/CMakeLists.txt
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/README.md
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/add_custom_v1.cpp
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/add_custom_v2.cpp
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/cmake/cpu_lib.cmake
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/cmake/npu_lib.cmake
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/data_utils.h
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/main.cpp
 create mode 100755 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/run.sh
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/scripts/gen_data.py
 create mode 100644 operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/scripts/verify_result.py

diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/CMakeLists.txt b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/CMakeLists.txt
new file mode 100644
index 000000000..392189fe1
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/CMakeLists.txt
@@ -0,0 +1,47 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v2.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    ascendc_kernels_${RUN_MODE}
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/README.md b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/README.md
new file mode 100644
index 000000000..f72b521cd
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/README.md
@@ -0,0 +1,88 @@
+## 目录结构介绍
+
+```
+├── KernelLaunch
+│   ├── cmake                   // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom_v1.cpp       // 算子kernel实现1：未优化前实现
+│   ├── add_custom_v2.cpp       // 算子kernel实现2：优化地址分配，消除Bank冲突后的实现
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                  // 编译运行算子的脚本
+```
+
+## 代码实现介绍
+
+本样例中实现的是固定shape为1*4096的Add算子。
+
+- kernel实现
+
+  Add算子的数学表达式为：
+
+  ```
+  z = x + y
+  ```
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。
+
+  实现1：请参考[add_custom_v1.cpp](./add_custom_v1.cpp)，xLocal地址为0，yLocal地址为0x4000，zLocal地址为0x8000。xLocal与yLocal存在读读冲突，xLocal与zLocal存在读写冲突。
+
+  实现2：请参考[add_custom_v2.cpp](./add_custom_v2.cpp)，为了避免Bank冲突，通过配置InitBuffer时的bufferSize来调整Tensor地址，xLocal地址为0，yLocal地址为0x4100，zLocal地址为0x10000。
+- 调用实现
+
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+
+- 打开样例目录
+  以命令行方式下载样例代码，master分支为例。
+
+  ```bash
+  cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch
+  ```
+- 配置环境变量
+
+  请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+
+  - 默认路径，root用户安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    ```
+  - 默认路径，非root用户安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    ```
+  - 指定路径install_path，安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+    ```
+- 样例执行
+
+  ```bash
+  bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+  ```
+
+  - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu /sim / npu]
+  - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+    - Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+  示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+
+  ```bash
+  bash run.sh -r cpu -v Ascendxxxyy
+  ```
+
+## 更新说明
+
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/07/01 | 新增本readme |
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/add_custom_v1.cpp
new file mode 100644
index 000000000..9d9774405
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/add_custom_v1.cpp
@@ -0,0 +1,86 @@
+/**
+ * @file add_custom_v1.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr int32_t TOTAL_LENGTH = 4096;                            // total length of data
+constexpr int32_t BUFFER_NUM = 1;                                 // tensor num for each queue
+}
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TOTAL_LENGTH * sizeof(float));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TOTAL_LENGTH * sizeof(float));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TOTAL_LENGTH * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm, TOTAL_LENGTH);
+        AscendC::DataCopy(yLocal, yGm, TOTAL_LENGTH);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        AscendC::Add(zLocal, xLocal, yLocal, TOTAL_LENGTH);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut()
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopy(zGm, zLocal, TOTAL_LENGTH);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom_v1<<<blockDim, nullptr, stream>>>(x, y, z);
+}
+#endif
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/add_custom_v2.cpp
new file mode 100644
index 000000000..65e7dd7e5
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/add_custom_v2.cpp
@@ -0,0 +1,90 @@
+/**
+ * @file add_custom_v2.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr int32_t TOTAL_LENGTH = 4096;        // total length of data
+constexpr int32_t BUFFER_NUM = 1;             // tensor num for each queue
+constexpr int32_t BANKGROUP_SIZE = 1024 * 64; // one bank size is 4KB, with 16 banks
+constexpr int32_t ONE_REPEAT_SIZE = 256;      // 256 bytes per repeat
+} // namespace
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH);
+        // xLocal size add 256 to avoid rr conflict
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE);
+        // yLocal size adjust to 64KB - xLocal size to avoid rw conflict
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, BANKGROUP_SIZE - (TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TOTAL_LENGTH * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm, TOTAL_LENGTH);
+        AscendC::DataCopy(yLocal, yGm, TOTAL_LENGTH);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        AscendC::Add(zLocal, xLocal, yLocal, TOTAL_LENGTH);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut()
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopy(zGm, zLocal, TOTAL_LENGTH);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom_v2<<<blockDim, nullptr, stream>>>(x, y, z);
+}
+#endif
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/cmake/cpu_lib.cmake b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/cmake/cpu_lib.cmake
new file mode 100644
index 000000000..5362c8b5a
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/cmake/cpu_lib.cmake
@@ -0,0 +1,9 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/cmake/npu_lib.cmake b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/cmake/npu_lib.cmake
new file mode 100644
index 000000000..f92b095d1
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/cmake/npu_lib.cmake
@@ -0,0 +1,11 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/data_utils.h b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/data_utils.h
new file mode 100644
index 000000000..09d906371
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/data_utils.h
@@ -0,0 +1,203 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/main.cpp b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/main.cpp
new file mode 100644
index 000000000..8a65f8fa6
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/main.cpp
@@ -0,0 +1,127 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
+extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
+using KernelEntry = void(*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *);
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z);
+extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z);
+using KernelEntry = void(*)(GM_ADDR, GM_ADDR, GM_ADDR);
+
+#endif
+
+struct ArgInfo {
+    std::string fileName;
+    size_t length;
+};
+
+#ifndef ASCENDC_CPU_DEBUG
+
+void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::vector<ArgInfo> &inputsInfo,
+                std::vector<ArgInfo> &outputsInfo)
+{
+    std::vector<uint8_t *> inputHost(inputsInfo.size());
+    std::vector<uint8_t *> inputDevice(inputsInfo.size());
+    std::vector<uint8_t *> outputHost(outputsInfo.size());
+    std::vector<uint8_t *> outputDevice(outputsInfo.size());
+
+    for (uint32_t i = 0; i < inputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMallocHost((void **)(&inputHost[i]), inputsInfo[i].length));
+        CHECK_ACL(aclrtMalloc((void **)(&inputDevice[i]), inputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, inputHost[i], inputsInfo[i].length);
+        CHECK_ACL(aclrtMemcpy(inputDevice[i], inputsInfo[i].length, inputHost[i], inputsInfo[i].length,
+                              ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    for (uint32_t i = 0; i < outputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMallocHost((void **)(&outputHost[i]), outputsInfo[i].length));
+        CHECK_ACL(aclrtMalloc((void **)(&outputDevice[i]), outputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+
+    kernelEntry(blockDim, stream, inputDevice[0], inputDevice[1], outputDevice[0]);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+    for (uint32_t i = 0; i < outputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMemcpy(outputHost[i], outputsInfo[i].length, outputDevice[i], outputsInfo[i].length,
+                              ACL_MEMCPY_DEVICE_TO_HOST));
+        WriteFile(outputsInfo[i].fileName, outputHost[i], outputsInfo[i].length);
+        CHECK_ACL(aclrtFree(outputDevice[i]));
+        CHECK_ACL(aclrtFreeHost(outputHost[i]));
+    }
+
+    for (uint32_t i = 0; i < inputsInfo.size(); i++) {
+        CHECK_ACL(aclrtFree(inputDevice[i]));
+        CHECK_ACL(aclrtFreeHost(inputHost[i]));
+    }
+}
+
+#else
+
+#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo)                                  \
+    {                                                                                               \
+        std::vector<uint8_t *> input(inputsInfo.size());                                            \
+        std::vector<uint8_t *> output(outputsInfo.size());                                          \
+                                                                                                    \
+        for (uint32_t i = 0; i < inputsInfo.size(); i++) {                                          \
+            input[i] = (uint8_t *)AscendC::GmAlloc(inputsInfo[i].length);                           \
+            ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, input[i], inputsInfo[i].length); \
+        }                                                                                           \
+                                                                                                    \
+        for (uint32_t i = 0; i < outputsInfo.size(); i++) {                                         \
+            output[i] = (uint8_t *)AscendC::GmAlloc(outputsInfo[i].length);                         \
+        }                                                                                           \
+                                                                                                    \
+        AscendC::SetKernelMode(KernelMode::AIV_MODE);                                               \
+        ICPU_RUN_KF(kernelEntry, blockDim, input[0], input[1], output[0]);                          \
+        for (uint32_t i = 0; i < inputsInfo.size(); i++) {                                          \
+            AscendC::GmFree((void *)input[i]);                                                      \
+        }                                                                                           \
+                                                                                                    \
+        for (uint32_t i = 0; i < outputsInfo.size(); i++) {                                         \
+            WriteFile(outputsInfo[i].fileName, output[i], outputsInfo[i].length);                   \
+            AscendC::GmFree((void *)output[i]);                                                     \
+        }                                                                                           \
+    }
+
+#endif
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t blockDim = 1;
+    uint32_t dataLen = 4096;
+    size_t inputByteSize = dataLen * sizeof(float);
+    size_t outputByteSize = dataLen * sizeof(float);
+
+    std::vector<ArgInfo> inputsInfo = {{"./input/input_x.bin", inputByteSize}, {"./input/input_y.bin", inputByteSize}};
+    std::vector<ArgInfo> outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}};
+    std::vector<ArgInfo> outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}};
+
+#ifndef ASCENDC_CPU_DEBUG
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info);
+    KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info);
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#else
+    KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info);
+    KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info);
+#endif
+    return 0;
+}
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/run.sh b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/run.sh
new file mode 100755
index 000000000..0c5aef144
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/run.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+SOC_VERSION="Ascend310P3"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [ "${RUN_MODE}" = "npu" ]; then
+        msprof op --launch-count=2 --output=./prof ./ascendc_kernels_bbit
+    elif [ "${RUN_MODE}" = "sim" ]; then
+        msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit
+    elif [ "${RUN_MODE}" = "cpu" ]; then
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin
+python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/scripts/gen_data.py b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/scripts/gen_data.py
new file mode 100644
index 000000000..86bbba89d
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/scripts/gen_data.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1, 100, [1, 4096]).astype(np.float32)
+    input_y = np.random.uniform(1, 100, [1, 4096]).astype(np.float32)
+    golden = (input_x + input_y).astype(np.float32)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/scripts/verify_result.py b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/scripts/verify_result.py
new file mode 100644
index 000000000..6a38a3b2b
--- /dev/null
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-4
+absolute_tol = 1e-5
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/4_best_practices/4_bank_conflict/README.md b/operator/ascendc/4_best_practices/4_bank_conflict/README.md
index 71c50671d..f2b828892 100644
--- a/operator/ascendc/4_best_practices/4_bank_conflict/README.md
+++ b/operator/ascendc/4_best_practices/4_bank_conflict/README.md
@@ -1 +1,69 @@
-减少bank冲突（待补充）
\ No newline at end of file
+## 概述
+
+本样例介绍基于Add算子优化bank冲突的实现，并提供核函数直调方法。
+
+## 目录结构介绍
+
+```
+├── 4_bank_conflict      // 使用核函数直调的方式调用Add自定义算子
+│   └── KernelLaunch     // Kernel Launch方式调用核函数样例
+```
+
+## 算子描述
+
+算子实现的是固定shape为1×4096的Add算子。
+
+Add的计算公式为：
+
+```python
+z = x + y
+```
+
+- x：输入，形状为\[1, 4096]，数据类型为float；
+- y：输入，形状为\[1, 4096]，数据类型为float；
+- z：输出，形状为\[1, 4096]，数据类型为float；
+
+## 算子规格描述
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom_v1 / add_custom_v2</td></tr>
+</table>
+
+## 支持的产品型号
+
+本样例支持如下产品型号：
+
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 编译运行样例算子
+
+针对自定义算子工程，编译运行包含如下步骤：
+
+- 编译自定义算子工程；
+- 调用执行自定义算子；
+
+详细操作如下所示。
+
+### 1. 获取源码包
+
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
+
+### 2. 编译运行样例工程
+
+- [KernelLaunch样例运行](./KernelLaunch/README.md)
+
+## 更新说明
+
+
+| 时间       | 更新事项         |
+| ---------- | ---------------- |
+| 2025/07/01 | 新增直调方式样例 |
diff --git a/operator/ascendc/4_best_practices/README.md b/operator/ascendc/4_best_practices/README.md
index 653d01b0c..f5379bbbf 100644
--- a/operator/ascendc/4_best_practices/README.md
+++ b/operator/ascendc/4_best_practices/README.md
@@ -6,6 +6,7 @@
 
 | 目录名称                        | 功能描述                                   | 运行环境                                   |
 | ------------------------------- | ------------------------------------------ | ------------------------------------------ |
+| [4_bank_conflict](./4_bank_conflict) | 基于Ascend C的bank冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
 | [6_group_matmul](./6_group_matmul) | 基于Ascend C的group matmul算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
 | [21_all_gather_matmul_custom](./21_all_gather_matmul_custom) | 基于Ascend C的AllGatherMatmul算子性能调优样例 | Atlas A2训练系列产品 |
 | [22_matmul_reduce_scatter_custom](./22_matmul_reduce_scatter_custom) | 基于Ascend C的MatmulReduceScatter算子性能调优样例 | Atlas A2训练系列产品 |
@@ -43,7 +44,8 @@
 ## 更新说明
 | 时间       | 更新事项                                     |
 | ---------- | -------------------------------------------- |
+| 2025/07/01 | 新增4_bank_conflict样例         |
 | 2024/12/19 | 新增23_matmul_all_reduce_custom样例         |
 | 2024/12/19 | 新增22_matmul_reduce_scatter_custom样例         |
 | 2024/12/19 | 新增21_all_gather_matmul_custom样例         |
-| 2024/11/20 | 新增6_group_matmul样例                     |
\ No newline at end of file
+| 2024/11/20 | 新增6_group_matmul样例                     |
-- 
Gitee


From e120936690a2ea9220fea64fb29431e8c21316c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B6=8A=E9=99=8C=E5=BA=A6=E9=98=A1?=
 <gaoming39@huawei.com>
Date: Tue, 8 Jul 2025 11:37:19 +0000
Subject: [PATCH 34/46] =?UTF-8?q?!2708=20=E5=A2=9E=E5=8A=A0=E4=B8=A4?=
 =?UTF-8?q?=E7=A7=8D=E5=BD=92=E7=BA=A6=E6=8C=87=E4=BB=A4=E7=9A=84=E6=A0=B7?=
 =?UTF-8?q?=E4=BE=8B=20Merge=20pull=20request=20!2708=20from=20=E8=B6=8A?=
 =?UTF-8?q?=E9=99=8C=E5=BA=A6=E9=98=A1/Regulation=5FDirective=5FReduce=5FC?=
 =?UTF-8?q?ustom?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../AclNNInvocationNaive/main.cpp             |  26 +--
 .../14_reduce_frameworklaunch/README.md       |   5 +
 .../ReduceCustom/op_host/reduce_custom.cpp    |  33 +--
 .../ReduceCustom/op_kernel/reduce_custom.cpp  | 204 +++++++++++++-----
 4 files changed, 189 insertions(+), 79 deletions(-)

diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp
index 7ecffbc7e..734d48798 100644
--- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp
+++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp
@@ -95,7 +95,7 @@ void DestroyResources(std::vector<void *> tensors, std::vector<void *> deviceAdd
 int main(int argc, char **argv)
 {
     constexpr int64_t inputShape = 4096;
-    constexpr float resFloat = 4096.0;
+    float resFloat = 0;
     // 1. (Fixed code) Initialize device / stream, refer to the list of external interfaces of acl
     // Update deviceId to your own device id
     int32_t deviceId = 0;
@@ -110,21 +110,21 @@ int main(int argc, char **argv)
     void *outputZDeviceAddr = nullptr;
     aclTensor *inputX = nullptr;
     aclTensor *outputZ = nullptr;
-    std::vector<aclFloat16> inputXHostData(inputXShape[0]);
-    std::vector<aclFloat16> outputZHostData(outputZShape[0]);
+    std::vector<float> inputXHostData(inputXShape[0], 1.0);
+    std::vector<float> outputZHostData(outputZShape[0], 0);
+
     for (int i = 0; i < inputXShape[0]; ++i) {
-        inputXHostData[i] = aclFloatToFloat16(1.0);
-    }
-    for (int i = 0; i < outputZShape[0]; ++i) {
-        outputZHostData[i] = aclFloatToFloat16(resFloat);
+        inputXHostData[i] = 1.0;
+        resFloat += 1.0;
     }
+
     std::vector<void *> tensors = {inputX, outputZ};
     std::vector<void *> deviceAddrs = {inputXDeviceAddr, outputZDeviceAddr};
     // Create inputX aclTensor
-    ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT16, &inputX);
+    ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT, &inputX);
     CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED);
     // Create outputZ aclTensor
-    ret = CreateAclTensor(outputZHostData, outputZShape, &outputZDeviceAddr, aclDataType::ACL_FLOAT16, &outputZ);
+    ret = CreateAclTensor(outputZHostData, outputZShape, &outputZDeviceAddr, aclDataType::ACL_FLOAT, &outputZ);
     CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED);
 
     // 3. Call the API of the custom operator library
@@ -154,9 +154,9 @@ int main(int argc, char **argv)
     // 5. Get the output value, copy the result from device memory to host memory, need to modify according to the
     // interface of the API
     auto size = GetShapeSize(outputZShape);
-    std::vector<aclFloat16> resultData(size, 0);
+    std::vector<float> resultData(size, 0);
     ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputZDeviceAddr,
-                      size * sizeof(aclFloat16), ACL_MEMCPY_DEVICE_TO_HOST);
+                      size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST);
     CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret);
               DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return FAILED);
 
@@ -164,11 +164,11 @@ int main(int argc, char **argv)
     DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr);
 
     // print the output result
-    std::vector<aclFloat16> goldenData(size, aclFloatToFloat16(resFloat));
+    std::vector<float> goldenData(size, resFloat);
 
     LOG_PRINT("result is:\n");
     for (int64_t i = 0; i < 10; i++) {
-        LOG_PRINT("%.1f ", aclFloat16ToFloat(resultData[i]));
+        LOG_PRINT("%.1f ", resultData[i]);
     }
     LOG_PRINT("\n");
     if (std::equal(resultData.begin(), resultData.begin() + 1, goldenData.begin())) {
diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md
index 6f9bc094b..04e13268d 100644
--- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md
+++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md
@@ -25,6 +25,10 @@ z = sum(x)
 
 3、长度在float输入(2KB,16KB]，或者half输入(4KB,32KB]时。由于一条WholeReduceSum的累加效率比使用两条BlockReduceSum的累加效率更高。所以采用两条WholeReduceSum（而不是两条BlockReduceSum+一条WholeReduceSum），得到这段buffer的累加和。
 
+4、长度在float输入为10000时，对应WholeReduceSumImpl中的处理方法，在Counter模式下，采用WholeReduceSum指令，循环处理二维数据中的每一行，得到每一行的归约运行结果。
+
+5、长度在float输入为20000时，对应BinaryReduceSumImpl中的处理方法，在Counter模式下，先将运算数据一分为二，使用Add指令将两部分数据相加，循环往复，最后一条WholeReduceSum指令得到归约的运行结果。此种操作方式，相比较WholeReduceSum单指令操作的方式，在数据量较大，循环次数较多的场景下，性能更优。
+
 注意代码中使用了Counter模式。
 
 ## 算子规格描述
@@ -134,3 +138,4 @@ CANN软件包中提供了工程创建工具msOpGen，ReduceCustom算子工程可
 | ---------- | ---------------------------- |
 | 2024/09/14 | 新增ReduceCustom样例 |
 | 2024/11/18 | 算子工程改写为由msOpGen生成 |
+| 2025/07/07 | 增加两种归约操作样例 |
diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp
index 5bec0d17e..743fb162b 100644
--- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp
+++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp
@@ -9,21 +9,26 @@
  */
 #include "reduce_custom_tiling.h"
 #include "register/op_def_registry.h"
-#define REDUCE_TILING_0 1
-#define REDUCE_TILING_1 2
-#define REDUCE_TILING_2 3
 
 namespace optiling {
+constexpr uint32_t REDUCE_TILING_1 = 1;
+constexpr uint32_t REDUCE_TILING_2 = 2;
+constexpr uint32_t REDUCE_TILING_3 = 3;
+constexpr uint32_t REDUCE_TILING_4 = 4;
+constexpr uint32_t REDUCE_TILING_5 = 5;
+
 constexpr uint32_t BLOCK_DIM = 1;
 constexpr uint32_t ONE_REPEAT_LEN = 256;
 constexpr uint32_t ONE_BLOCK_LEN = 32;
 constexpr uint32_t OUT_SHAPE = 32;
-constexpr uint32_t HALF_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(uint16_t);
-constexpr uint32_t FLOAT_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(float);
-constexpr uint32_t HALF_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_BLOCK_LEN / sizeof(uint16_t);
-constexpr uint32_t FLOAT_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(float) * ONE_BLOCK_LEN / sizeof(float);
-constexpr uint32_t HALF_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_REPEAT_LEN / sizeof(uint16_t);
-constexpr uint32_t FLOAT_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(float) * ONE_REPEAT_LEN / sizeof(float);
+constexpr uint32_t HALF_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(uint16_t); // 128
+constexpr uint32_t FLOAT_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(float); // 64
+constexpr uint32_t HALF_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_BLOCK_LEN / sizeof(uint16_t); // 2048
+constexpr uint32_t FLOAT_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(float) * ONE_BLOCK_LEN / sizeof(float); //512
+constexpr uint32_t HALF_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_REPEAT_LEN / sizeof(uint16_t); // 16384
+constexpr uint32_t FLOAT_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(float) * ONE_REPEAT_LEN / sizeof(float); // 4096
+constexpr uint32_t WHOLEREDUCESUM_SIGLE_MODE = 10000;
+constexpr uint32_t BINARYREDUCESUM_SIGLE_MODE = 20000;
 static ge::graphStatus TilingFunc(gert::TilingContext *context)
 {
     TilingData tiling;
@@ -32,15 +37,19 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context)
     // Only WholeReduceSum is used under 256B.
     if ((totalLength <= HALF_THRESHOLD0 && inputDtype == ge::DT_FLOAT16) ||
         (totalLength <= FLOAT_THRESHOLD0 && inputDtype == ge::DT_FLOAT)) {
-        context->SetTilingKey(REDUCE_TILING_0);
+        context->SetTilingKey(REDUCE_TILING_1);
     // One WholeReduceSum and one BlockReduceSum are used in (256B,2KB](for float input) and (256B,4KB](for half input).
     } else if ((totalLength <= HALF_THRESHOLD1 && inputDtype == ge::DT_FLOAT16) ||
         (totalLength <= FLOAT_THRESHOLD1 && inputDtype == ge::DT_FLOAT)) {
-        context->SetTilingKey(REDUCE_TILING_1);
+        context->SetTilingKey(REDUCE_TILING_2);
     // Two WholeReduceSum are used in (2KB,16KB](for float input) and (4KB,32KB](for half input).
     } else if ((totalLength <= HALF_THRESHOLD2 && inputDtype == ge::DT_FLOAT16) ||
         (totalLength <= FLOAT_THRESHOLD2 && inputDtype == ge::DT_FLOAT)) {
-        context->SetTilingKey(REDUCE_TILING_2);
+        context->SetTilingKey(REDUCE_TILING_3);
+    } else if (totalLength == WHOLEREDUCESUM_SIGLE_MODE) {
+        context->SetTilingKey(REDUCE_TILING_4);
+    } else if (totalLength == BINARYREDUCESUM_SIGLE_MODE) {
+        context->SetTilingKey(REDUCE_TILING_5);
     }
     context->SetBlockDim(BLOCK_DIM);
     tiling.set_totalLength(totalLength);
diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp
index c4ac235d3..d8d631332 100644
--- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp
+++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp
@@ -8,15 +8,20 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
 #include "kernel_operator.h"
-#define REDUCE_TILING_0 1
-#define REDUCE_TILING_1 2
-#define REDUCE_TILING_2 3
+#define REDUCE_TILING_1 1
+#define REDUCE_TILING_2 2
+#define REDUCE_TILING_3 3
+#define REDUCE_TILING_4 4
+#define REDUCE_TILING_5 5
 
+template<typename DTYPE>
 class KernelReduce {
 static constexpr uint32_t DEFAULT_BLK_STRIDE = 1;
 static constexpr uint32_t DEFAULT_REP_STRIDE = 8;
 static constexpr uint32_t REP_LEN = 256;
 static constexpr uint32_t BLK_LEN = 32;
+static constexpr uint32_t ONE_REPEAT_FLOAT_SIZE = REP_LEN / 4;
+static constexpr uint32_t BINARY_BOUNDARY = DEFAULT_REP_STRIDE * 2;
 public:
     __aicore__ inline KernelReduce() {}
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, uint32_t totalLength, uint32_t outLength)
@@ -24,105 +29,192 @@ public:
         this->totalLength = totalLength;
         this->outLength = outLength;
 
-        xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x, totalLength);
-        zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z, outLength);
-        pipe.InitBuffer(inQueueX, 1, totalLength * sizeof(DTYPE_X));
-        pipe.InitBuffer(outQueueZ, 1, outLength * sizeof(DTYPE_Z));
+        xGm.SetGlobalBuffer((__gm__ DTYPE *)x, totalLength);
+        zGm.SetGlobalBuffer((__gm__ DTYPE *)z, outLength);
+        pipe.InitBuffer(inQueueX, 1, totalLength * sizeof(DTYPE));
+        pipe.InitBuffer(outQueueZ, 1, outLength * sizeof(DTYPE));
     }
-    __aicore__ inline void Process1()
+
+    template<size_t ComputeKey = 0>
+    __aicore__ inline void Compute()
     {
-        CopyIn();
-        Compute1();
-        CopyOut();
+        if constexpr (ComputeKey == REDUCE_TILING_1) {
+            Compute1();
+        } else if constexpr (ComputeKey == REDUCE_TILING_2) {
+            Compute2();
+        } else if constexpr (ComputeKey == REDUCE_TILING_3) {
+            Compute3();
+        } else if constexpr (ComputeKey == REDUCE_TILING_4) {
+            Compute4();
+        } else if constexpr (ComputeKey == REDUCE_TILING_5) {
+            Compute5();
+        }
     }
-    __aicore__ inline void Process2()
-    {
-        CopyIn();
-        Compute2();
-        CopyOut();
-    }
-    __aicore__ inline void Process3()
+
+    template<size_t ComputeKey = 0>
+    __aicore__ inline void Process()
     {
         CopyIn();
-        Compute3();
+        Compute<ComputeKey>();
         CopyOut();
     }
 
 private:
     __aicore__ inline void CopyIn()
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.AllocTensor<DTYPE>();
         AscendC::DataCopy(xLocal, xGm, totalLength);
         inQueueX.EnQue(xLocal);
     }
     // Only WholeReduceSum is used under 256B.
     __aicore__ inline void Compute1()
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
-        constexpr int64_t maskLen = REP_LEN / sizeof(DTYPE_X);
-        AscendC::WholeReduceSum<DTYPE_X>(zLocal, xLocal, maskLen, 1,
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+
+        constexpr int64_t maskLen = REP_LEN / sizeof(DTYPE);
+        AscendC::WholeReduceSum<DTYPE>(zLocal, xLocal, maskLen, 1,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
-        outQueueZ.EnQue<DTYPE_Z>(zLocal);
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
         inQueueX.FreeTensor(xLocal);
     }
     // One WholeReduceSum and one BlockReduceSum are used in (256B,2KB](for float input) and (256B,4KB](for half input).
     __aicore__ inline void Compute2()
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
-        pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE_X));
-        AscendC::LocalTensor<DTYPE_X> tempTensor1 = calcBuf.Get<DTYPE_X>();
-        constexpr uint32_t c0Count = BLK_LEN / sizeof(DTYPE_X);
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+        pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE));
+        AscendC::LocalTensor<DTYPE> tempTensor1 = calcBuf.Get<DTYPE>();
+        constexpr uint32_t c0Count = BLK_LEN / sizeof(DTYPE);
         const uint32_t blockNum0 = (totalLength + c0Count - 1) / c0Count;
+
         AscendC::SetMaskCount();
-        AscendC::SetVectorMask<DTYPE_X>(0, totalLength);
-        AscendC::BlockReduceSum<DTYPE_X, false>(tempTensor1, xLocal, AscendC::MASK_PLACEHOLDER, 1,
+        AscendC::SetVectorMask<DTYPE>(0, totalLength);
+        AscendC::BlockReduceSum<DTYPE, false>(tempTensor1, xLocal, 1, AscendC::MASK_PLACEHOLDER,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
         AscendC::PipeBarrier<PIPE_V>();
-        AscendC::SetVectorMask<DTYPE_X>(0, blockNum0);
-        AscendC::WholeReduceSum<DTYPE_X, false>(zLocal, tempTensor1, AscendC::MASK_PLACEHOLDER, 1,
+        AscendC::SetVectorMask<DTYPE>(0, blockNum0);
+        AscendC::WholeReduceSum<DTYPE, false>(zLocal, tempTensor1, 1, AscendC::MASK_PLACEHOLDER,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
         AscendC::PipeBarrier<PIPE_V>();
         AscendC::SetMaskNorm();
-        outQueueZ.EnQue<DTYPE_Z>(zLocal);
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
         inQueueX.FreeTensor(xLocal);
     }
     // Two WholeReduceSum are used in (2KB,16KB](for float input) and (4KB,32KB](for half input).
     __aicore__ inline void Compute3()
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
-        pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE_X));
-        AscendC::LocalTensor<DTYPE_X> tempTensor1 = calcBuf.Get<DTYPE_X>();
-        const uint32_t repeatNum = (totalLength * sizeof(DTYPE_X) + REP_LEN - 1) / REP_LEN;
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+        pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE));
+        AscendC::LocalTensor<DTYPE> tempTensor1 = calcBuf.Get<DTYPE>();
+        const uint32_t repeatNum = (totalLength * sizeof(DTYPE) + REP_LEN - 1) / REP_LEN;
+
         AscendC::SetMaskCount();
-        AscendC::SetVectorMask<DTYPE_X>(0, totalLength);
-        AscendC::WholeReduceSum<DTYPE_X, false>(tempTensor1, xLocal, AscendC::MASK_PLACEHOLDER, 1,
+        AscendC::SetVectorMask<DTYPE>(0, totalLength);
+        AscendC::WholeReduceSum<DTYPE, false>(tempTensor1, xLocal, 1, AscendC::MASK_PLACEHOLDER,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
         AscendC::PipeBarrier<PIPE_V>();
-        AscendC::SetVectorMask<DTYPE_X>(0, repeatNum);
-        AscendC::WholeReduceSum<DTYPE_X, false>(zLocal, tempTensor1, AscendC::MASK_PLACEHOLDER, 1,
+        AscendC::SetVectorMask<DTYPE>(0, repeatNum);
+        AscendC::WholeReduceSum<DTYPE, false>(zLocal, tempTensor1, 1, AscendC::MASK_PLACEHOLDER,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
         AscendC::PipeBarrier<PIPE_V>();
         AscendC::SetMaskNorm();
-        outQueueZ.EnQue<DTYPE_Z>(zLocal);
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+
+    __aicore__ inline void Compute4()
+    {
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+
+        int64_t start = AscendC::GetSystemCycle();
+        WholeReduceSumImpl(zLocal, xLocal, 1, totalLength);
+        int64_t runCycle = AscendC::GetSystemCycle() - start;
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
         inQueueX.FreeTensor(xLocal);
     }
+
+    __aicore__ inline void Compute5()
+    {
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+
+        int64_t start = AscendC::GetSystemCycle();
+        BinaryReduceSumImpl(zLocal, xLocal, 1, totalLength);
+        int64_t runCycle = AscendC::GetSystemCycle() - start;
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+
     __aicore__ inline void CopyOut()
     {
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.DeQue<DTYPE>();
         AscendC::DataCopy(zGm, zLocal, this->outLength);
         outQueueZ.FreeTensor(zLocal);
     }
 
+    __aicore__ inline void WholeReduceSumImpl(const AscendC::LocalTensor<float>& dst, const AscendC::LocalTensor<float>& src,
+        const uint32_t bsLength, const uint32_t hLength)
+    { 
+        AscendC::SetMaskCount();
+        for (uint32_t i = 0; i < bsLength; i++) {
+            uint32_t totalNum = hLength;
+            AscendC::LocalTensor<float> srcTmp = src[i * hLength];
+            AscendC::LocalTensor<float> dstTmp = dst[i * hLength];
+            while (totalNum > 1) {
+                AscendC::SetVectorMask<uint8_t, AscendC::MaskMode::COUNTER>(0, totalNum);
+                AscendC::WholeReduceSum<float, false>(dstTmp, srcTmp, AscendC::MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE,
+                    DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
+                AscendC::PipeBarrier<PIPE_V>();
+                totalNum = AscendC::DivCeil(totalNum, ONE_REPEAT_FLOAT_SIZE);
+                srcTmp = dstTmp;
+            }
+        }
+        AscendC::ResetMask();
+        AscendC::SetMaskNorm();
+    }
+
+    __aicore__ inline void BinaryReduceSumImpl(const AscendC::LocalTensor<float>& dst, const AscendC::LocalTensor<float>& src,
+    const uint32_t bsLength, const uint32_t hLength)
+    {
+        AscendC::BinaryRepeatParams binaryParams;
+        AscendC::UnaryRepeatParams unaryParams;
+        AscendC::SetMaskCount();
+        for (uint32_t i = 0; i < bsLength; i++) {
+            uint32_t totalNum = hLength;
+            AscendC::LocalTensor<float> srcTmp = src[i * hLength];
+            AscendC::LocalTensor<float> dstTmp = dst[i * hLength];
+            while (totalNum > ONE_REPEAT_FLOAT_SIZE) {
+                uint32_t halfNum = AscendC::DivCeil(totalNum, BINARY_BOUNDARY) * DEFAULT_REP_STRIDE;
+                AscendC::SetVectorMask<uint8_t, AscendC::MaskMode::COUNTER>(0, totalNum - halfNum);
+                AscendC::Add<float, false>(dstTmp, srcTmp, srcTmp[halfNum], AscendC::MASK_PLACEHOLDER, 1, binaryParams);
+                AscendC::PipeBarrier<PIPE_V>();
+                totalNum = halfNum;
+                srcTmp = dstTmp;
+            }
+            AscendC::SetVectorMask<uint8_t, AscendC::MaskMode::COUNTER>(0, totalNum);
+            AscendC::WholeReduceSum<float, false>(dstTmp, srcTmp, AscendC::MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE,
+                DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
+            AscendC::PipeBarrier<PIPE_V>();
+        }
+        AscendC::ResetMask();
+        AscendC::SetMaskNorm();
+    }
+
 private:
     AscendC::TPipe pipe;
     AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueX;
     AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueZ;
     AscendC::TBuf<AscendC::TPosition::VECCALC> calcBuf;
-    AscendC::GlobalTensor<DTYPE_X> xGm;
-    AscendC::GlobalTensor<DTYPE_Z> zGm;
+    AscendC::GlobalTensor<DTYPE> xGm;
+    AscendC::GlobalTensor<DTYPE> zGm;
     uint32_t totalLength;
     uint32_t outLength;
 };
@@ -130,14 +222,18 @@ private:
 extern "C" __global__ __aicore__ void reduce_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
 {
     GET_TILING_DATA(tiling_data, tiling);
-    KernelReduce op;
+    KernelReduce<float> op;
     op.Init(x, z, tiling_data.totalLength, tiling_data.outLength);
-    if (TILING_KEY_IS(REDUCE_TILING_0)) {
-        op.Process1();
-    } else if (TILING_KEY_IS(REDUCE_TILING_1)) {
-        op.Process2();
+    if (TILING_KEY_IS(REDUCE_TILING_1)) {
+        op.Process<REDUCE_TILING_1>();
     } else if (TILING_KEY_IS(REDUCE_TILING_2)) {
-        op.Process3();
+        op.Process<REDUCE_TILING_2>();
+    } else if (TILING_KEY_IS(REDUCE_TILING_3)) {
+        op.Process<REDUCE_TILING_3>();
+    } else if (TILING_KEY_IS(REDUCE_TILING_4)) {
+        op.Process<REDUCE_TILING_4>();
+    } else if (TILING_KEY_IS(REDUCE_TILING_5)) {
+        op.Process<REDUCE_TILING_5>();
     }
 }
 
-- 
Gitee


From 6f032b5e57709de1067c07f1af259119659cd941 Mon Sep 17 00:00:00 2001
From: hehongan <hehongan@h-partners.com>
Date: Thu, 10 Jul 2025 11:58:32 +0000
Subject: [PATCH 35/46] =?UTF-8?q?!2713=20matmul=5Fleakyrelu=5Fcustom?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E6=B3=A8=E9=87=8A=20ASCEND310P=20?=
 =?UTF-8?q?=E5=B9=B6=E5=88=A0=E9=99=A4=E4=BA=86=E5=86=97=E4=BD=99=E5=8F=A5?=
 =?UTF-8?q?=E5=8F=B7=20Merge=20pull=20request=20!2713=20from=20hehongan/ma?=
 =?UTF-8?q?ster?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp       | 2 +-
 .../MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp
index 3b78451b5..a65b6e230 100644
--- a/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp
+++ b/operator/ascendc/0_introduction/13_matmulleakyrelu_kernellaunch/MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp
@@ -102,7 +102,7 @@ __aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::Process
     uint32_t computeRound = 0;
 
 #ifdef CUSTOM_ASCEND310P
-    // Set temp UB space when on SCEND310P .
+    // Set temp UB space when on ASCEND310P
     AscendC::TBuf<> tmpMMFormatUb;
     AscendC::LocalTensor<uint8_t> mmformatUb;
     pipe->InitBuffer(tmpMMFormatUb, tiling.baseM * tiling.baseN * sizeof(cType));
diff --git a/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp b/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp
index 4a291ae6f..62f9f3668 100644
--- a/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp
+++ b/operator/ascendc/tutorials/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/matmul_leakyrelu_custom.cpp
@@ -102,7 +102,7 @@ __aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::Process
     uint32_t computeRound = 0;
 
 #ifdef CUSTOM_ASCEND310P
-    // Set temp UB space when on SCEND310P .
+    // Set temp UB space when on ASCEND310P
     AscendC::TBuf<> tmpMMFormatUb;
     AscendC::LocalTensor<uint8_t> mmformatUb;
     pipe->InitBuffer(tmpMMFormatUb, tiling.baseM * tiling.baseN * sizeof(cType));
-- 
Gitee


From cfded7e064ea46fe417fdc4c72d8b1149cb530f8 Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Mon, 14 Jul 2025 06:09:30 +0000
Subject: [PATCH 36/46] !2706 add gm conflict case Merge pull request !2706
 from zhanghao0689/master

---
 .../AclNNInvocation/README.md                 |  75 +++
 .../AclNNInvocation/inc/common.h              |  45 ++
 .../AclNNInvocation/inc/op_runner.h           | 188 +++++++
 .../AclNNInvocation/inc/operator_desc.h       |  57 +++
 .../AclNNInvocation/run.sh                    |  78 +++
 .../AclNNInvocation/scripts/acl.json          |   1 +
 .../AclNNInvocation/scripts/gen_data.py       |  23 +
 .../AclNNInvocation/scripts/verify_result.py  |  53 ++
 .../AclNNInvocation/src/CMakeLists.txt        |  65 +++
 .../AclNNInvocation/src/common.cpp            |  80 +++
 .../AclNNInvocation/src/main.cpp              | 163 ++++++
 .../AclNNInvocation/src/op_runner.cpp         | 462 ++++++++++++++++++
 .../AclNNInvocation/src/operator_desc.cpp     |  51 ++
 .../15_mata_address_conflict/AddsCustom.json  |  37 ++
 .../AddsCustom/op_host/adds_custom.cpp        |  56 +++
 .../AddsCustom/op_kernel/adds_custom.cpp      |  33 ++
 .../AddsCustom/op_kernel/adds_custom_tiling.h |  22 +
 .../AddsCustom/op_kernel/adds_custom_v1.h     |  88 ++++
 .../AddsCustom/op_kernel/adds_custom_v2.h     |  94 ++++
 .../AddsCustom/op_kernel/adds_custom_v3.h     |  89 ++++
 .../15_mata_address_conflict/README.md        | 164 ++++++-
 .../15_mata_address_conflict/install.sh       |  58 +++
 operator/ascendc/4_best_practices/README.md   |   1 +
 23 files changed, 1982 insertions(+), 1 deletion(-)
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h
 create mode 100755 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h
 create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h
 create mode 100755 operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh

diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md
new file mode 100644
index 000000000..5c1ffb4d2
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md
@@ -0,0 +1,75 @@
+## 目录结构介绍
+
+```
+├── AclNNInvocation             //通过单算子API调用的方式调用AddsCustom算子
+│   ├── inc                     // 头文件目录
+│   │   ├── common.h            // 声明公共方法类，用于读取二进制文件
+│   │   ├── op_runner.h         // 算子描述声明文件，包含算子输入/输出，算子类型以及输入描述与输出描述
+│   │   └── operator_desc.h     // 算子运行相关信息声明文件，包含算子输入/输出个数，输入/输出大小等
+│   ├── input                   // 存放脚本生成的输入数据目录
+│   ├── scripts
+│   │   ├── acl.json            // acl配置文件
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 精度校验脚本
+│   ├── src
+│   │   ├── CMakeLists.txt      // 编译规则文件
+│   │   ├── common.cpp          // 公共函数，读取二进制文件函数的实现文件
+│   │   ├── main.cpp            // 单算子调用应用的入口
+│   │   ├── op_runner.cpp       // 单算子调用主体流程实现文件
+│   │   └── operator_desc.cpp   // 构造算子的输入与输出描述
+│   └── run.sh                  // 执行命令脚本
+```
+
+## 代码实现介绍
+
+完成自定义算子的开发部署后，可以通过单算子调用的方式来验证单算子的功能。src/main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子，无需提供单算子描述文件进行离线模型的转换，直接调用单算子API接口。
+
+自定义算子编译部署后，会自动生成单算子API，可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”，形如：
+
+```cpp
+ // 获取算子使用的workspace空间大小
+ aclnnStatus aclnnAddsCustomGetWorkspaceSize(
+     const aclTensor *x,
+     int64_t caseId,
+     const aclTensor *out,
+     uint64_t *workspaceSize,
+     aclOpExecutor **executor);
+ // 执行算子
+ aclnnStatus aclnnAddsCustom(
+     void *workspace,
+     uint64_t workspaceSize,
+     aclOpExecutor *executor,
+     aclrtStream stream);
+```
+
+其中aclnnAddsCustomGetWorkspaceSize为第一段接口，主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后，按照workspaceSize大小申请Device侧内存，然后调用第二段接口aclnnAddsCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。
+
+## 运行样例算子
+
+### 1. 编译算子工程
+
+运行此样例前，请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。
+
+### 2. 单算子API调用样例运行
+
+- 进入到样例目录
+
+  以命令行方式下载样例代码，master分支为例。
+
+  ```bash
+  cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation
+  ```
+- 样例执行
+
+  样例执行过程中会自动生成测试数据，然后编译与运行单算子API调用样例，最后检验运行结果。具体过程可参见run.sh脚本。
+
+  ```bash
+  bash run.sh
+  ```
+
+## 更新说明
+
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/07/03 | 新增本readme |
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h
new file mode 100644
index 000000000..fadb5c808
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h
@@ -0,0 +1,45 @@
+/**
+ * @file common.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+#define SUCCESS 0
+#define FAILED 1
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR]  " fmt "\n", ##args)
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize);
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size);
+
+#endif // COMMON_H
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h
new file mode 100644
index 000000000..7b98d5730
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h
@@ -0,0 +1,188 @@
+/**
+ * @file op_runner.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef OP_RUNNER_H
+#define OP_RUNNER_H
+
+#include "acl/acl.h"
+#include "aclnn/acl_meta.h"
+#include "common.h"
+#include "operator_desc.h"
+
+/**
+ * Op Runner
+ */
+class OpRunner {
+public:
+    /**
+     * @brief Constructor
+     * @param [in] opDesc: op description
+     */
+    explicit OpRunner(OperatorDesc *opDesc);
+
+    /**
+     * @brief Destructor
+     */
+    virtual ~OpRunner();
+
+    /**
+     * @brief Init op runner
+     */
+    bool Init();
+
+    /**
+     * @brief Get number of inputs
+     * @return number of inputs
+     */
+    const size_t NumInputs();
+
+    /**
+     * @brief Get number of outputs
+     * @return number of outputs
+     */
+    const size_t NumOutputs();
+
+    /**
+     * @brief Get input size by index
+     * @param [in] index: input index
+     * @return size of the input
+     */
+    const size_t GetInputSize(size_t index) const;
+    const size_t GetInputNumDims(size_t index) const;
+    aclDataType GetInputDataType(size_t index) const;
+    aclFormat GetInputFormat(size_t index) const;
+
+    /**
+     * @brief Get output size by index
+     * @param [in] index: output index
+     * @return size of the output
+     */
+    size_t GetOutputSize(size_t index) const;
+    const size_t GetOutputNumDims(size_t index) const;
+    aclDataType GetOutputDataType(size_t index) const;
+    aclFormat GetOutputFormat(size_t index) const;
+
+    /**
+     * @brief Get input element count by index
+     * @param i[in] ndex: input index
+     * @return element count of the input
+     */
+    size_t GetInputElementCount(size_t index) const;
+
+    /**
+     * @brief Get output element count by index
+     * @param [in] index: output index
+     * @return element count of the output
+     */
+    size_t GetOutputElementCount(size_t index) const;
+
+    /**
+     * @brief Get input shape by index
+     * @param [in] index: input index
+     * @return shape of the output
+     */
+    std::vector<int64_t> GetInputShape(size_t index) const;
+
+    /**
+     * @brief Get output shape by index
+     * @param [in] index: output index
+     * @return shape of the output
+     */
+    std::vector<int64_t> GetOutputShape(size_t index) const;
+
+    /**
+     * @brief Get input buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: input index
+     * @return host address of the input
+     */
+    template <typename T> T *GetInputBuffer(size_t index)
+    {
+        if (index >= numInputs_) {
+            ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+            return nullptr;
+        }
+        return reinterpret_cast<T *>(hostInputs_[index]);
+    }
+
+    /**
+     * @brief Get output buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: output index
+     * @return host address of the output
+     */
+    template <typename T> const T *GetOutputBuffer(size_t index)
+    {
+        if (index >= numOutputs_) {
+            ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+            return nullptr;
+        }
+
+        return reinterpret_cast<T *>(hostOutputs_[index]);
+    }
+
+    /**
+     * @brief Print readable input by index
+     * @param [in] index: input index
+     * @param [in] elementsPerRow: number of elements per row
+     */
+    void PrintInput(size_t index, size_t elementsPerRow = 16);
+
+    /**
+     * @brief Print readable output by index
+     * @param [in] index: output index
+     * @param [in] elementsPerRow: number of elements per row
+     */
+    void PrintOutput(size_t index, size_t elementsPerRow = 16);
+
+    /**
+     * @brief Compile static op
+     * @return compile result
+     */
+    bool CompileStaticOp();
+
+    /**
+     * @brief Compile dynamic op
+     * @return compile result
+     */
+    bool CompileDynamicOp();
+
+    /**
+     * @brief Run op
+     * @return run result
+     */
+    bool RunOp(int64_t caseId);
+
+    /**
+     * @brief Get case index
+     * @return case index by user input
+     */
+    int64_t GetCaseId();
+
+private:
+    size_t numInputs_;
+    size_t numOutputs_;
+    void *workspace_;
+    int64_t caseId_;
+
+    std::vector<aclDataBuffer *> inputBuffers_;
+    std::vector<aclDataBuffer *> outputBuffers_;
+
+    std::vector<void *> devInputs_;
+    std::vector<void *> devOutputs_;
+
+    std::vector<void *> hostInputs_;
+    std::vector<void *> hostOutputs_;
+
+    std::vector<aclTensor *> inputTensor_;
+    std::vector<aclTensor *> outputTensor_;
+    OperatorDesc *opDesc_;
+};
+
+#endif // OP_RUNNER_H
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h
new file mode 100644
index 000000000..cf02d7cec
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h
@@ -0,0 +1,57 @@
+/**
+ * @file operator_desc.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef OPERATOR_DESC_H
+#define OPERATOR_DESC_H
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+/**
+ * Op description
+ */
+struct OperatorDesc {
+    /**
+     * Constructor
+     */
+    explicit OperatorDesc();
+
+    /**
+     * Destructor
+     */
+    virtual ~OperatorDesc();
+
+    /**
+     * Add an input tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    /**
+     * Add an output tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    std::string opType;
+    std::vector<aclTensorDesc *> inputDesc;
+    std::vector<aclTensorDesc *> outputDesc;
+};
+
+#endif // OPERATOR_DESC_H
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh
new file mode 100755
index 000000000..d5eac7c1d
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export DDK_PATH=$_ASCEND_INSTALL_PATH
+export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib
+
+function main {
+    # 1. 清除遗留生成文件和日志文件
+    rm -rf $HOME/ascend/log/*
+    rm -rf ./input && mkdir -p ./input
+    rm -rf ./output && mkdir -p ./output
+
+    # 2. 生成输入数据和真值数据
+    cd $CURRENT_DIR
+    python3 scripts/gen_data.py
+    if [ $? -ne 0 ]; then
+        echo "ERROR: generate input data failed!"
+        return 1
+    fi
+    echo "INFO: generate input data success!"
+
+    # 3. 编译可执行文件
+    cd $CURRENT_DIR
+    rm -rf build
+    mkdir -p build
+    cd build
+    cmake ../src -DCMAKE_SKIP_RPATH=TRUE
+    if [ $? -ne 0 ]; then
+        echo "ERROR: cmake failed!"
+        return 1
+    fi
+    echo "INFO: cmake success!"
+    make
+    if [ $? -ne 0 ]; then
+        echo "ERROR: make failed!"
+        return 1
+    fi
+    echo "INFO: make success!"
+
+    # 4. 运行可执行文件
+    export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH
+    export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+    cd $CURRENT_DIR/output
+    echo "INFO: execute op!"
+    msprof op --launch-count=3 --output=./prof ./execute_adds_op
+    if [ $? -ne 0 ]; then
+        echo "ERROR: acl executable run failed! please check your project!"
+        return 1
+    fi
+    echo "INFO: acl executable run success!"
+
+    # 5. 精度比对
+    cd $CURRENT_DIR
+    python3 scripts/verify_result.py output/output_z_1.bin output/golden.bin
+    python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin
+    python3 scripts/verify_result.py output/output_z_3.bin output/golden.bin
+    if [ $? -ne 0 ]; then
+        echo "ERROR: verify result failed!"
+        return 1
+    fi
+}
+
+main
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json
new file mode 100644
index 000000000..9e26dfeeb
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py
new file mode 100644
index 000000000..9c4ecbe6e
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1, 100, [8192, 128]).astype(np.float32)
+    golden = (input_x + 2.0).astype(np.float32)
+
+    input_x.tofile("./input/input_x.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py
new file mode 100644
index 000000000..a5019f30f
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-4
+absolute_tol = 1e-5
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt
new file mode 100644
index 000000000..8d0ae1bd3
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt
@@ -0,0 +1,65 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+
+# CMake lowest version requirement
+cmake_minimum_required(VERSION 3.5.1)
+
+# project information
+project(acl_execute_adds)
+
+# Compile options
+add_compile_options(-std=c++11)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output")
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output")
+
+set(INC_PATH $ENV{DDK_PATH})
+
+if (NOT DEFINED ENV{DDK_PATH})
+    set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest")
+    message(STATUS "set default INC_PATH: ${INC_PATH}")
+else ()
+    message(STATUS "env INC_PATH: ${INC_PATH}")
+endif()
+
+set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api")
+
+set(LIB_PATH $ENV{NPU_HOST_LIB})
+
+# Dynamic libraries in the stub directory can only be used for compilation
+if (NOT DEFINED ENV{NPU_HOST_LIB})
+    string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER)
+    set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib")
+    message(STATUS "set default LIB_PATH: ${LIB_PATH}")
+else ()
+    message(STATUS "env LIB_PATH: ${LIB_PATH}")
+endif()
+
+# Header path
+include_directories(
+    ../inc
+    ${INC_PATH}/include
+    ${CUST_PKG_PATH}/include
+)
+
+# add host lib path
+link_directories(
+    ${LIB_PATH}
+    ${CUST_PKG_PATH}/lib
+)
+
+add_executable(execute_adds_op
+    operator_desc.cpp
+    op_runner.cpp
+    main.cpp
+    common.cpp
+)
+
+target_link_libraries(execute_adds_op
+    ascendcl
+    cust_opapi
+    acl_op_compiler
+    nnopbase
+    stdc++
+)
+
+install(TARGETS execute_adds_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp
new file mode 100644
index 000000000..d58716122
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp
@@ -0,0 +1,80 @@
+/**
+ * @file common.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "common.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fstream>
+
+extern bool g_isDevice;
+
+bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file %s", filePath.c_str());
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp
new file mode 100644
index 000000000..b70950642
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp
@@ -0,0 +1,163 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cstdint>
+#include <iostream>
+#include "acl/acl.h"
+#include "common.h"
+#include "op_runner.h"
+
+bool g_isDevice = false;
+int deviceId = 0;
+
+OperatorDesc CreateOpDesc()
+{
+    // define operator
+    std::vector<int64_t> shape{8192, 128};
+    aclDataType dataType = ACL_FLOAT;
+    aclFormat format = ACL_FORMAT_ND;
+    OperatorDesc opDesc;
+    opDesc.AddInputTensorDesc(dataType, shape.size(), shape.data(), format);
+    opDesc.AddOutputTensorDesc(dataType, shape.size(), shape.data(), format);
+    return opDesc;
+}
+
+bool SetInputData(OpRunner &runner)
+{
+    size_t fileSize = 0;
+    ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer<void>(0), runner.GetInputSize(0));
+    INFO_LOG("Set input success");
+    return true;
+}
+
+bool ProcessOutputData(OpRunner &runner)
+{
+    int64_t caseId = runner.GetCaseId();
+    WriteFile("../output/output_z_" + std::to_string(caseId) + ".bin", runner.GetOutputBuffer<void>(0),
+              runner.GetOutputSize(0));
+    INFO_LOG("Write output success");
+    return true;
+}
+
+void DestroyResource()
+{
+    bool flag = false;
+    if (aclrtResetDevice(deviceId) != ACL_SUCCESS) {
+        ERROR_LOG("Reset device %d failed", deviceId);
+        flag = true;
+    }
+    INFO_LOG("Reset Device success");
+    if (aclFinalize() != ACL_SUCCESS) {
+        ERROR_LOG("Finalize acl failed");
+        flag = true;
+    }
+    if (flag) {
+        ERROR_LOG("Destroy resource failed");
+    } else {
+        INFO_LOG("Destroy resource success");
+    }
+}
+
+bool InitResource()
+{
+    std::string output = "../output";
+
+    // acl.json is dump or profiling config file
+    if (aclInit("../scripts/acl.json") != ACL_SUCCESS) {
+        ERROR_LOG("acl init failed");
+        return false;
+    }
+
+    if (aclrtSetDevice(deviceId) != ACL_SUCCESS) {
+        ERROR_LOG("Set device failed. deviceId is %d", deviceId);
+        (void)aclFinalize();
+        return false;
+    }
+    INFO_LOG("Set device[%d] success", deviceId);
+
+    // runMode is ACL_HOST which represents app is running in host
+    // runMode is ACL_DEVICE which represents app is running in device
+    aclrtRunMode runMode;
+    if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) {
+        ERROR_LOG("Get run mode failed");
+        DestroyResource();
+        return false;
+    }
+    g_isDevice = (runMode == ACL_DEVICE);
+    INFO_LOG("Get RunMode[%d] success", runMode);
+
+    return true;
+}
+
+bool RunOp(int64_t caseId)
+{
+    // create op desc
+    OperatorDesc opDesc = CreateOpDesc();
+
+    // create Runner
+    OpRunner opRunner(&opDesc);
+    if (!opRunner.Init()) {
+        ERROR_LOG("Init OpRunner failed");
+        return false;
+    }
+
+    // Load inputs
+    if (!SetInputData(opRunner)) {
+        ERROR_LOG("Set input data failed");
+        return false;
+    }
+
+    // Run op
+    if (!opRunner.RunOp(caseId)) {
+        ERROR_LOG("Run op failed");
+        return false;
+    }
+
+    // process output data
+    if (!ProcessOutputData(opRunner)) {
+        ERROR_LOG("Process output data failed");
+        return false;
+    }
+
+    INFO_LOG("Run op success");
+    return true;
+}
+
+int main(int argc, char **argv)
+{
+    if (!InitResource()) {
+        ERROR_LOG("Init resource failed");
+        return FAILED;
+    }
+    INFO_LOG("Init resource success");
+
+    int64_t caseId = 1;
+    if (!RunOp(caseId)) {
+        DestroyResource();
+        return FAILED;
+    }
+
+    caseId = 2;
+    if (!RunOp(caseId)) {
+        DestroyResource();
+        return FAILED;
+    }
+
+    caseId = 3;
+    if (!RunOp(caseId)) {
+        DestroyResource();
+        return FAILED;
+    }
+
+    DestroyResource();
+    return SUCCESS;
+}
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp
new file mode 100644
index 000000000..d7bde46d6
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp
@@ -0,0 +1,462 @@
+/**
+ * @file op_runner.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "op_runner.h"
+
+#include <cassert>
+#include <limits>
+
+#include "acl/acl_op_compiler.h"
+#include "aclnn_adds_custom.h"
+#include "common.h"
+
+using namespace std;
+
+extern bool g_isDevice;
+
+OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc)
+{
+    numInputs_ = opDesc->inputDesc.size();
+    numOutputs_ = opDesc->outputDesc.size();
+    workspace_ = nullptr;
+}
+
+OpRunner::~OpRunner()
+{
+    if (workspace_ != nullptr) {
+        (void)aclrtFree(workspace_);
+    }
+    for (size_t i = 0; i < numInputs_; ++i) {
+        (void)aclDestroyTensor(inputTensor_[i]);
+        (void)aclDestroyDataBuffer(inputBuffers_[i]);
+        (void)aclrtFree(devInputs_[i]);
+        if (g_isDevice) {
+            (void)aclrtFree(hostInputs_[i]);
+        } else {
+            (void)aclrtFreeHost(hostInputs_[i]);
+        }
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        (void)aclDestroyTensor(outputTensor_[i]);
+        (void)aclDestroyDataBuffer(outputBuffers_[i]);
+        (void)aclrtFree(devOutputs_[i]);
+        if (g_isDevice) {
+            (void)aclrtFree(hostOutputs_[i]);
+        } else {
+            (void)aclrtFreeHost(hostOutputs_[i]);
+        }
+    }
+}
+
+bool OpRunner::Init()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+            return false;
+        }
+        devInputs_.emplace_back(devMem);
+        inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostInput = nullptr;
+        if (g_isDevice) {
+            if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+                return false;
+            }
+        } else {
+            if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+                return false;
+            }
+        }
+        if (hostInput == nullptr) {
+            ERROR_LOG("Malloc memory for input[%zu] failed", i);
+            return false;
+        }
+        hostInputs_.emplace_back(hostInput);
+
+        aclTensor *inputTensor =
+            aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0,
+                            GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]);
+        if (inputTensor == nullptr) {
+            ERROR_LOG("Create Tensor for input[%zu] failed", i);
+            return false;
+        }
+        inputTensor_.emplace_back(inputTensor);
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+            return false;
+        }
+        devOutputs_.emplace_back(devMem);
+        outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostOutput = nullptr;
+        if (g_isDevice) {
+            if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+                return false;
+            }
+        } else {
+            if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+                return false;
+            }
+        }
+        if (hostOutput == nullptr) {
+            ERROR_LOG("Malloc host memory for output[%zu] failed", i);
+            return false;
+        }
+        hostOutputs_.emplace_back(hostOutput);
+
+        aclTensor *outputTensor =
+            aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), nullptr, 0,
+                            GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]);
+        if (outputTensor == nullptr) {
+            ERROR_LOG("Create Tensor for output[%zu] failed", i);
+            return false;
+        }
+        outputTensor_.emplace_back(outputTensor);
+    }
+
+    return true;
+}
+
+const size_t OpRunner::NumInputs()
+{
+    return numInputs_;
+}
+
+const size_t OpRunner::NumOutputs()
+{
+    return numOutputs_;
+}
+
+const size_t OpRunner::GetInputSize(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->inputDesc[index]);
+}
+
+const size_t OpRunner::GetInputNumDims(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescNumDims(opDesc_->inputDesc[index]);
+}
+
+aclDataType OpRunner::GetInputDataType(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ACL_DT_UNDEFINED;
+    }
+
+    return aclGetTensorDescType(opDesc_->inputDesc[index]);
+}
+
+aclFormat OpRunner::GetInputFormat(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ACL_FORMAT_UNDEFINED;
+    }
+
+    return aclGetTensorDescFormat(opDesc_->inputDesc[index]);
+}
+
+std::vector<int64_t> OpRunner::GetInputShape(size_t index) const
+{
+    std::vector<int64_t> ret;
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ret;
+    }
+
+    auto desc = opDesc_->inputDesc[index];
+    for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) {
+        int64_t dimSize;
+        if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) {
+            ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i);
+            ret.clear();
+            return ret;
+        }
+        ret.emplace_back(dimSize);
+    }
+
+    return ret;
+}
+
+size_t OpRunner::GetOutputSize(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->outputDesc[index]);
+}
+
+const size_t OpRunner::GetOutputNumDims(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescNumDims(opDesc_->outputDesc[index]);
+}
+
+aclDataType OpRunner::GetOutputDataType(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ACL_DT_UNDEFINED;
+    }
+
+    return aclGetTensorDescType(opDesc_->outputDesc[index]);
+}
+
+aclFormat OpRunner::GetOutputFormat(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ACL_FORMAT_UNDEFINED;
+    }
+
+    return aclGetTensorDescFormat(opDesc_->outputDesc[index]);
+}
+
+std::vector<int64_t> OpRunner::GetOutputShape(size_t index) const
+{
+    std::vector<int64_t> ret;
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ret;
+    }
+
+    auto desc = opDesc_->outputDesc[index];
+    for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) {
+        int64_t dimSize;
+        if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) {
+            ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i);
+            ret.clear();
+            return ret;
+        }
+        ret.emplace_back(dimSize);
+    }
+    return ret;
+}
+
+size_t OpRunner::GetInputElementCount(size_t index) const
+{
+    if (index >= opDesc_->inputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescElementCount(opDesc_->inputDesc[index]);
+}
+
+size_t OpRunner::GetOutputElementCount(size_t index) const
+{
+    if (index >= opDesc_->outputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescElementCount(opDesc_->outputDesc[index]);
+}
+
+bool OpRunner::RunOp(int64_t caseId)
+{
+    caseId_ = caseId;
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE;
+        if (g_isDevice) {
+            kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+        }
+        if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) {
+            ERROR_LOG("Copy input[%zu] failed", i);
+            return false;
+        }
+        INFO_LOG("Copy input[%zu] success", i);
+    }
+
+    aclrtStream stream = nullptr;
+    if (aclrtCreateStream(&stream) != ACL_SUCCESS) {
+        ERROR_LOG("Create stream failed");
+        return false;
+    }
+    INFO_LOG("Create stream success");
+
+    size_t workspaceSize = 0;
+    aclOpExecutor *handle = nullptr;
+    auto ret = aclnnAddsCustomGetWorkspaceSize(inputTensor_[0], caseId, outputTensor_[0], &workspaceSize, &handle);
+    if (ret != ACL_SUCCESS) {
+        (void)aclrtDestroyStream(stream);
+        ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast<int32_t>(ret));
+        return false;
+    }
+    INFO_LOG("Execute aclnnAddsCustomGetWorkspaceSize success, workspace size %lu", workspaceSize);
+
+    if (workspaceSize != 0) {
+        if (aclrtMalloc(&workspace_, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory failed");
+        }
+    }
+
+    ret = aclnnAddsCustom(workspace_, workspaceSize, handle, stream);
+    if (ret != ACL_SUCCESS) {
+        (void)aclrtDestroyStream(stream);
+        ERROR_LOG("Execute Operator failed. error code is %d", static_cast<int32_t>(ret));
+        return false;
+    }
+    INFO_LOG("Execute aclnnAddsCustom success");
+
+    // The unit of 5000 is ms.
+    ret = aclrtSynchronizeStreamWithTimeout(stream, 5000);
+    if (ret != SUCCESS) {
+        ERROR_LOG("Synchronize stream failed. error code is %d", static_cast<int32_t>(ret));
+        (void)aclrtDestroyStream(stream);
+        return false;
+    }
+    INFO_LOG("Synchronize stream success");
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST;
+        if (g_isDevice) {
+            kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+        }
+        if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) {
+            INFO_LOG("Copy output[%zu] success", i);
+            (void)aclrtDestroyStream(stream);
+            return false;
+        }
+        INFO_LOG("Copy output[%zu] success", i);
+    }
+
+    (void)aclrtDestroyStream(stream);
+    return true;
+}
+
+int64_t OpRunner::GetCaseId()
+{
+    return caseId_;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case ACL_BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT8:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT8:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT16:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT16:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT32:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT32:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT64:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT64:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_FLOAT16:
+            DoPrintFp16Data(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case ACL_FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case ACL_DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+}
+
+void OpRunner::PrintInput(size_t index, size_t numElementsPerRow)
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_);
+        return;
+    }
+
+    auto desc = opDesc_->inputDesc[index];
+    PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow);
+}
+
+void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow)
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return;
+    }
+
+    auto desc = opDesc_->outputDesc[index];
+    PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow);
+}
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp
new file mode 100644
index 000000000..90e0ac343
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp
@@ -0,0 +1,51 @@
+/**
+ * @file operator_desc.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "operator_desc.h"
+
+#include "common.h"
+
+using namespace std;
+
+OperatorDesc::OperatorDesc() {}
+
+OperatorDesc::~OperatorDesc()
+{
+    for (auto *desc : inputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+
+    for (auto *desc : outputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+}
+
+OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format)
+{
+    aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format);
+    if (desc == nullptr) {
+        ERROR_LOG("create tensor failed");
+        return *this;
+    }
+    inputDesc.emplace_back(desc);
+    return *this;
+}
+
+OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims,
+                                                aclFormat format)
+{
+    aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format);
+    if (desc == nullptr) {
+        ERROR_LOG("create tensor failed");
+        return *this;
+    }
+
+    outputDesc.emplace_back(desc);
+    return *this;
+}
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json
new file mode 100644
index 000000000..a54432512
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json
@@ -0,0 +1,37 @@
+[
+    {
+        "op": "AddsCustom",
+        "language": "cpp",
+        "input_desc": [
+            {
+                "name": "x",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            }
+        ],
+        "output_desc": [
+            {
+                "name": "z",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            }
+        ],
+        "attr": [
+            {
+                "name": "case_id",
+                "type": "int",
+                "value": 1
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp
new file mode 100644
index 000000000..6c91c15b5
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp
@@ -0,0 +1,56 @@
+/**
+ * @file adds_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "../op_kernel/adds_custom_tiling.h"
+#include "register/op_def_registry.h"
+
+namespace optiling {
+static ge::graphStatus TilingFunc(gert::TilingContext *context)
+{
+    constexpr uint32_t BLOCK_DIM = 16;
+    context->SetBlockDim(BLOCK_DIM);
+
+    // set tiling_key
+    auto attrs = context->GetAttrs();
+    const int64_t *caseId = attrs->GetInt(0);
+    context->SetTilingKey(*caseId);
+
+    AddsCustomTilingData *tiling = context->GetTilingData<AddsCustomTilingData>();
+    constexpr uint32_t M = 8192;
+    constexpr uint32_t N = 128;
+    constexpr uint32_t TILE_M = 512;
+    constexpr uint32_t TILE_N = 8;
+    constexpr uint32_t LOOP_ONE_CORE = M / TILE_M;
+    tiling->m = M;
+    tiling->n = N;
+    tiling->tileM = TILE_M;
+    tiling->tileN = TILE_N;
+    tiling->loopOneCore = LOOP_ONE_CORE;
+
+    // set workspace size
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    currentWorkspace[0] = 0;
+
+    return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+namespace ops {
+class AddsCustom : public OpDef {
+public:
+    explicit AddsCustom(const char *name) : OpDef(name)
+    {
+        this->Input("x").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND});
+        this->Output("z").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND});
+        this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b");
+        this->Attr("case_id").Int(1);
+    }
+};
+OP_ADD(AddsCustom);
+} // namespace ops
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp
new file mode 100644
index 000000000..8d0ad4cd9
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp
@@ -0,0 +1,33 @@
+/**
+ * @file adds_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "adds_custom_v1.h"
+#include "adds_custom_v2.h"
+#include "adds_custom_v3.h"
+
+extern "C" __global__ __aicore__ void adds_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
+{
+    REGISTER_TILING_DEFAULT(AddsCustomTilingData);
+    GET_TILING_DATA(tilingData, tiling);
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);
+    if (TILING_KEY_IS(1UL)) {
+        KernelAddsV1 op;
+        op.Init(x, z, &tilingData);
+        op.Process();
+    } else if (TILING_KEY_IS(2UL)) {
+        KernelAddsV2 op;
+        op.Init(x, z, &tilingData);
+        op.Process();
+    } else if (TILING_KEY_IS(3UL)) {
+        KernelAddsV3 op;
+        op.Init(x, z, &tilingData);
+        op.Process();
+    }
+}
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h
new file mode 100644
index 000000000..8730ae528
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h
@@ -0,0 +1,22 @@
+/**
+ * @file adds_custom_tiling.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADDS_CUSTOM_TILING_H
+#define ADDS_CUSTOM_TILING_H
+#include <cstdint>
+
+class AddsCustomTilingData {
+public:
+    uint32_t m;
+    uint32_t n;
+    uint32_t tileM;
+    uint32_t tileN;
+    uint32_t loopOneCore;
+};
+#endif // ADDS_CUSTOM_TILING_H
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h
new file mode 100644
index 000000000..70d86c001
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h
@@ -0,0 +1,88 @@
+/**
+ * @file adds_custom_v1.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADDS_CUSTOM_V1_H
+#define ADDS_CUSTOM_V1_H
+#include "kernel_operator.h"
+#include "adds_custom_tiling.h"
+
+using AscendC::TPosition;
+class KernelAddsV1 {
+public:
+    __aicore__ inline KernelAddsV1() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr)
+    {
+        tiling = tilingPtr;
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileN);
+        // the gm address conflict happens when multi cores visit the same addr range(512Bytes)
+        // we disable the L2 cache mode to highlight the influence of the gm address conflict
+        xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
+        zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        for (int32_t i = 0; i < tiling->loopOneCore; i++) {
+            // the following two SyncAll in this case are unnecessary actually,
+            // we just used them to highlight the influence of gm address conflict in each loop
+            AscendC::SyncAll();
+            CopyIn(i);
+            Compute();
+            AscendC::SyncAll();
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::DataCopyParams params;
+        params.blockCount = tiling->tileM;
+        params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE;
+        params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE;
+        params.dstStride = 0;
+        AscendC::DataCopy(xLocal, xGm[progress * tiling->tileM * tiling->n], params);
+        inQueueX.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        constexpr float scale = 2.0;
+        AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopyParams params;
+        params.blockCount = tiling->tileM;
+        params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE;
+        params.srcStride = 0;
+        params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE;
+        AscendC::DataCopy(zGm[progress * tiling->tileM * tiling->n], zLocal, params);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    static constexpr int32_t BUFFER_NUM = 2;
+    static constexpr int32_t BLOCK_SIZE = 32;
+
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> zGm;
+    AddsCustomTilingData *tiling;
+};
+#endif // ADDS_CUSTOM_V1_H
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h
new file mode 100644
index 000000000..ae5314a90
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h
@@ -0,0 +1,94 @@
+/**
+ * @file adds_custom_v2.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADDS_CUSTOM_V2_H
+#define ADDS_CUSTOM_V2_H
+#include "kernel_operator.h"
+#include "adds_custom_tiling.h"
+
+using AscendC::TPosition;
+class KernelAddsV2 {
+public:
+    __aicore__ inline KernelAddsV2() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr)
+    {
+        tiling = tilingPtr;
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileN);
+        // the gm address conflict happens when multi cores visit the same addr range(512Bytes)
+        // we disable the L2 cache mode to highlight the influence of the gm address conflict
+        xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
+        zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        for (int32_t i = 0; i < tiling->loopOneCore; i++) {
+            // adjust the loop order to avoid the gm address conflict:
+            // the loop order of core0  : 0, 1, 2, 3, ..., 13, 14, 15
+            // the loop order of core1  : 1, 2, 3, 4, ..., 14, 15, 0
+            // ...
+            // the loop order of core15 : 15, 0, 1, 2, ..., 12, 13, 14
+            int32_t newProgress = (i + AscendC::GetBlockIdx()) % tiling->loopOneCore;
+            // the following two SyncAll in this case are unnecessary actually,
+            // we just used them to highlight the influence of gm address conflict in each loop
+            AscendC::SyncAll();
+            CopyIn(newProgress);
+            Compute();
+            AscendC::SyncAll();
+            CopyOut(newProgress);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::DataCopyParams params;
+        params.blockCount = tiling->tileM;
+        params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE;
+        params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE;
+        params.dstStride = 0;
+        AscendC::DataCopy(xLocal, xGm[progress * tiling->tileM * tiling->n], params);
+        inQueueX.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        constexpr float scale = 2.0;
+        AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopyParams params;
+        params.blockCount = tiling->tileM;
+        params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE;
+        params.srcStride = 0;
+        params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE;
+        AscendC::DataCopy(zGm[progress * tiling->tileM * tiling->n], zLocal, params);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    static constexpr int32_t BUFFER_NUM = 2;
+    static constexpr int32_t BLOCK_SIZE = 32;
+
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> zGm;
+    AddsCustomTilingData *tiling;
+};
+#endif // ADDS_CUSTOM_V2_H
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h
new file mode 100644
index 000000000..caecdef5e
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h
@@ -0,0 +1,89 @@
+/**
+ * @file adds_custom_v3.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADDS_CUSTOM_V3_H
+#define ADDS_CUSTOM_V3_H
+#include "kernel_operator.h"
+#include "adds_custom_tiling.h"
+
+using AscendC::TPosition;
+class KernelAddsV3 {
+public:
+    __aicore__ inline KernelAddsV3() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr)
+    {
+        tiling = tilingPtr;
+        // change the tile method from column split to row split
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileM * tiling->n);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileM * tiling->n);
+        // the gm address conflict happens when multi cores visit the same addr range(512Bytes)
+        // we disable the L2 cache mode to highlight the influence of the gm address conflict
+        xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
+        zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        for (int32_t i = 0; i < tiling->loopOneCore; i++) {
+            // the following two SyncAll in this case are unnecessary actually,
+            // we just used them to highlight the influence of gm address conflict in each loop
+            AscendC::SyncAll();
+            CopyIn(i);
+            Compute();
+            AscendC::SyncAll();
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::DataCopyParams params;
+        params.blockCount = tiling->tileM;
+        params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE;
+        params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE;
+        params.dstStride = 0;
+        AscendC::DataCopy(xLocal, xGm[progress * tiling->tileN], params);
+        inQueueX.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        constexpr float scale = 2.0;
+        AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopyParams params;
+        params.blockCount = tiling->tileM;
+        params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE;
+        params.srcStride = 0;
+        params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE;
+        AscendC::DataCopy(zGm[progress * tiling->tileN], zLocal, params);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    static constexpr int32_t BUFFER_NUM = 2;
+    static constexpr int32_t BLOCK_SIZE = 32;
+
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> zGm;
+    AddsCustomTilingData *tiling;
+};
+#endif // ADDS_CUSTOM_V3_H
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md
index 34c96391e..1ebba2146 100644
--- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md
@@ -1 +1,163 @@
-MATA地址冲突（待补充）
\ No newline at end of file
+## 概述
+
+本样例基于AddsCustom算子工程，介绍了同地址冲突的影响以及两种解决方法。
+
+## 目录结构介绍
+
+```
+├── 15_mata_address_conflict   // 同地址冲突样例工程目录
+│   ├── AclNNInvocation        // 通过单算子API调用的方式调用AddsCustom算子
+│   ├── AddsCustom             // AddsCustom算子工程
+│   ├── AddsCustom.json        // AddsCustom算子的原型定义json文件
+│   └── install.sh             // 脚本，调用msOpGen生成自定义算子工程，并编译
+```
+
+## 算子描述
+
+Adds算子实现了一个Tensor与标量值2.0相加，返回相加结果的功能。对应的数学表达式为：
+
+```
+z = x + 2.0
+```
+
+本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响，在Global Memory的数据访问中，数据访问请求(读/写)在AI 处理器内部会按照512 Bytes对齐进行地址转换，同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内，出于数据一致性的要求，AI 处理器会对落入同一个512Bytes范围内的请求进行串行处理，导致搬运效率降低，即发生了同地址访问现象。
+本样例中共有3个实现版本：
+adds_custom_v1.h：基础实现版本，每个核的计算顺序一致，存在同地址冲突，带宽效率较差
+adds_custom_v2.h：通过调整每个核的计算顺序，避免发生同地址冲突
+adds_custom_v3.h：通过调整切分顺序，避免发生同地址冲突
+
+当前算子执行机制保证用户kernel入参（包括workspace/tiling）的地址是512 Bytes对齐的，因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。
+
+## 算子规格描述
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Adds</td></tr>
+</tr>
+<tr><td rowspan="2" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8192 * 128</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8192 * 128</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">adds_custom</td></tr>
+</table>
+
+## 支持的产品型号
+
+本样例支持如下产品型号：
+
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 算子工程介绍
+
+其中，算子工程目录AddsCustom包含算子的实现文件，如下所示：
+
+```
+├── AddsCustom               // AddsCustom自定义算子工程
+│   ├── op_host              // host侧实现文件
+│   └── op_kernel            // kernel侧实现文件
+```
+
+CANN软件包中提供了工程创建工具msOpGen，AddsCustom算子工程可通过AddsCustom.json自动创建，自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。
+
+创建完自定义算子工程后，开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程，本样例已在AddsCustom目录中准备好了必要的算子实现，install.sh脚本会创建一个CustomOp目录，并将算子实现文件复制到对应目录下，再编译算子。
+
+备注：CustomOp目录为生成目录，每次执行install.sh脚本都会删除该目录并重新生成，切勿在该目录下编码算子，会存在丢失风险。
+
+## 编译运行样例算子
+
+针对自定义算子工程，编译运行包含如下步骤：
+
+- 调用msOpGen工具生成自定义算子工程；
+- 完成算子host和kernel实现；
+- 编译自定义算子工程生成自定义算子包；
+- 安装自定义算子包到自定义算子库中；
+- 调用执行自定义算子；
+
+详细操作如下所示。
+
+### 1. 获取源码包
+
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
+
+### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
+
+- 切换到msOpGen脚本install.sh所在目录
+
+  ```bash
+  # 若开发者以git命令行方式clone了master分支代码，并切换目录
+  cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict
+  ```
+- 调用脚本，生成自定义算子工程，复制host和kernel实现并编译算子
+
+  - 方式一：配置环境变量运行脚本
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量命令。
+    - 默认路径，root用户安装CANN软件包
+
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+      运行install.sh脚本
+
+      ```bash
+      bash install.sh -v [SOC_VERSION]
+      ```
+  - 方式二：指定命令行安装路径来运行脚本
+    ```bash
+    bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH]
+    ```
+
+  参数说明：
+
+  - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+    - Atlas A2训练系列产品/Atlas 800I A2推理产品
+  - ASCEND_INSTALL_PATH：CANN软件包安装路径
+
+  脚本运行成功后，会在当前目录下创建CustomOp目录，编译完成后，会在CustomOp/build_out中，生成自定义算子安装包custom_opp_\<target os>_\<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
+
+### 3. 部署自定义算子包
+
+- 部署自定义算子包前，请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH
+
+  ```bash
+  echo $ASCEND_OPP_PATH
+  # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp
+
+  # 若没有，则需导出CANN环境变量
+  source [ASCEND_INSTALL_PATH]/bin/setenv.bash
+  # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash
+  ```
+
+  参数说明：
+
+  - ASCEND_INSTALL_PATH：CANN软件包安装路径，一般和上一步中指定的路径保持一致
+- 在自定义算子安装包所在路径下，执行如下命令安装自定义算子包
+
+  ```bash
+  cd CustomOp/build_out
+  ./custom_opp_<target os>_<target architecture>.run
+  ```
+
+  命令执行成功后，自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。
+
+### 4. 调用执行算子工程
+
+- [单算子API调用AddsCustom算子工程](./AclNNInvocation/README.md)
+
+## 更新说明
+
+
+| 时间       | 更新事项 |
+| ---------- | -------- |
+| 2025/07/03 | 新增样例 |
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh
new file mode 100755
index 000000000..24a0c35a2
--- /dev/null
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+SHORT=v:,i:,
+LONG=soc-version:,install-path:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH
+
+OP_NAME=AddsCustom
+# Generate the op framework
+rm -rf CustomOp && msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp
+# Copy op implementation files to CustomOp
+rm -rf CustomOp/op_host/*.cpp
+rm -rf CustomOp/op_kernel/*.h && rm -rf CustomOp/op_kernel/*.cpp
+cp -rf $OP_NAME/op_kernel CustomOp/
+cp -rf $OP_NAME/op_host CustomOp/
+
+# Build CustomOp project
+(cd CustomOp && bash build.sh)
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/README.md b/operator/ascendc/4_best_practices/README.md
index f5379bbbf..c40fe61a7 100644
--- a/operator/ascendc/4_best_practices/README.md
+++ b/operator/ascendc/4_best_practices/README.md
@@ -8,6 +8,7 @@
 | ------------------------------- | ------------------------------------------ | ------------------------------------------ |
 | [4_bank_conflict](./4_bank_conflict) | 基于Ascend C的bank冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
 | [6_group_matmul](./6_group_matmul) | 基于Ascend C的group matmul算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
+| [15_mata_address_conflict](./15_mata_address_conflict) | 基于Ascend C的同地址冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
 | [21_all_gather_matmul_custom](./21_all_gather_matmul_custom) | 基于Ascend C的AllGatherMatmul算子性能调优样例 | Atlas A2训练系列产品 |
 | [22_matmul_reduce_scatter_custom](./22_matmul_reduce_scatter_custom) | 基于Ascend C的MatmulReduceScatter算子性能调优样例 | Atlas A2训练系列产品 |
 | [23_matmul_all_reduce_custom](./23_matmul_all_reduce_custom) | 基于Ascend C的MatmulAllReduce算子性能调优样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
-- 
Gitee


From 2fc9b125b6c4a6d4e8924c08c1303b58bafba819 Mon Sep 17 00:00:00 2001
From: Y_keven <yingkaidi@huawei.com>
Date: Mon, 14 Jul 2025 08:23:22 +0000
Subject: [PATCH 37/46] =?UTF-8?q?!2714=20=E4=BF=AE=E5=A4=8Dpython/resnet50?=
 =?UTF-8?q?=E5=BC=82=E6=AD=A5=E6=8E=A8=E7=90=86=E7=A4=BA=E4=BE=8B=E7=9A=84?=
 =?UTF-8?q?=E5=8F=98=E9=87=8F=E5=90=8D=E6=8B=BC=E5=86=99=E9=94=99=E8=AF=AF?=
 =?UTF-8?q?=20Merge=20pull=20request=20!2714=20from=20Y=5Fkeven/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/acl_net.py                             | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/level2_simple_inference/1_classification/resnet50_async_imagenet_classification/src/acl_net.py b/python/level2_simple_inference/1_classification/resnet50_async_imagenet_classification/src/acl_net.py
index a70a7f7dd..ef60270b6 100644
--- a/python/level2_simple_inference/1_classification/resnet50_async_imagenet_classification/src/acl_net.py
+++ b/python/level2_simple_inference/1_classification/resnet50_async_imagenet_classification/src/acl_net.py
@@ -51,7 +51,7 @@ class Net(object):
         self.model_id = None            # pointer
         self.context = None             # pointer
         self.stream = None              # pointer
-        self.excute_times = execute_times
+        self.execute_times = execute_times
         self.callback_interval = callback_interval
         self.is_callback = True if callback_interval else False
         self.memory_pool = memory_pool
@@ -169,8 +169,8 @@ class Net(object):
         for idx in range(self.memory_pool):
             img_idx = idx % len(images_dataset_list)
             img_input = self._load_input_data(images_dataset_list[img_idx])
-            infer_ouput = self._load_output_data()
-            self.dataset_list.append([img_input, infer_ouput])
+            infer_output = self._load_output_data()
+            self.dataset_list.append([img_input, infer_output])
         print("data interaction from host to device success")
 
     def _destroy_dataset_and_databuf(self, ):
@@ -226,16 +226,16 @@ class Net(object):
     def _get_callback(self, idx):
         if (idx + 1) % self.callback_interval == 0:
             acl.rt.launch_callback(self.callback_func,
-                                   self.excute_dataset,
+                                   self.execute_dataset,
                                    1,
                                    self.stream)
-            self.dataset_list.extend(self.excute_dataset)
-            self.excute_dataset = []
+            self.dataset_list.extend(self.execute_dataset)
+            self.execute_dataset = []
 
     def forward(self):
         print('execute stage:')
-        self.excute_dataset = []
-        for idx in range(self.excute_times):
+        self.execute_dataset = []
+        for idx in range(self.execute_times):
             img_data, infer_output = self.dataset_list.pop(0)
             ret = acl.mdl.execute_async(self.model_id,
                                         img_data,
@@ -244,7 +244,7 @@ class Net(object):
             check_ret("acl.mdl.execute_async", ret)
 
             if self.is_callback:
-                self.excute_dataset.append([img_data, infer_output])
+                self.execute_dataset.append([img_data, infer_output])
                 self._get_callback(idx)
         print('execute stage success')
 
-- 
Gitee


From 79e7fc50b314a84d9b4124e8c8a732ec34234f66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BB=98=E8=B1=AA?= <fuhao16@huawei.com>
Date: Tue, 15 Jul 2025 08:35:37 +0000
Subject: [PATCH 38/46] =?UTF-8?q?!2709=20[fix]meta=E6=9C=AA=E6=88=90?=
 =?UTF-8?q?=E5=8A=9F=E6=B3=A8=E5=86=8C=20Merge=20pull=20request=20!2709=20?=
 =?UTF-8?q?from=20=E4=BB=98=E8=B1=AA/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../PytorchInvocation/test_ops_custom_register_in_graph.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/PytorchInvocation/test_ops_custom_register_in_graph.py b/operator/ascendc/0_introduction/1_add_frameworklaunch/PytorchInvocation/test_ops_custom_register_in_graph.py
index f9bed9c44..9e8397463 100644
--- a/operator/ascendc/0_introduction/1_add_frameworklaunch/PytorchInvocation/test_ops_custom_register_in_graph.py
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/PytorchInvocation/test_ops_custom_register_in_graph.py
@@ -29,7 +29,7 @@ except ModuleNotFoundError:
         )
 
 
-@impl(m, "npu_add_custom")
+@impl(m.m, "npu_add_custom")
 def npu_add_custom_meta(x, y):
     return torch.empty_like(x)
 
@@ -63,7 +63,7 @@ class TestTorchCompileCustomAdd(TestCase):
 
             def forward(self, x, y):
                 return torch_npu.npu_add_custom(x, y)
-        mod = torch.compile(Module().npu(), backend=npu_backend)
+        mod = torch.compile(Module().npu(), backend=npu_backend, fullgraph=True)
         output = mod(x, y)
         print(output)
         self.assertRtolEqual(output, (x + y))
-- 
Gitee


From de4c1bb09b62d1bb596919aadfc2878517c27e26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B5=B5=E6=99=BA=E6=85=A7?= <zhaozhihui5@huawei.com>
Date: Tue, 15 Jul 2025 12:52:23 +0000
Subject: [PATCH 39/46] =?UTF-8?q?!2717=20add=20llm=20datadist=20v2=20sampl?=
 =?UTF-8?q?e=20Merge=20pull=20request=20!2717=20from=20=E8=B5=B5=E6=99=BA?=
 =?UTF-8?q?=E6=85=A7/zzh=5F0714=5Fadd=5Fllm=5Fdatadist=5Fv2=5Fsample?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../11_llm_data_dist/CMakeLists.txt           |  50 +++
 .../11_llm_data_dist/decoder_sample2.cpp      | 286 ++++++++++++++++++
 .../11_llm_data_dist/prompt_sample2.cpp       | 262 ++++++++++++++++
 .../11_llm_data_dist/readme.md                |  15 +
 .../10_llm_data_dist/README.md                |  12 +-
 .../switch_role_sample.py                     | 178 +++++++++++
 6 files changed, 802 insertions(+), 1 deletion(-)
 create mode 100644 cplusplus/level1_single_api/11_llm_data_dist/decoder_sample2.cpp
 create mode 100644 cplusplus/level1_single_api/11_llm_data_dist/prompt_sample2.cpp
 create mode 100644 python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/switch_role_sample.py

diff --git a/cplusplus/level1_single_api/11_llm_data_dist/CMakeLists.txt b/cplusplus/level1_single_api/11_llm_data_dist/CMakeLists.txt
index 603b6e968..25addfeab 100644
--- a/cplusplus/level1_single_api/11_llm_data_dist/CMakeLists.txt
+++ b/cplusplus/level1_single_api/11_llm_data_dist/CMakeLists.txt
@@ -68,4 +68,54 @@ target_link_directories(decoder_sample PRIVATE
 target_link_libraries(decoder_sample PRIVATE
         llm_engine
         graph
+)
+
+add_executable(prompt_sample2 "prompt_sample2.cpp")
+
+target_compile_options(prompt_sample2 PRIVATE
+        ${common_compile_options}
+)
+
+target_compile_definitions(prompt_sample2 PRIVATE
+        ${common_compile_definitions}
+)
+
+target_include_directories(prompt_sample2 PRIVATE
+        ${INCLUDE_DIR}
+        ${INCLUDE_DIR}/external/ge_common
+)
+
+target_link_directories(prompt_sample2 PRIVATE
+        ${ASCEND_PATH}/lib64
+)
+
+target_link_libraries(prompt_sample2 PRIVATE
+        llm_engine
+        graph
+        ascendcl
+)
+
+add_executable(decoder_sample2 "decoder_sample2.cpp")
+
+target_compile_options(decoder_sample2 PRIVATE
+        ${common_compile_options}
+)
+
+target_compile_definitions(decoder_sample2 PRIVATE
+        ${common_compile_definitions}
+)
+
+target_include_directories(decoder_sample2 PRIVATE
+        ${INCLUDE_DIR}
+        ${INCLUDE_DIR}/external/ge_common
+)
+
+target_link_directories(decoder_sample2 PRIVATE
+        ${ASCEND_PATH}/lib64
+)
+
+target_link_libraries(decoder_sample2 PRIVATE
+        llm_engine
+        graph
+        ascendcl
 )
\ No newline at end of file
diff --git a/cplusplus/level1_single_api/11_llm_data_dist/decoder_sample2.cpp b/cplusplus/level1_single_api/11_llm_data_dist/decoder_sample2.cpp
new file mode 100644
index 000000000..c4a186e96
--- /dev/null
+++ b/cplusplus/level1_single_api/11_llm_data_dist/decoder_sample2.cpp
@@ -0,0 +1,286 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <cstdio>
+#include <thread>
+#include <iostream>
+#include "acl/acl.h"
+#include "llm_datadist/llm_datadist.h"
+
+using namespace llm_datadist;
+namespace{
+constexpr uint16_t PROMPT_LISTEN_PORT = 26000;
+constexpr uint16_t DECODER_LISTEN_PORT = 26001;
+constexpr uint16_t PROMPT_CLUSTER_ID = 0;
+constexpr uint16_t DECODER_CLUSTER_ID = 1;
+constexpr uint32_t NUM_TENSORS = 4U;
+constexpr size_t TENSOR_SIZE = 8 * 16 * sizeof(int32_t);
+const std::vector<int64_t> TENSOR_SHAPE = {8, 16};
+constexpr size_t TENSOR_BLOCK_ELEMENT_NUM = 16;
+constexpr int32_t WAIT_PROMPT_TIME = 5;
+constexpr int32_t EXPECTED_ARG_CNT = 4;
+constexpr uint32_t ARG_INDEX_DEVICE_ID = 1;
+constexpr uint32_t ARG_INDEX_LOCAL_IP = 2;
+constexpr uint32_t ARG_INDEX_REMOTE_IP = 3;
+
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+}
+
+int Initialize(LlmDataDist &llmDataDist, const std::string &deviceId)
+{
+    std::map<AscendString, AscendString> options;
+    options[OPTION_DEVICE_ID] = deviceId.c_str();
+    if (std::getenv("LOCAL_COMM_RES") == nullptr) {
+        printf("[ERROR] env:LOCAL_COMM_RES not set\n");
+        return -1;
+    }
+    options[OPTION_LOCAL_COMM_RES] = std::getenv("LOCAL_COMM_RES");
+    auto ret = llmDataDist.Initialize(options);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] Initialize failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] Initialize success\n");
+    return LLM_SUCCESS;
+}
+
+int32_t SetRole(LlmDataDist &llmDataDist, LlmRole role, const char *localIp)
+{
+    std::map<AscendString, AscendString> options;
+    options[OPTION_LISTEN_IP_INFO] = (std::string(localIp) + ":" + std::to_string(DECODER_LISTEN_PORT)).c_str();
+    auto ret = llmDataDist.SetRole(role, options);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] SetRole failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] SetRole success\n");
+    return 0;
+}
+
+int Link(LlmDataDist &llmDataDist, const char *remoteIp)
+{
+    std::vector<Status> rets;
+    std::vector<ClusterInfo> clusters;
+    ClusterInfo clusterInfo;
+    IpInfo remoteIpInfo;
+    remoteIpInfo.ip = remoteIp;
+    remoteIpInfo.port = PROMPT_LISTEN_PORT;
+    clusterInfo.remote_ip_infos.emplace_back(std::move(remoteIpInfo));
+    clusters.emplace_back(std::move(clusterInfo));
+    auto ret = llmDataDist.LinkLlmClusters(clusters, rets);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] LinkLlmClusters failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] LinkLlmClusters success\n");
+    return 0;
+}
+
+int Unlink(LlmDataDist &llmDataDist, const char *remoteIp)
+{
+    std::vector<Status> rets;
+    std::vector<ClusterInfo> clusters;
+    ClusterInfo clusterInfo;
+    clusterInfo.remote_cluster_id = 0;
+    IpInfo remoteIpInfo;
+    remoteIpInfo.ip = remoteIp;
+    remoteIpInfo.port = PROMPT_LISTEN_PORT;
+    clusterInfo.remote_ip_infos.emplace_back(std::move(remoteIpInfo));
+    clusters.emplace_back(std::move(clusterInfo));
+    auto ret = llmDataDist.UnlinkLlmClusters(clusters, rets);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] UnlinkLlmClusters failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] UnlinkLlmClusters success\n");
+    return 0;
+}
+
+int32_t CheckBuffers(const std::vector<void *> &buffers, const std::vector<uint32_t> &checkIndexList)
+{
+    for (auto buffer : buffers) {
+        std::vector<int32_t> hostBuffer(TENSOR_SIZE / sizeof(int32_t));
+        CHECK_ACL(aclrtMemcpy(&hostBuffer[0], TENSOR_SIZE, buffer, TENSOR_SIZE, ACL_MEMCPY_DEVICE_TO_HOST));
+        for (auto checkIndex : checkIndexList) {
+            for (size_t i = 0U; i < TENSOR_BLOCK_ELEMENT_NUM; ++i) {
+                auto expect = checkIndex * TENSOR_BLOCK_ELEMENT_NUM + i;
+                if (hostBuffer[expect] != expect) {
+                    printf("[ERROR] Buffer check failed, index = %zu, val = %d, expect val = %zu\n",
+                           expect, hostBuffer[expect], expect);
+                    return -1;
+                }
+            }
+        }
+    }
+    printf("[INFO] CheckBuffers success\n");
+    return 0;
+}
+
+int32_t PullCache(LlmDataDist &llmDataDist, int64_t cacheId)
+{
+    std::vector<uint64_t> promptBlocks {1, 2, 3};
+    std::vector<uint64_t> decoderBlocks {1, 2, 3};
+    CacheIndex cacheIndex{PROMPT_CLUSTER_ID, 1, 0};
+    // 可以使用PullKvBlock拉取多块block的数据
+    Cache cache{};
+    cache.cache_id = cacheId;
+    auto ret = llmDataDist.PullKvBlocks(cacheIndex, cache, promptBlocks, decoderBlocks);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] PullKvBlocks failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] PullKvBlocks success\n");
+    // 也可以使用PullKvCache拉取一个batch中的连续数据
+    cacheIndex.batch_index = 0;
+    ret = llmDataDist.PullKvCache(cacheIndex, cache, 0);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] PullKvCache failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] PullKvCache success\n");
+    return 0;
+}
+
+void Finalize(LlmDataDist &llmDataDist, int64_t cacheId, bool linked, const char *remoteIp,
+              const std::vector<void *> buffers)
+{
+    if (linked) {
+        auto ret = Unlink(llmDataDist, remoteIp);
+        if (ret != 0) {
+            printf("[ERROR] Unlink failed, ret = %d\n", ret);
+        } else {
+            printf("[INFO] Unlink success\n");
+        }
+    }
+    if (cacheId > 0) {
+        auto ret = llmDataDist.UnregisterKvCache(cacheId);
+        if (ret != 0) {
+            printf("[ERROR] UnregisterKvCache failed, ret = %u\n", ret);
+        } else {
+            printf("[INFO] UnregisterKvCache success\n");
+        }
+    }
+    for (auto buffer : buffers) {
+        aclrtFree(buffer);
+    }
+    llmDataDist.Finalize();
+}
+
+int32_t RunDecoderSample(const char *deviceId, const char *localIp, const char *remoteIp)
+{
+    printf("[INFO] Decoder Sample start\n");
+    // 1. 初始化
+    LlmDataDist llmDataDist(DECODER_CLUSTER_ID, LlmRole::kDecoder);
+    if (Initialize(llmDataDist, deviceId) != 0) {
+        return -1;
+    }
+
+    // 2. 注册内存地址
+    CacheDesc cacheDesc{};
+    cacheDesc.num_tensors = NUM_TENSORS;
+    cacheDesc.data_type = DT_INT32;
+    cacheDesc.shape = TENSOR_SHAPE;
+    std::vector<uint64_t> tensorAddrs;
+    std::vector<void *> buffers;
+    for (uint32_t i = 0U; i < NUM_TENSORS; ++i) {
+        int32_t *buffer = nullptr;
+        CHECK_ACL(aclrtMalloc((void **)&buffer, TENSOR_SIZE, ACL_MEM_MALLOC_HUGE_ONLY));
+        tensorAddrs.emplace_back(reinterpret_cast<uint64_t>(buffer));
+        buffers.emplace_back(reinterpret_cast<void *>(buffer));
+    }
+    int64_t cacheId = -1;
+    bool linked = false;
+    auto ret = llmDataDist.RegisterKvCache(cacheDesc, tensorAddrs, {}, cacheId);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] RegisterKvCache failed, ret = %u\n", ret);
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+    // 3. RegisterKvCache成功后，可以获取cache中各tensor的地址用于后续操作
+    printf("[INFO] RegisterKvCache success\n");
+    for (size_t i = 0U; i < tensorAddrs.size(); ++i) {
+        printf("[INFO] Tensor[%zu] addr = %p\n", i, reinterpret_cast<void *>(tensorAddrs[i]));
+    }
+
+    // 4. 等待Prompt写完cache，实际业务场景可通过合适方式实现通知
+    std::this_thread::sleep_for(std::chrono::seconds(WAIT_PROMPT_TIME));
+
+    // 5. 与prompt建链
+    if (Link(llmDataDist, remoteIp) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+    linked = true;
+
+    // 6. 从prompt拉取Cache
+    if (PullCache(llmDataDist, cacheId) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+
+    if (CheckBuffers(buffers, {0, 1, 2, 3}) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+
+    // 7. 解除链路
+    if (Unlink(llmDataDist, remoteIp) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+    linked = false;
+
+    // 8. 切换角色
+    if (SetRole(llmDataDist, LlmRole::kPrompt, localIp) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+
+    // 9. 等待Prompt push cache，实际业务场景可通过合适方式实现通知
+    std::this_thread::sleep_for(std::chrono::seconds(30));
+
+    if (CheckBuffers(buffers, {4, 5, 6, 7}) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+
+    // 10. 释放Cache与LlmDatadist
+    llmDataDist.Finalize();
+    printf("[INFO] Finalize success\n");
+    printf("[INFO] Decoder Sample end\n");
+    return 0;
+}
+
+int main(int32_t argc, char **argv)
+{
+    if (argc != EXPECTED_ARG_CNT) {
+        printf("[ERROR] expect 3 args(deviceId, localHostIp, remoteHostIp), but got %d\n", argc - 1);
+        return -1;
+    }
+    const auto deviceId = argv[ARG_INDEX_DEVICE_ID];
+    const auto localIp = argv[ARG_INDEX_LOCAL_IP];
+    const auto remoteIp = argv[ARG_INDEX_REMOTE_IP];
+    printf("[INFO] deviceId = %s, localIp = %s, remoteIp = %s\n", deviceId, localIp, remoteIp);
+    auto ret = RunDecoderSample(deviceId, localIp, remoteIp);
+    return ret;
+}
\ No newline at end of file
diff --git a/cplusplus/level1_single_api/11_llm_data_dist/prompt_sample2.cpp b/cplusplus/level1_single_api/11_llm_data_dist/prompt_sample2.cpp
new file mode 100644
index 000000000..033463d71
--- /dev/null
+++ b/cplusplus/level1_single_api/11_llm_data_dist/prompt_sample2.cpp
@@ -0,0 +1,262 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <cstdio>
+#include <thread>
+#include <iostream>
+#include "acl/acl.h"
+#include "llm_datadist/llm_datadist.h"
+
+using namespace llm_datadist;
+namespace{
+constexpr uint16_t PROMPT_LISTEN_PORT = 26000;
+constexpr uint16_t DECODER_LISTEN_PORT = 26001;
+constexpr uint16_t PROMPT_CLUSTER_ID = 0;
+constexpr uint32_t NUM_TENSORS = 4U;
+constexpr size_t TENSOR_SIZE = 8 * 16 * sizeof(int32_t);
+const std::vector<int64_t> TENSOR_SHAPE = {8, 16};
+constexpr int32_t WAIT_TIME = 10;
+constexpr int32_t EXPECTED_ARG_CNT = 4;
+constexpr uint32_t ARG_INDEX_DEVICE_ID = 1;
+constexpr uint32_t ARG_INDEX_LOCAL_IP = 2;
+constexpr uint32_t ARG_INDEX_REMOTE_IP = 3;
+
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+}
+
+int Initialize(LlmDataDist &llmDataDist, const std::string &deviceId, const std::string &localIp)
+{
+    std::map<AscendString, AscendString> options;
+    options[OPTION_DEVICE_ID] = deviceId.c_str();
+    options[OPTION_LISTEN_IP_INFO] = (localIp + ":" + std::to_string(PROMPT_LISTEN_PORT)).c_str();
+    if (std::getenv("LOCAL_COMM_RES") == nullptr) {
+        printf("[ERROR] env:LOCAL_COMM_RES not set\n");
+        return -1;
+    }
+    options[OPTION_LOCAL_COMM_RES] = std::getenv("LOCAL_COMM_RES");
+    auto ret = llmDataDist.Initialize(options);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] Initialize failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] Initialize success\n");
+    return LLM_SUCCESS;
+}
+
+int32_t SetRole(LlmDataDist &llmDataDist, LlmRole role)
+{
+    std::map<AscendString, AscendString> options;
+    auto ret = llmDataDist.SetRole(role, options);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] SetRole failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] SetRole success\n");
+    return 0;
+}
+
+int Link(LlmDataDist &llmDataDist, const char *remoteIp)
+{
+    std::vector<Status> rets;
+    std::vector<ClusterInfo> clusters;
+    ClusterInfo clusterInfo;
+    IpInfo remoteIpInfo;
+    remoteIpInfo.ip = remoteIp;
+    remoteIpInfo.port = DECODER_LISTEN_PORT;
+    clusterInfo.remote_ip_infos.emplace_back(std::move(remoteIpInfo));
+    clusters.emplace_back(std::move(clusterInfo));
+    auto ret = llmDataDist.LinkLlmClusters(clusters, rets);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] LinkLlmClusters failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] LinkLlmClusters success\n");
+    return 0;
+}
+
+int Unlink(LlmDataDist &llmDataDist, const char *remoteIp)
+{
+    std::vector<Status> rets;
+    std::vector<ClusterInfo> clusters;
+    ClusterInfo clusterInfo;
+    clusterInfo.remote_cluster_id = 1;
+    IpInfo remoteIpInfo;
+    remoteIpInfo.ip = remoteIp;
+    remoteIpInfo.port = DECODER_LISTEN_PORT;
+    clusterInfo.remote_ip_infos.emplace_back(std::move(remoteIpInfo));
+    clusters.emplace_back(std::move(clusterInfo));
+    auto ret = llmDataDist.UnlinkLlmClusters(clusters, rets);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] UnlinkLlmClusters failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] UnlinkLlmClusters success\n");
+    return 0;
+}
+
+int32_t PushCache(LlmDataDist &llmDataDist, int64_t cacheId)
+{
+    std::vector<uint64_t> promptBlocks {5, 6, 7};
+    std::vector<uint64_t> decoderBlocks {5, 6, 7};
+    // 可以使用PushKvBlock推送多块block的数据
+    Cache cache{};
+    cache.cache_id = cacheId;
+    auto ret = LLM_SUCCESS;
+    CacheIndex cacheIndex{1, 1};
+    for (uint32_t i = 0U; i < NUM_TENSORS; ++i) {
+        KvCacheExtParam param{};
+        param.src_layer_range = std::pair<int32_t, int32_t>(i, i);
+        param.dst_layer_range = std::pair<int32_t, int32_t>(i, i);
+        param.tensor_num_per_layer = 1;
+        ret = llmDataDist.PushKvBlocks(cache, cacheIndex, promptBlocks, decoderBlocks, param);
+        if (ret != LLM_SUCCESS) {
+            printf("[ERROR] PushKvBlocks failed, ret = %u\n", ret);
+            return -1;
+        }
+    }
+    printf("[INFO] PushKvBlocks success\n");
+
+    // 也可以使用PushKvCache推送一个batch中的连续数据
+    CacheIndex cacheIndex2{1, 1, 4};
+    KvCacheExtParam param2{};
+    param2.src_layer_range = std::pair<int32_t, int32_t>(0, 0);
+    param2.dst_layer_range = std::pair<int32_t, int32_t>(0, 0);
+    param2.tensor_num_per_layer = 4;
+    ret = llmDataDist.PushKvCache(cache, cacheIndex2, 4, -1, param2);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] PushKvCache failed, ret = %u\n", ret);
+        return -1;
+    }
+    printf("[INFO] PushKvCache success\n");
+    return 0;
+}
+
+void Finalize(LlmDataDist &llmDataDist, int64_t cacheId, bool linked, const char *remoteIp,
+              const std::vector<void *> buffers)
+{
+    if (linked) {
+        auto ret = Unlink(llmDataDist, remoteIp);
+        if (ret != 0) {
+            printf("[ERROR] Unlink failed, ret = %d\n", ret);
+        } else {
+            printf("[INFO] Unlink success\n");
+        }
+    }
+    if (cacheId > 0) {
+        auto ret = llmDataDist.UnregisterKvCache(cacheId);
+        if (ret != 0) {
+            printf("[ERROR] UnregisterKvCache failed, ret = %u\n", ret);
+        } else {
+            printf("[INFO] UnregisterKvCache success\n");
+        }
+    }
+    for (auto buffer : buffers) {
+        aclrtFree(buffer);
+    }
+    llmDataDist.Finalize();
+}
+
+int32_t RunPromptSample(const char *deviceId, const char *localIp, const char *remoteIp)
+{
+    printf("[INFO] Prompt Sample start\n");
+    // 1. 初始化
+    LlmDataDist llmDataDist(PROMPT_CLUSTER_ID, LlmRole::kPrompt);
+    if (Initialize(llmDataDist, deviceId, localIp) != 0) {
+        printf("[ERROR] Initialize LlmDataDist failed\n");
+        return -1;
+    }
+    // 2. 注册内存地址
+    CacheDesc cacheDesc{};
+    cacheDesc.num_tensors = NUM_TENSORS;
+    cacheDesc.data_type = DT_INT32;
+    cacheDesc.shape = TENSOR_SHAPE;
+    std::vector<uint64_t> tensorAddrs;
+    std::vector<void *> buffers;
+    for (uint32_t i = 0U; i < NUM_TENSORS; ++i) {
+        int32_t *buffer = nullptr;
+        CHECK_ACL(aclrtMalloc((void **)&buffer, TENSOR_SIZE, ACL_MEM_MALLOC_HUGE_ONLY));
+
+        // init device buffer
+        std::vector<int32_t> hostBuffer(TENSOR_SIZE / sizeof(int32_t));
+        std::iota(hostBuffer.begin(), hostBuffer.end(), 0);
+        CHECK_ACL(aclrtMemcpy(buffer, TENSOR_SIZE, &hostBuffer[0], TENSOR_SIZE, ACL_MEMCPY_HOST_TO_DEVICE));
+
+        tensorAddrs.emplace_back(reinterpret_cast<uint64_t>(buffer));
+        buffers.emplace_back(reinterpret_cast<void *>(buffer));
+    }
+    int64_t cacheId = -1;
+    bool linked = false;
+    auto ret = llmDataDist.RegisterKvCache(cacheDesc, tensorAddrs, {}, cacheId);
+    if (ret != LLM_SUCCESS) {
+        printf("[ERROR] RegisterKvCache failed, ret = %u\n", ret);
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+    // 3. RegisterKvCache成功后，可以获取cache中各tensor的地址用于后续操作
+    printf("[INFO] RegisterKvCache success\n");
+    for (size_t i = 0U; i < tensorAddrs.size(); ++i) {
+        printf("[INFO] Tensor[%zu] addr = %p\n", i, reinterpret_cast<void *>(tensorAddrs[i]));
+    }
+
+    // 4. 等待Decoder拉取cache
+    std::this_thread::sleep_for(std::chrono::seconds(WAIT_TIME));
+
+    // 5. 切换角色
+    if (SetRole(llmDataDist, LlmRole::kDecoder) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+
+    // 6. 与decoder建链
+    if (Link(llmDataDist, remoteIp) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+    linked = true;
+
+    // 7. 与decoder建链
+    if (PushCache(llmDataDist, cacheId) != 0) {
+        Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
+        return -1;
+    }
+
+    // 8. 释放Cache与LlmDatadist
+    llmDataDist.Finalize();
+    printf("[INFO] Finalize success\n");
+    printf("[INFO] Prompt Sample end\n");
+    return 0;
+}
+
+int main(int32_t argc, char **argv)
+{
+    if (argc != EXPECTED_ARG_CNT) {
+        printf("[ERROR] expect 3 args(deviceId, localHostIp, remoteHostIp), but got %d\n", argc - 1);
+        return -1;
+    }
+    const auto deviceId = argv[ARG_INDEX_DEVICE_ID];
+    const auto localIp = argv[ARG_INDEX_LOCAL_IP];
+    const auto remoteIp = argv[ARG_INDEX_REMOTE_IP];
+    printf("[INFO] deviceId = %s, localIp = %s, remoteIp = %s\n", deviceId, localIp, remoteIp);
+    auto ret = RunPromptSample(deviceId, localIp, remoteIp);
+    return ret;
+}
\ No newline at end of file
diff --git a/cplusplus/level1_single_api/11_llm_data_dist/readme.md b/cplusplus/level1_single_api/11_llm_data_dist/readme.md
index f02114570..1a9cf9f80 100644
--- a/cplusplus/level1_single_api/11_llm_data_dist/readme.md
+++ b/cplusplus/level1_single_api/11_llm_data_dist/readme.md
@@ -71,6 +71,8 @@
 
 3. 在运行环境执行可执行文件。
 
+    3.1 执行sample1
+
     - 执行prompt_sample, 参数为device_id与local_ip, 其中device_id为prompt要使用的device_id, local_ip为prompt所在device的ip，如:
         ```
         ./prompt_sample 0 10.10.10.1
@@ -80,3 +82,16 @@
         ```
         ./decoder_sample 4 10.10.10.5 10.10.10.1
         ```
+
+    3.2 执行sample2
+
+    - 执行prompt_sample2, 参数为device_id、local_host_ip和remote_host_ip, 其中device_id为prompt要使用的device_id, local_host_ip为prompt所在host的ip, remote_host_ip为decoder所在host的ip，如:
+        ```
+        LOCAL_COMM_RES='{"status": "completed", "version": "1.0", "server_list": [{"server_id": "node_1", "device": [{"device_id": "0", "device_ip": "10.10.10.1"}]}]}' ./prompt_sample 0 10.10.170.1 10.170.10.2
+        ```
+
+    - 执行decoder_sample2, 参数为device_id、local_host_ip和remote_host_ip, 其中device_id为decoder要使用的device_id, local_host_ip为decoder所在host的ip，remote_host_ip为prompt所在host的ip，如:
+        ```
+        LOCAL_COMM_RES='{"status": "completed", "version": "1.0", "server_list": [{"server_id": "node_1", "device": [{"device_id": "1", "device_ip": "10.10.10.2"}]}]}' ./decoder_sample 1 10.170.10.2 10.170.10.1
+        ```
+    **注**：LOCAL_COMM_RES为sample2执行所需环境变量，配置了当前进程所需的通信资源，将传递给llm_datadist作为初始化option; 配置格式与HCCL的ranktable一致，只需要配置本进程第一个参数device_id对应的信息，其中ranktable中的rank_id和server_count字段不需要配置，当前用例配置为A2的ranktable格式，其他环境需参考对应环境的ranktable格式进行配置
\ No newline at end of file
diff --git a/python/level1_single_api/10_llm_data_dist/README.md b/python/level1_single_api/10_llm_data_dist/README.md
index f3393ab60..76c3225e5 100644
--- a/python/level1_single_api/10_llm_data_dist/README.md
+++ b/python/level1_single_api/10_llm_data_dist/README.md
@@ -96,5 +96,15 @@
       python pull_from_cache_to_blocks.py --device_id 0 --cluster_id 1
       # Decoder主机:
       python pull_from_cache_to_blocks.py --device_id 0 --cluster_id 2
-      ```      
+      ```
+    - 执行switch role样例程序，此样例程序使用单侧建链方式，首先torch自行申请内存并注册blocks,
+      decoder发起建链并pull blocks, 然后两侧切换角色, 并prompt发起建链， decoder进行push_blocks：
+      分别在Prompt主机与Decoder主机，执行样例程序：
+      ```
+      # Prompt主机:
+      LOCAL_COMM_RES='{"status": "completed", "version": "1.0", "server_list": [{"server_id": "node_1", "device": [{"device_id": "0", "device_ip": "10.10.10.1"}]}]}' GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python switch_role_sample.py --device_id 0 --role p --local_host_ip 10.170.10 --remote_host_ip 10.170.10
+      # Decoder主机:
+      LOCAL_COMM_RES='{"status": "completed", "version": "1.0", "server_list": [{"server_id": "node_1", "device": [{"device_id": "1", "device_ip": "10.10.10.2"}]}]}' GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python switch_role_sample.py --device_id 1 --role d --local_host_ip 10.170.10 --remote_host_ip 10.170.10
+      ```
+      **注**：**LOCAL_COMM_RES**为单侧建链方式执行所需环境变量，配置了当前进程所需的通信资源，将传递给llm_datadist作为初始化option; 配置格式与HCCL的ranktable一致，只需要配置本进程参数device_id对应的信息，其中ranktable中的rank_id和server_count字段不需要配置，当前用例配置为A2的ranktable格式，其他环境需参考对应环境的ranktable格式进行配置；**GLOO_SOCKET_IFNAME**为本地网卡名，可通过ifconfig查询；**HCCL_INTRA_ROCE_ENABLE=1**代表使用roce方式进行通信；
 
diff --git a/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/switch_role_sample.py b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/switch_role_sample.py
new file mode 100644
index 000000000..616e62eee
--- /dev/null
+++ b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/switch_role_sample.py
@@ -0,0 +1,178 @@
+"""
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import argparse
+import os
+import logging
+import datetime
+from llm_datadist import LLMDataDist, LLMRole, LLMConfig, CacheDesc, DataType, BlocksCacheKey, \
+    Placement, LLMClusterInfo, LLMStatusCode
+import torch
+import torch.distributed as dist
+import torch_npu
+import torchair
+
+logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+
+NUM_TENSORS = 2
+BLOCKS_NUM = 3
+KV_SHAPE = 10
+PROMPT_CLUSTER_ID = 0
+DECODER_CLUSTER_ID = 1
+
+def init_process_group(rank, world_size, master_ip, backend='gloo'):
+    os.environ['MASTER_ADDR'] = master_ip
+    os.environ['MASTER_PORT'] = '29500'
+
+    logging.info(f"init group begin, {rank=}, {world_size=}, {master_ip=}")
+    dist.init_process_group(backend=backend, rank=rank, world_size=world_size, timeout=datetime.timedelta(seconds=30))
+    logging.info(f"init group success, {rank=}, {world_size=}, {master_ip=}")
+
+
+def init_llm_datadist(role: LLMRole, cluster_id, device_id: int, local_host_ip, remote_host_ip) -> LLMDataDist:
+    init_process_group(cluster_id, 2, min(local_host_ip, remote_host_ip))
+    datadist = LLMDataDist(role, cluster_id)
+    llm_config = LLMConfig()
+    llm_config.device_id = device_id
+    if os.getenv('LOCAL_COMM_RES') is None:
+        raise Exception("env:LOCAL_COMM_RES is not set")
+    llm_config.local_comm_res = os.getenv('LOCAL_COMM_RES')
+    if role == LLMRole.PROMPT:
+        llm_config.listen_ip_info = f"{local_host_ip}:26000"
+    llm_options = llm_config.generate_options()
+    datadist.init(llm_options)
+    logging.info(f"init {role} success, {cluster_id=}")
+    return datadist
+
+
+def run_prompt_sample(datadist, remote_host_ip):
+    cache_manager = datadist.cache_manager
+    cache_desc = CacheDesc(num_tensors=NUM_TENSORS, shape=[BLOCKS_NUM, KV_SHAPE], data_type=DataType.DT_FLOAT,
+                           placement=Placement.DEVICE)
+    tensor = torch.ones(BLOCKS_NUM, KV_SHAPE, dtype=torch.float).npu()
+    tensor2 = torch.ones(BLOCKS_NUM, KV_SHAPE, dtype=torch.float).npu()
+    addr = int(tensor.data_ptr())
+    addr2 = int(tensor2.data_ptr())
+    cache = cache_manager.register_blocks_cache(cache_desc, [addr, addr2], BlocksCacheKey(PROMPT_CLUSTER_ID, 0))
+    logging.info('register_blocks_cache success')
+    dist.barrier() # register end
+    logging.info('wait decoder link and pull...')
+    dist.barrier() # decoder unlink
+
+    datadist.switch_role(LLMRole.DECODER)
+    dist.barrier() # prompt switch role end, close lisen
+    dist.barrier() # decoder switch role end, lisen
+
+    cluster = LLMClusterInfo()
+    cluster.append_remote_ip_info(remote_host_ip, 26000)
+    ret, _ = datadist.link_clusters([cluster], 5000)
+    if ret != LLMStatusCode.LLM_SUCCESS:
+        raise Exception("link failed")
+    logging.info('link success, wait decoder push...')
+    dist.barrier() # prompt link end
+    dist.barrier() # decoder push blocks end
+    logging.info(f'after decoder push, {tensor=}')
+    logging.info(f'after decoder push, {tensor2=}')
+
+    cluster = LLMClusterInfo()
+    cluster.remote_cluster_id = DECODER_CLUSTER_ID
+    ret, _ = datadist.unlink_clusters([cluster], 5000, force=True)
+    if ret != LLMStatusCode.LLM_SUCCESS:
+        raise Exception("unlink failed")
+
+    cache_manager.unregister_cache(cache.cache_id)
+    datadist.finalize()
+    logging.info('[finalize] success')
+
+
+def run_decoder_sample(datadist, local_host_ip, remote_host_ip):
+    cache_manager = datadist.cache_manager
+    cache_desc = CacheDesc(num_tensors=NUM_TENSORS, shape=[BLOCKS_NUM, KV_SHAPE], data_type=DataType.DT_FLOAT,
+                           placement=Placement.DEVICE)
+    tensor = torch.full((BLOCKS_NUM, KV_SHAPE), 0, dtype=torch.float).npu()
+    tensor2 = torch.full((BLOCKS_NUM, KV_SHAPE), 0, dtype=torch.float).npu()
+    addr = int(tensor.data_ptr())
+    addr2 = int(tensor2.data_ptr())
+    cache = cache_manager.register_blocks_cache(cache_desc, [addr, addr2], BlocksCacheKey(DECODER_CLUSTER_ID, 0))
+    logging.info('register_blocks_cache success')
+    dist.barrier() # register end
+
+    cluster = LLMClusterInfo()
+    cluster.append_remote_ip_info(remote_host_ip, 26000)
+    ret, _ = datadist.link_clusters([cluster], 5000)
+    if ret != LLMStatusCode.LLM_SUCCESS:
+        raise Exception("unlink failed")
+
+    cache_manager.pull_blocks(BlocksCacheKey(PROMPT_CLUSTER_ID, 0), cache, src_blocks=[0, 1], dst_blocks=[0, 2])
+    logging.info(f'after decoder pull, {tensor=}')
+    logging.info(f'after decoder pull, {tensor2=}')
+
+    cluster = LLMClusterInfo()
+    cluster.remote_cluster_id = PROMPT_CLUSTER_ID
+    cluster.append_remote_ip_info(remote_host_ip, 26000)
+    ret, _ = datadist.unlink_clusters([cluster], 5000)
+    if ret != LLMStatusCode.LLM_SUCCESS:
+        raise Exception("unlink failed")
+    
+    dist.barrier() # decoder unlink
+    dist.barrier() # prompt switch role end, close lisen
+    llm_config = LLMConfig()
+    llm_config.listen_ip_info = f"{local_host_ip}:26000"
+    llm_options = llm_config.generate_options()
+    datadist.switch_role(LLMRole.PROMPT, llm_options)
+    logging.info('decoder link, pull, unlink, switch role success, wait prompt link...')
+    dist.barrier() # decoder switch role end, lisen
+    dist.barrier() # prompt link end
+
+    cache_manager.push_blocks(BlocksCacheKey(PROMPT_CLUSTER_ID, 0), cache, src_blocks=[0, 1, 2], dst_blocks=[0, 1,2],
+                              src_layer_range=range(0, 2), dst_layer_range=range(0, 2), tensor_num_per_layer=1)
+    dist.barrier() # decoder push blocks end
+    cluster = LLMClusterInfo()
+    cluster.remote_cluster_id = PROMPT_CLUSTER_ID
+    ret, _ = datadist.unlink_clusters([cluster], 5000, force=True)
+    if ret != LLMStatusCode.LLM_SUCCESS:
+        raise Exception("unlink failed")
+
+    cache_manager.unregister_cache(cache.cache_id)
+    datadist.finalize()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device_id", type=int, default=0, help='device id')
+    parser.add_argument("--role", type=str, default=1, help='role type, support p/d')
+    parser.add_argument("--local_host_ip", type=str, help='local host ip')
+    parser.add_argument("--remote_host_ip", type=str, help='remote host ip')
+    args = parser.parse_args()
+    if args.role not in ['p', 'd']:
+        raise RuntimeError("Not supported cluster id")
+    if args.device_id not in [0, 1, 2, 3, 4, 5, 6, 7]:
+        raise RuntimeError("Not supported device id")
+    if args.local_host_ip is None:
+        raise RuntimeError("local_host_ip is not set")
+    if args.remote_host_ip is None:
+        raise RuntimeError("remote_host_ip is not set")
+    logging.info(f'Sample start, device_id = {args.device_id}, role = {args.role}')
+
+    torch.npu.set_device(args.device_id)
+    role = LLMRole.PROMPT if args.role == 'p' else LLMRole.DECODER
+    cluster_id = PROMPT_CLUSTER_ID if args.role == 'p' else DECODER_CLUSTER_ID
+    datadist = init_llm_datadist(role, cluster_id, args.device_id, args.local_host_ip, args.remote_host_ip)
+    if role == LLMRole.PROMPT:
+        run_prompt_sample(datadist, args.remote_host_ip)
+    else:
+        run_decoder_sample(datadist, args.local_host_ip, args.remote_host_ip)
+    logging.info('Sample end')
-- 
Gitee


From 464b844ab5cd1c7ee6f18be645ba3a050320671f Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Wed, 16 Jul 2025 01:02:04 +0000
Subject: [PATCH 40/46] !2716 add l2 bypass case Merge pull request !2716 from
 zhanghao0689/master

---
 .../12_cachemiss_preload_dcci/README.md       |   1 -
 .../AclNNInvocation/README.md                 |  76 +++
 .../AclNNInvocation/inc/common.h              |  45 ++
 .../AclNNInvocation/inc/op_runner.h           | 188 +++++++
 .../AclNNInvocation/inc/operator_desc.h       |  57 +++
 .../12_l2_cache_bypass/AclNNInvocation/run.sh |  77 +++
 .../AclNNInvocation/scripts/acl.json          |   1 +
 .../AclNNInvocation/scripts/gen_data.py       |  28 ++
 .../AclNNInvocation/scripts/verify_result.py  |  53 ++
 .../AclNNInvocation/src/CMakeLists.txt        |  65 +++
 .../AclNNInvocation/src/common.cpp            |  80 +++
 .../AclNNInvocation/src/main.cpp              | 163 ++++++
 .../AclNNInvocation/src/op_runner.cpp         | 462 ++++++++++++++++++
 .../AclNNInvocation/src/operator_desc.cpp     |  51 ++
 .../12_l2_cache_bypass/AddCustom.json         |  47 ++
 .../AddCustom/op_host/add_custom.cpp          |  49 ++
 .../AddCustom/op_kernel/add_custom.cpp        |  28 ++
 .../AddCustom/op_kernel/add_custom_tiling.h   |  18 +
 .../AddCustom/op_kernel/add_custom_v1.h       | 102 ++++
 .../AddCustom/op_kernel/add_custom_v2.h       | 105 ++++
 .../12_l2_cache_bypass/README.md              | 162 ++++++
 .../12_l2_cache_bypass/install.sh             |  59 +++
 .../15_mata_address_conflict/README.md        |   9 +-
 operator/ascendc/4_best_practices/README.md   |   3 +
 24 files changed, 1924 insertions(+), 5 deletions(-)
 delete mode 100644 operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h
 create mode 100755 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h
 create mode 100644 operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md
 create mode 100755 operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh

diff --git a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md b/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md
deleted file mode 100644
index ae31f00d7..000000000
--- a/operator/ascendc/4_best_practices/12_cachemiss_preload_dcci/README.md
+++ /dev/null
@@ -1 +0,0 @@
-CACHE MISS优化 preload dcci（待补充）
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md
new file mode 100644
index 000000000..d3e63bedf
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/README.md
@@ -0,0 +1,76 @@
+## 目录结构介绍
+
+```
+├── AclNNInvocation             //通过单算子API调用的方式调用AddCustom算子
+│   ├── inc                     // 头文件目录
+│   │   ├── common.h            // 声明公共方法类，用于读取二进制文件
+│   │   ├── op_runner.h         // 算子描述声明文件，包含算子输入/输出，算子类型以及输入描述与输出描述
+│   │   └── operator_desc.h     // 算子运行相关信息声明文件，包含算子输入/输出个数，输入/输出大小等
+│   ├── input                   // 存放脚本生成的输入数据目录
+│   ├── scripts
+│   │   ├── acl.json            // acl配置文件
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 精度校验脚本
+│   ├── src
+│   │   ├── CMakeLists.txt      // 编译规则文件
+│   │   ├── common.cpp          // 公共函数，读取二进制文件函数的实现文件
+│   │   ├── main.cpp            // 单算子调用应用的入口
+│   │   ├── op_runner.cpp       // 单算子调用主体流程实现文件
+│   │   └── operator_desc.cpp   // 构造算子的输入与输出描述
+│   └── run.sh                  // 执行命令脚本
+```
+
+## 代码实现介绍
+
+完成自定义算子的开发部署后，可以通过单算子调用的方式来验证单算子的功能。src/main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子，无需提供单算子描述文件进行离线模型的转换，直接调用单算子API接口。
+
+自定义算子编译部署后，会自动生成单算子API，可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”，形如：
+
+```cpp
+// 获取算子使用的workspace空间大小
+aclnnStatus aclnnAddCustomGetWorkspaceSize(
+    const aclTensor *x,
+    const aclTensor *y,
+    int64_t caseId,
+    const aclTensor *out,
+    uint64_t *workspaceSize,
+    aclOpExecutor **executor);
+// 执行算子
+aclnnStatus aclnnAddCustom(
+    void *workspace,
+    uint64_t workspaceSize,
+    aclOpExecutor *executor,
+    aclrtStream stream);
+```
+
+其中aclnnAddCustomGetWorkspaceSize为第一段接口，主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后，按照workspaceSize大小申请Device侧内存，然后调用第二段接口aclnnAddCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。
+
+## 运行样例算子
+
+### 1. 编译算子工程
+
+运行此样例前，请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。
+
+### 2. 单算子API调用样例运行
+
+- 进入到样例目录
+
+  以命令行方式下载样例代码，master分支为例。
+
+  ```bash
+  cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation
+  ```
+- 样例执行
+
+  样例执行过程中会自动生成测试数据，然后编译与运行单算子API调用样例，最后检验运行结果。具体过程可参见run.sh脚本。
+
+  ```bash
+  bash run.sh
+  ```
+
+## 更新说明
+
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/07/14 | 新增本readme |
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h
new file mode 100644
index 000000000..fadb5c808
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/common.h
@@ -0,0 +1,45 @@
+/**
+ * @file common.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+#define SUCCESS 0
+#define FAILED 1
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR]  " fmt "\n", ##args)
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize);
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size);
+
+#endif // COMMON_H
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h
new file mode 100644
index 000000000..7b98d5730
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/op_runner.h
@@ -0,0 +1,188 @@
+/**
+ * @file op_runner.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef OP_RUNNER_H
+#define OP_RUNNER_H
+
+#include "acl/acl.h"
+#include "aclnn/acl_meta.h"
+#include "common.h"
+#include "operator_desc.h"
+
+/**
+ * Op Runner
+ */
+class OpRunner {
+public:
+    /**
+     * @brief Constructor
+     * @param [in] opDesc: op description
+     */
+    explicit OpRunner(OperatorDesc *opDesc);
+
+    /**
+     * @brief Destructor
+     */
+    virtual ~OpRunner();
+
+    /**
+     * @brief Init op runner
+     */
+    bool Init();
+
+    /**
+     * @brief Get number of inputs
+     * @return number of inputs
+     */
+    const size_t NumInputs();
+
+    /**
+     * @brief Get number of outputs
+     * @return number of outputs
+     */
+    const size_t NumOutputs();
+
+    /**
+     * @brief Get input size by index
+     * @param [in] index: input index
+     * @return size of the input
+     */
+    const size_t GetInputSize(size_t index) const;
+    const size_t GetInputNumDims(size_t index) const;
+    aclDataType GetInputDataType(size_t index) const;
+    aclFormat GetInputFormat(size_t index) const;
+
+    /**
+     * @brief Get output size by index
+     * @param [in] index: output index
+     * @return size of the output
+     */
+    size_t GetOutputSize(size_t index) const;
+    const size_t GetOutputNumDims(size_t index) const;
+    aclDataType GetOutputDataType(size_t index) const;
+    aclFormat GetOutputFormat(size_t index) const;
+
+    /**
+     * @brief Get input element count by index
+     * @param i[in] ndex: input index
+     * @return element count of the input
+     */
+    size_t GetInputElementCount(size_t index) const;
+
+    /**
+     * @brief Get output element count by index
+     * @param [in] index: output index
+     * @return element count of the output
+     */
+    size_t GetOutputElementCount(size_t index) const;
+
+    /**
+     * @brief Get input shape by index
+     * @param [in] index: input index
+     * @return shape of the output
+     */
+    std::vector<int64_t> GetInputShape(size_t index) const;
+
+    /**
+     * @brief Get output shape by index
+     * @param [in] index: output index
+     * @return shape of the output
+     */
+    std::vector<int64_t> GetOutputShape(size_t index) const;
+
+    /**
+     * @brief Get input buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: input index
+     * @return host address of the input
+     */
+    template <typename T> T *GetInputBuffer(size_t index)
+    {
+        if (index >= numInputs_) {
+            ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+            return nullptr;
+        }
+        return reinterpret_cast<T *>(hostInputs_[index]);
+    }
+
+    /**
+     * @brief Get output buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: output index
+     * @return host address of the output
+     */
+    template <typename T> const T *GetOutputBuffer(size_t index)
+    {
+        if (index >= numOutputs_) {
+            ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+            return nullptr;
+        }
+
+        return reinterpret_cast<T *>(hostOutputs_[index]);
+    }
+
+    /**
+     * @brief Print readable input by index
+     * @param [in] index: input index
+     * @param [in] elementsPerRow: number of elements per row
+     */
+    void PrintInput(size_t index, size_t elementsPerRow = 16);
+
+    /**
+     * @brief Print readable output by index
+     * @param [in] index: output index
+     * @param [in] elementsPerRow: number of elements per row
+     */
+    void PrintOutput(size_t index, size_t elementsPerRow = 16);
+
+    /**
+     * @brief Compile static op
+     * @return compile result
+     */
+    bool CompileStaticOp();
+
+    /**
+     * @brief Compile dynamic op
+     * @return compile result
+     */
+    bool CompileDynamicOp();
+
+    /**
+     * @brief Run op
+     * @return run result
+     */
+    bool RunOp(int64_t caseId);
+
+    /**
+     * @brief Get case index
+     * @return case index by user input
+     */
+    int64_t GetCaseId();
+
+private:
+    size_t numInputs_;
+    size_t numOutputs_;
+    void *workspace_;
+    int64_t caseId_;
+
+    std::vector<aclDataBuffer *> inputBuffers_;
+    std::vector<aclDataBuffer *> outputBuffers_;
+
+    std::vector<void *> devInputs_;
+    std::vector<void *> devOutputs_;
+
+    std::vector<void *> hostInputs_;
+    std::vector<void *> hostOutputs_;
+
+    std::vector<aclTensor *> inputTensor_;
+    std::vector<aclTensor *> outputTensor_;
+    OperatorDesc *opDesc_;
+};
+
+#endif // OP_RUNNER_H
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h
new file mode 100644
index 000000000..cf02d7cec
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/inc/operator_desc.h
@@ -0,0 +1,57 @@
+/**
+ * @file operator_desc.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef OPERATOR_DESC_H
+#define OPERATOR_DESC_H
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+/**
+ * Op description
+ */
+struct OperatorDesc {
+    /**
+     * Constructor
+     */
+    explicit OperatorDesc();
+
+    /**
+     * Destructor
+     */
+    virtual ~OperatorDesc();
+
+    /**
+     * Add an input tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    /**
+     * Add an output tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    std::string opType;
+    std::vector<aclTensorDesc *> inputDesc;
+    std::vector<aclTensorDesc *> outputDesc;
+};
+
+#endif // OPERATOR_DESC_H
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh
new file mode 100755
index 000000000..894fec61c
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/run.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export DDK_PATH=$_ASCEND_INSTALL_PATH
+export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib
+
+function main {
+    # 1. 清除遗留生成文件和日志文件
+    rm -rf $HOME/ascend/log/*
+    rm -rf ./input && mkdir -p ./input
+    rm -rf ./output && mkdir -p ./output
+
+    # 2. 生成输入数据和真值数据
+    cd $CURRENT_DIR
+    python3 scripts/gen_data.py
+    if [ $? -ne 0 ]; then
+        echo "ERROR: generate input data failed!"
+        return 1
+    fi
+    echo "INFO: generate input data success!"
+
+    # 3. 编译可执行文件
+    cd $CURRENT_DIR
+    rm -rf build
+    mkdir -p build
+    cd build
+    cmake ../src -DCMAKE_SKIP_RPATH=TRUE
+    if [ $? -ne 0 ]; then
+        echo "ERROR: cmake failed!"
+        return 1
+    fi
+    echo "INFO: cmake success!"
+    make
+    if [ $? -ne 0 ]; then
+        echo "ERROR: make failed!"
+        return 1
+    fi
+    echo "INFO: make success!"
+
+    # 4. 运行可执行文件
+    export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH
+    export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+    cd $CURRENT_DIR/output
+    echo "INFO: execute op!"
+    msprof --application="./execute_add_op" --ai-core=on --l2=on --output=./prof
+    if [ $? -ne 0 ]; then
+        echo "ERROR: acl executable run failed! please check your project!"
+        return 1
+    fi
+    echo "INFO: acl executable run success!"
+
+    # 5. 精度比对
+    cd $CURRENT_DIR
+    python3 scripts/verify_result.py output/output_z_1.bin output/golden.bin
+    python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin
+    if [ $? -ne 0 ]; then
+        echo "ERROR: verify result failed!"
+        return 1
+    fi
+}
+
+main
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json
new file mode 100644
index 000000000..9e26dfeeb
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/acl.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py
new file mode 100644
index 000000000..17b3d7119
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/gen_data.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    row = 5120
+    col = 5120
+    input_x = np.random.uniform(1, 10, [row, col]).astype(np.float32)
+    input_y = np.random.uniform(1, 10, [row, col * 3]).astype(np.float32)
+    y_blocks = np.split(input_y, 3, axis=1)
+    result_blocks = [input_x + block for block in y_blocks]
+    golden = np.hstack(result_blocks)
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py
new file mode 100644
index 000000000..a5019f30f
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-4
+absolute_tol = 1e-5
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt
new file mode 100644
index 000000000..32bed518d
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/CMakeLists.txt
@@ -0,0 +1,65 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+
+# CMake lowest version requirement
+cmake_minimum_required(VERSION 3.5.1)
+
+# project information
+project(acl_execute_add)
+
+# Compile options
+add_compile_options(-std=c++11)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output")
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output")
+
+set(INC_PATH $ENV{DDK_PATH})
+
+if (NOT DEFINED ENV{DDK_PATH})
+    set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest")
+    message(STATUS "set default INC_PATH: ${INC_PATH}")
+else ()
+    message(STATUS "env INC_PATH: ${INC_PATH}")
+endif()
+
+set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api")
+
+set(LIB_PATH $ENV{NPU_HOST_LIB})
+
+# Dynamic libraries in the stub directory can only be used for compilation
+if (NOT DEFINED ENV{NPU_HOST_LIB})
+    string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER)
+    set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib")
+    message(STATUS "set default LIB_PATH: ${LIB_PATH}")
+else ()
+    message(STATUS "env LIB_PATH: ${LIB_PATH}")
+endif()
+
+# Header path
+include_directories(
+    ../inc
+    ${INC_PATH}/include
+    ${CUST_PKG_PATH}/include
+)
+
+# add host lib path
+link_directories(
+    ${LIB_PATH}
+    ${CUST_PKG_PATH}/lib
+)
+
+add_executable(execute_add_op
+    operator_desc.cpp
+    op_runner.cpp
+    main.cpp
+    common.cpp
+)
+
+target_link_libraries(execute_add_op
+    ascendcl
+    cust_opapi
+    acl_op_compiler
+    nnopbase
+    stdc++
+)
+
+install(TARGETS execute_add_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp
new file mode 100644
index 000000000..d58716122
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/common.cpp
@@ -0,0 +1,80 @@
+/**
+ * @file common.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "common.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fstream>
+
+extern bool g_isDevice;
+
+bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file %s", filePath.c_str());
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp
new file mode 100644
index 000000000..d727b0a29
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/main.cpp
@@ -0,0 +1,163 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cstdint>
+#include <iostream>
+#include "acl/acl.h"
+#include "common.h"
+#include "op_runner.h"
+
+bool g_isDevice = false;
+int deviceId = 0;
+
+OperatorDesc CreateOpDesc()
+{
+    // define operator
+    constexpr uint32_t ROW = 5120;
+    constexpr uint32_t COL = 5120;
+    std::vector<int64_t> shapeX{ROW, COL};
+    std::vector<int64_t> shapeY{ROW, COL*3};
+    std::vector<int64_t> shapeZ{ROW, COL*3};
+    aclDataType dataType = ACL_FLOAT;
+    aclFormat format = ACL_FORMAT_ND;
+    OperatorDesc opDesc;
+    opDesc.AddInputTensorDesc(dataType, shapeX.size(), shapeX.data(), format);
+    opDesc.AddInputTensorDesc(dataType, shapeY.size(), shapeY.data(), format);
+    opDesc.AddOutputTensorDesc(dataType, shapeZ.size(), shapeZ.data(), format);
+    return opDesc;
+}
+
+bool SetInputData(OpRunner &runner)
+{
+    size_t fileSize = 0;
+    ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer<void>(0), runner.GetInputSize(0));
+    ReadFile("../input/input_y.bin", fileSize, runner.GetInputBuffer<void>(1), runner.GetInputSize(1));
+    INFO_LOG("Set input success");
+    return true;
+}
+
+bool ProcessOutputData(OpRunner &runner)
+{
+    int64_t caseId = runner.GetCaseId();
+    WriteFile("../output/output_z_" + std::to_string(caseId) + ".bin", runner.GetOutputBuffer<void>(0),
+              runner.GetOutputSize(0));
+    INFO_LOG("Write output success");
+    return true;
+}
+
+void DestroyResource()
+{
+    bool flag = false;
+    if (aclrtResetDevice(deviceId) != ACL_SUCCESS) {
+        ERROR_LOG("Reset device %d failed", deviceId);
+        flag = true;
+    }
+    INFO_LOG("Reset Device success");
+    if (aclFinalize() != ACL_SUCCESS) {
+        ERROR_LOG("Finalize acl failed");
+        flag = true;
+    }
+    if (flag) {
+        ERROR_LOG("Destroy resource failed");
+    } else {
+        INFO_LOG("Destroy resource success");
+    }
+}
+
+bool InitResource()
+{
+    std::string output = "../output";
+
+    // acl.json is dump or profiling config file
+    if (aclInit("../scripts/acl.json") != ACL_SUCCESS) {
+        ERROR_LOG("acl init failed");
+        return false;
+    }
+
+    if (aclrtSetDevice(deviceId) != ACL_SUCCESS) {
+        ERROR_LOG("Set device failed. deviceId is %d", deviceId);
+        (void)aclFinalize();
+        return false;
+    }
+    INFO_LOG("Set device[%d] success", deviceId);
+
+    // runMode is ACL_HOST which represents app is running in host
+    // runMode is ACL_DEVICE which represents app is running in device
+    aclrtRunMode runMode;
+    if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) {
+        ERROR_LOG("Get run mode failed");
+        DestroyResource();
+        return false;
+    }
+    g_isDevice = (runMode == ACL_DEVICE);
+    INFO_LOG("Get RunMode[%d] success", runMode);
+
+    return true;
+}
+
+bool RunOp(int64_t caseId)
+{
+    // create op desc
+    OperatorDesc opDesc = CreateOpDesc();
+
+    // create Runner
+    OpRunner opRunner(&opDesc);
+    if (!opRunner.Init()) {
+        ERROR_LOG("Init OpRunner failed");
+        return false;
+    }
+
+    // Load inputs
+    if (!SetInputData(opRunner)) {
+        ERROR_LOG("Set input data failed");
+        return false;
+    }
+
+    // Run op
+    if (!opRunner.RunOp(caseId)) {
+        ERROR_LOG("Run op failed");
+        return false;
+    }
+
+    // process output data
+    if (!ProcessOutputData(opRunner)) {
+        ERROR_LOG("Process output data failed");
+        return false;
+    }
+
+    INFO_LOG("Run op success");
+    return true;
+}
+
+int main(int argc, char **argv)
+{
+    if (!InitResource()) {
+        ERROR_LOG("Init resource failed");
+        return FAILED;
+    }
+    INFO_LOG("Init resource success");
+
+    int64_t caseId = 1;
+    if (!RunOp(caseId)) {
+        DestroyResource();
+        return FAILED;
+    }
+
+    caseId = 2;
+    if (!RunOp(caseId)) {
+        DestroyResource();
+        return FAILED;
+    }
+
+    DestroyResource();
+    return SUCCESS;
+}
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp
new file mode 100644
index 000000000..36d197bc5
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/op_runner.cpp
@@ -0,0 +1,462 @@
+/**
+ * @file op_runner.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "op_runner.h"
+
+#include <cassert>
+#include <limits>
+
+#include "acl/acl_op_compiler.h"
+#include "aclnn_add_custom.h"
+#include "common.h"
+
+using namespace std;
+
+extern bool g_isDevice;
+
+OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc)
+{
+    numInputs_ = opDesc->inputDesc.size();
+    numOutputs_ = opDesc->outputDesc.size();
+    workspace_ = nullptr;
+}
+
+OpRunner::~OpRunner()
+{
+    if (workspace_ != nullptr) {
+        (void)aclrtFree(workspace_);
+    }
+    for (size_t i = 0; i < numInputs_; ++i) {
+        (void)aclDestroyTensor(inputTensor_[i]);
+        (void)aclDestroyDataBuffer(inputBuffers_[i]);
+        (void)aclrtFree(devInputs_[i]);
+        if (g_isDevice) {
+            (void)aclrtFree(hostInputs_[i]);
+        } else {
+            (void)aclrtFreeHost(hostInputs_[i]);
+        }
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        (void)aclDestroyTensor(outputTensor_[i]);
+        (void)aclDestroyDataBuffer(outputBuffers_[i]);
+        (void)aclrtFree(devOutputs_[i]);
+        if (g_isDevice) {
+            (void)aclrtFree(hostOutputs_[i]);
+        } else {
+            (void)aclrtFreeHost(hostOutputs_[i]);
+        }
+    }
+}
+
+bool OpRunner::Init()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+            return false;
+        }
+        devInputs_.emplace_back(devMem);
+        inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostInput = nullptr;
+        if (g_isDevice) {
+            if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+                return false;
+            }
+        } else {
+            if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+                return false;
+            }
+        }
+        if (hostInput == nullptr) {
+            ERROR_LOG("Malloc memory for input[%zu] failed", i);
+            return false;
+        }
+        hostInputs_.emplace_back(hostInput);
+
+        aclTensor *inputTensor =
+            aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0,
+                            GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]);
+        if (inputTensor == nullptr) {
+            ERROR_LOG("Create Tensor for input[%zu] failed", i);
+            return false;
+        }
+        inputTensor_.emplace_back(inputTensor);
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+            return false;
+        }
+        devOutputs_.emplace_back(devMem);
+        outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostOutput = nullptr;
+        if (g_isDevice) {
+            if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+                return false;
+            }
+        } else {
+            if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) {
+                ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+                return false;
+            }
+        }
+        if (hostOutput == nullptr) {
+            ERROR_LOG("Malloc host memory for output[%zu] failed", i);
+            return false;
+        }
+        hostOutputs_.emplace_back(hostOutput);
+
+        aclTensor *outputTensor =
+            aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), nullptr, 0,
+                            GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]);
+        if (outputTensor == nullptr) {
+            ERROR_LOG("Create Tensor for output[%zu] failed", i);
+            return false;
+        }
+        outputTensor_.emplace_back(outputTensor);
+    }
+
+    return true;
+}
+
+const size_t OpRunner::NumInputs()
+{
+    return numInputs_;
+}
+
+const size_t OpRunner::NumOutputs()
+{
+    return numOutputs_;
+}
+
+const size_t OpRunner::GetInputSize(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->inputDesc[index]);
+}
+
+const size_t OpRunner::GetInputNumDims(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescNumDims(opDesc_->inputDesc[index]);
+}
+
+aclDataType OpRunner::GetInputDataType(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ACL_DT_UNDEFINED;
+    }
+
+    return aclGetTensorDescType(opDesc_->inputDesc[index]);
+}
+
+aclFormat OpRunner::GetInputFormat(size_t index) const
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ACL_FORMAT_UNDEFINED;
+    }
+
+    return aclGetTensorDescFormat(opDesc_->inputDesc[index]);
+}
+
+std::vector<int64_t> OpRunner::GetInputShape(size_t index) const
+{
+    std::vector<int64_t> ret;
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return ret;
+    }
+
+    auto desc = opDesc_->inputDesc[index];
+    for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) {
+        int64_t dimSize;
+        if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) {
+            ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i);
+            ret.clear();
+            return ret;
+        }
+        ret.emplace_back(dimSize);
+    }
+
+    return ret;
+}
+
+size_t OpRunner::GetOutputSize(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->outputDesc[index]);
+}
+
+const size_t OpRunner::GetOutputNumDims(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescNumDims(opDesc_->outputDesc[index]);
+}
+
+aclDataType OpRunner::GetOutputDataType(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ACL_DT_UNDEFINED;
+    }
+
+    return aclGetTensorDescType(opDesc_->outputDesc[index]);
+}
+
+aclFormat OpRunner::GetOutputFormat(size_t index) const
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ACL_FORMAT_UNDEFINED;
+    }
+
+    return aclGetTensorDescFormat(opDesc_->outputDesc[index]);
+}
+
+std::vector<int64_t> OpRunner::GetOutputShape(size_t index) const
+{
+    std::vector<int64_t> ret;
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return ret;
+    }
+
+    auto desc = opDesc_->outputDesc[index];
+    for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) {
+        int64_t dimSize;
+        if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) {
+            ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i);
+            ret.clear();
+            return ret;
+        }
+        ret.emplace_back(dimSize);
+    }
+    return ret;
+}
+
+size_t OpRunner::GetInputElementCount(size_t index) const
+{
+    if (index >= opDesc_->inputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescElementCount(opDesc_->inputDesc[index]);
+}
+
+size_t OpRunner::GetOutputElementCount(size_t index) const
+{
+    if (index >= opDesc_->outputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescElementCount(opDesc_->outputDesc[index]);
+}
+
+bool OpRunner::RunOp(int64_t caseId)
+{
+    caseId_ = caseId;
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE;
+        if (g_isDevice) {
+            kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+        }
+        if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) {
+            ERROR_LOG("Copy input[%zu] failed", i);
+            return false;
+        }
+        INFO_LOG("Copy input[%zu] success", i);
+    }
+
+    aclrtStream stream = nullptr;
+    if (aclrtCreateStream(&stream) != ACL_SUCCESS) {
+        ERROR_LOG("Create stream failed");
+        return false;
+    }
+    INFO_LOG("Create stream success");
+
+    size_t workspaceSize = 0;
+    aclOpExecutor *handle = nullptr;
+    auto ret = aclnnAddCustomGetWorkspaceSize(inputTensor_[0], inputTensor_[1], caseId, outputTensor_[0], &workspaceSize, &handle);
+    if (ret != ACL_SUCCESS) {
+        (void)aclrtDestroyStream(stream);
+        ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast<int32_t>(ret));
+        return false;
+    }
+    INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize);
+
+    if (workspaceSize != 0) {
+        if (aclrtMalloc(&workspace_, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) {
+            ERROR_LOG("Malloc device memory failed");
+        }
+    }
+
+    ret = aclnnAddCustom(workspace_, workspaceSize, handle, stream);
+    if (ret != ACL_SUCCESS) {
+        (void)aclrtDestroyStream(stream);
+        ERROR_LOG("Execute Operator failed. error code is %d", static_cast<int32_t>(ret));
+        return false;
+    }
+    INFO_LOG("Execute aclnnAddCustom success");
+
+    // The unit of 5000 is ms.
+    ret = aclrtSynchronizeStreamWithTimeout(stream, 5000);
+    if (ret != SUCCESS) {
+        ERROR_LOG("Synchronize stream failed. error code is %d", static_cast<int32_t>(ret));
+        (void)aclrtDestroyStream(stream);
+        return false;
+    }
+    INFO_LOG("Synchronize stream success");
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST;
+        if (g_isDevice) {
+            kind = ACL_MEMCPY_DEVICE_TO_DEVICE;
+        }
+        if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) {
+            INFO_LOG("Copy output[%zu] success", i);
+            (void)aclrtDestroyStream(stream);
+            return false;
+        }
+        INFO_LOG("Copy output[%zu] success", i);
+    }
+
+    (void)aclrtDestroyStream(stream);
+    return true;
+}
+
+int64_t OpRunner::GetCaseId()
+{
+    return caseId_;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case ACL_BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT8:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT8:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT16:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT16:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT32:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT32:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_INT64:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_UINT64:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case ACL_FLOAT16:
+            DoPrintFp16Data(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case ACL_FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case ACL_DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+}
+
+void OpRunner::PrintInput(size_t index, size_t numElementsPerRow)
+{
+    if (index >= numInputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_);
+        return;
+    }
+
+    auto desc = opDesc_->inputDesc[index];
+    PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow);
+}
+
+void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow)
+{
+    if (index >= numOutputs_) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return;
+    }
+
+    auto desc = opDesc_->outputDesc[index];
+    PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow);
+}
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp
new file mode 100644
index 000000000..90e0ac343
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AclNNInvocation/src/operator_desc.cpp
@@ -0,0 +1,51 @@
+/**
+ * @file operator_desc.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "operator_desc.h"
+
+#include "common.h"
+
+using namespace std;
+
+OperatorDesc::OperatorDesc() {}
+
+OperatorDesc::~OperatorDesc()
+{
+    for (auto *desc : inputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+
+    for (auto *desc : outputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+}
+
+OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format)
+{
+    aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format);
+    if (desc == nullptr) {
+        ERROR_LOG("create tensor failed");
+        return *this;
+    }
+    inputDesc.emplace_back(desc);
+    return *this;
+}
+
+OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims,
+                                                aclFormat format)
+{
+    aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format);
+    if (desc == nullptr) {
+        ERROR_LOG("create tensor failed");
+        return *this;
+    }
+
+    outputDesc.emplace_back(desc);
+    return *this;
+}
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json
new file mode 100644
index 000000000..b76e8928f
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom.json
@@ -0,0 +1,47 @@
+[
+    {
+        "op": "AddCustom",
+        "language": "cpp",
+        "input_desc": [
+            {
+                "name": "x",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            },
+            {
+                "name": "y",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            }
+        ],
+        "output_desc": [
+            {
+                "name": "z",
+                "param_type": "required",
+                "format": [
+                    "ND"
+                ],
+                "type": [
+                    "float"
+                ]
+            }
+        ],
+        "attr": [
+            {
+                "name": "case_id",
+                "type": "int",
+                "value": 1
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp
new file mode 100644
index 000000000..b9cb652e0
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_host/add_custom.cpp
@@ -0,0 +1,49 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "../op_kernel/add_custom_tiling.h"
+#include "register/op_def_registry.h"
+
+namespace optiling {
+static ge::graphStatus TilingFunc(gert::TilingContext *context)
+{
+    constexpr uint32_t BLOCK_DIM = 40;
+    context->SetBlockDim(BLOCK_DIM);
+
+    // set tiling_key
+    auto attrs = context->GetAttrs();
+    const int64_t *caseId = attrs->GetInt(0);
+    context->SetTilingKey(*caseId);
+
+    AddCustomTilingData *tiling = context->GetTilingData<AddCustomTilingData>();
+    // x shape is [5120, 5120], y shape is [5120, 15360], so we set outer loop to 3
+    tiling->loopOuter = 3U;
+
+    // set workspace size
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    currentWorkspace[0] = 0;
+
+    return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+namespace ops {
+class AddCustom : public OpDef {
+public:
+    explicit AddCustom(const char *name) : OpDef(name)
+    {
+        this->Input("x").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND});
+        this->Input("y").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND});
+        this->Output("z").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND});
+        this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b");
+        this->Attr("case_id").Int(1);
+    }
+};
+OP_ADD(AddCustom);
+} // namespace ops
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp
new file mode 100644
index 000000000..895e6444f
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom.cpp
@@ -0,0 +1,28 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "add_custom_v1.h"
+#include "add_custom_v2.h"
+
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
+{
+    REGISTER_TILING_DEFAULT(AddCustomTilingData);
+    GET_TILING_DATA(tilingData, tiling);
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+    if (TILING_KEY_IS(1UL)) {
+        KernelAddV1 op;
+        op.Init(x, y, z, &tilingData);
+        op.Process();
+    } else if (TILING_KEY_IS(2UL)) {
+        KernelAddV2 op;
+        op.Init(x, y, z, &tilingData);
+        op.Process();
+    }
+}
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h
new file mode 100644
index 000000000..d865aba89
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_tiling.h
@@ -0,0 +1,18 @@
+/**
+ * @file add_custom_tiling.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADD_CUSTOM_TILING_H
+#define ADD_CUSTOM_TILING_H
+#include <cstdint>
+
+class AddCustomTilingData {
+public:
+    uint32_t loopOuter;
+};
+#endif // ADD_CUSTOM_TILING_H
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h
new file mode 100644
index 000000000..086bca4f0
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v1.h
@@ -0,0 +1,102 @@
+/**
+ * @file add_custom_v1.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADD_CUSTOM_V1_H
+#define ADD_CUSTOM_V1_H
+#include "kernel_operator.h"
+#include "add_custom_tiling.h"
+
+using AscendC::TPosition;
+class KernelAddV1 {
+public:
+    __aicore__ inline KernelAddV1() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData *tilingPtr)
+    {
+        tiling = tilingPtr;
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * TILE_N);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_M * TILE_N * sizeof(float));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        for (uint32_t i = 0; i < tiling->loopOuter; i++) {
+            for (uint32_t j = 0; j < M_A / TILE_M; j++) {
+                CopyIn(i, j);
+                Compute();
+                CopyOut(i, j);
+            }
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(uint32_t progressOuter, uint32_t progressInner)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
+        AscendC::DataCopyParams paramsX;
+        paramsX.blockCount = TILE_M;
+        paramsX.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE;
+        paramsX.srcStride = (N_A - TILE_N) * sizeof(float) / BLOCK_SIZE;
+        paramsX.dstStride = 0;
+        AscendC::DataCopy(xLocal, xGm[progressInner * TILE_M * N_A], paramsX);
+
+        AscendC::DataCopyParams paramsY;
+        paramsY.blockCount = TILE_M;
+        paramsY.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE;
+        paramsY.srcStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE;
+        paramsY.dstStride = 0;
+        AscendC::DataCopy(yLocal, yGm[progressOuter * N_A + progressInner * TILE_M * N_B], paramsY);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        AscendC::Add(zLocal, xLocal, yLocal, TILE_M * TILE_N);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progressOuter, int32_t progressInner)
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopyParams paramsZ;
+        paramsZ.blockCount = TILE_M;
+        paramsZ.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE;
+        paramsZ.srcStride = 0;
+        paramsZ.dstStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE;
+        AscendC::DataCopy(zGm[progressOuter * N_A + progressInner * TILE_M * N_B], zLocal, paramsZ);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    static constexpr int32_t BUFFER_NUM = 2;
+    static constexpr int32_t BLOCK_SIZE = 32;
+    static constexpr uint32_t M_A = 5120U;
+    static constexpr uint32_t N_A = M_A;
+    static constexpr uint32_t M_B = M_A;
+    static constexpr uint32_t N_B = N_A * 3U;
+    static constexpr uint32_t TILE_M = 64U;
+    static constexpr uint32_t TILE_N = 128U;
+
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    AddCustomTilingData *tiling;
+};
+#endif // ADD_CUSTOM_V1_H
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h
new file mode 100644
index 000000000..1f790e84d
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/AddCustom/op_kernel/add_custom_v2.h
@@ -0,0 +1,105 @@
+/**
+ * @file add_custom_v2.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADD_CUSTOM_V2_H
+#define ADD_CUSTOM_V2_H
+#include "kernel_operator.h"
+#include "add_custom_tiling.h"
+
+using AscendC::TPosition;
+class KernelAddV2 {
+public:
+    __aicore__ inline KernelAddV2() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData *tilingPtr)
+    {
+        tiling = tilingPtr;
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * TILE_N);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N);
+        // disable the l2 cache mode of y and z
+        yGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
+        zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_M * TILE_N * sizeof(float));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        for (uint32_t i = 0; i < tiling->loopOuter; i++) {
+            for (uint32_t j = 0; j < M_A / TILE_M; j++) {
+                CopyIn(i, j);
+                Compute();
+                CopyOut(i, j);
+            }
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(uint32_t progressOuter, uint32_t progressInner)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
+        AscendC::DataCopyParams paramsX;
+        paramsX.blockCount = TILE_M;
+        paramsX.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE;
+        paramsX.srcStride = (N_A - TILE_N) * sizeof(float) / BLOCK_SIZE;
+        paramsX.dstStride = 0;
+        AscendC::DataCopy(xLocal, xGm[progressInner * TILE_M * N_A], paramsX);
+
+        AscendC::DataCopyParams paramsY;
+        paramsY.blockCount = TILE_M;
+        paramsY.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE;
+        paramsY.srcStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE;
+        paramsY.dstStride = 0;
+        AscendC::DataCopy(yLocal, yGm[progressOuter * N_A + progressInner * TILE_M * N_B], paramsY);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        AscendC::Add(zLocal, xLocal, yLocal, TILE_M * TILE_N);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progressOuter, int32_t progressInner)
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopyParams paramsZ;
+        paramsZ.blockCount = TILE_M;
+        paramsZ.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE;
+        paramsZ.srcStride = 0;
+        paramsZ.dstStride = (N_B - TILE_N) * sizeof(float) / BLOCK_SIZE;
+        AscendC::DataCopy(zGm[progressOuter * N_A + progressInner * TILE_M * N_B], zLocal, paramsZ);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    static constexpr int32_t BUFFER_NUM = 2;
+    static constexpr int32_t BLOCK_SIZE = 32;
+    static constexpr uint32_t M_A = 5120U;
+    static constexpr uint32_t N_A = M_A;
+    static constexpr uint32_t M_B = M_A;
+    static constexpr uint32_t N_B = N_A * 3U;
+    static constexpr uint32_t TILE_M = 64U;
+    static constexpr uint32_t TILE_N = 128U;
+
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    AddCustomTilingData *tiling;
+};
+#endif // ADD_CUSTOM_V2_H
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md b/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md
new file mode 100644
index 000000000..22f239d00
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/README.md
@@ -0,0 +1,162 @@
+## 概述
+
+本样例基于AddCustom算子工程，介绍了设置L2 CacheMode的方法以及其影响场景。
+
+## 目录结构介绍
+
+```
+├── l2_cache_bypass            // L2 CacheMode样例工程目录
+│   ├── AclNNInvocation        // 通过单算子API调用的方式调用AddCustom算子
+│   ├── AddCustom              // AddCustom算子工程
+│   ├── AddCustom.json         // AddCustom算子的原型定义json文件
+│   └── install.sh             // 脚本，调用msOpGen生成自定义算子工程，并编译
+```
+
+## 算子描述
+
+Add算子实现了两个Shape不相同的Tensor相加，返回相加结果的功能。对应的数学表达式为：
+
+```
+z = x + y
+```
+
+本样例主要介绍数据搬运中设置合理CacheMode对搬运效率的影响，在Global Memory的数据访问中，如果数据只需要访问一次，后续不需要重复读取，那么这种场景下可以设置Global Memory的CacheMode为CACHE_MODE_DISABLED，在这种模式下数据访问将不经过L2 Cache，避免影响需要重复访问的数据，从而提升数据访问效率。
+
+本样例中共有2个实现版本：     
+add_custom_v1.h：基础版本，从列方向切分，每个核计算5120×128的数据量，共有40个核参与计算。           
+add_custom_v2.h：在add_custom_v1基础上，设置y/z的CacheMode为CACHE_MODE_DISABLED，避免替换已进入Cache的x数据，影响搬运效率。             
+
+## 算子规格描述
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">5120 * 5120</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">5120 * 15360</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">5120 * 15360</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+## 支持的产品型号
+
+本样例支持如下产品型号：
+
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 算子工程介绍
+
+其中，算子工程目录AddCustom包含算子的实现文件，如下所示：
+
+```
+├── AddCustom                // AddCustom自定义算子工程
+│   ├── op_host              // host侧实现文件
+│   └── op_kernel            // kernel侧实现文件
+```
+
+CANN软件包中提供了工程创建工具msOpGen，AddCustom算子工程可通过AddCustom.json自动创建，自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。
+
+创建完自定义算子工程后，开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程，本样例已在AddCustom目录中准备好了必要的算子实现，install.sh脚本会创建一个CustomOp目录，并将算子实现文件复制到对应目录下，再编译算子。
+
+备注：CustomOp目录为生成目录，每次执行install.sh脚本都会删除该目录并重新生成，切勿在该目录下编码算子，会存在丢失风险。
+
+## 编译运行样例算子
+
+针对自定义算子工程，编译运行包含如下步骤：
+
+- 调用msOpGen工具生成自定义算子工程；
+- 完成算子host和kernel实现；
+- 编译自定义算子工程生成自定义算子包；
+- 安装自定义算子包到自定义算子库中；
+- 调用执行自定义算子；
+
+详细操作如下所示。
+
+### 1. 获取源码包
+
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
+
+### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
+
+- 切换到msOpGen脚本install.sh所在目录
+
+  ```bash
+  # 若开发者以git命令行方式clone了master分支代码，并切换目录
+  cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/12_l2_cache_bypass/
+  ```
+- 调用脚本，生成自定义算子工程，复制host和kernel实现并编译算子
+
+  - 方式一：配置环境变量运行脚本
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量命令。
+    - 默认路径，root用户安装CANN软件包
+
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+      运行install.sh脚本
+
+      ```bash
+      bash install.sh -v [SOC_VERSION]
+      ```
+  - 方式二：指定命令行安装路径来运行脚本
+    ```bash
+    bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH]
+    ```
+
+  参数说明：
+
+  - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+    - Atlas A2训练系列产品/Atlas 800I A2推理产品
+  - ASCEND_INSTALL_PATH：CANN软件包安装路径
+
+  脚本运行成功后，会在当前目录下创建CustomOp目录，编译完成后，会在CustomOp/build_out中，生成自定义算子安装包custom_opp_\<target os>_\<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
+
+### 3. 部署自定义算子包
+
+- 部署自定义算子包前，请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH
+
+  ```bash
+  echo $ASCEND_OPP_PATH
+  # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp
+
+  # 若没有，则需导出CANN环境变量
+  source [ASCEND_INSTALL_PATH]/bin/setenv.bash
+  # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash
+  ```
+
+  参数说明：
+
+  - ASCEND_INSTALL_PATH：CANN软件包安装路径，一般和上一步中指定的路径保持一致
+- 在自定义算子安装包所在路径下，执行如下命令安装自定义算子包
+
+  ```bash
+  cd CustomOp/build_out
+  ./custom_opp_<target os>_<target architecture>.run
+  ```
+
+  命令执行成功后，自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。
+
+### 4. 调用执行算子工程
+
+- [单算子API调用AddCustom算子工程](./AclNNInvocation/README.md)
+
+## 更新说明
+
+
+| 时间       | 更新事项 |
+| ---------- | -------- |
+| 2025/07/14 | 新增样例 |
diff --git a/operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh b/operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh
new file mode 100755
index 000000000..09c8bf0aa
--- /dev/null
+++ b/operator/ascendc/4_best_practices/12_l2_cache_bypass/install.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+SHORT=v:,i:,
+LONG=soc-version:,install-path:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+# only support Ascend910B2 since different soc version have different cache size
+VERSION_LIST="Ascend910B2"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH
+
+OP_NAME=AddCustom
+# Generate the op framework
+rm -rf CustomOp && msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp
+# Copy op implementation files to CustomOp
+rm -rf CustomOp/op_host/*.cpp
+rm -rf CustomOp/op_kernel/*.h && rm -rf CustomOp/op_kernel/*.cpp
+cp -rf $OP_NAME/op_kernel CustomOp/
+cp -rf $OP_NAME/op_host CustomOp/
+
+# Build CustomOp project
+(cd CustomOp && bash build.sh)
\ No newline at end of file
diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md
index 1ebba2146..bd20372ab 100644
--- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md
+++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md
@@ -21,10 +21,11 @@ z = x + 2.0
 ```
 
 本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响，在Global Memory的数据访问中，数据访问请求(读/写)在AI 处理器内部会按照512 Bytes对齐进行地址转换，同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内，出于数据一致性的要求，AI 处理器会对落入同一个512Bytes范围内的请求进行串行处理，导致搬运效率降低，即发生了同地址访问现象。
-本样例中共有3个实现版本：
-adds_custom_v1.h：基础实现版本，每个核的计算顺序一致，存在同地址冲突，带宽效率较差
-adds_custom_v2.h：通过调整每个核的计算顺序，避免发生同地址冲突
-adds_custom_v3.h：通过调整切分顺序，避免发生同地址冲突
+
+本样例中共有3个实现版本：     
+adds_custom_v1.h：基础实现版本，每个核的计算顺序一致，存在同地址冲突，带宽效率较差。    
+adds_custom_v2.h：通过调整每个核的计算顺序，避免发生同地址冲突。   
+adds_custom_v3.h：通过调整切分顺序，避免发生同地址冲突。
 
 当前算子执行机制保证用户kernel入参（包括workspace/tiling）的地址是512 Bytes对齐的，因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。
 
diff --git a/operator/ascendc/4_best_practices/README.md b/operator/ascendc/4_best_practices/README.md
index c40fe61a7..926e4a6ef 100644
--- a/operator/ascendc/4_best_practices/README.md
+++ b/operator/ascendc/4_best_practices/README.md
@@ -8,6 +8,7 @@
 | ------------------------------- | ------------------------------------------ | ------------------------------------------ |
 | [4_bank_conflict](./4_bank_conflict) | 基于Ascend C的bank冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
 | [6_group_matmul](./6_group_matmul) | 基于Ascend C的group matmul算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
+| [12_l2_cache_bypass](./12_l2_cache_bypass) | 基于Ascend C的L2 CaCheMode算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
 | [15_mata_address_conflict](./15_mata_address_conflict) | 基于Ascend C的同地址冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 |
 | [21_all_gather_matmul_custom](./21_all_gather_matmul_custom) | 基于Ascend C的AllGatherMatmul算子性能调优样例 | Atlas A2训练系列产品 |
 | [22_matmul_reduce_scatter_custom](./22_matmul_reduce_scatter_custom) | 基于Ascend C的MatmulReduceScatter算子性能调优样例 | Atlas A2训练系列产品 |
@@ -45,6 +46,8 @@
 ## 更新说明
 | 时间       | 更新事项                                     |
 | ---------- | -------------------------------------------- |
+| 2025/07/14 | 新增12_l2_cache_bypass样例         |
+| 2025/07/03 | 新增15_mata_address_conflict样例         |
 | 2025/07/01 | 新增4_bank_conflict样例         |
 | 2024/12/19 | 新增23_matmul_all_reduce_custom样例         |
 | 2024/12/19 | 新增22_matmul_reduce_scatter_custom样例         |
-- 
Gitee


From 435971102b39845b258f842b670c3744bd9f8af6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B5=B5=E6=99=BA=E6=85=A7?= <zhaozhihui5@huawei.com>
Date: Wed, 16 Jul 2025 10:10:16 +0000
Subject: [PATCH 41/46] =?UTF-8?q?!2720=20fix=20comment=20Merge=20pull=20re?=
 =?UTF-8?q?quest=20!2720=20from=20=E8=B5=B5=E6=99=BA=E6=85=A7/zzh=5F0716?=
 =?UTF-8?q?=5Ffix=5Fcomment?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../11_llm_data_dist/decoder_sample2.cpp                  | 8 ++++----
 .../level1_single_api/11_llm_data_dist/prompt_sample2.cpp | 6 +++---
 cplusplus/level1_single_api/11_llm_data_dist/readme.md    | 6 ++++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/cplusplus/level1_single_api/11_llm_data_dist/decoder_sample2.cpp b/cplusplus/level1_single_api/11_llm_data_dist/decoder_sample2.cpp
index c4a186e96..909be6ddd 100644
--- a/cplusplus/level1_single_api/11_llm_data_dist/decoder_sample2.cpp
+++ b/cplusplus/level1_single_api/11_llm_data_dist/decoder_sample2.cpp
@@ -222,7 +222,7 @@ int32_t RunDecoderSample(const char *deviceId, const char *localIp, const char *
         printf("[INFO] Tensor[%zu] addr = %p\n", i, reinterpret_cast<void *>(tensorAddrs[i]));
     }
 
-    // 4. 等待Prompt写完cache，实际业务场景可通过合适方式实现通知
+    // 4. 等待prompt写完cache，实际业务场景可通过合适方式实现通知
     std::this_thread::sleep_for(std::chrono::seconds(WAIT_PROMPT_TIME));
 
     // 5. 与prompt建链
@@ -232,7 +232,7 @@ int32_t RunDecoderSample(const char *deviceId, const char *localIp, const char *
     }
     linked = true;
 
-    // 6. 从prompt拉取Cache
+    // 6. 从prompt拉取cache
     if (PullCache(llmDataDist, cacheId) != 0) {
         Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
         return -1;
@@ -256,7 +256,7 @@ int32_t RunDecoderSample(const char *deviceId, const char *localIp, const char *
         return -1;
     }
 
-    // 9. 等待Prompt push cache，实际业务场景可通过合适方式实现通知
+    // 9. 等待prompt push cache，实际业务场景可通过合适方式实现通知
     std::this_thread::sleep_for(std::chrono::seconds(30));
 
     if (CheckBuffers(buffers, {4, 5, 6, 7}) != 0) {
@@ -264,7 +264,7 @@ int32_t RunDecoderSample(const char *deviceId, const char *localIp, const char *
         return -1;
     }
 
-    // 10. 释放Cache与LlmDatadist
+    // 10. 释放cache与llmDataDist
     llmDataDist.Finalize();
     printf("[INFO] Finalize success\n");
     printf("[INFO] Decoder Sample end\n");
diff --git a/cplusplus/level1_single_api/11_llm_data_dist/prompt_sample2.cpp b/cplusplus/level1_single_api/11_llm_data_dist/prompt_sample2.cpp
index 033463d71..52abdafc4 100644
--- a/cplusplus/level1_single_api/11_llm_data_dist/prompt_sample2.cpp
+++ b/cplusplus/level1_single_api/11_llm_data_dist/prompt_sample2.cpp
@@ -218,7 +218,7 @@ int32_t RunPromptSample(const char *deviceId, const char *localIp, const char *r
         printf("[INFO] Tensor[%zu] addr = %p\n", i, reinterpret_cast<void *>(tensorAddrs[i]));
     }
 
-    // 4. 等待Decoder拉取cache
+    // 4. 等待decoder拉取cache
     std::this_thread::sleep_for(std::chrono::seconds(WAIT_TIME));
 
     // 5. 切换角色
@@ -234,13 +234,13 @@ int32_t RunPromptSample(const char *deviceId, const char *localIp, const char *r
     }
     linked = true;
 
-    // 7. 与decoder建链
+    // 7. 向decoder push cache
     if (PushCache(llmDataDist, cacheId) != 0) {
         Finalize(llmDataDist, cacheId, linked, remoteIp, buffers);
         return -1;
     }
 
-    // 8. 释放Cache与LlmDatadist
+    // 8. 释放Cache与llmDataDist
     llmDataDist.Finalize();
     printf("[INFO] Finalize success\n");
     printf("[INFO] Prompt Sample end\n");
diff --git a/cplusplus/level1_single_api/11_llm_data_dist/readme.md b/cplusplus/level1_single_api/11_llm_data_dist/readme.md
index 1a9cf9f80..9c5546e3a 100644
--- a/cplusplus/level1_single_api/11_llm_data_dist/readme.md
+++ b/cplusplus/level1_single_api/11_llm_data_dist/readme.md
@@ -85,13 +85,15 @@
 
     3.2 执行sample2
 
+    此样例使用了单边操作的方式输出kv, p/d两侧注册kv后，decoder向prompt发起建链，然后pull kv，然后两个切换角色，prompt向decoder发起建链，并向decoder push kv
+
     - 执行prompt_sample2, 参数为device_id、local_host_ip和remote_host_ip, 其中device_id为prompt要使用的device_id, local_host_ip为prompt所在host的ip, remote_host_ip为decoder所在host的ip，如:
         ```
-        LOCAL_COMM_RES='{"status": "completed", "version": "1.0", "server_list": [{"server_id": "node_1", "device": [{"device_id": "0", "device_ip": "10.10.10.1"}]}]}' ./prompt_sample 0 10.10.170.1 10.170.10.2
+        LOCAL_COMM_RES='{"status": "completed", "version": "1.0", "server_list": [{"server_id": "node_1", "device": [{"device_id": "0", "device_ip": "10.10.10.1"}]}]}' ./prompt_sample2 0 10.10.170.1 10.170.10.2
         ```
 
     - 执行decoder_sample2, 参数为device_id、local_host_ip和remote_host_ip, 其中device_id为decoder要使用的device_id, local_host_ip为decoder所在host的ip，remote_host_ip为prompt所在host的ip，如:
         ```
-        LOCAL_COMM_RES='{"status": "completed", "version": "1.0", "server_list": [{"server_id": "node_1", "device": [{"device_id": "1", "device_ip": "10.10.10.2"}]}]}' ./decoder_sample 1 10.170.10.2 10.170.10.1
+        LOCAL_COMM_RES='{"status": "completed", "version": "1.0", "server_list": [{"server_id": "node_1", "device": [{"device_id": "1", "device_ip": "10.10.10.2"}]}]}' ./decoder_sample2 1 10.170.10.2 10.170.10.1
         ```
     **注**：LOCAL_COMM_RES为sample2执行所需环境变量，配置了当前进程所需的通信资源，将传递给llm_datadist作为初始化option; 配置格式与HCCL的ranktable一致，只需要配置本进程第一个参数device_id对应的信息，其中ranktable中的rank_id和server_count字段不需要配置，当前用例配置为A2的ranktable格式，其他环境需参考对应环境的ranktable格式进行配置
\ No newline at end of file
-- 
Gitee


From 934dc736007e24735996d1c14077b0d72b64297a Mon Sep 17 00:00:00 2001
From: renjie <renjie88@huawei.com>
Date: Thu, 17 Jul 2025 01:27:49 +0000
Subject: [PATCH 42/46] !2718 add workspace * add workspace

---
 .../op_host/add_custom_tiling_sink_tiling.cpp             | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
index 563ba0b63..5116eb258 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
@@ -10,7 +10,7 @@
 
 #include "add_custom_tiling_sink_tiling.h"
 #include "register/device_op_impl_registry.h"
-
+#include "tiling/platform/platform_ascendc.h"
 namespace optiling {
 static constexpr uint32_t BLOCK_DIM = 8;
 static constexpr uint32_t TILE_NUM = 8;
@@ -26,11 +26,13 @@ ge::graphStatus AddCustomSinkTilingFunc(gert::TilingContext *context)
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
-    currentWorkspace[0] = DEFAULT_WORKSPACE_SIZE; // 设置运行时workspace大小
+    auto platform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    size_t sysWorkspaceSize = platform.GetLibApiWorkSpaceSize();
+    currentWorkspace[0] = sysWorkspaceSize + DEFAULT_WORKSPACE_SIZE; // 设置运行时workspace大小
     if (context->GetInputTensor(1) != nullptr && context->GetInputTensor(1)->GetData<float>() == nullptr) {
         // 通过判断值依赖InputTensor的Data是否为空指针来确认当前是否处于编译期。
         // Tiling下沉场景，编译期需要为算子分配内存，包括其所需的workspace。为了保证运行时的高效性，编译期应根据算子的执行需求，合理设置所需的workspace最大值，以避免内存不足或浪费。
-        currentWorkspace[0] = MAX_WORKSPACE_SIZE;
+        currentWorkspace[0] = sysWorkspaceSize + MAX_WORKSPACE_SIZE;
     }
     return ge::GRAPH_SUCCESS;
 }
-- 
Gitee


From 159d68f900ca9e4246ebaca4f39afea35d1db0e8 Mon Sep 17 00:00:00 2001
From: renjie <renjie88@huawei.com>
Date: Thu, 17 Jul 2025 03:07:33 +0000
Subject: [PATCH 43/46] !2721 fix tilingsink sample comment * fix tilingsink
 sample comment

---
 .../op_host/add_custom_tiling_sink_tiling.cpp               | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
index 5116eb258..24f17126b 100644
--- a/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
+++ b/operator/ascendc/2_features/17_tiling_sink/AddCustomTilingSink/AddCustomTilingSink/op_host/add_custom_tiling_sink_tiling.cpp
@@ -14,7 +14,7 @@
 namespace optiling {
 static constexpr uint32_t BLOCK_DIM = 8;
 static constexpr uint32_t TILE_NUM = 8;
-static constexpr size_t MAX_WORKSPACE_SIZE = 32; // 算子所需workspace的最大值，AddCustomTilingSink样例不需要workspace，不涉及设置，此处设置为固定值仅作为示例
+static constexpr size_t MAX_WORKSPACE_SIZE = 32; // 算子所需用户workspace空间最大值，AddCustomTilingSink算子本身逻辑无需用户workspace空间，此处设置为固定值仅作为示例
 static constexpr size_t DEFAULT_WORKSPACE_SIZE = 0;
 ge::graphStatus AddCustomSinkTilingFunc(gert::TilingContext *context)
 {
@@ -28,11 +28,11 @@ ge::graphStatus AddCustomSinkTilingFunc(gert::TilingContext *context)
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
     auto platform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
     size_t sysWorkspaceSize = platform.GetLibApiWorkSpaceSize();
-    currentWorkspace[0] = sysWorkspaceSize + DEFAULT_WORKSPACE_SIZE; // 设置运行时workspace大小
+    currentWorkspace[0] = sysWorkspaceSize + DEFAULT_WORKSPACE_SIZE; // 设置运行时workspace大小，此处为系统workspace空间 + 用户workspace空间
     if (context->GetInputTensor(1) != nullptr && context->GetInputTensor(1)->GetData<float>() == nullptr) {
         // 通过判断值依赖InputTensor的Data是否为空指针来确认当前是否处于编译期。
         // Tiling下沉场景，编译期需要为算子分配内存，包括其所需的workspace。为了保证运行时的高效性，编译期应根据算子的执行需求，合理设置所需的workspace最大值，以避免内存不足或浪费。
-        currentWorkspace[0] = sysWorkspaceSize + MAX_WORKSPACE_SIZE;
+        currentWorkspace[0] = sysWorkspaceSize + MAX_WORKSPACE_SIZE; // 设置编译期workspace大小，此处为系统workspace空间 + 用户workspace空间最大值
     }
     return ge::GRAPH_SUCCESS;
 }
-- 
Gitee


From 39409db816ccf0c088cb3c33404e8b4cbc734e71 Mon Sep 17 00:00:00 2001
From: shinoda <zhuyuchen7@huawei.com>
Date: Fri, 18 Jul 2025 08:24:25 +0000
Subject: [PATCH 44/46] !2715 change to new tiling definitions Merge pull
 request !2715 from shinoda/master

---
 .../op_host/matmul_custom.cpp                  | 16 +++++++---------
 .../op_kernel/matmul_custom.cpp                |  4 +++-
 .../matmul_custom_tiling.h                     | 18 +++++++-----------
 .../10_matmul_frameworklaunch/README.md        |  3 +++
 4 files changed, 20 insertions(+), 21 deletions(-)
 rename operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/{op_host => op_kernel}/matmul_custom_tiling.h (50%)

diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_host/matmul_custom.cpp b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_host/matmul_custom.cpp
index 49bc45d64..f1911480c 100644
--- a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_host/matmul_custom.cpp
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_host/matmul_custom.cpp
@@ -7,7 +7,7 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
-#include "matmul_custom_tiling.h"
+#include "../op_kernel/matmul_custom_tiling.h"
 #include "register/op_def_registry.h"
 #include "tiling/platform/platform_ascendc.h"
 #include "tiling/tiling_api.h"
@@ -45,27 +45,25 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context)
     }
     cubeTiling.SetBias(true);
     cubeTiling.SetBufferSpace(-1, -1, -1);
-    MatmulCustomTilingData tiling;
-    if (cubeTiling.GetTiling(tiling.cubeTilingData) == -1) {
+    MatmulCustomTilingData *tiling = context->GetTilingData<MatmulCustomTilingData>();
+    if (cubeTiling.GetTiling(tiling->cubeTilingData) == -1) {
         return ge::GRAPH_FAILED;
     }
 
     uint64_t localMemSize;
     ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, localMemSize);
-    tiling.set_localMemSize(localMemSize);
+    tiling->localMemSize = localMemSize;
 
     if (ascendcPlatform.GetSocVersion() == platform_ascendc::SocVersion::ASCEND310P) {
         context->SetBlockDim(2);
         context->SetTilingKey(2);
     } else {
-        /* SetBlockDim here refers to the number of cube cores, so for separated arch(AIC:AIV=1:2), 
-            vector cores number is set 48 by SetDim, cube core number need to be set 24 here.*/ 
-        context->SetBlockDim(24); 
+        /* SetBlockDim here refers to the number of cube cores, so for separated arch(AIC:AIV=1:2),
+            vector cores number is set 48 by SetDim, cube core number need to be set 24 here.*/
+        context->SetBlockDim(24);
         context->SetTilingKey(1);
     }
 
-    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
-    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     size_t userWorkspaceSize = 0;
     size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform.GetLibApiWorkSpaceSize());
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom.cpp b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom.cpp
index 77a323fca..d0d86f000 100644
--- a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom.cpp
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom.cpp
@@ -9,6 +9,7 @@
  */
 #include "kernel_operator.h"
 #include "lib/matmul_intf.h"
+#include "matmul_custom_tiling.h"
 
 using namespace matmul;
 
@@ -141,12 +142,13 @@ MatmulKernel<aType, bType, cType, biasType>::CalcOffset(int32_t blockIdx, const
   * @param  bias: Bias gm addr.
   * @param  c: C matrix gm addr.
   * @param  workspace: Temporary gm space addr required by matmul calc.
-  * @param  tiling: Tiling data addr. 
+  * @param  tiling: Tiling data addr.
   * @retval None
   */
 extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
                                                     GM_ADDR tiling)
 {
+    REGISTER_TILING_DEFAULT(MatmulCustomTilingData);
     GET_TILING_DATA(tilingData, tiling);
     MatmulKernel<half, half, float, float> matmulKernel;
     AscendC::TPipe pipe;
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_host/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h
similarity index 50%
rename from operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_host/matmul_custom_tiling.h
rename to operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h
index fd898cba9..8f32f3418 100644
--- a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_host/matmul_custom_tiling.h
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h
@@ -10,16 +10,12 @@
 #ifndef MATMUL_TILING_H
 #define MATMUL_TILING_H
 
-#include "register/tilingdata_base.h"
-#include "tiling/tiling_api.h"
+#include <cstdint>
+#include "kernel_tiling/kernel_tiling.h"
 
-namespace optiling {
-BEGIN_TILING_DATA_DEF(MatmulCustomTilingData)
-TILING_DATA_FIELD_DEF(uint64_t, localMemSize);
-TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, cubeTilingData);
-END_TILING_DATA_DEF;
+struct MatmulCustomTilingData {
+    uint64_t localMemSize;
+    TCubeTiling cubeTilingData;
+};
 
-REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulCustomTilingData)
-} // namespace optiling
-
-#endif
\ No newline at end of file
+#endif  // MATMUL_TILING_H
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/README.md b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/README.md
index 3b58d140e..05aeaa0c3 100644
--- a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/README.md
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/README.md
@@ -46,6 +46,8 @@ C = A * B + Bias
 ## 算子工程介绍
 本样例介绍了多核场景（[MatmulCustomMultiCore](./MatmulCustomMultiCore/)）和单核场景（[MatmulCustomSingleCore](./MatmulCustomSingleCore/)）两种MamMul算子实现。可以根据使用场景，自行选择多核算子工程或单核算子工程，并在编译算子工程时，进入选择的算子实现工程中完成编译和安装。
 
+其中[MatmulCustomMultiCore](./MatmulCustomMultiCore/)使用标准C++语法定义Tiling结构体，[MatmulCustomSingleCore](./MatmulCustomSingleCore/)使用宏定义方式定义Tiling结构体。相较于使用宏定义方式，标准C++语法定义Tiling结构体不仅更符合C++开发者的开发习惯，并且提供了强大的灵活性。
+
 以单核算子工程为例，算子工程目录MatmulCustomSingleCore包含算子的实现文件，如下所示：
 ```
 ├── MatmulCustomSingleCore  // Matmul自定义算子工程
@@ -140,3 +142,4 @@ CANN软件包中提供了工程创建工具msOpGen，MatmulCustom算子工程可
 | 2024/05/27 | 更新readme              |
 | 2024/11/11 | 样例目录调整 |
 | 2024/11/18 | 算子工程改写为由msOpGen生成 |
+| 2025/07/14 | MatmulCustomMultiCore使用标准C++语法定义Tiling结构体 |
-- 
Gitee


From 3158b9c6ee7d189119db0562033a4dd6feab616d Mon Sep 17 00:00:00 2001
From: hujiawenKaven <hujiawen5@hisilicon.com>
Date: Tue, 22 Jul 2025 07:39:04 +0000
Subject: [PATCH 45/46] !2669 add aclrt launch kernel add_custom sample Merge
 pull request !2669 from hujiawenKaven/aclKernelLaunch

---
 .../AddKernelInvocationAcl/CMakeLists.txt     |  44 ++++
 .../AddKernelInvocationAcl/README.md          |  72 ++++++
 .../AddKernelInvocationAcl/add_custom.cpp     |  82 ++++++
 .../cmake/cpu_lib.cmake                       |   9 +
 .../cmake/npu_lib.cmake                       |  10 +
 .../AddKernelInvocationAcl/data_utils.h       | 240 ++++++++++++++++++
 .../AddKernelInvocationAcl/main.cpp           |  97 +++++++
 .../AddKernelInvocationAcl/run.sh             | 113 +++++++++
 .../scripts/gen_data.py                       |  25 ++
 .../scripts/verify_result.py                  |  53 ++++
 .../3_add_kernellaunch/README.md              |   3 +
 11 files changed, 748 insertions(+)
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/README.md
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/add_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/data_utils.h
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/main.cpp
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/run.sh
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/scripts/gen_data.py
 create mode 100644 operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/scripts/verify_result.py

diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/CMakeLists.txt b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/CMakeLists.txt
new file mode 100644
index 000000000..ec0da5217
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/CMakeLists.txt
@@ -0,0 +1,44 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/add_custom.cpp)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},npu>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    ascendcl
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendc_kernels>>
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/README.md b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/README.md
new file mode 100644
index 000000000..ce8d471cb
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/README.md
@@ -0,0 +1,72 @@
+## 目录结构介绍
+```
+├── AddKernelInvocationAcl
+│   ├── cmake                   // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom.cpp          // 算子kernel实现
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                  // 编译运行算子的脚本
+```
+## 代码实现介绍
+本调用样例中实现的是固定shape为8*2048的Add算子。
+- kernel实现  
+  Add算子的数学表达式为：
+  ```
+  z = x + y
+  ```
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。具体请参考[add_custom.cpp](./add_custom.cpp)。
+
+- 调用实现
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用aclrtLaunchKernelWithConfig函数调用来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU上板。支持参数为[cpu / npu]
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas 训练系列产品
+      - Atlas 推理系列产品AI Core
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+      - Atlas 200/500 A2推理产品
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```bash
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/06/05 | 新增本readme |
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/add_custom.cpp b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/add_custom.cpp
new file mode 100644
index 000000000..96b37a7c3
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/add_custom.cpp
@@ -0,0 +1,82 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+
+constexpr int32_t TOTAL_LENGTH = 8 * 2048;                            // total length of data
+constexpr int32_t USE_CORE_NUM = 8;                                   // num of core used
+constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;         // length computed of each core
+constexpr int32_t TILE_NUM = 8;                                       // split data into 8 tiles for each core
+constexpr int32_t BUFFER_NUM = 2;                                     // tensor num for each queue
+constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // separate to 2 parts, due to double buffer
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = TILE_NUM * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+        AscendC::DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+        AscendC::DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+        AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+        outQueueZ.EnQue<half>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        AscendC::DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<half> xGm;
+    AscendC::GlobalTensor<half> yGm;
+    AscendC::GlobalTensor<half> zGm;
+};
+
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake
new file mode 100644
index 000000000..751a11941
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake
@@ -0,0 +1,9 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_options(ascendc_kernels PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake
new file mode 100644
index 000000000..d862f0064
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake
@@ -0,0 +1,10 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_fatbin_library(ascendc_kernels ${KERNEL_FILES})
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/data_utils.h b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/data_utils.h
new file mode 100644
index 000000000..1d43459ef
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/data_utils.h
@@ -0,0 +1,240 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <memory>
+#include <cstring>
+#include <cerrno>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+/**
+ * @brief Reads a binary file into memory.
+ *
+ * This function opens a binary file, reads its contents into a dynamically allocated memory buffer,
+ * and returns a pointer to the buffer and the size of the file through output parameters.
+ *
+ * @param filePath The path to the binary file to be read.
+ * @param outBuffer A reference to a unique pointer that will hold the file data.
+ * @param outSize A reference to a size_t that will hold the size of the file.
+ * @return true if the file was read successfully, false otherwise.
+ */
+bool ReadBinaryFile(const char *filePath, std::unique_ptr<char[]> &outBuffer, size_t &outSize)
+{
+    FILE *file = fopen(filePath, "rb");
+    if (!file) {
+        ERROR_LOG("Error opening file: %s\n", strerror(errno));
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    outSize = ftell(file);
+    rewind(file);
+
+    outBuffer.reset(new char[outSize]);
+    if (fread(outBuffer.get(), 1, outSize, file) != outSize) {
+        ERROR_LOG("Error reading file.\n");
+        fclose(file);
+        return false;
+    }
+
+    fclose(file);
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/main.cpp b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/main.cpp
new file mode 100644
index 000000000..322546543
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/main.cpp
@@ -0,0 +1,97 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z);
+#endif
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t blockDim = 8;
+    size_t inputByteSize = 8 * 2048 * sizeof(uint16_t);
+    size_t outputByteSize = 8 * 2048 * sizeof(uint16_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputByteSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputByteSize);
+    uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputByteSize);
+
+    ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(add_custom, blockDim, x, y, z); // use this macro for cpu debug
+
+    WriteFile("./output/output_z.bin", z, outputByteSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)z);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost, *zHost;
+    uint8_t *xDevice, *yDevice, *zDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    aclrtBinHandle binHandle = nullptr;
+    aclrtFuncHandle funcHandle = nullptr;
+    aclrtArgsHandle argsHandle = nullptr;
+    aclrtParamHandle paramHandle = nullptr;
+
+    const char *filePath = "./out/fatbin/ascendc_kernels/ascendc_kernels.o";
+    CHECK_ACL(aclrtBinaryLoadFromFile(filePath, nullptr, &binHandle));
+    CHECK_ACL(aclrtBinaryGetFunction(binHandle, "add_custom", &funcHandle));
+    CHECK_ACL(aclrtKernelArgsInit(funcHandle, &argsHandle));
+
+    CHECK_ACL(aclrtKernelArgsAppend(argsHandle, (void **)&xDevice, sizeof(uintptr_t), &paramHandle));
+    CHECK_ACL(aclrtKernelArgsAppend(argsHandle, (void **)&yDevice, sizeof(uintptr_t), &paramHandle));
+    CHECK_ACL(aclrtKernelArgsAppend(argsHandle, (void **)&zDevice, sizeof(uintptr_t), &paramHandle));
+    CHECK_ACL(aclrtKernelArgsFinalize(argsHandle));
+
+    CHECK_ACL(aclrtLaunchKernelWithConfig(funcHandle, blockDim, stream, nullptr, argsHandle, nullptr));
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output_z.bin", zHost, outputByteSize);
+
+    CHECK_ACL(aclrtBinaryUnLoad(binHandle));
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(zHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/run.sh b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/run.sh
new file mode 100644
index 000000000..6b6d23964
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/run.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "cpu" ]; then
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/scripts/gen_data.py b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/scripts/gen_data.py
new file mode 100644
index 000000000..ea8ce828a
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/scripts/gen_data.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1, 100, [8, 2048]).astype(np.float16)
+    input_y = np.random.uniform(1, 100, [8, 2048]).astype(np.float16)
+    golden = (input_x + input_y).astype(np.float16)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/scripts/verify_result.py b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/scripts/verify_result.py
new file mode 100644
index 000000000..1a21d809a
--- /dev/null
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/AddKernelInvocationAcl/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float16
+relative_tol = 1e-3
+absolute_tol = 1e-5
+error_tol = 1e-3
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float16).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float16).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/0_introduction/3_add_kernellaunch/README.md b/operator/ascendc/0_introduction/3_add_kernellaunch/README.md
index a0dffa144..dd44718ab 100644
--- a/operator/ascendc/0_introduction/3_add_kernellaunch/README.md
+++ b/operator/ascendc/0_introduction/3_add_kernellaunch/README.md
@@ -3,6 +3,7 @@
 ## 目录结构介绍
 ```
 ├── 3_add_kernellaunch                // 使用核函数直调的方式调用Add自定义算子
+│   ├── AddKernelInvocationAcl        // 使用aclrtLaunchKernelWithConfig接口调用核函数样例
 │   ├── AddKernelInvocationNeo        // Kernel Launch方式调用核函数样例
 │   ├── AddKernelInvocationTilingNeo  // Kernel Launch方式调用核函数样例，带有Tiling
 │   └── CppExtensions                 // pybind方式调用核函数样例，带有Tiling
@@ -43,6 +44,7 @@ z = x + y
 ### 1. 获取源码包
 编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
 ### 2. 编译运行样例工程
+- [AddKernelInvocationAcl样例运行](./AddKernelInvocationAcl/README.md)
 - [AddKernelInvocationNeo样例运行](./AddKernelInvocationNeo/README.md)
 - [AddKernelInvocationTilingNeo样例运行](./AddKernelInvocationTilingNeo/README.md)
 - [CppExtensions样例运行](./CppExtensions/README.md)
@@ -57,3 +59,4 @@ z = x + y
 | 2024/06/06 | AddKernelInvocation样例转维护，不再更新，不推荐使用 |
 | 2024/08/11 | 删除AddKernelInvocation样例 |
 | 2024/11/11 | 样例目录调整 |   |
+| 2025/06/05 | 新增AddKernelInvocationAcl样例 |   |
-- 
Gitee


From 2cf94acaf1688eaf21c72a07567353a1fa5405c9 Mon Sep 17 00:00:00 2001
From: hehongan <hehongan@h-partners.com>
Date: Thu, 24 Jul 2025 06:24:29 +0000
Subject: [PATCH 46/46] =?UTF-8?q?!2719=20=E5=88=A0=E9=99=A4=E4=BA=86?=
 =?UTF-8?q?=E5=86=97=E4=BD=99=E7=9A=84=E5=88=A4=E6=96=AD=E6=9D=A1=E4=BB=B6?=
 =?UTF-8?q?=20Merge=20pull=20request=20!2719=20from=20hehongan/fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../VectorAddMultiCoreWithTiling/add_custom.cpp      | 12 ++++++------
 .../add_custom.cpp                                   | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
index 8b267ea1d..df072c8e6 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
@@ -80,7 +80,7 @@ private:
     {
         AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.AllocTensor<bfloat16_t>();
         AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.AllocTensor<bfloat16_t>();
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
                 this->tileLength);
             AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
@@ -114,7 +114,7 @@ private:
     __aicore__ inline void CopyOut(int32_t progress)
     {
         AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.DeQue<bfloat16_t>();
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
                 this->tileLength);
         } else {
@@ -199,7 +199,7 @@ private:
     {
         AscendC::LocalTensor<int8_t> xLocal = inQueueX.AllocTensor<int8_t>();
         AscendC::LocalTensor<int8_t> yLocal = inQueueY.AllocTensor<int8_t>();
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
                 this->tileLength);
             AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
@@ -233,7 +233,7 @@ private:
     __aicore__ inline void CopyOut(int32_t progress)
     {
         AscendC::LocalTensor<int8_t> zLocal = outQueueZ.DeQue<int8_t>();
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
                 this->tileLength);
         } else {
@@ -315,7 +315,7 @@ private:
     {
         AscendC::LocalTensor<dataType> xLocal = inQueueX.AllocTensor<dataType>();
         AscendC::LocalTensor<dataType> yLocal = inQueueY.AllocTensor<dataType>();
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
                 this->tileLength);
             AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
@@ -342,7 +342,7 @@ private:
     __aicore__ inline void CopyOut(int32_t progress)
     {
         AscendC::LocalTensor<dataType> zLocal = outQueueZ.DeQue<dataType>();
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
                 this->tileLength);
         } else {
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
index 733e162c3..6baf8e693 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
@@ -464,7 +464,7 @@ private:
         AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t) / this->coef), 0, 0, 0};
         AscendC::DataCopyPadExtParams<bfloat16_t> padParams = {false, 0, 0, 0};
 
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopyPad<bfloat16_t>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
                 copyXParams, padParams);
             AscendC::DataCopyPad<bfloat16_t>(yLocal, yGm[((progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength) / this->coef],
@@ -504,7 +504,7 @@ private:
     {
         AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.DeQue<bfloat16_t>();
         AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0};
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopyPad<bfloat16_t>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
         } else {
             AscendC::DataCopyPad<bfloat16_t>(zGm[progress * this->tileLength], zLocal, copyParams);
@@ -606,7 +606,7 @@ private:
         AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t) / this->coef), 0, 0, 0};
         AscendC::DataCopyPadExtParams<int8_t> padParams = {false, 0, 0, 0};
 
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopyPad<int8_t>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
                 copyXParams, padParams);
             AscendC::DataCopyPad<int8_t>(yLocal, yGm[((progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength) / this->coef],
@@ -646,7 +646,7 @@ private:
     {
         AscendC::LocalTensor<int8_t> zLocal = outQueueZ.DeQue<int8_t>();
         AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0};
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopyPad<int8_t>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
         } else {
             AscendC::DataCopyPad<int8_t>(zGm[progress * this->tileLength], zLocal, copyParams);
@@ -745,7 +745,7 @@ private:
         AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->tileLength * sizeof(dataType) / this->coef), 0, 0, 0};
         AscendC::DataCopyPadExtParams<dataType> padParams = {false, 0, 0, 0};
 
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopyPad<dataType>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
                 copyXParams, padParams);
             AscendC::DataCopyPad<dataType>(yLocal, yGm[((progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength) / this->coef],
@@ -778,7 +778,7 @@ private:
     {
         AscendC::LocalTensor<dataType> zLocal = outQueueZ.DeQue<dataType>();
         AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0};
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
             AscendC::DataCopyPad<dataType>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
         } else {
             AscendC::DataCopyPad<dataType>(zGm[progress * this->tileLength], zLocal, copyParams);
-- 
Gitee