diff --git a/examples/mte_put_perf/CMakeLists.txt b/examples/mte_put_perf/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c8d6f29af4ea7d03d71ea80ad4e4f69758662e20
--- /dev/null
+++ b/examples/mte_put_perf/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+shmem_add_collective_example(mte_put_perf)
\ No newline at end of file
diff --git a/examples/mte_put_perf/main.cpp b/examples/mte_put_perf/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8aaee25641fe846f9b79912a9eab9f575d11373e
--- /dev/null
+++ b/examples/mte_put_perf/main.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <vector>
+
+#include "../utils/perf_utils.h"
+
+typedef float datatype;
+
+int g_npus = 8;
+const char *ipport;
+const char *fuc_data_type;
+int f_rank = 0;
+int f_npu = 0;
+extern void device_put_demo(uint32_t block_dim, void* stream, uint8_t* gva, uint8_t* src_gva, uint8_t* time_gva, int elements, int ubsize);
+
+int test_shmem_team_all_gather(int rank_id, int n_ranks, uint64_t local_mem_size)
+{
+    // 初始化ACL和SHMEM
+    int32_t device_id = rank_id % g_npus + f_npu;
+    int status = 0;
+    aclrtStream stream = nullptr;
+
+    status = aclInit(nullptr);
+    status = aclrtSetDevice(device_id);
+    status = aclrtCreateStream(&stream);
+
+    shmem_init_attr_t *attributes;
+    status = shmem_set_attr(rank_id, n_ranks, local_mem_size, ipport, &attributes);
+    status = shmem_init_attr(attributes);
+    std::vector<int> extra_vec = {64};
+
+    MTE_PERF_FRAME(device_put_demo, stream, 3, 25, extra_vec, 1, 8);
+
+    status = shmem_finalize();
+    status = aclrtDestroyStream(stream);
+    status = aclrtResetDevice(device_id);
+    status = aclFinalize();
+
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    int status = 0;
+    int n_ranks = atoi(argv[1]);
+    int rank_id = atoi(argv[2]);
+    ipport = argv[3];
+    g_npus = atoi(argv[4]);
+    f_rank = atoi(argv[5]);
+    f_npu = atoi(argv[6]);
+    fuc_data_type = argv[7];
+    uint64_t local_mem_size = 1024UL * 1024UL * 1024;
+    int32_t ret = shmem_set_conf_store_tls(false, nullptr, 0);
+    std::cout << "[SUCCESS] demo run start in rank " << rank_id << std::endl;
+    status = test_shmem_team_all_gather(rank_id, n_ranks, local_mem_size);
+    std::cout << "[SUCCESS] demo run success in rank " << rank_id << std::endl;
+    
+    return 0;
+}
diff --git a/examples/mte_put_perf/mte_put_perf_kernel.cpp b/examples/mte_put_perf/mte_put_perf_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a85e4a38f54b6875ca8bb3d6dde51893a3126e3
--- /dev/null
+++ b/examples/mte_put_perf/mte_put_perf_kernel.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "shmem_api.h"
+#include "internal/host_device/shmemi_types.h"
+#include "internal/device/shmemi_device_common.h"
+#include "../utils/perf_utils.h"
+typedef float datatype;
+
+DEFINE_SHMEM_FUNC(device_put_demo, shmem_mte_put_mem_nbi, MTE2, MTE3, 1, 1, 5, 100);
\ No newline at end of file
diff --git a/examples/utils/perf_data_process.py b/examples/utils/perf_data_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..98eca0c2a7b1749317e50e0f0492d63264370134
--- /dev/null
+++ b/examples/utils/perf_data_process.py
@@ -0,0 +1,328 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import numpy as np
+import seaborn as sns
+import math
+import argparse
+import os
+import glob
+from math import ceil
+
+def bytes_to_human_readable(x, pos):
+    """Convert bytes to human-readable units (B, KB, MB, etc.)"""
+    if x < 1024:
+        return f"{x} B"
+    elif x < 1024 ** 2:
+        return f"{x // 1024} KB"
+    elif x < 1024 ** 3:
+        return f"{x // (1024 ** 2)} MB"
+    else:
+        return f"{x // (1024 ** 3)} GB"
+
+def parse_arguments():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(description='Process CSV data and generate performance analysis charts')
+    # Directory argument: process all CSV files in the directory
+    parser.add_argument(
+        '--dir', '-d',
+        type=str,
+        default=None,
+        help='Directory path containing CSV files (process all .csv files in this directory)'
+    )
+    # CSV path argument: process a single CSV file
+    parser.add_argument(
+        '--csv-path', '-c',
+        type=str,
+        default=None,
+        help='Full/relative path of a single CSV file (process this specific file)'
+    )
+    # Optional UBsize filter
+    parser.add_argument(
+        '--ubsize', '-u',
+        type=int,
+        nargs='+',
+        default=None,
+        help='Filter UBsize values to plot (multiple values allowed, e.g., -u 16 32)'
+    )
+    # Optional Coresize filter
+    parser.add_argument(
+        '--coresize', '-s',
+        type=int,
+        nargs='+',
+        default=None,
+        help='Filter Coresize values to plot (multiple values allowed, e.g., -s 8 16)'
+    )
+    args = parser.parse_args()
+
+    # Validate mutual exclusivity of -d and -c
+    if args.dir is None and args.csv_path is None:
+        parser.error('Either --dir (-d) or --csv-path (-c) must be specified')
+    if args.dir is not None and args.csv_path is not None:
+        parser.error('Only one of --dir (-d) or --csv-path (-c) can be specified')
+
+    return args
+
+def get_subplot_layout(n, default_cols=5):
+    """
+    Calculate subplot rows and columns based on the number of items
+    :param n: Number of items to plot
+    :param default_cols: Default number of columns for layout
+    :return: (rows, cols)
+    """
+    if n == 0:
+        return 1, 1  # Fallback layout for no data
+    cols = min(default_cols, n)
+    rows = ceil(n / cols)
+    return rows, cols
+
+def process_csv(csv_file, output_dir, ubsize_filter, coresize_filter):
+    """Process a single CSV file and generate charts to the specified output directory"""
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Read CSV file with error handling
+    try:
+        df = pd.read_csv(csv_file, skiprows=1, usecols=range(5))
+    except Exception as e:
+        print(f"Error reading CSV file {csv_file}: {e}")
+        return
+
+    # Calculate throughput (y value in GB/S)
+    def calculate_new_value(row):
+        col0_value = row.iloc[0]
+        col2_value = row.iloc[2]
+        col4_value = row.iloc[4]
+        return col0_value * col2_value / col4_value * 1000000 / (1024 ** 3)
+        
+    df['y'] = df.apply(calculate_new_value, axis=1)
+    ymin, ymax = 0, df['y'].max()
+    ymax = math.ceil(ymax / 10) * 10  # Round up to nearest 10
+
+    # Get CSV basename for chart naming
+    csv_basename = os.path.splitext(os.path.basename(csv_file))[0]
+
+    # -------------------------- Figure 1: UBsize_compare (CoreSize subplots) --------------------------
+    # Filter Coresize values (3rd column)
+    col3_values = sorted(df.iloc[:, 2].unique())
+    if coresize_filter is not None:
+        col3_values = [v for v in col3_values if v in coresize_filter]
+
+    if col3_values:
+        # Dynamic subplot layout (5 columns by default for Coresize)
+        core_rows, core_cols = get_subplot_layout(len(col3_values), default_cols=5)
+        fig, axs = plt.subplots(core_rows, core_cols, figsize=(core_cols*4, core_rows*4))
+        fig.subplots_adjust(right=0.8, hspace=0.5, wspace=0.3)
+
+        # Flatten axs to 1D array for easy iteration
+        if core_rows == 1 and core_cols == 1:
+            axs = np.array([axs])
+        elif core_rows == 1 or core_cols == 1:
+            axs = axs.flatten()
+        else:
+            axs = axs.flatten()
+
+        # Plot each Coresize subplot
+        for i, col3 in enumerate(col3_values):
+            if i >= len(axs):
+                break
+            ax = axs[i]
+            subset = df[df.iloc[:, 2] == col3]
+            # Plot each UBsize line (with filter)
+            for col4, group in subset.groupby(subset.columns[3]):
+                if ubsize_filter is not None and col4 not in ubsize_filter:
+                    continue
+                ax.plot(group.iloc[:, 0], group['y'], label=f'Col4={col4}')
+            # Set subplot properties
+            ax.set_title(f'core_size={col3}')
+            ax.set_xlabel('DataSize')
+            ax.set_ylabel('throughput(GB/S)')
+            ax.set_ylim(ymin, ymax)
+            ax.set_xscale('log')
+            ax.grid(True)
+            # Set x-axis ticks (power of 2)
+            max_val = subset.iloc[:, 0].max()
+            min_val = subset.iloc[:, 0].min()
+            ticks = [2 ** i for i in range(int(np.log2(min_val)), int(np.log2(max_val)) + 1)]
+            ax.xaxis.set_major_formatter(ticker.FuncFormatter(bytes_to_human_readable))
+            ax.xaxis.set_major_locator(ticker.FixedLocator(ticks))
+            ax.tick_params(axis='x', rotation=45)
+
+        # Hide unused subplots
+        for i in range(len(col3_values), len(axs)):
+            axs[i].set_visible(False)
+
+        # Add legend (if available)
+        if len(axs) > 0:
+            handles, labels = axs[0].get_legend_handles_labels()
+            if labels:
+                labels = [f'ubsize {label[4:]}' for label in labels]
+                fig.legend(handles, labels, loc='center right', bbox_to_anchor=(0.98, 0.5), fontsize=8)
+
+        # Save and close figure
+        fig1_path = os.path.join(output_dir, f'{csv_basename}_UBsize_compare.png')
+        plt.tight_layout(rect=[0, 0, 0.85, 1])
+        plt.savefig(fig1_path, bbox_inches='tight')
+        plt.close()
+        print(f"Generated: {fig1_path}")
+    else:
+        print(f"WARN: {csv_file} - No eligible Coresize values found, skipping UBsize_compare chart")
+
+    # -------------------------- Figure 2: Core_compare (UBsize subplots) --------------------------
+    # Filter UBsize values (4th column)
+    col4_values = sorted(df.iloc[:, 3].unique())
+    if ubsize_filter is not None:
+        col4_values = [v for v in col4_values if v in ubsize_filter]
+
+    if col4_values:
+        # Dynamic subplot layout (2 columns by default for UBsize)
+        ub_rows, ub_cols = get_subplot_layout(len(col4_values), default_cols=2)
+        fig, axs = plt.subplots(ub_rows, ub_cols, figsize=(ub_cols*8, ub_rows*8))
+        fig.subplots_adjust(right=0.8, hspace=0.5, wspace=0.3)
+
+        # Flatten axs to 1D array
+        if ub_rows == 1 and ub_cols == 1:
+            axs = np.array([axs])
+        elif ub_rows == 1 or ub_cols == 1:
+            axs = axs.flatten()
+        else:
+            axs = axs.flatten()
+
+        # Colormap configuration
+        yc = np.arange(21)
+        cmap = plt.cm.viridis
+        norm = plt.Normalize(vmin=np.min(yc), vmax=np.max(yc))
+
+        # Plot each UBsize subplot
+        for i, col4 in enumerate(col4_values):
+            if i >= len(axs):
+                break
+            ax = axs[i]
+            subset = df[df.iloc[:, 3] == col4]
+            # Plot each Coresize line (with filter)
+            for col3, group in subset.groupby(subset.columns[2]):
+                if coresize_filter is not None and col3 not in coresize_filter:
+                    continue
+                ax.plot(group.iloc[:, 0], group['y'], color=cmap(norm(col3)), label=f'Col3={col3}')
+            # Set subplot properties
+            ax.set_title(f'UBsize={col4}')
+            ax.set_xlabel('DataSize')
+            ax.set_ylabel('throughput(GB/S)')
+            ax.set_ylim(ymin, ymax)
+            ax.set_xscale('log')
+            ax.grid(True)
+            # Set x-axis ticks (power of 2)
+            max_val = subset.iloc[:, 0].max()
+            min_val = subset.iloc[:, 0].min()
+            ticks = [2 ** i for i in range(int(np.log2(min_val)), int(np.log2(max_val)) + 1)]
+            ax.xaxis.set_major_formatter(ticker.FuncFormatter(bytes_to_human_readable))
+            ax.xaxis.set_major_locator(ticker.FixedLocator(ticks))
+            ax.tick_params(axis='x', rotation=45)
+
+        # Hide unused subplots
+        for i in range(len(col4_values), len(axs)):
+            axs[i].set_visible(False)
+
+        # Add legend (if available)
+        if len(axs) > 0:
+            handles, labels = axs[0].get_legend_handles_labels()
+            if labels:
+                labels = [f'blockdims {label[4:]}' for label in labels]
+                fig.legend(handles, labels, loc='center right', bbox_to_anchor=(0.98, 0.5), fontsize=8)
+
+        # Save and close figure
+        fig2_path = os.path.join(output_dir, f'{csv_basename}_Core_compare.png')
+        plt.tight_layout(rect=[0, 0, 0.8, 1])
+        plt.savefig(fig2_path, bbox_inches='tight')
+        plt.close()
+        print(f"Generated: {fig2_path}")
+    else:
+        print(f"WARN: {csv_file} - No eligible UBsize values found, skipping Core_compare chart")
+
+    # -------------------------- Heatmaps (Filtered Data) --------------------------
+    # Filter data for heatmaps
+    heatmap_df = df.copy()
+    if ubsize_filter is not None:
+        heatmap_df = heatmap_df[heatmap_df.iloc[:, 3].isin(ubsize_filter)]
+    if coresize_filter is not None:
+        heatmap_df = heatmap_df[heatmap_df.iloc[:, 2].isin(coresize_filter)]
+
+    if heatmap_df.empty:
+        print(f"WARN: {csv_file} - No data left after filtering, skipping heatmaps")
+        return
+
+    # -------------------------- Heatmap 1: Max Throughput --------------------------
+    max_values = heatmap_df.groupby([heatmap_df.iloc[:, 2], heatmap_df.iloc[:, 3]])['y'].max().reset_index()
+    if not max_values.empty:
+        max_values_pivot = max_values.pivot(index=df.columns[3], columns=df.columns[2], values='y')
+        plt.figure(figsize=(20, 8))
+        ax = sns.heatmap(max_values_pivot, annot=True, cmap='coolwarm', fmt='.2f')
+        ax.invert_yaxis()
+        plt.title('Throughput max ')
+        plt.xlabel('Blockdim')
+        plt.ylabel('UBsize')
+        heatmap1_path = os.path.join(output_dir, f'{csv_basename}_y_max_heatmap.png')
+        plt.savefig(heatmap1_path, dpi=300, bbox_inches='tight')
+        plt.close()
+        print(f"Generated: {heatmap1_path}")
+    else:
+        print(f"WARN: {csv_file} - No max throughput data available, skipping max heatmap")
+
+    # -------------------------- Heatmap 2: Mean Throughput (>1MB) --------------------------
+    filtered_data = heatmap_df[heatmap_df.iloc[:, 0] >= 1048576].groupby([heatmap_df.iloc[:, 2], heatmap_df.iloc[:, 3]])['y'].mean().reset_index()
+    if not filtered_data.empty:
+        filtered_data_pivot = filtered_data.pivot(index=df.columns[3], columns=df.columns[2], values='y')
+        plt.figure(figsize=(20, 8))
+        ax = sns.heatmap(filtered_data_pivot, annot=True, cmap='coolwarm', fmt='.2f')
+        ax.invert_yaxis()
+        plt.title('Throughput mean (total data > 1 MB) ')
+        plt.xlabel('Blockdim')
+        plt.ylabel('UBsize')
+        heatmap2_path = os.path.join(output_dir, f'{csv_basename}_y_mean_heatmap.png')
+        plt.savefig(heatmap2_path, dpi=300, bbox_inches='tight')
+        plt.close()
+        print(f"Generated: {heatmap2_path}")
+    else:
+        print(f"WARN: {csv_file} - No mean throughput data available, skipping mean heatmap")
+
+def main():
+    args = parse_arguments()
+    ubsize_filter = args.ubsize
+    coresize_filter = args.coresize
+
+    if args.dir:
+        # Process all CSV files in directory
+        dir_path = os.path.abspath(args.dir)
+        if not os.path.isdir(dir_path):
+            print(f"Error: Directory {dir_path} does not exist")
+            return
+        csv_files = glob.glob(os.path.join(dir_path, '*.csv'))
+        if not csv_files:
+            print(f"Warning: No CSV files found in directory {dir_path}")
+            return
+        output_dir = os.path.join(dir_path, 'picture')
+        for csv_file in csv_files:
+            print(f"\nProcessing CSV file: {csv_file}")
+            process_csv(csv_file, output_dir, ubsize_filter, coresize_filter)
+    elif args.csv_path:
+        # Process single CSV file
+        csv_file = os.path.abspath(args.csv_path)
+        if not os.path.isfile(csv_file):
+            print(f"Error: CSV file {csv_file} does not exist")
+            return
+        output_dir = os.path.dirname(csv_file)
+        print(f"\nProcessing CSV file: {csv_file}")
+        process_csv(csv_file, output_dir, ubsize_filter, coresize_filter)
+
+    print("\nAll CSV files processed successfully!")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/utils/perf_utils.h b/examples/utils/perf_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e40b52c6b5cfc5eae0037f24cb1856e63f49c6d7
--- /dev/null
+++ b/examples/utils/perf_utils.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef PERF_UTILS_H
+#define PERF_UTILS_H
+
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include "acl/acl.h"
+#include "shmem_api.h"
+enum ProcessType {
+    PERF_MTE_DEFAULT,
+    PERF_MTE_DEFAULT_PINGPONG,
+    PERF_MTE_CUSTOM,
+};
+// 定义一个函数，将数据写入 CSV 文件
+void writeCSV(const std::string& filename, const std::vector<std::vector<std::string>>& data) {
+    // 打开文件
+    std::ofstream outFile(filename);
+
+    // 检查文件是否成功打开
+    if (!outFile.is_open()) {
+        std::cerr << "Error: Unable to open file " << filename << std::endl;
+        return;
+    }
+
+    // 遍历数据并写入文件
+    for (const auto& row : data) {
+        for (size_t i = 0; i < row.size(); ++i) {
+            outFile << row[i];
+            if (i < row.size() - 1) {
+                outFile << ","; // 字段之间用逗号分隔
+            }
+        }
+        outFile << "\n"; // 每行结束后换行
+    }
+
+    // 关闭文件
+    outFile.close();
+}
+
+std::string intToString(int value) {
+    std::ostringstream oss;
+    oss << value;
+    return oss.str();
+}
+
+// 辅助函数：将 float 转换为字符串
+std::string floatToString(float value) {
+    std::ostringstream oss;
+    oss << value;
+    return oss.str();
+}
+
+
+#define MTE_PERF_FRAME(FUNC, STREAM, EXP_MIN, EXP_MAX, EXTRA_VEC, BLOCK_MIN, BLOCK_MAX)\
+do { \
+    int FRAME_status = 0;\
+    int FRAME_rank_id = shmem_my_pe();\
+    /* 1. 初始化CSV数据，包含固定表头 */ \
+    std::vector<std::vector<std::string>> csv_data = { \
+        {"DataSize/B", "Npus", "Blocks", "UBsize/KB", "CoreMaxTime/us", "CoreMinTime/us", "CoreMeanTime/us", "SingleCoreTime/us"} \
+    }; \
+    /* 2. 生成2的幂次数组powers_of_two */ \
+    std::vector<int> powers_of_two; \
+    for (int exponent = (EXP_MIN); exponent <= (EXP_MAX); ++exponent) { \
+        int value = static_cast<int>(std::pow(2, exponent)); \
+        powers_of_two.push_back(value); \
+    } \
+    std::vector<int> UBSizeLst = (EXTRA_VEC);\
+    /* 3. 嵌套循环：block_size(1~20) 遍历 + powers_of_two 遍历 */ \
+    for (int block_size = (BLOCK_MIN); block_size <= (BLOCK_MAX); block_size++) { \
+        if (block_size <= 0) continue;\
+        for (int datasize : powers_of_two) { \
+            if (datasize <= 0) continue; \
+            for (int ubsize : UBSizeLst){   \
+                if (ubsize <= 0) continue; \
+                void *dst_ptr = shmem_malloc(datasize * block_size);\
+                void *src_ptr = shmem_malloc(datasize * block_size);\
+                void *time_ptr = nullptr;\
+                FRAME_status = aclrtMalloc((void **) &(time_ptr), sizeof(datatype) * 2048, ACL_MEM_MALLOC_HUGE_FIRST);\
+                int all_size = (datasize * block_size);\
+                int trans_size = all_size / sizeof(datatype);\
+                std::vector<datatype> input(trans_size, 0);\
+                for (int i = 0; i < trans_size; i++) {\
+                    input[i] = (FRAME_rank_id + 10);\
+                }\
+                FRAME_status = aclrtMemcpy(src_ptr, all_size, input.data(), all_size, ACL_MEMCPY_HOST_TO_DEVICE);\
+                FUNC(block_size, (STREAM), (uint8_t *)dst_ptr, (uint8_t *)src_ptr,  (uint8_t *)time_ptr, trans_size, ubsize); \
+                FRAME_status = aclrtSynchronizeStream((STREAM));\
+                int32_t *src_host;\
+                FRAME_status = aclrtMallocHost(reinterpret_cast<void**>(&src_host), all_size);\
+                int32_t *dst_host;\
+                FRAME_status = aclrtMallocHost(reinterpret_cast<void**>(&dst_host), all_size);\
+                float *time_host;\
+                FRAME_status = aclrtMallocHost(reinterpret_cast<void**>(&time_host), sizeof(float)* 2048);\
+                FRAME_status = aclrtMemcpy(time_host, sizeof(float)*2048, time_ptr, sizeof(float)*2048, ACL_MEMCPY_DEVICE_TO_HOST);\
+                float *usetime = (float*)time_host;\
+                std::cout << "rank: " << FRAME_rank_id << " time: " << *usetime << " size: " << sizeof(float) * trans_size << std::endl;\
+                FRAME_status = aclrtMemcpy(src_host, sizeof(datatype), src_ptr, sizeof(datatype), ACL_MEMCPY_DEVICE_TO_HOST);\
+                FRAME_status = aclrtMemcpy(dst_host, sizeof(datatype), dst_ptr, sizeof(datatype), ACL_MEMCPY_DEVICE_TO_HOST);\
+                std::cout << "rank: " << FRAME_rank_id << " src: " << *(datatype*)src_host << "address: " << std::hex << static_cast<void*>(src_ptr) << std::endl;\
+                std::cout << "rank: " << FRAME_rank_id << " dst: " << *(datatype*)dst_host << "address: " << std::hex << static_cast<void*>(dst_ptr) << std::endl;\
+                float max_core_time = 0.0f;\
+                float min_core_time = 0.0f;\
+                float mean_core_time = 0.0f;\
+                std::vector<std::string> sub_data = {intToString(datasize), intToString(g_npus), intToString(block_size), intToString(ubsize), floatToString(max_core_time), floatToString(min_core_time), floatToString(mean_core_time)};\
+                for (int i = 0; i < block_size; i++) {\
+                    if (i == 0) {\
+                        max_core_time = *(usetime+i*16);\
+                        min_core_time = *(usetime+i*16);\
+                        mean_core_time = *(usetime+i*16);\
+                    } else {\
+                        if (*(usetime+i*16) > max_core_time){\
+                            max_core_time = *(usetime+i*16);\
+                        } \
+                        if (*(usetime+i*16) < min_core_time){\
+                            min_core_time = *(usetime+i*16);\
+                        }\
+                        mean_core_time += *(usetime+i*16);\
+                    }\
+                    sub_data.push_back(floatToString(*(usetime+i*16)));\
+                }\
+                sub_data[4] = floatToString(max_core_time);\
+                sub_data[5] = floatToString(min_core_time);\
+                sub_data[6] = floatToString(mean_core_time/block_size);\
+                csv_data.push_back(sub_data);\
+                FRAME_status = aclrtFreeHost(src_host);\
+                FRAME_status = aclrtFreeHost(dst_host);\
+                FRAME_status = aclrtFreeHost(time_host);\
+                FRAME_status = aclrtFree(time_ptr);\
+                shmem_free(dst_ptr);\
+                shmem_free(src_ptr);\
+                FRAME_status = aclrtSynchronizeStream(stream);\
+            }\
+        } \
+    } \
+    /* 7. 调用writeCSV函数落盘，文件名拼接rank_id生成唯一名称 */ \
+    writeCSV("output" + intToString(FRAME_rank_id) + ".csv", csv_data); \
+    std::cout << "CSV文件已生成：output" << FRAME_rank_id << ".csv\n"; \
+} while (0)
+
+#define BASIC_PARAM()\
+    rank = shmem_my_pe();\
+    rank_size = shmem_n_pes();\
+    gva_gm = (__gm__ datatype *)gva;\
+    dev_gm = (__gm__ datatype *)src_gva;\
+    time_gm = (__gm__ float *)time_gva;\
+    device_state = shmemi_get_state();\
+
+#define BASIC_PROCESS_PARAM(WARMUP, LOOP)\
+    copy_ub = device_state->mte_config.shmem_ub;\
+    uint32_t copy_ub_size = device_state->mte_config.ub_size;\
+    copy_ub_size = ubsize * 1024;\
+    copy_event_id = (AscendC::TEventID)device_state->mte_config.event_id;\
+    int32_t tile_length;\
+    tile_length = (int32_t)(elements/AscendC::GetBlockNum());\
+    uint64_t t11, t22;\
+    int32_t all_times = 0;\
+    int warmup = (WARMUP);\
+    int loop_test = (LOOP);\
+    uint64_t block_size = tile_length * sizeof(datatype);\
+
+#define BASIC_PINGPONG_PARAM(WARMUP, LOOP)\
+    BASIC_PROCESS_PARAM(WARMUP, LOOP);\
+    int pingpongId = 0;\
+    uint32_t copy_ub_num = copy_ub_size / sizeof(datatype);\
+    __ubuf__ datatype *ping_buff = reinterpret_cast<__ubuf__ datatype *>(uint64_t(copy_ub));\
+    __ubuf__ datatype *pong_buff = reinterpret_cast<__ubuf__ datatype *>(uint64_t(copy_ub + copy_ub_size));\
+
+#define MY_CUSTOM_PARAM(WARMUP, LOOP)\
+
+
+#define GENERAL_PROCESS(START_FLAG, END_FLAG, CUSTOM_PARAM, CUSTOM_PROCESS, WARMUP, LOOP)\
+        CUSTOM_PARAM((WARMUP), (LOOP));\
+        AscendC::PipeBarrier<PIPE_ALL>();\
+        for (int i = 0; i < (warmup + loop_test); ++i) {\
+            if (AscendC::GetBlockIdx()>=elements){\
+                break;\
+            }\
+            t11 = get_sys_cnt();\
+            AscendC::SetFlag<AscendC::HardEvent::S_##START_FLAG>(EVENT_ID0);\
+            AscendC::WaitFlag<AscendC::HardEvent::S_##START_FLAG>(EVENT_ID0);\
+            CUSTOM_PROCESS;\
+            AscendC::SetFlag<AscendC::HardEvent::END_FLAG##_S>(EVENT_ID0);\
+            AscendC::WaitFlag<AscendC::HardEvent::END_FLAG##_S>(EVENT_ID0);\
+            t22 = get_sys_cnt();\
+            if (i >= warmup) {\
+                all_times += (int32_t)(t22 - t11);\
+            }\
+        }\
+        float time_us1 = ((int32_t)(all_times))*(float)(0.02)/loop_test;\
+        if (AscendC::GetBlockIdx()>=elements){\
+                time_us1 = 0.0f;\
+        }\
+        *((__gm__ float*)time_gm + AscendC::GetBlockIdx()*16) = (float)time_us1;\
+
+#define GENERAL_PINGPONG_PROCESS(START_FLAG, END_FLAG, CUSTOM_PARAM, CUSTOM_PROCESS, WARMUP, LOOP)\
+        CUSTOM_PARAM((WARMUP), (LOOP));\
+        AscendC::PipeBarrier<PIPE_ALL>();\
+        for (int i = 0; i < (warmup + loop_test); ++i) {\
+            if (AscendC::GetBlockIdx()>=elements){\
+                break;\
+            }\
+            int32_t num_total = tile_length;\
+            int32_t offset = 0;\
+            t11 = get_sys_cnt();\
+            for (int i = 0; num_total > 0; i++) {\
+                AscendC::TEventID EVENT_ID = pingpongId == 0 ? EVENT_ID0 : EVENT_ID1;\
+                __ubuf__ datatype *buf = pingpongId == 0 ? ping_buff : pong_buff;\
+                uint32_t copy_num = num_total > copy_ub_num ? copy_ub_num : num_total;\
+                AscendC::SetFlag<AscendC::HardEvent::S_##START_FLAG>(EVENT_ID);\
+                AscendC::WaitFlag<AscendC::HardEvent::S_##START_FLAG>(EVENT_ID);\
+                CUSTOM_PROCESS;\
+                AscendC::SetFlag<AscendC::HardEvent::END_FLAG##_S>(EVENT_ID);\
+                AscendC::WaitFlag<AscendC::HardEvent::END_FLAG##_S>(EVENT_ID);\
+                offset += copy_num;\
+                num_total -= copy_num;\
+                pingpongId = 1 - pingpongId;\
+            }\
+            t22 = get_sys_cnt();\
+            if (i >= warmup) {\
+                all_times += (int32_t)(t22 - t11);\
+            }\
+        }\
+        float time_us1 = ((int32_t)(all_times))*(float)(0.02)/loop_test;\
+        if (AscendC::GetBlockIdx()>=elements){\
+                time_us1 = 0.0f;\
+        }\
+        *((__gm__ float*)time_gm + AscendC::GetBlockIdx()*16) = (float)time_us1;\
+
+#define CUSTOM_PROCESS(START_FLAG, END_FLAG, CUSTOM_PARAM, CUSTOM_PROCESS, WARMUP, LOOP)\
+
+#define DEFINE_SHMEM_FUNC(TEST_FUNC_NAME, SHMEM_MTE_API, START_FLAG, END_FLAG, NPU_OFFSET, PERF_TYPE, WARMUP, LOOP)\
+class TEST_FUNC_NAME##_kernel_op {\
+public:\
+    __aicore__ inline TEST_FUNC_NAME##_kernel_op() {}\
+    __aicore__ inline void Init(GM_ADDR gva, GM_ADDR src_gva, GM_ADDR time_gva)\
+    {\
+        BASIC_PARAM();\
+    }\
+    __aicore__ inline void Process(int elements, int ubsize)\
+    {\
+        GENERAL_PROCESS(START_FLAG, END_FLAG, BASIC_PROCESS_PARAM,\
+                        SHMEM_MTE_API(gva_gm + tile_length * AscendC::GetBlockIdx(), dev_gm + tile_length * AscendC::GetBlockIdx(), reinterpret_cast<__ubuf__ datatype*>(copy_ub),\
+                            copy_ub_size, tile_length, (rank_size + rank + (NPU_OFFSET))%rank_size, copy_event_id),\
+                        WARMUP, LOOP);\
+    }\
+    \
+    __aicore__ inline void Process_PingPong(int elements, int ubsize)\
+    {\
+        GENERAL_PINGPONG_PROCESS(START_FLAG, END_FLAG, BASIC_PINGPONG_PARAM,\
+                        SHMEM_MTE_API(gva_gm + tile_length * AscendC::GetBlockIdx() + offset, dev_gm + tile_length * AscendC::GetBlockIdx() + offset, \
+                                        buf, copy_ub_size, copy_num, (rank_size + rank + (NPU_OFFSET))%rank_size, EVENT_ID),\
+                        WARMUP, LOOP);\
+    }\
+    __aicore__ inline void Process_Custom(int elements, int ubsize)\
+    {\
+        CUSTOM_PROCESS(START_FLAG, END_FLAG, MY_CUSTOM_PARAM,\
+                        SHMEM_MTE_API,\
+                        WARMUP, LOOP);\
+    }\
+private:\
+    __gm__ datatype *gva_gm;\
+    __gm__ datatype *dev_gm;\
+    __gm__ float *time_gm;\
+    __gm__ shmemi_device_host_state_t *device_state;\
+    uint64_t copy_ub;\
+    uint32_t copy_ub_size;\
+    AscendC::TEventID copy_event_id;\
+    int64_t rank;\
+    int64_t rank_size;\
+};\
+extern "C" __global__ __aicore__ void TEST_FUNC_NAME##_kernel(GM_ADDR gva, GM_ADDR src_gva, GM_ADDR time_gva, int elements, int ubsize)\
+{\
+    TEST_FUNC_NAME##_kernel_op op;\
+    op.Init(gva, src_gva, time_gva);\
+    if ((PERF_TYPE) == PERF_MTE_DEFAULT) {\
+        op.Process(elements, ubsize);\
+    } else if ((PERF_TYPE) == PERF_MTE_DEFAULT_PINGPONG) {\
+        op.Process_PingPong(elements, ubsize);\
+    } else if ((PERF_TYPE) == PERF_MTE_CUSTOM) {\
+        op.Process_Custom(elements, ubsize);\
+    } else {\
+        return;\
+    }\
+}\
+void TEST_FUNC_NAME(uint32_t block_dim, void* stream, uint8_t* gva, uint8_t* src_gva, uint8_t* time_gva, int elements, int ubsize)\
+{\
+    TEST_FUNC_NAME##_kernel<<<block_dim, nullptr, stream>>>(gva, src_gva, time_gva, elements, ubsize);\
+}\
+
+#endif
\ No newline at end of file
diff --git a/examples/utils/utils.py b/examples/utils/utils.py
index 36914cfd86bc4cfc335f53043acef1f1596c502b..84fd35f90423d90b293cb5fa5691ad04d27236c4 100644
--- a/examples/utils/utils.py
+++ b/examples/utils/utils.py
@@ -1,3 +1,12 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
 from enum import IntEnum
 import numpy as np
 import torch