diff --git a/examples/mte_put_perf/CMakeLists.txt b/examples/mte_put_perf/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8d6f29af4ea7d03d71ea80ad4e4f69758662e20 --- /dev/null +++ b/examples/mte_put_perf/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +shmem_add_collective_example(mte_put_perf) \ No newline at end of file diff --git a/examples/mte_put_perf/main.cpp b/examples/mte_put_perf/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8aaee25641fe846f9b79912a9eab9f575d11373e --- /dev/null +++ b/examples/mte_put_perf/main.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include + +#include "../utils/perf_utils.h" + +typedef float datatype; + +int g_npus = 8; +const char *ipport; +const char *fuc_data_type; +int f_rank = 0; +int f_npu = 0; +extern void device_put_demo(uint32_t block_dim, void* stream, uint8_t* gva, uint8_t* src_gva, uint8_t* time_gva, int elements, int ubsize); + +int test_shmem_team_all_gather(int rank_id, int n_ranks, uint64_t local_mem_size) +{ + // 初始化ACL和SHMEM + int32_t device_id = rank_id % g_npus + f_npu; + int status = 0; + aclrtStream stream = nullptr; + + status = aclInit(nullptr); + status = aclrtSetDevice(device_id); + status = aclrtCreateStream(&stream); + + shmem_init_attr_t *attributes; + status = shmem_set_attr(rank_id, n_ranks, local_mem_size, ipport, &attributes); + status = shmem_init_attr(attributes); + std::vector extra_vec = {64}; + + MTE_PERF_FRAME(device_put_demo, stream, 3, 25, extra_vec, 1, 8); + + status = shmem_finalize(); + status = aclrtDestroyStream(stream); + status = aclrtResetDevice(device_id); + status = aclFinalize(); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int status = 0; + int n_ranks = atoi(argv[1]); + int rank_id = atoi(argv[2]); + ipport = argv[3]; + g_npus = atoi(argv[4]); + f_rank = atoi(argv[5]); + f_npu = atoi(argv[6]); + fuc_data_type = argv[7]; + uint64_t local_mem_size = 1024UL * 1024UL * 1024; + int32_t ret = shmem_set_conf_store_tls(false, nullptr, 0); + std::cout << "[SUCCESS] demo run start in rank " << rank_id << std::endl; + status = test_shmem_team_all_gather(rank_id, n_ranks, local_mem_size); + std::cout << "[SUCCESS] demo run success in rank " << rank_id << std::endl; + + return 0; +} diff --git a/examples/mte_put_perf/mte_put_perf_kernel.cpp b/examples/mte_put_perf/mte_put_perf_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9a85e4a38f54b6875ca8bb3d6dde51893a3126e3 --- /dev/null +++ b/examples/mte_put_perf/mte_put_perf_kernel.cpp @@ -0,0 +1,18 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "shmem_api.h" +#include "internal/host_device/shmemi_types.h" +#include "internal/device/shmemi_device_common.h" +#include "../utils/perf_utils.h" +typedef float datatype; + +DEFINE_SHMEM_FUNC(device_put_demo, shmem_mte_put_mem_nbi, MTE2, MTE3, 1, 1, 5, 100); \ No newline at end of file diff --git a/examples/utils/perf_data_process.py b/examples/utils/perf_data_process.py new file mode 100644 index 0000000000000000000000000000000000000000..98eca0c2a7b1749317e50e0f0492d63264370134 --- /dev/null +++ b/examples/utils/perf_data_process.py @@ -0,0 +1,328 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np +import seaborn as sns +import math +import argparse +import os +import glob +from math import ceil + +def bytes_to_human_readable(x, pos): + """Convert bytes to human-readable units (B, KB, MB, etc.)""" + if x < 1024: + return f"{x} B" + elif x < 1024 ** 2: + return f"{x // 1024} KB" + elif x < 1024 ** 3: + return f"{x // (1024 ** 2)} MB" + else: + return f"{x // (1024 ** 3)} GB" + +def parse_arguments(): + """Parse command line arguments""" + parser = argparse.ArgumentParser(description='Process CSV data and generate performance analysis charts') + # Directory argument: process all CSV files in the directory + parser.add_argument( + '--dir', '-d', + type=str, + default=None, + help='Directory path containing CSV files (process all .csv files in this directory)' + ) + # CSV path argument: process a single CSV file + parser.add_argument( + '--csv-path', '-c', + type=str, + default=None, + help='Full/relative path of a single CSV file (process this specific file)' + ) + # Optional UBsize filter + parser.add_argument( + '--ubsize', '-u', + type=int, + nargs='+', + default=None, + help='Filter UBsize values to plot (multiple values allowed, e.g., -u 16 32)' + ) + # Optional Coresize filter + parser.add_argument( + '--coresize', '-s', + type=int, + nargs='+', + default=None, + help='Filter Coresize values to plot (multiple values allowed, e.g., -s 8 16)' + ) + args = parser.parse_args() + + # Validate mutual exclusivity of -d and -c + if args.dir is None and args.csv_path is None: + parser.error('Either --dir (-d) or --csv-path (-c) must be specified') + if args.dir is not None and args.csv_path is not None: + parser.error('Only one of --dir (-d) or --csv-path (-c) can be specified') + + return args + +def get_subplot_layout(n, default_cols=5): + """ + Calculate subplot rows and columns based on the number of items + :param n: Number of items to plot + :param default_cols: Default number of columns for layout + :return: (rows, cols) + """ + if n == 0: + return 1, 1 # Fallback layout for no data + cols = min(default_cols, n) + rows = ceil(n / cols) + return rows, cols + +def process_csv(csv_file, output_dir, ubsize_filter, coresize_filter): + """Process a single CSV file and generate charts to the specified output directory""" + os.makedirs(output_dir, exist_ok=True) + + # Read CSV file with error handling + try: + df = pd.read_csv(csv_file, skiprows=1, usecols=range(5)) + except Exception as e: + print(f"Error reading CSV file {csv_file}: {e}") + return + + # Calculate throughput (y value in GB/S) + def calculate_new_value(row): + col0_value = row.iloc[0] + col2_value = row.iloc[2] + col4_value = row.iloc[4] + return col0_value * col2_value / col4_value * 1000000 / (1024 ** 3) + + df['y'] = df.apply(calculate_new_value, axis=1) + ymin, ymax = 0, df['y'].max() + ymax = math.ceil(ymax / 10) * 10 # Round up to nearest 10 + + # Get CSV basename for chart naming + csv_basename = os.path.splitext(os.path.basename(csv_file))[0] + + # -------------------------- Figure 1: UBsize_compare (CoreSize subplots) -------------------------- + # Filter Coresize values (3rd column) + col3_values = sorted(df.iloc[:, 2].unique()) + if coresize_filter is not None: + col3_values = [v for v in col3_values if v in coresize_filter] + + if col3_values: + # Dynamic subplot layout (5 columns by default for Coresize) + core_rows, core_cols = get_subplot_layout(len(col3_values), default_cols=5) + fig, axs = plt.subplots(core_rows, core_cols, figsize=(core_cols*4, core_rows*4)) + fig.subplots_adjust(right=0.8, hspace=0.5, wspace=0.3) + + # Flatten axs to 1D array for easy iteration + if core_rows == 1 and core_cols == 1: + axs = np.array([axs]) + elif core_rows == 1 or core_cols == 1: + axs = axs.flatten() + else: + axs = axs.flatten() + + # Plot each Coresize subplot + for i, col3 in enumerate(col3_values): + if i >= len(axs): + break + ax = axs[i] + subset = df[df.iloc[:, 2] == col3] + # Plot each UBsize line (with filter) + for col4, group in subset.groupby(subset.columns[3]): + if ubsize_filter is not None and col4 not in ubsize_filter: + continue + ax.plot(group.iloc[:, 0], group['y'], label=f'Col4={col4}') + # Set subplot properties + ax.set_title(f'core_size={col3}') + ax.set_xlabel('DataSize') + ax.set_ylabel('throughput(GB/S)') + ax.set_ylim(ymin, ymax) + ax.set_xscale('log') + ax.grid(True) + # Set x-axis ticks (power of 2) + max_val = subset.iloc[:, 0].max() + min_val = subset.iloc[:, 0].min() + ticks = [2 ** i for i in range(int(np.log2(min_val)), int(np.log2(max_val)) + 1)] + ax.xaxis.set_major_formatter(ticker.FuncFormatter(bytes_to_human_readable)) + ax.xaxis.set_major_locator(ticker.FixedLocator(ticks)) + ax.tick_params(axis='x', rotation=45) + + # Hide unused subplots + for i in range(len(col3_values), len(axs)): + axs[i].set_visible(False) + + # Add legend (if available) + if len(axs) > 0: + handles, labels = axs[0].get_legend_handles_labels() + if labels: + labels = [f'ubsize {label[4:]}' for label in labels] + fig.legend(handles, labels, loc='center right', bbox_to_anchor=(0.98, 0.5), fontsize=8) + + # Save and close figure + fig1_path = os.path.join(output_dir, f'{csv_basename}_UBsize_compare.png') + plt.tight_layout(rect=[0, 0, 0.85, 1]) + plt.savefig(fig1_path, bbox_inches='tight') + plt.close() + print(f"Generated: {fig1_path}") + else: + print(f"WARN: {csv_file} - No eligible Coresize values found, skipping UBsize_compare chart") + + # -------------------------- Figure 2: Core_compare (UBsize subplots) -------------------------- + # Filter UBsize values (4th column) + col4_values = sorted(df.iloc[:, 3].unique()) + if ubsize_filter is not None: + col4_values = [v for v in col4_values if v in ubsize_filter] + + if col4_values: + # Dynamic subplot layout (2 columns by default for UBsize) + ub_rows, ub_cols = get_subplot_layout(len(col4_values), default_cols=2) + fig, axs = plt.subplots(ub_rows, ub_cols, figsize=(ub_cols*8, ub_rows*8)) + fig.subplots_adjust(right=0.8, hspace=0.5, wspace=0.3) + + # Flatten axs to 1D array + if ub_rows == 1 and ub_cols == 1: + axs = np.array([axs]) + elif ub_rows == 1 or ub_cols == 1: + axs = axs.flatten() + else: + axs = axs.flatten() + + # Colormap configuration + yc = np.arange(21) + cmap = plt.cm.viridis + norm = plt.Normalize(vmin=np.min(yc), vmax=np.max(yc)) + + # Plot each UBsize subplot + for i, col4 in enumerate(col4_values): + if i >= len(axs): + break + ax = axs[i] + subset = df[df.iloc[:, 3] == col4] + # Plot each Coresize line (with filter) + for col3, group in subset.groupby(subset.columns[2]): + if coresize_filter is not None and col3 not in coresize_filter: + continue + ax.plot(group.iloc[:, 0], group['y'], color=cmap(norm(col3)), label=f'Col3={col3}') + # Set subplot properties + ax.set_title(f'UBsize={col4}') + ax.set_xlabel('DataSize') + ax.set_ylabel('throughput(GB/S)') + ax.set_ylim(ymin, ymax) + ax.set_xscale('log') + ax.grid(True) + # Set x-axis ticks (power of 2) + max_val = subset.iloc[:, 0].max() + min_val = subset.iloc[:, 0].min() + ticks = [2 ** i for i in range(int(np.log2(min_val)), int(np.log2(max_val)) + 1)] + ax.xaxis.set_major_formatter(ticker.FuncFormatter(bytes_to_human_readable)) + ax.xaxis.set_major_locator(ticker.FixedLocator(ticks)) + ax.tick_params(axis='x', rotation=45) + + # Hide unused subplots + for i in range(len(col4_values), len(axs)): + axs[i].set_visible(False) + + # Add legend (if available) + if len(axs) > 0: + handles, labels = axs[0].get_legend_handles_labels() + if labels: + labels = [f'blockdims {label[4:]}' for label in labels] + fig.legend(handles, labels, loc='center right', bbox_to_anchor=(0.98, 0.5), fontsize=8) + + # Save and close figure + fig2_path = os.path.join(output_dir, f'{csv_basename}_Core_compare.png') + plt.tight_layout(rect=[0, 0, 0.8, 1]) + plt.savefig(fig2_path, bbox_inches='tight') + plt.close() + print(f"Generated: {fig2_path}") + else: + print(f"WARN: {csv_file} - No eligible UBsize values found, skipping Core_compare chart") + + # -------------------------- Heatmaps (Filtered Data) -------------------------- + # Filter data for heatmaps + heatmap_df = df.copy() + if ubsize_filter is not None: + heatmap_df = heatmap_df[heatmap_df.iloc[:, 3].isin(ubsize_filter)] + if coresize_filter is not None: + heatmap_df = heatmap_df[heatmap_df.iloc[:, 2].isin(coresize_filter)] + + if heatmap_df.empty: + print(f"WARN: {csv_file} - No data left after filtering, skipping heatmaps") + return + + # -------------------------- Heatmap 1: Max Throughput -------------------------- + max_values = heatmap_df.groupby([heatmap_df.iloc[:, 2], heatmap_df.iloc[:, 3]])['y'].max().reset_index() + if not max_values.empty: + max_values_pivot = max_values.pivot(index=df.columns[3], columns=df.columns[2], values='y') + plt.figure(figsize=(20, 8)) + ax = sns.heatmap(max_values_pivot, annot=True, cmap='coolwarm', fmt='.2f') + ax.invert_yaxis() + plt.title('Throughput max ') + plt.xlabel('Blockdim') + plt.ylabel('UBsize') + heatmap1_path = os.path.join(output_dir, f'{csv_basename}_y_max_heatmap.png') + plt.savefig(heatmap1_path, dpi=300, bbox_inches='tight') + plt.close() + print(f"Generated: {heatmap1_path}") + else: + print(f"WARN: {csv_file} - No max throughput data available, skipping max heatmap") + + # -------------------------- Heatmap 2: Mean Throughput (>1MB) -------------------------- + filtered_data = heatmap_df[heatmap_df.iloc[:, 0] >= 1048576].groupby([heatmap_df.iloc[:, 2], heatmap_df.iloc[:, 3]])['y'].mean().reset_index() + if not filtered_data.empty: + filtered_data_pivot = filtered_data.pivot(index=df.columns[3], columns=df.columns[2], values='y') + plt.figure(figsize=(20, 8)) + ax = sns.heatmap(filtered_data_pivot, annot=True, cmap='coolwarm', fmt='.2f') + ax.invert_yaxis() + plt.title('Throughput mean (total data > 1 MB) ') + plt.xlabel('Blockdim') + plt.ylabel('UBsize') + heatmap2_path = os.path.join(output_dir, f'{csv_basename}_y_mean_heatmap.png') + plt.savefig(heatmap2_path, dpi=300, bbox_inches='tight') + plt.close() + print(f"Generated: {heatmap2_path}") + else: + print(f"WARN: {csv_file} - No mean throughput data available, skipping mean heatmap") + +def main(): + args = parse_arguments() + ubsize_filter = args.ubsize + coresize_filter = args.coresize + + if args.dir: + # Process all CSV files in directory + dir_path = os.path.abspath(args.dir) + if not os.path.isdir(dir_path): + print(f"Error: Directory {dir_path} does not exist") + return + csv_files = glob.glob(os.path.join(dir_path, '*.csv')) + if not csv_files: + print(f"Warning: No CSV files found in directory {dir_path}") + return + output_dir = os.path.join(dir_path, 'picture') + for csv_file in csv_files: + print(f"\nProcessing CSV file: {csv_file}") + process_csv(csv_file, output_dir, ubsize_filter, coresize_filter) + elif args.csv_path: + # Process single CSV file + csv_file = os.path.abspath(args.csv_path) + if not os.path.isfile(csv_file): + print(f"Error: CSV file {csv_file} does not exist") + return + output_dir = os.path.dirname(csv_file) + print(f"\nProcessing CSV file: {csv_file}") + process_csv(csv_file, output_dir, ubsize_filter, coresize_filter) + + print("\nAll CSV files processed successfully!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/utils/perf_utils.h b/examples/utils/perf_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..e40b52c6b5cfc5eae0037f24cb1856e63f49c6d7 --- /dev/null +++ b/examples/utils/perf_utils.h @@ -0,0 +1,304 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef PERF_UTILS_H +#define PERF_UTILS_H + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "shmem_api.h" +enum ProcessType { + PERF_MTE_DEFAULT, + PERF_MTE_DEFAULT_PINGPONG, + PERF_MTE_CUSTOM, +}; +// 定义一个函数,将数据写入 CSV 文件 +void writeCSV(const std::string& filename, const std::vector>& data) { + // 打开文件 + std::ofstream outFile(filename); + + // 检查文件是否成功打开 + if (!outFile.is_open()) { + std::cerr << "Error: Unable to open file " << filename << std::endl; + return; + } + + // 遍历数据并写入文件 + for (const auto& row : data) { + for (size_t i = 0; i < row.size(); ++i) { + outFile << row[i]; + if (i < row.size() - 1) { + outFile << ","; // 字段之间用逗号分隔 + } + } + outFile << "\n"; // 每行结束后换行 + } + + // 关闭文件 + outFile.close(); +} + +std::string intToString(int value) { + std::ostringstream oss; + oss << value; + return oss.str(); +} + +// 辅助函数:将 float 转换为字符串 +std::string floatToString(float value) { + std::ostringstream oss; + oss << value; + return oss.str(); +} + + +#define MTE_PERF_FRAME(FUNC, STREAM, EXP_MIN, EXP_MAX, EXTRA_VEC, BLOCK_MIN, BLOCK_MAX)\ +do { \ + int FRAME_status = 0;\ + int FRAME_rank_id = shmem_my_pe();\ + /* 1. 初始化CSV数据,包含固定表头 */ \ + std::vector> csv_data = { \ + {"DataSize/B", "Npus", "Blocks", "UBsize/KB", "CoreMaxTime/us", "CoreMinTime/us", "CoreMeanTime/us", "SingleCoreTime/us"} \ + }; \ + /* 2. 生成2的幂次数组powers_of_two */ \ + std::vector powers_of_two; \ + for (int exponent = (EXP_MIN); exponent <= (EXP_MAX); ++exponent) { \ + int value = static_cast(std::pow(2, exponent)); \ + powers_of_two.push_back(value); \ + } \ + std::vector UBSizeLst = (EXTRA_VEC);\ + /* 3. 嵌套循环:block_size(1~20) 遍历 + powers_of_two 遍历 */ \ + for (int block_size = (BLOCK_MIN); block_size <= (BLOCK_MAX); block_size++) { \ + if (block_size <= 0) continue;\ + for (int datasize : powers_of_two) { \ + if (datasize <= 0) continue; \ + for (int ubsize : UBSizeLst){ \ + if (ubsize <= 0) continue; \ + void *dst_ptr = shmem_malloc(datasize * block_size);\ + void *src_ptr = shmem_malloc(datasize * block_size);\ + void *time_ptr = nullptr;\ + FRAME_status = aclrtMalloc((void **) &(time_ptr), sizeof(datatype) * 2048, ACL_MEM_MALLOC_HUGE_FIRST);\ + int all_size = (datasize * block_size);\ + int trans_size = all_size / sizeof(datatype);\ + std::vector input(trans_size, 0);\ + for (int i = 0; i < trans_size; i++) {\ + input[i] = (FRAME_rank_id + 10);\ + }\ + FRAME_status = aclrtMemcpy(src_ptr, all_size, input.data(), all_size, ACL_MEMCPY_HOST_TO_DEVICE);\ + FUNC(block_size, (STREAM), (uint8_t *)dst_ptr, (uint8_t *)src_ptr, (uint8_t *)time_ptr, trans_size, ubsize); \ + FRAME_status = aclrtSynchronizeStream((STREAM));\ + int32_t *src_host;\ + FRAME_status = aclrtMallocHost(reinterpret_cast(&src_host), all_size);\ + int32_t *dst_host;\ + FRAME_status = aclrtMallocHost(reinterpret_cast(&dst_host), all_size);\ + float *time_host;\ + FRAME_status = aclrtMallocHost(reinterpret_cast(&time_host), sizeof(float)* 2048);\ + FRAME_status = aclrtMemcpy(time_host, sizeof(float)*2048, time_ptr, sizeof(float)*2048, ACL_MEMCPY_DEVICE_TO_HOST);\ + float *usetime = (float*)time_host;\ + std::cout << "rank: " << FRAME_rank_id << " time: " << *usetime << " size: " << sizeof(float) * trans_size << std::endl;\ + FRAME_status = aclrtMemcpy(src_host, sizeof(datatype), src_ptr, sizeof(datatype), ACL_MEMCPY_DEVICE_TO_HOST);\ + FRAME_status = aclrtMemcpy(dst_host, sizeof(datatype), dst_ptr, sizeof(datatype), ACL_MEMCPY_DEVICE_TO_HOST);\ + std::cout << "rank: " << FRAME_rank_id << " src: " << *(datatype*)src_host << "address: " << std::hex << static_cast(src_ptr) << std::endl;\ + std::cout << "rank: " << FRAME_rank_id << " dst: " << *(datatype*)dst_host << "address: " << std::hex << static_cast(dst_ptr) << std::endl;\ + float max_core_time = 0.0f;\ + float min_core_time = 0.0f;\ + float mean_core_time = 0.0f;\ + std::vector sub_data = {intToString(datasize), intToString(g_npus), intToString(block_size), intToString(ubsize), floatToString(max_core_time), floatToString(min_core_time), floatToString(mean_core_time)};\ + for (int i = 0; i < block_size; i++) {\ + if (i == 0) {\ + max_core_time = *(usetime+i*16);\ + min_core_time = *(usetime+i*16);\ + mean_core_time = *(usetime+i*16);\ + } else {\ + if (*(usetime+i*16) > max_core_time){\ + max_core_time = *(usetime+i*16);\ + } \ + if (*(usetime+i*16) < min_core_time){\ + min_core_time = *(usetime+i*16);\ + }\ + mean_core_time += *(usetime+i*16);\ + }\ + sub_data.push_back(floatToString(*(usetime+i*16)));\ + }\ + sub_data[4] = floatToString(max_core_time);\ + sub_data[5] = floatToString(min_core_time);\ + sub_data[6] = floatToString(mean_core_time/block_size);\ + csv_data.push_back(sub_data);\ + FRAME_status = aclrtFreeHost(src_host);\ + FRAME_status = aclrtFreeHost(dst_host);\ + FRAME_status = aclrtFreeHost(time_host);\ + FRAME_status = aclrtFree(time_ptr);\ + shmem_free(dst_ptr);\ + shmem_free(src_ptr);\ + FRAME_status = aclrtSynchronizeStream(stream);\ + }\ + } \ + } \ + /* 7. 调用writeCSV函数落盘,文件名拼接rank_id生成唯一名称 */ \ + writeCSV("output" + intToString(FRAME_rank_id) + ".csv", csv_data); \ + std::cout << "CSV文件已生成:output" << FRAME_rank_id << ".csv\n"; \ +} while (0) + +#define BASIC_PARAM()\ + rank = shmem_my_pe();\ + rank_size = shmem_n_pes();\ + gva_gm = (__gm__ datatype *)gva;\ + dev_gm = (__gm__ datatype *)src_gva;\ + time_gm = (__gm__ float *)time_gva;\ + device_state = shmemi_get_state();\ + +#define BASIC_PROCESS_PARAM(WARMUP, LOOP)\ + copy_ub = device_state->mte_config.shmem_ub;\ + uint32_t copy_ub_size = device_state->mte_config.ub_size;\ + copy_ub_size = ubsize * 1024;\ + copy_event_id = (AscendC::TEventID)device_state->mte_config.event_id;\ + int32_t tile_length;\ + tile_length = (int32_t)(elements/AscendC::GetBlockNum());\ + uint64_t t11, t22;\ + int32_t all_times = 0;\ + int warmup = (WARMUP);\ + int loop_test = (LOOP);\ + uint64_t block_size = tile_length * sizeof(datatype);\ + +#define BASIC_PINGPONG_PARAM(WARMUP, LOOP)\ + BASIC_PROCESS_PARAM(WARMUP, LOOP);\ + int pingpongId = 0;\ + uint32_t copy_ub_num = copy_ub_size / sizeof(datatype);\ + __ubuf__ datatype *ping_buff = reinterpret_cast<__ubuf__ datatype *>(uint64_t(copy_ub));\ + __ubuf__ datatype *pong_buff = reinterpret_cast<__ubuf__ datatype *>(uint64_t(copy_ub + copy_ub_size));\ + +#define MY_CUSTOM_PARAM(WARMUP, LOOP)\ + + +#define GENERAL_PROCESS(START_FLAG, END_FLAG, CUSTOM_PARAM, CUSTOM_PROCESS, WARMUP, LOOP)\ + CUSTOM_PARAM((WARMUP), (LOOP));\ + AscendC::PipeBarrier();\ + for (int i = 0; i < (warmup + loop_test); ++i) {\ + if (AscendC::GetBlockIdx()>=elements){\ + break;\ + }\ + t11 = get_sys_cnt();\ + AscendC::SetFlag(EVENT_ID0);\ + AscendC::WaitFlag(EVENT_ID0);\ + CUSTOM_PROCESS;\ + AscendC::SetFlag(EVENT_ID0);\ + AscendC::WaitFlag(EVENT_ID0);\ + t22 = get_sys_cnt();\ + if (i >= warmup) {\ + all_times += (int32_t)(t22 - t11);\ + }\ + }\ + float time_us1 = ((int32_t)(all_times))*(float)(0.02)/loop_test;\ + if (AscendC::GetBlockIdx()>=elements){\ + time_us1 = 0.0f;\ + }\ + *((__gm__ float*)time_gm + AscendC::GetBlockIdx()*16) = (float)time_us1;\ + +#define GENERAL_PINGPONG_PROCESS(START_FLAG, END_FLAG, CUSTOM_PARAM, CUSTOM_PROCESS, WARMUP, LOOP)\ + CUSTOM_PARAM((WARMUP), (LOOP));\ + AscendC::PipeBarrier();\ + for (int i = 0; i < (warmup + loop_test); ++i) {\ + if (AscendC::GetBlockIdx()>=elements){\ + break;\ + }\ + int32_t num_total = tile_length;\ + int32_t offset = 0;\ + t11 = get_sys_cnt();\ + for (int i = 0; num_total > 0; i++) {\ + AscendC::TEventID EVENT_ID = pingpongId == 0 ? EVENT_ID0 : EVENT_ID1;\ + __ubuf__ datatype *buf = pingpongId == 0 ? ping_buff : pong_buff;\ + uint32_t copy_num = num_total > copy_ub_num ? copy_ub_num : num_total;\ + AscendC::SetFlag(EVENT_ID);\ + AscendC::WaitFlag(EVENT_ID);\ + CUSTOM_PROCESS;\ + AscendC::SetFlag(EVENT_ID);\ + AscendC::WaitFlag(EVENT_ID);\ + offset += copy_num;\ + num_total -= copy_num;\ + pingpongId = 1 - pingpongId;\ + }\ + t22 = get_sys_cnt();\ + if (i >= warmup) {\ + all_times += (int32_t)(t22 - t11);\ + }\ + }\ + float time_us1 = ((int32_t)(all_times))*(float)(0.02)/loop_test;\ + if (AscendC::GetBlockIdx()>=elements){\ + time_us1 = 0.0f;\ + }\ + *((__gm__ float*)time_gm + AscendC::GetBlockIdx()*16) = (float)time_us1;\ + +#define CUSTOM_PROCESS(START_FLAG, END_FLAG, CUSTOM_PARAM, CUSTOM_PROCESS, WARMUP, LOOP)\ + +#define DEFINE_SHMEM_FUNC(TEST_FUNC_NAME, SHMEM_MTE_API, START_FLAG, END_FLAG, NPU_OFFSET, PERF_TYPE, WARMUP, LOOP)\ +class TEST_FUNC_NAME##_kernel_op {\ +public:\ + __aicore__ inline TEST_FUNC_NAME##_kernel_op() {}\ + __aicore__ inline void Init(GM_ADDR gva, GM_ADDR src_gva, GM_ADDR time_gva)\ + {\ + BASIC_PARAM();\ + }\ + __aicore__ inline void Process(int elements, int ubsize)\ + {\ + GENERAL_PROCESS(START_FLAG, END_FLAG, BASIC_PROCESS_PARAM,\ + SHMEM_MTE_API(gva_gm + tile_length * AscendC::GetBlockIdx(), dev_gm + tile_length * AscendC::GetBlockIdx(), reinterpret_cast<__ubuf__ datatype*>(copy_ub),\ + copy_ub_size, tile_length, (rank_size + rank + (NPU_OFFSET))%rank_size, copy_event_id),\ + WARMUP, LOOP);\ + }\ + \ + __aicore__ inline void Process_PingPong(int elements, int ubsize)\ + {\ + GENERAL_PINGPONG_PROCESS(START_FLAG, END_FLAG, BASIC_PINGPONG_PARAM,\ + SHMEM_MTE_API(gva_gm + tile_length * AscendC::GetBlockIdx() + offset, dev_gm + tile_length * AscendC::GetBlockIdx() + offset, \ + buf, copy_ub_size, copy_num, (rank_size + rank + (NPU_OFFSET))%rank_size, EVENT_ID),\ + WARMUP, LOOP);\ + }\ + __aicore__ inline void Process_Custom(int elements, int ubsize)\ + {\ + CUSTOM_PROCESS(START_FLAG, END_FLAG, MY_CUSTOM_PARAM,\ + SHMEM_MTE_API,\ + WARMUP, LOOP);\ + }\ +private:\ + __gm__ datatype *gva_gm;\ + __gm__ datatype *dev_gm;\ + __gm__ float *time_gm;\ + __gm__ shmemi_device_host_state_t *device_state;\ + uint64_t copy_ub;\ + uint32_t copy_ub_size;\ + AscendC::TEventID copy_event_id;\ + int64_t rank;\ + int64_t rank_size;\ +};\ +extern "C" __global__ __aicore__ void TEST_FUNC_NAME##_kernel(GM_ADDR gva, GM_ADDR src_gva, GM_ADDR time_gva, int elements, int ubsize)\ +{\ + TEST_FUNC_NAME##_kernel_op op;\ + op.Init(gva, src_gva, time_gva);\ + if ((PERF_TYPE) == PERF_MTE_DEFAULT) {\ + op.Process(elements, ubsize);\ + } else if ((PERF_TYPE) == PERF_MTE_DEFAULT_PINGPONG) {\ + op.Process_PingPong(elements, ubsize);\ + } else if ((PERF_TYPE) == PERF_MTE_CUSTOM) {\ + op.Process_Custom(elements, ubsize);\ + } else {\ + return;\ + }\ +}\ +void TEST_FUNC_NAME(uint32_t block_dim, void* stream, uint8_t* gva, uint8_t* src_gva, uint8_t* time_gva, int elements, int ubsize)\ +{\ + TEST_FUNC_NAME##_kernel<<>>(gva, src_gva, time_gva, elements, ubsize);\ +}\ + +#endif \ No newline at end of file diff --git a/examples/utils/utils.py b/examples/utils/utils.py index 36914cfd86bc4cfc335f53043acef1f1596c502b..84fd35f90423d90b293cb5fa5691ad04d27236c4 100644 --- a/examples/utils/utils.py +++ b/examples/utils/utils.py @@ -1,3 +1,12 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# from enum import IntEnum import numpy as np import torch