From e08723e7081fbdff714118763ed12f9d5275ae89 Mon Sep 17 00:00:00 2001 From: kong0808 Date: Fri, 25 Jul 2025 15:36:30 +0800 Subject: [PATCH] add matmul_quant, matmul_channelsplit, matmul_unitflag examples --- examples/matrix/basic_block_matmul/README.md | 2 +- .../scripts/gen_data.py | 9 +- examples/matrix/batch_matmul/README.md | 2 +- .../scripts/gen_data.py | 6 +- examples/matrix/common_scripts/exec_utils.py | 12 + examples/matrix/common_scripts/gen_data.py | 28 +- examples/matrix/matmul/README.md | 2 +- examples/matrix/matmul_a2b2share/README.md | 4 +- examples/matrix/matmul_a2b2share/main.cpp | 3 +- .../matrix/matmul_channelsplit/CMakeLists.txt | 73 +++++ examples/matrix/matmul_channelsplit/README.md | 107 ++++++++ .../matmul_channelsplit/cmake/cpu_lib.cmake | 31 +++ .../matmul_channelsplit/cmake/npu_lib.cmake | 27 ++ examples/matrix/matmul_channelsplit/main.cpp | 229 ++++++++++++++++ .../matmul_channelsplit_custom_tiling.cpp | 50 ++++ .../matmul_channelsplit_custom_tiling.h | 37 +++ .../matmul_channelsplit_custom_kernel.cpp | 156 +++++++++++ .../matmul_channelsplit_custom_kernel.h | 78 ++++++ examples/matrix/matmul_channelsplit/run.sh | 96 +++++++ .../matmul_channelsplit/scripts/exec_test.py | 130 +++++++++ .../matmul_channelsplit/testcase/case.csv | 1 + examples/matrix/matmul_constant/README.md | 2 +- .../matmul_constant/scripts/exec_test.py | 19 +- examples/matrix/matmul_l0c_extend/README.md | 2 +- .../matmul_l0c_extend/scripts/exec_test.py | 19 +- examples/matrix/matmul_l0cache/README.md | 2 +- examples/matrix/matmul_l2cache/README.md | 4 +- examples/matrix/matmul_l2cache/main.cpp | 3 +- .../matrix/matmul_mixdualmaster/README.md | 2 +- examples/matrix/matmul_mixdualmaster/run.sh | 4 +- .../matmul_mixdualmaster/scripts/exec_test.py | 19 +- examples/matrix/matmul_mndb/README.md | 2 +- .../matrix/matmul_mndb/scripts/exec_test.py | 19 +- examples/matrix/matmul_nbuffer33/README.md | 10 +- examples/matrix/matmul_nbuffer33/main.cpp | 10 +- .../matmul_nbuffer33_custom_tiling.cpp | 14 +- .../op_host/matmul_nbuffer33_custom_tiling.h | 9 - examples/matrix/matmul_nbuffer33/run.sh | 2 +- .../matmul_nbuffer33/scripts/exec_test.py | 82 +----- .../matrix/matmul_nbuffer33/testcase/case.csv | 2 +- examples/matrix/matmul_nz/README.md | 2 +- .../matrix/matmul_nz/scripts/exec_test.py | 19 +- .../matrix/matmul_partial_output/README.md | 6 +- .../matmul_partial_output_custom_kernel.cpp | 1 + .../scripts/exec_test.py | 19 +- examples/matrix/matmul_preload/README.md | 2 +- examples/matrix/matmul_preload/main.cpp | 40 +-- .../op_host/matmul_preload_custom_tiling.cpp | 15 + .../op_host/matmul_preload_custom_tiling.h | 9 + .../matmul_preload/scripts/exec_test.py | 50 ++-- .../matrix/matmul_preload/testcase/case.csv | 4 +- examples/matrix/matmul_quant/CMakeLists.txt | 73 +++++ examples/matrix/matmul_quant/README.md | 120 ++++++++ .../matrix/matmul_quant/cmake/cpu_lib.cmake | 31 +++ .../matrix/matmul_quant/cmake/npu_lib.cmake | 27 ++ examples/matrix/matmul_quant/main.cpp | 258 ++++++++++++++++++ .../op_host/matmul_quant_custom_tiling.cpp | 57 ++++ .../op_host/matmul_quant_custom_tiling.h | 38 +++ .../op_kernel/matmul_quant_custom_kernel.cpp | 156 +++++++++++ .../op_kernel/matmul_quant_custom_kernel.h | 76 ++++++ examples/matrix/matmul_quant/run.sh | 96 +++++++ .../matrix/matmul_quant/scripts/exec_test.py | 124 +++++++++ .../matmul_quant/scripts/gen_quant_data.py | 97 +++++++ .../matrix/matmul_quant/testcase/case.csv | 4 + examples/matrix/matmul_sparse/README.md | 8 +- .../matrix/matmul_sparse/scripts/exec_test.py | 19 +- .../matmul_sparse/scripts/gen_sparse_data.py | 4 +- examples/matrix/matmul_splitk/README.md | 2 +- .../op_kernel/matmul_splitk_custom.cpp | 4 +- .../op_kernel/matmul_splitk_custom_impl.h | 8 +- .../matrix/matmul_splitk/scripts/exec_test.py | 19 +- .../matrix/matmul_splitk/testcase/case.csv | 3 +- examples/matrix/matmul_triangle/README.md | 4 +- examples/matrix/matmul_triangle/main.cpp | 3 +- examples/matrix/matmul_tscm/README.md | 6 +- .../matrix/matmul_tscm/scripts/exec_test.py | 19 +- examples/matrix/matmul_unaligned/README.md | 4 +- examples/matrix/matmul_unaligned/main.cpp | 3 +- .../matrix/matmul_unitflag/CMakeLists.txt | 74 +++++ examples/matrix/matmul_unitflag/README.md | 112 ++++++++ .../matmul_unitflag/cmake/cpu_lib.cmake | 35 +++ .../matmul_unitflag/cmake/npu_lib.cmake | 28 ++ examples/matrix/matmul_unitflag/main.cpp | 225 +++++++++++++++ .../op_host/matmul_unitflag_custom_tiling.cpp | 49 ++++ .../op_host/matmul_unitflag_custom_tiling.h | 37 +++ .../matmul_unitflag_custom_kernel.cpp | 146 ++++++++++ .../op_kernel/matmul_unitflag_custom_kernel.h | 85 ++++++ examples/matrix/matmul_unitflag/run.sh | 105 +++++++ .../matmul_unitflag/scripts/exec_test.py | 124 +++++++++ .../matrix/matmul_unitflag/testcase/case.csv | 1 + examples/readme.md | 43 +-- 91 files changed, 3504 insertions(+), 299 deletions(-) create mode 100644 examples/matrix/matmul_channelsplit/CMakeLists.txt create mode 100644 examples/matrix/matmul_channelsplit/README.md create mode 100644 examples/matrix/matmul_channelsplit/cmake/cpu_lib.cmake create mode 100644 examples/matrix/matmul_channelsplit/cmake/npu_lib.cmake create mode 100644 examples/matrix/matmul_channelsplit/main.cpp create mode 100644 examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.cpp create mode 100644 examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.h create mode 100644 examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.cpp create mode 100644 examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.h create mode 100644 examples/matrix/matmul_channelsplit/run.sh create mode 100644 examples/matrix/matmul_channelsplit/scripts/exec_test.py create mode 100644 examples/matrix/matmul_channelsplit/testcase/case.csv create mode 100644 examples/matrix/matmul_quant/CMakeLists.txt create mode 100644 examples/matrix/matmul_quant/README.md create mode 100644 examples/matrix/matmul_quant/cmake/cpu_lib.cmake create mode 100644 examples/matrix/matmul_quant/cmake/npu_lib.cmake create mode 100644 examples/matrix/matmul_quant/main.cpp create mode 100644 examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.cpp create mode 100644 examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.h create mode 100644 examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.cpp create mode 100644 examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.h create mode 100644 examples/matrix/matmul_quant/run.sh create mode 100644 examples/matrix/matmul_quant/scripts/exec_test.py create mode 100644 examples/matrix/matmul_quant/scripts/gen_quant_data.py create mode 100644 examples/matrix/matmul_quant/testcase/case.csv create mode 100644 examples/matrix/matmul_unitflag/CMakeLists.txt create mode 100644 examples/matrix/matmul_unitflag/README.md create mode 100644 examples/matrix/matmul_unitflag/cmake/cpu_lib.cmake create mode 100644 examples/matrix/matmul_unitflag/cmake/npu_lib.cmake create mode 100644 examples/matrix/matmul_unitflag/main.cpp create mode 100644 examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.cpp create mode 100644 examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.h create mode 100644 examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.cpp create mode 100644 examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.h create mode 100644 examples/matrix/matmul_unitflag/run.sh create mode 100644 examples/matrix/matmul_unitflag/scripts/exec_test.py create mode 100644 examples/matrix/matmul_unitflag/testcase/case.csv diff --git a/examples/matrix/basic_block_matmul/README.md b/examples/matrix/basic_block_matmul/README.md index a6f39f4f..b661cb33 100644 --- a/examples/matrix/basic_block_matmul/README.md +++ b/examples/matrix/basic_block_matmul/README.md @@ -67,7 +67,7 @@ Framework调用样例中实现的是固定shape为[M, N, K] = [4096, 5120, 4096] - 调用End接口,结束矩阵乘操作。 - tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/scripts/gen_data.py b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/scripts/gen_data.py index 378bf267..fc65f297 100644 --- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/scripts/gen_data.py +++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/scripts/gen_data.py @@ -10,8 +10,9 @@ # See LICENSE in the root of the software repository for the full text of the License. # ====================================================================================================================== -import numpy as np import os +import subprocess +import numpy as np def gen_golden_data(): @@ -26,8 +27,10 @@ def gen_golden_data(): x2_gm = np.random.randint(1, 10, [k, n]).astype(x2_gm_type) golden = np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32)).astype(np.float32) x1_gm = x1_gm.transpose() # A is transposed - os.system("mkdir -p input") - os.system("mkdir -p output") + if not os.path.exists("input"): + os.makedirs("input") + if not os.path.exists("output"): + os.makedirs("output") x1_gm.tofile("./input/x1_gm.bin") x2_gm.tofile("./input/x2_gm.bin") golden.tofile("./output/golden.bin") diff --git a/examples/matrix/batch_matmul/README.md b/examples/matrix/batch_matmul/README.md index 02dd08e1..c5f5afb1 100644 --- a/examples/matrix/batch_matmul/README.md +++ b/examples/matrix/batch_matmul/README.md @@ -64,7 +64,7 @@ Framework调用样例中实现的是固定shape为[M, N, K] = [192, 1536, 64], b - 结束矩阵乘操作。 - tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的 Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的 Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 根据输入输出LayOut设置单核计算的A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/scripts/gen_data.py b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/scripts/gen_data.py index e20796c2..2a5fbae6 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/scripts/gen_data.py +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/scripts/gen_data.py @@ -50,8 +50,10 @@ def gen_golden_data(): b_broadcast = np.broadcast_to(b_t, b_broadcast_shape) golden = np.matmul(a_broadcast, b_broadcast).astype(np.float32) golden = np.transpose(golden, axes=(0, 3, 1, 2, 4)) - os.system("mkdir -p input") - os.system("mkdir -p output") + if not os.path.exists("input"): + os.makedirs("input") + if not os.path.exists("output"): + os.makedirs("output") x1_gm.tofile("./input/x1_gm.bin") x2_gm.tofile("./input/x2_gm.bin") golden.tofile("./output/golden.bin") diff --git a/examples/matrix/common_scripts/exec_utils.py b/examples/matrix/common_scripts/exec_utils.py index 2eaedf07..9eaf3a6d 100644 --- a/examples/matrix/common_scripts/exec_utils.py +++ b/examples/matrix/common_scripts/exec_utils.py @@ -13,6 +13,8 @@ import os import sys import csv import logging +import subprocess +import shlex logging.basicConfig(level=logging.INFO) @@ -89,3 +91,13 @@ def clear_file_cache(file_work_dir): os.system("rm -rf " + rm_files) rm_files = file_work_dir + "/bin/sim_out/*" os.system("rm -rf " + rm_files) + + +def get_process_case_cmd(kernel_name, params_str, is_perf, run_mode): + if is_perf: + cmd = f"msprof op --application=\"./{kernel_name} {params_str}\" --output=./prof_out" + elif run_mode == "sim": + cmd = f"msprof op simulator --application=\"./{kernel_name} {params_str}\" --output=./sim_out" + else: + cmd = f"./{kernel_name} {params_str}" + return shlex.split(cmd) \ No newline at end of file diff --git a/examples/matrix/common_scripts/gen_data.py b/examples/matrix/common_scripts/gen_data.py index 1c757a27..445c666a 100644 --- a/examples/matrix/common_scripts/gen_data.py +++ b/examples/matrix/common_scripts/gen_data.py @@ -69,12 +69,26 @@ class MatmulGenData: y_gm_fp32 = MatmulGenData.due_overflow(res_tf) return y_gm_fp32 - def gen_golden_data_fp16(self, work_dir, dst_type=np.float32): - src_type = np.float16 + + def gen_c_data_nz_format(self, y_gm_fp32, dst_type, c0size): + nz_fractal_m = 16 + nz_fractal_n = 16 if self.is_channel_split: + nz_fractal_n = 8 c0size = 8 - else: - c0size = 16 + align_m = int(int((self.m + nz_fractal_m - 1) / nz_fractal_m) * nz_fractal_m) + align_n = int(int((self.n + nz_fractal_n - 1) / nz_fractal_n) * nz_fractal_n) + y_gm_pad = np.zeros([align_m, align_n]) + y_gm_pad[0:self.m, 0:self.n] = y_gm_fp32 + y_gm = y_gm_pad.astype(dst_type) + y_shape = [self.b, align_m, align_n] + y_gm = MatmulGenData.nd_to_nz(y_gm, y_shape, dst_type, c0size) + return y_gm + + + def gen_golden_data_fp16(self, work_dir, dst_type=np.float32): + src_type = np.float16 + c0size = 16 if self.is_trans_a: x1_shape = [self.b, self.k, self.m] else: @@ -83,7 +97,6 @@ class MatmulGenData: x2_shape = [self.b, self.n, self.k] else: x2_shape = [self.b, self.k, self.n] - y_shape = [self.b, self.m, self.n] x1_gm = np.random.uniform(-1, 1, x1_shape).astype(src_type) x1_gm_fp32 = x1_gm.astype(np.float32) x2_gm = np.random.uniform(-1, 1, x2_shape).astype(src_type) @@ -96,14 +109,15 @@ class MatmulGenData: y_gm_fp32 = self.tf_matmul(x1_gm_fp32, x2_gm_fp32, bias_gm_fp32) else: y_gm_fp32 = self.tf_matmul(x1_gm_fp32, x2_gm_fp32) - y_gm = y_gm_fp32.astype(dst_type) if self.a_format == "NZ": x1_gm = MatmulGenData.nd_to_nz(x1_gm, x1_shape, src_type, c0size) if self.b_format == "NZ": x2_gm = MatmulGenData.nd_to_nz(x2_gm, x2_shape, src_type, c0size) if self.c_format == "NZ": - y_gm = MatmulGenData.nd_to_nz(y_gm, y_shape, dst_type, c0size) + y_gm = self.gen_c_data_nz_format(y_gm_fp32, dst_type, c0size) + else: + y_gm = y_gm_fp32.astype(dst_type) x1_gm.tofile(work_dir + "/input/x1_gm.bin") x2_gm.tofile(work_dir + "/input/x2_gm.bin") diff --git a/examples/matrix/matmul/README.md b/examples/matrix/matmul/README.md index 3b5af64a..ed090c23 100644 --- a/examples/matrix/matmul/README.md +++ b/examples/matrix/matmul/README.md @@ -65,7 +65,7 @@ matmul算子规格 - 结束矩阵乘操作。 - tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_a2b2share/README.md b/examples/matrix/matmul_a2b2share/README.md index 34afa5b2..765ad29e 100644 --- a/examples/matrix/matmul_a2b2share/README.md +++ b/examples/matrix/matmul_a2b2share/README.md @@ -15,7 +15,7 @@ | [cmake](cmake) | 编译工程文件 | | [op_host](op_host) | 本样例tiling代码实现 | | [op_kernel](op_kernel) | 本样例kernel代码实现 | -| [scripts](scripts) | 执行文件 | +| [scripts](scripts) | 包含输入输出真值数据生成脚本文件和执行文件 | | [testcase](testcase) | 用例文件,配置用例的计算shape信息 | | [CMakeLists.txt](CMakeLists.txt) | 编译工程文件 | | [main.cpp](main.cpp) | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | @@ -92,7 +92,7 @@ - 结束两次矩阵乘操作。 - tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_a2b2share/main.cpp b/examples/matrix/matmul_a2b2share/main.cpp index 5e209873..1db885d5 100644 --- a/examples/matrix/matmul_a2b2share/main.cpp +++ b/examples/matrix/matmul_a2b2share/main.cpp @@ -224,7 +224,8 @@ int32_t main(int32_t argc, const char* args[]) ss >> problem[i - 1]; } - optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3]}; + bool isBias = problem[3]; + optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias}; auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); #ifdef ASCENDC_CPU_DEBUG MatmulHost::TestMatmulCpu(caseParams); diff --git a/examples/matrix/matmul_channelsplit/CMakeLists.txt b/examples/matrix/matmul_channelsplit/CMakeLists.txt new file mode 100644 index 00000000..6f448fe5 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/CMakeLists.txt @@ -0,0 +1,73 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if (${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_channelsplit_custom_kernel.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_matmul_channelsplit_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_channelsplit_custom_tiling.cpp +) + +target_compile_options(ascendc_matmul_channelsplit_bbit PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(ascendc_matmul_channelsplit_bbit PRIVATE + SOC_VERSION="${SOC_VERSION}" +) + +target_include_directories(ascendc_matmul_channelsplit_bbit PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(ascendc_matmul_channelsplit_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl +) diff --git a/examples/matrix/matmul_channelsplit/README.md b/examples/matrix/matmul_channelsplit/README.md new file mode 100644 index 00000000..79369120 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/README.md @@ -0,0 +1,107 @@ + +## 概述 + +本样例介绍了调用Matmul API实现矩阵乘输出Channel拆分功能的单算子。当Matmul计算结果C矩阵的格式为NZ时,C矩阵采用分形存储,若C矩阵数据类型为float,默认情况下分形大小为16 * 16。Channel拆分功能可以将该场景下C矩阵的每个16 * 16分形切分为16 * 8的分形,使C矩阵按照16 * 8的分形进行存储。关于ChannelSplit的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程(高阶API) > 特性场景 > 矩阵乘输出的Channel拆分”章节。 + +本样例以直调的方式调用算子核函数。 +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + + +## 样例支持的产品型号为 +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + + +## 目录结构 +| 目录及文件 | 描述 | +|----------------------------------|----------------------| +| [cmake](cmake) | 编译工程文件 | +| [op_host](op_host) | 本样例的tiling代码实现 | +| [op_kernel](op_kernel) | 本样例的kernel代码实现 | +| [scripts](scripts) | 执行脚本文件 | +| [testcase](testcase) | 用例文件,配置用例的计算shape信息 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + + +## 算子描述 +- 算子功能 + MatmulChannelSplitCustom算子调用Matmul API计算时,通过配置MatmulConfig中的isEnableChannelSplit参数为true,使能矩阵乘输出的Channel拆分功能,对输入的A、B矩阵做矩阵乘和加bias偏置。 + +- 算子规格 + + + + + + + + + + + + +
算子类型(OpType)MatmulChannelSplitCustom
算子输入nameshapedata typeformatisTrans
a-float16NDfalse
b-float16NDfalse
bias-floatND-
算子输出c-floatNZ-
核函数名matmul_channelsplit_custom
+ + +## 算子实现介绍 +- 约束条件 + 开启ChannelSplit功能需满足: + - C矩阵的数据排布格式为CubeFormat::NZ。 + - C矩阵的数据类型为float。 + - C矩阵的内存逻辑位置为Gloabl Memory。 + + +- 算子Kernel实现 + - 计算逻辑:C = A * B + Bias。 + - A、B为源操作数,A为左矩阵,形状为[M, K];B为右矩阵,形状为[K, N]。 + - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 + - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。 + - 具体步骤: + - 创建Matmul对象。 + 创建Matmul对象时,自定义MatmulConfig参数,将其中的isEnableChannelSplit参数设置为true,使能矩阵乘输出的Channel拆分功能,获得自定义的使用Norm模板的Matmul对象。 + ``` + constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM; + constexpr static MatmulFuncParams funcParamsChannelSplit{ + false, false, false, false, 0, IterateOrder::ORDER_M, ScheduleType::INNER_PRODUCT, + true, false, false, false, true/*isEnableChannelSplit*/ + }; + constexpr static MatmulConfig CFG_NORM = GetMMConfig(funcParamsChannelSplit); + ``` + - 初始化操作。 + - 完成矩阵乘操作。 + - 结束矩阵乘操作。 + +- 算子Tiling实现 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。 + - 只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - 获取Tiling参数的流程如下: + - 创建一个Tiling对象。 + - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 + - 调用GetTiling接口,获取Tiling信息。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] + ``` + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy -p 0 + ``` \ No newline at end of file diff --git a/examples/matrix/matmul_channelsplit/cmake/cpu_lib.cmake b/examples/matrix/matmul_channelsplit/cmake/cpu_lib.cmake new file mode 100644 index 00000000..487a91e0 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/cmake/cpu_lib.cmake @@ -0,0 +1,31 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} + DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/matrix/matmul_channelsplit/cmake/npu_lib.cmake b/examples/matrix/matmul_channelsplit/cmake/npu_lib.cmake new file mode 100644 index 00000000..bc803099 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/cmake/npu_lib.cmake @@ -0,0 +1,27 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING +) \ No newline at end of file diff --git a/examples/matrix/matmul_channelsplit/main.cpp b/examples/matrix/matmul_channelsplit/main.cpp new file mode 100644 index 00000000..2a3ec442 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/main.cpp @@ -0,0 +1,229 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "../../common/data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "op_host/matmul_channelsplit_custom_tiling.h" + +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void matmul_channelsplit_custom_do(uint32_t coreDim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); +#else +#include "tikicpulib.h" +extern "C" void matmul_channelsplit_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); +#endif + +namespace { +constexpr bool IS_BIAS = true; +constexpr bool IS_A_TRANS = false; +constexpr bool IS_B_TRANS = false; +constexpr int32_t BLOCK_SIZE = 16; + +struct MatrixFileSize +{ + size_t x1FileSize; + size_t x2FileSize; + size_t yFileSize; + size_t biasFileSize; +}; +} + +namespace MatmulHost { +static size_t GetSysWorkSpaceSize() +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + return static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); +} + +// CPU debug mode +#ifdef ASCENDC_CPU_DEBUG +void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize); + uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize); + uint8_t* bias = nullptr; + uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize); + uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize); + ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize); + ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize); + if (IS_BIAS) { + bias = (uint8_t*)AscendC::GmAlloc(biasFileSize); + ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); + } + size_t tilingFileSize = sizeof(TCubeTiling); + uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + // Calculate Tiling + const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams); + memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize); + ICPU_RUN_KF(matmul_channelsplit_custom, tilingData.usedCoreNum, x1, x2, bias, y, workspace, tiling); + WriteFile("../output/output.bin", y, yFileSize); + AscendC::GmFree((void*)x1); + AscendC::GmFree((void*)x2); + AscendC::GmFree((void*)y); + AscendC::GmFree((void*)workspace); + AscendC::GmFree((void*)tiling); + if (IS_BIAS) { + AscendC::GmFree((void*)bias); + } +} +// NPU +#else +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k, + void* stream = nullptr) +{ + // Init args + uint8_t* workspaceDevice = nullptr; + + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + + // Allocate workspace on device + CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + // Initialize kernel with arguments and workspace pointer + uint8_t* tilingHost = nullptr; + uint8_t* tilingDevice = nullptr; + size_t tilingFileSize = sizeof(TCubeTiling); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + // Calculate Tiling + const auto tilingData = MatmulHost::GenerateTiling(testCaseParams); + CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData, + tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, + tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + // Launch kernel + matmul_channelsplit_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); +} + +void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + + aclrtContext context; + aclrtStream stream = nullptr; + int64_t deviceId = 0; + TestAclInit(context, stream, deviceId); + + uint8_t* x1Host = nullptr; + uint8_t* x1Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize); + CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* x2Host = nullptr; + uint8_t* x2Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize); + CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* biasHost = nullptr; + uint8_t* biasDevice = nullptr; + if (IS_BIAS) { + CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + uint8_t* yHost = nullptr; + uint8_t* yDevice = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize)); + CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, yFileSize); + + if (IS_BIAS) { + CHECK_ACL(aclrtFree(biasDevice)); + CHECK_ACL(aclrtFreeHost(biasHost)); + } + CHECK_ACL(aclrtFree(x1Device)); + CHECK_ACL(aclrtFreeHost(x1Host)); + CHECK_ACL(aclrtFree(x2Device)); + CHECK_ACL(aclrtFreeHost(x2Host)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); + TestAclDeInit(context, stream, deviceId); +} +#endif +} // namespace MatmulHost + +int32_t main(int32_t argc, const char* args[]) +{ + int64_t inputParams[3] = {1, 1, 1}; + for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop + std::stringstream ss(args[i]); + ss >> inputParams[i - 1]; + } + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); + int64_t M = inputParams[0]; + int64_t N = inputParams[1]; + int64_t K = inputParams[2]; + MatrixFileSize matrixFileSize; + // uint16_t represent half + matrixFileSize.x1FileSize = static_cast(M * K) * sizeof(uint16_t); + matrixFileSize.x2FileSize = static_cast(K * N) * sizeof(uint16_t); + // float NZ cMatrix output, 16*8 fractal + int32_t alignM = (static_cast(M) + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; + int32_t alignN = (static_cast(N) + BLOCK_SIZE / 2 - 1) / (BLOCK_SIZE / 2) * (BLOCK_SIZE / 2); + matrixFileSize.yFileSize = static_cast(alignM * alignN) * sizeof(float); + matrixFileSize.biasFileSize = static_cast(1 * N) * sizeof(float); +#ifdef ASCENDC_CPU_DEBUG + MatmulHost::TestMatmulCpu(M, N, K, matrixFileSize); +#else + MatmulHost::TestMatmul(M, N, K, matrixFileSize); +#endif + return 0; +} diff --git a/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.cpp b/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.cpp new file mode 100644 index 00000000..cfa68371 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.cpp @@ -0,0 +1,50 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "matmul_channelsplit_custom_tiling.h" +#include + +namespace MatmulHost { + +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) +{ + TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + uint32_t M = testCaseParams.m; + uint32_t N = testCaseParams.n; + uint32_t K = testCaseParams.k; + uint32_t blockDim = testCaseParams.usedCoreNum; + bool isBias = testCaseParams.isBias; + bool isAtrans = testCaseParams.isATrans; + bool isBtrans = testCaseParams.isBTrans; + + cubeTiling.SetDim(blockDim); + cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isBtrans); + // CTYPE is NZ format + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::NZ, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetOrgShape(M, N, K); + cubeTiling.SetShape(M, N, K); + cubeTiling.EnableBias(isBias); + cubeTiling.SetBufferSpace(-1, -1, -1); + if (cubeTiling.GetTiling(tilingData) == -1) { + std::cout << "Generate tiling failed." << std::endl; + return {}; + } + return tilingData; +} + +} // namespace MatmulHost diff --git a/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.h b/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.h new file mode 100644 index 00000000..38bf87ab --- /dev/null +++ b/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.h @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_HOST_MATMUL_CHANNELSPLIT_CUSTOM_TILING_H +#define EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_HOST_MATMUL_CHANNELSPLIT_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" + +namespace MatmulHost { + +struct MatmulCaseParams +{ + int32_t usedCoreNum; + int32_t m; + int32_t n; + int32_t k; + bool isBias; + bool isATrans; + bool isBTrans; +}; + +/** + * @brief Generate matmul tiling. + * @param testCaseParams: Testcase parameters. + * @retval Generated Tiling data. + */ +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams); + +} // namespace MatmulHost +#endif // EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_HOST_MATMUL_CHANNELSPLIT_CUSTOM_TILING_H diff --git a/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.cpp b/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.cpp new file mode 100644 index 00000000..12a7866b --- /dev/null +++ b/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.cpp @@ -0,0 +1,156 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "matmul_channelsplit_custom_kernel.h" + +namespace { +/** + * @brief Copy tiling data to TCubeTiling ptr from tiling gm addr. + * @param tiling: TCubeTiling ptr which needs to copy tiling data. + * @param tilingGM: Tiling gm addr. + * @retval None + */ +__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +template +__aicore__ inline T Ceil(T num1, T num2) +{ + if (num2 == 0) { + return num1; + } + return (num1 + num2 - 1) / num2; +} + +constexpr int32_t BLOCK_SIZE = 16; +} + +namespace MatmulChannelSplitCustom { +template +__aicore__ inline void MatmulKernel::Init( + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB) +{ + this->tiling = tiling; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N); + // float NZ cMatrix output, 16*8 fractal + uint32_t alignM = Ceil(tiling.M, BLOCK_SIZE) * BLOCK_SIZE; + uint32_t alignN = Ceil(tiling.N, BLOCK_SIZE / 2) * (BLOCK_SIZE / 2); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), alignM * alignN); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N); + + int32_t offsetA = 0; + int32_t offsetB = 0; + int32_t offsetC = 0; + int32_t offsetBias = 0; + this->isTransA = isTransA; + this->isTransB = isTransB; + CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias); + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; + if (GetSysWorkSpacePtr() == nullptr) { + return; + } +} + +template +__aicore__ inline void MatmulKernel::Process() +{ + if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) { + return; + } + + // process with tail block + int tailM = tiling.M - mCoreIndex * tiling.singleCoreM; + tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM; + int tailN = tiling.N - nCoreIndex * tiling.singleCoreN; + tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN; + if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) { + matmulObj.SetTail(tailM, tailN); + } + + matmulObj.SetTensorA(aGlobal, isTransA); + matmulObj.SetTensorB(bGlobal, isTransB); + if (tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + matmulObj.IterateAll(cGlobal); + matmulObj.End(); +} + +template +__aicore__ inline void MatmulKernel::CalcOffset( + int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias) +{ + const TCubeTiling& tiling = this->tiling; + auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores + mCoreIndex = blockIdx % mSingleBlocks; + nCoreIndex = blockIdx / mSingleBlocks; + + if (isTransA) { + offsetA = mCoreIndex * tiling.singleCoreM; + } else { + offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM; + } + if (isTransB) { + offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN; + } else { + offsetB = nCoreIndex * tiling.singleCoreN; + } + // output C matrix is NZ format + offsetC = tiling.M * nCoreIndex * tiling.singleCoreN + mCoreIndex * tiling.singleCoreM * BLOCK_SIZE / 2; + offsetBias = nCoreIndex * tiling.singleCoreN; +} +} // namespace MatmulChannelSplitCustom + +/** + * @brief matmul kernel function. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tilingGm: Tiling data addr. + * @retval None + */ +extern "C" __global__ __aicore__ void matmul_channelsplit_custom(GM_ADDR a, GM_ADDR b, + GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + // prepare tiling + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + // define matmul kernel + MatmulChannelSplitCustom::MatmulKernel matmulKernel; + AscendC::TPipe pipe; + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); + // init matmul kernel, isTransA=false, isTransB=false + matmulKernel.Init(a, b, bias, c, tiling, false, false); + // matmul kernel process + matmulKernel.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void matmul_channelsplit_custom_do(uint32_t blockDim, void* stream, + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + matmul_channelsplit_custom<<>>(a, b, bias, c, workspace, tilingGm); +} +#endif diff --git a/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.h b/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.h new file mode 100644 index 00000000..e88082a2 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.h @@ -0,0 +1,78 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_KERNEL_MATMUL_CHANNELSPLIT_CUSTOM_KERNEL_H +#define EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_KERNEL_MATMUL_CHANNELSPLIT_CUSTOM_KERNEL_H +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +namespace MatmulChannelSplitCustom { +constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM; +// Set isEnableChannelSplit=true +constexpr static MatmulFuncParams funcParamsChannelSplit{ + false, false, false, false, 0, IterateOrder::ORDER_M, ScheduleType::INNER_PRODUCT, + true, false, false, false, true/*isEnableChannelSplit*/ +}; +constexpr static MatmulConfig CFG_NORM = GetMMConfig(funcParamsChannelSplit); +template +class MatmulKernel { +public: + __aicore__ inline MatmulKernel(){}; + /** + * @brief Initialization before process. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param tiling: Matmul tiling struct. + * @param isTransA: Whether A matrix is transposed. + * @param isTransB: Whether B matrix is transposed. + * @retval None + */ + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, + bool isTransA, bool isTransB); + /** + * @brief Process matrix calculation. + * @retval None + */ + __aicore__ inline void Process(); + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + AscendC::Matmul matmulObj; + +private: + /** + * @brief Calculate the gm offset based on the blockIdx. + * @param blockIdx: Current Core blockidx. + * @param offsetA: Gm offset of A matrix. + * @param offsetB: Gm offset of B matrix. + * @param offsetC: Gm offset of C matrix. + * @param offsetBias: Gm offset of Bias matrix. + * @retval None + */ + __aicore__ inline void CalcOffset( + int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias); + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + TCubeTiling tiling; + int32_t mCoreIndex; + int32_t nCoreIndex; + bool isTransA{false}; + bool isTransB{false}; +}; +} // namespace MatmulChannelSplitCustom + +#endif // EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_KERNEL_MATMUL_CHANNELSPLIT_CUSTOM_KERNEL_H diff --git a/examples/matrix/matmul_channelsplit/run.sh b/examples/matrix/matmul_channelsplit/run.sh new file mode 100644 index 00000000..1153cb01 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/run.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +export IS_PERF="0" + +SHORT=r:,v:,p:, +LONG=run-mode:,soc-version:,perf:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (-p | --perf ) + IS_PERF="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +# Check invalid input +if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then + echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}" + exit 1 +fi + +if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then + echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu." + exit 1 +fi + +if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1." + exit 1 +fi + +# only npu mode support is_perf = 1 +if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." + exit 1 +fi + +rm -rf build +mkdir build +cd build + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +set -euo pipefail + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +cd ../ + +rm -rf input +mkdir input +rm -rf output +mkdir output + +rm -rf bin/ +mkdir -p bin +cd bin +cp ../build/ascendc_matmul_channelsplit_bbit ./ + +export TF_CPP_MIN_LOG_LEVEL=3 + +if [ "${RUN_MODE}" = "npu" ]; then + if [ "${IS_PERF}" = "1" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + python3 -u ../scripts/exec_test.py npu "perf" + else + python3 -u ../scripts/exec_test.py npu "normal" + fi +elif [ "${RUN_MODE}" = "sim" ]; then + python3 -u ../scripts/exec_test.py sim "normal" +elif [ "${RUN_MODE}" = "cpu" ]; then + python3 -u ../scripts/exec_test.py cpu "normal" +fi diff --git a/examples/matrix/matmul_channelsplit/scripts/exec_test.py b/examples/matrix/matmul_channelsplit/scripts/exec_test.py new file mode 100644 index 00000000..1ab5d18d --- /dev/null +++ b/examples/matrix/matmul_channelsplit/scripts/exec_test.py @@ -0,0 +1,130 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import csv +import time +import logging +import subprocess +import shlex + +import numpy as np + +sys.path.append("../..") +from common_scripts.gen_data import MatmulGenData +from common_scripts.compare_data import compare_data +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache + +IS_BIAS = True +IS_TRANS_A = False +IS_TRANS_B = False +IS_CHANNEL_SPLIT = True +# float16 in float32 out +DATA_TYPE_STR = "float16_float32" +A_FORMAT = "ND" +B_FORMAT = "ND" +C_FORMAT = "NZ" + +logging.basicConfig(level=logging.INFO) + + +class ProcessParams: + def __init__(self, case_name, m, n, k, b, is_perf, run_mode): + self.case_name = case_name + self.m = m + self.n = n + self.k = k + self.b = b + self.is_perf = is_perf + self.run_mode = run_mode + + +def process_case(file_work_dir, process_params): + case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ + process_params.k, process_params.b, process_params.is_perf, process_params.run_mode + logging.info("[INFO] start process case [%s]" % (case_name)) + logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) + clear_file_cache(file_work_dir) + + matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR,\ + A_FORMAT, B_FORMAT, C_FORMAT, IS_CHANNEL_SPLIT) + params_str = f"{m} {n} {k}" + if is_perf: + matmul_gen_data.gen_fake_golden_data(file_work_dir) + cmd = f"msprof op --application=\"./ascendc_matmul_channelsplit_bbit {params_str}\" --output=./prof_out" + elif run_mode == "sim": + matmul_gen_data.gen_golden_data(file_work_dir) + cmd = f"msprof op simulator --application=\"./ascendc_matmul_channelsplit_bbit {params_str}\"\ + --output=./sim_out" + else: + matmul_gen_data.gen_golden_data(file_work_dir) + cmd = f"./ascendc_matmul_channelsplit_bbit {params_str}" + subprocess.run(shlex.split(cmd)) + if is_perf: + wrong_num = -1 + else: + logging.info("[INFO] compare data case[%s]" % (case_name)) + wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + res_data = [] + res_data.append(case_name) + res_data.append(wrong_num) + res_data.append(b * m * n) + if wrong_num == -1: + res_data.append("None") + elif wrong_num / (b * m * n) > 0.001: + res_data.append("Fail") + else: + res_data.append("Success") + if is_perf: + task_duration = get_perf_task_duration("./prof_out") + res_data.append(task_duration) + return res_data + + +def main(): + args_len = len(sys.argv) - 1 + if args_len != 2: + logging.info("[ERROR] exec_test input params error!") + return -1 + + file_work_dir = get_file_work_dir() + if not os.path.exists(file_work_dir): + logging.info("[ERROR] file path %s not exist!" % (file_work_dir)) + return -1 + + is_perf = False + if sys.argv[2] == "perf": + is_perf = True + + case_list = get_case_list() + res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] + run_mode = sys.argv[1] + for is_process, case_name, m, n, k, b in case_list: + if is_process == 1: + process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode) + res_data = process_case(file_work_dir, process_params) + res_list.append(res_data) + + timestamp = time.time() + result_file_name = "result_" + str(timestamp) + ".csv" + with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(res_list) + + logging.info("---------------RESULT---------------") + for res in res_list: + logging.info(res) + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/matrix/matmul_channelsplit/testcase/case.csv b/examples/matrix/matmul_channelsplit/testcase/case.csv new file mode 100644 index 00000000..8fb2bb95 --- /dev/null +++ b/examples/matrix/matmul_channelsplit/testcase/case.csv @@ -0,0 +1 @@ +1, case001, 64, 7680, 64 \ No newline at end of file diff --git a/examples/matrix/matmul_constant/README.md b/examples/matrix/matmul_constant/README.md index 9ff8bc85..30106c66 100644 --- a/examples/matrix/matmul_constant/README.md +++ b/examples/matrix/matmul_constant/README.md @@ -119,4 +119,4 @@ MatmulConstantCustom单算子,对输入的A、B矩阵做矩阵乘和加bias偏 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` bash run.sh -r cpu -v Ascendxxxyy -p 0 - ``` + ``` \ No newline at end of file diff --git a/examples/matrix/matmul_constant/scripts/exec_test.py b/examples/matrix/matmul_constant/scripts/exec_test.py index 28299603..aacdcd2d 100644 --- a/examples/matrix/matmul_constant/scripts/exec_test.py +++ b/examples/matrix/matmul_constant/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_bbit %s %s %s %s\" --output=\"./prof_out\"" % - (m, n, k, b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_l0c_extend/README.md b/examples/matrix/matmul_l0c_extend/README.md index e8aadba3..fd221900 100644 --- a/examples/matrix/matmul_l0c_extend/README.md +++ b/examples/matrix/matmul_l0c_extend/README.md @@ -85,7 +85,7 @@ - 结束矩阵乘操作。 - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_l0c_extend/scripts/exec_test.py b/examples/matrix/matmul_l0c_extend/scripts/exec_test.py index 8087a5e0..a684ea7f 100644 --- a/examples/matrix/matmul_l0c_extend/scripts/exec_test.py +++ b/examples/matrix/matmul_l0c_extend/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = True IS_TRANS_A = True @@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_l0c_extend_bbit %s %s %s %s\" --output=\"./prof_out\"" % - (m, n, k, b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_l0c_extend_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_l0c_extend_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_l0c_extend_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_l0cache/README.md b/examples/matrix/matmul_l0cache/README.md index aa224a3b..cd51ddae 100644 --- a/examples/matrix/matmul_l0cache/README.md +++ b/examples/matrix/matmul_l0cache/README.md @@ -85,7 +85,7 @@ L0缓存没有对外开关,由Matmul API内部根据用户配置的shape信息 ``` - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_l2cache/README.md b/examples/matrix/matmul_l2cache/README.md index 03d5b2ff..6ccb24ab 100644 --- a/examples/matrix/matmul_l2cache/README.md +++ b/examples/matrix/matmul_l2cache/README.md @@ -15,7 +15,7 @@ | [cmake](cmake) | 编译工程文件 | | [op_host](op_host) | 本样例tiling代码实现 | | [op_kernel](op_kernel) | 本样例kernel代码实现 | -| [scripts](scripts) | 执行文件 | +| [scripts](scripts) | 包含输入输出真值数据生成脚本文件和执行文件 | | [testcase](testcase) | 用例文件,配置用例的计算shape信息 | | [CMakeLists.txt](CMakeLists.txt) | 编译工程文件 | | [main.cpp](main.cpp) | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | @@ -75,7 +75,7 @@ - 结束矩阵乘操作。 - Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息。根据数据的切分方式,设置数据切分后每次计算的Shape和数据切分前的完整OrgShape。 diff --git a/examples/matrix/matmul_l2cache/main.cpp b/examples/matrix/matmul_l2cache/main.cpp index 6e54a5e4..608da572 100644 --- a/examples/matrix/matmul_l2cache/main.cpp +++ b/examples/matrix/matmul_l2cache/main.cpp @@ -189,7 +189,8 @@ int32_t main(int32_t argc, const char* args[]) ss >> problem[i - 1]; } - optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3], problem[4]}; + bool isBias = problem[3]; + optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias, problem[4]}; auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); #ifdef ASCENDC_CPU_DEBUG MatmulHost::TestMatmulCpu(caseParams); diff --git a/examples/matrix/matmul_mixdualmaster/README.md b/examples/matrix/matmul_mixdualmaster/README.md index 80c93599..4465a7c5 100644 --- a/examples/matrix/matmul_mixdualmaster/README.md +++ b/examples/matrix/matmul_mixdualmaster/README.md @@ -64,7 +64,7 @@ - 结束矩阵乘操作。 - Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_mixdualmaster/run.sh b/examples/matrix/matmul_mixdualmaster/run.sh index 3933ebf5..78e91376 100644 --- a/examples/matrix/matmul_mixdualmaster/run.sh +++ b/examples/matrix/matmul_mixdualmaster/run.sh @@ -61,8 +61,6 @@ rm -rf build mkdir build cd build -export TF_CPP_MIN_LOG_LEVEL=3 - source $ASCEND_HOME_DIR/bin/setenv.bash export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH set -euo pipefail @@ -82,6 +80,8 @@ mkdir -p bin cd bin cp ../build/ascendc_matmul_mixdualmaster_bbit ./ +export TF_CPP_MIN_LOG_LEVEL=3 + if [ "${RUN_MODE}" = "npu" ]; then if [ "${IS_PERF}" = "1" ]; then export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} diff --git a/examples/matrix/matmul_mixdualmaster/scripts/exec_test.py b/examples/matrix/matmul_mixdualmaster/scripts/exec_test.py index 532b9f3f..befb8094 100644 --- a/examples/matrix/matmul_mixdualmaster/scripts/exec_test.py +++ b/examples/matrix/matmul_mixdualmaster/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_mixdualmaster_bbit %s %s %s %s\" --output=\"./prof_out\"" % - (m, n, k, b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_mixdualmaster_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_mixdualmaster_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_mixdualmaster_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_mndb/README.md b/examples/matrix/matmul_mndb/README.md index 236a69b5..a52a0d3a 100644 --- a/examples/matrix/matmul_mndb/README.md +++ b/examples/matrix/matmul_mndb/README.md @@ -66,7 +66,7 @@ - 结束矩阵乘操作。 - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul Kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul Kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_mndb/scripts/exec_test.py b/examples/matrix/matmul_mndb/scripts/exec_test.py index 8429d6e1..cc700ad5 100644 --- a/examples/matrix/matmul_mndb/scripts/exec_test.py +++ b/examples/matrix/matmul_mndb/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_mndb_bbit %s %s %s %s\" --output=\"./prof_out\"" % - (m, n, k, b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_mndb_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_mndb_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_mndb_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_nbuffer33/README.md b/examples/matrix/matmul_nbuffer33/README.md index 4b6ced60..4d60f7ba 100644 --- a/examples/matrix/matmul_nbuffer33/README.md +++ b/examples/matrix/matmul_nbuffer33/README.md @@ -69,10 +69,16 @@ - 结束矩阵乘操作。 - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 + - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等,开启NBuffer33模式。 + ``` + matmul_tiling::MatmulConfigParams matmulConfigParams(1, false, + matmul_tiling::ScheduleType::N_BUFFER_33, /* NBuffer33模式 */ + matmul_tiling::MatrixTraverse::NOSET, false); + cubeTiling.SetMatmulConfigParams(matmulConfigParams); + ``` - 调用GetTiling接口,获取Tiling信息。 ## 编译运行样例 diff --git a/examples/matrix/matmul_nbuffer33/main.cpp b/examples/matrix/matmul_nbuffer33/main.cpp index 21bd7b4f..6208faf0 100644 --- a/examples/matrix/matmul_nbuffer33/main.cpp +++ b/examples/matrix/matmul_nbuffer33/main.cpp @@ -190,8 +190,9 @@ void TestMatmul(const MatmulHost::MatmulCaseParams &testCaseParams) int32_t main(int32_t argc, const char *args[]) { - int64_t problem[12] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - for (int32_t i = 1; i < argc && i < 13; ++i) { // 13 + int64_t problem[3] = {1, 1, 1}; + int32_t size = 4; + for (int32_t i = 1; i < argc && i < size; ++i) { std::stringstream ss(args[i]); ss >> problem[i - 1]; } @@ -199,10 +200,7 @@ int32_t main(int32_t argc, const char *args[]) auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion); MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), static_cast(problem[0]), static_cast(problem[1]), static_cast(problem[2]), - IS_BIAS, IS_A_TRANS, IS_B_TRANS, - static_cast(problem[3]), static_cast(problem[4]), static_cast(problem[5]), - static_cast(problem[6]), static_cast(problem[7]), static_cast(problem[8]), - static_cast(problem[9]), static_cast(problem[10]), static_cast(problem[11])}; + IS_BIAS, IS_A_TRANS, IS_B_TRANS}; #ifdef ASCENDC_CPU_DEBUG MatmulHost::TestMatmulCpu(testCaseParams); #else diff --git a/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.cpp b/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.cpp index c71416ea..4c3bbf51 100644 --- a/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.cpp +++ b/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.cpp @@ -21,7 +21,7 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) uint32_t M = testCaseParams.m; uint32_t N = testCaseParams.n; uint32_t K = testCaseParams.k; - uint32_t blockDim = 1; + uint32_t blockDim = testCaseParams.usedCoreNum; bool isBias = testCaseParams.isBias; bool isAtrans = testCaseParams.isATrans; bool isBtrans = testCaseParams.isBTrans; @@ -39,19 +39,13 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) cubeTiling.SetShape(M, N, K); cubeTiling.EnableBias(isBias); cubeTiling.SetBufferSpace(-1, -1, -1); + matmul_tiling::MatmulConfigParams matmulConfigParams(1, false, matmul_tiling::ScheduleType::N_BUFFER_33, + matmul_tiling::MatrixTraverse::NOSET, false); + cubeTiling.SetMatmulConfigParams(matmulConfigParams); if (cubeTiling.GetTiling(tilingData) == -1) { std::cout << "Generate tiling failed." << std::endl; return {}; } - tilingData.baseM = testCaseParams.baseM; - tilingData.baseN = testCaseParams.baseN; - tilingData.baseK = testCaseParams.baseK; - tilingData.stepM = testCaseParams.stepM; - tilingData.stepN = testCaseParams.stepN; - tilingData.stepKa = testCaseParams.stepKa; - tilingData.stepKb = testCaseParams.stepKb; - tilingData.depthA1 = testCaseParams.depthA1; - tilingData.depthB1 = testCaseParams.depthB1; return tilingData; } diff --git a/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.h b/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.h index 2a2c1c7e..449fbf1e 100644 --- a/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.h +++ b/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.h @@ -24,15 +24,6 @@ struct MatmulCaseParams int32_t isBias; bool isATrans; bool isBTrans; - int32_t stepM; - int32_t stepN; - int32_t stepKa; - int32_t stepKb; - int32_t depthA1; - int32_t depthB1; - int32_t baseM; - int32_t baseN; - int32_t baseK; }; /** diff --git a/examples/matrix/matmul_nbuffer33/run.sh b/examples/matrix/matmul_nbuffer33/run.sh index f80b3277..5e271e9d 100644 --- a/examples/matrix/matmul_nbuffer33/run.sh +++ b/examples/matrix/matmul_nbuffer33/run.sh @@ -80,7 +80,7 @@ mkdir -p bin cd bin cp ../build/ascendc_matmul_nbuffer33_bbit ./ -export TF_CPP_MIN_LOG_LEVEL=3s +export TF_CPP_MIN_LOG_LEVEL=3 if [ "${RUN_MODE}" = "npu" ]; then if [ "${IS_PERF}" = "1" ]; then diff --git a/examples/matrix/matmul_nbuffer33/scripts/exec_test.py b/examples/matrix/matmul_nbuffer33/scripts/exec_test.py index dfba0c12..ce5cc63d 100644 --- a/examples/matrix/matmul_nbuffer33/scripts/exec_test.py +++ b/examples/matrix/matmul_nbuffer33/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -31,8 +33,7 @@ logging.basicConfig(level=logging.INFO) class ProcessParams: - def __init__(self, case_name, m, n, k, b, stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek,\ - is_perf, run_mode): + def __init__(self, case_name, m, n, k, b, is_perf, run_mode): self.case_name = case_name self.m = m self.n = n @@ -40,42 +41,24 @@ class ProcessParams: self.b = b self.is_perf = is_perf self.run_mode = run_mode - self.stepm = stepm - self.stepn = stepn - self.stepka = stepka - self.stepkb = stepkb - self.deptha1 = deptha1 - self.depthb1 = depthb1 - self.basem = basem - self.basen = basen - self.basek = basek def process_case(file_work_dir, process_params): case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ process_params.k, process_params.b, process_params.is_perf, process_params.run_mode - stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek = process_params.stepm, process_params.stepn,\ - process_params.stepka, process_params.stepkb, process_params.deptha1, process_params.depthb1,\ - process_params.basem, process_params.basen, process_params.basek logging.info("[INFO] start process case [%s]" % (case_name)) logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_nbuffer33_bbit %s %s %s %s %s %s %s %s %s %s %s %s %s\"\ - --output=\"./prof_out\"" % (m, n, k, stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek,\ - b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_nbuffer33_bbit %s %s %s %s %s %s %s %s %s %s %s\ - %s %s\" --output=\"./sim_out\"" % (m, n, k, stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen,\ - basek, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_nbuffer33_bbit %s %s %s %s %s %s %s %s %s %s %s %s %s" % (m, n, k,\ - stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_nbuffer33_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: @@ -97,45 +80,6 @@ def process_case(file_work_dir, process_params): return res_data -def get_case_list(): - current_path = os.getcwd() - case_dir = os.path.join(os.path.dirname(current_path), "testcase") - if not os.path.exists(case_dir): - logging.info("[ERROR] file path %s not exist!" % (case_dir)) - return None - - case_list = [] - for file_name in os.listdir(case_dir): - if not file_name.endswith(".csv"): - continue - - abs_file_name = os.path.join(case_dir, file_name) - with open(abs_file_name, mode='r', encoding='utf-8') as file: - csv_reader = csv.reader(file) - for row in csv_reader: - item_list = [] - item_list.append(int(row[0].lstrip("\ufeff"))) - item_list.append(row[1]) - item_list.append(int(row[2])) - item_list.append(int(row[3])) - item_list.append(int(row[4])) - item_list.append(int(row[5])) - item_list.append(int(row[6])) - item_list.append(int(row[7])) - item_list.append(int(row[8])) - item_list.append(int(row[9])) - item_list.append(int(row[10])) - item_list.append(int(row[11])) - item_list.append(int(row[12])) - item_list.append(int(row[13])) - if len(row) > 14: - item_list.append(int(row[14])) - else: - item_list.append(1) - case_list.append(item_list) - return case_list - - def main(): args_len = len(sys.argv) - 1 if args_len != 2: @@ -154,11 +98,9 @@ def main(): case_list = get_case_list() res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] run_mode = sys.argv[1] - for is_process, case_name, m, n, k, stepm, stepn, stepka, stepkb, deptha1, depthb1,\ - basem, basen, basek, b in case_list: + for is_process, case_name, m, n, k, b in case_list: if is_process == 1: - process_params = ProcessParams(case_name, m, n, k, b, stepm, stepn, stepka, stepkb,\ - deptha1, depthb1, basem, basen, basek, is_perf, run_mode) + process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode) res_data = process_case(file_work_dir, process_params) res_list.append(res_data) diff --git a/examples/matrix/matmul_nbuffer33/testcase/case.csv b/examples/matrix/matmul_nbuffer33/testcase/case.csv index 0bc22a85..0a942cd1 100644 --- a/examples/matrix/matmul_nbuffer33/testcase/case.csv +++ b/examples/matrix/matmul_nbuffer33/testcase/case.csv @@ -1 +1 @@ -1, case001, 384, 1024, 384, 3, 1, 3, 3, 9, 6, 128, 128, 128 \ No newline at end of file +1, case001, 256, 512, 192 \ No newline at end of file diff --git a/examples/matrix/matmul_nz/README.md b/examples/matrix/matmul_nz/README.md index a17c4ef6..2ee8e62e 100644 --- a/examples/matrix/matmul_nz/README.md +++ b/examples/matrix/matmul_nz/README.md @@ -92,7 +92,7 @@ MatmulNzCustom算子首先使用DataCopyPad接口将非对齐的矩阵数据搬 ``` - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_nz/scripts/exec_test.py b/examples/matrix/matmul_nz/scripts/exec_test.py index 3065da6e..b10ebda5 100644 --- a/examples/matrix/matmul_nz/scripts/exec_test.py +++ b/examples/matrix/matmul_nz/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -52,17 +54,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR, A_FORMAT, B_FORMAT) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_bbit %s %s %s %s\" --output=\"./prof_out\"" % - (m, n, k, b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_partial_output/README.md b/examples/matrix/matmul_partial_output/README.md index 38009236..94254930 100644 --- a/examples/matrix/matmul_partial_output/README.md +++ b/examples/matrix/matmul_partial_output/README.md @@ -1,7 +1,7 @@ ## 概述 -本样例介绍了调用Matmul高阶API实现开启Partial Output功能的单算子。Partial Output功能的应用场景为矩阵乘结果不需要累加,只需要输出baseM\*baseK和baseK\*baseN的计算结果baseM\*baseN。 +本样例介绍了调用Matmul高阶API实现开启Partial Output功能的单算子。Partial Output功能的应用场景为矩阵乘结果不需要累加,只需要输出baseM\*baseK和baseK\*baseN的计算结果baseM\*baseN。关于Partial Output功能的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程(高阶API) > 特性场景 > 单次矩阵乘局部输出”章节。 本样例以直调的方式调用算子核函数。 直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 @@ -73,7 +73,7 @@ - 结束矩阵乘操作。 - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 @@ -103,4 +103,4 @@ 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` bash run.sh -r cpu -v Ascendxxxyy -p 0 - ``` \ No newline at end of file + ``` diff --git a/examples/matrix/matmul_partial_output/op_kernel/matmul_partial_output_custom_kernel.cpp b/examples/matrix/matmul_partial_output/op_kernel/matmul_partial_output_custom_kernel.cpp index 7c48a8c9..89128d14 100644 --- a/examples/matrix/matmul_partial_output/op_kernel/matmul_partial_output_custom_kernel.cpp +++ b/examples/matrix/matmul_partial_output/op_kernel/matmul_partial_output_custom_kernel.cpp @@ -103,6 +103,7 @@ __aicore__ inline void MatmulKernel::Process(Asce Clear(bufferC); } bufferC = bufferC + bufferTmp; + AscendC::PipeBarrier(); outputOffset += tiling.baseM * tiling.baseN; } DataCopy(cGlobal[i * tiling.baseM * tiling.baseN], bufferC, tiling.baseM * tiling.baseN); diff --git a/examples/matrix/matmul_partial_output/scripts/exec_test.py b/examples/matrix/matmul_partial_output/scripts/exec_test.py index 367acae2..98c4338d 100644 --- a/examples/matrix/matmul_partial_output/scripts/exec_test.py +++ b/examples/matrix/matmul_partial_output/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -49,17 +51,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_partial_output_bbit %s %s %s %s\"\ - --output=\"./prof_out\"" % (m, n, k, b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_partial_output_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_partial_output_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_partial_output_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_preload/README.md b/examples/matrix/matmul_preload/README.md index c85a78a1..fec924ee 100644 --- a/examples/matrix/matmul_preload/README.md +++ b/examples/matrix/matmul_preload/README.md @@ -67,7 +67,7 @@ matmul单算子,对输入的A、B矩阵做矩阵乘和加bias偏置。在MTE2 - 结束矩阵乘操作。 - tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_preload/main.cpp b/examples/matrix/matmul_preload/main.cpp index 9eeee4ff..3b4a0c4d 100644 --- a/examples/matrix/matmul_preload/main.cpp +++ b/examples/matrix/matmul_preload/main.cpp @@ -40,8 +40,11 @@ static size_t GetSysWorkSpaceSize() // CPU debug mode #ifdef ASCENDC_CPU_DEBUG -void TestMatmulCpu(int64_t m, int64_t n, int64_t k) +void TestMatmulCpu(const MatmulHost::MatmulCaseParams &testCaseParams) { + int64_t m = testCaseParams.m; + int64_t n = testCaseParams.n; + int64_t k = testCaseParams.k; size_t x1FileSize = m * k * sizeof(uint16_t); // uint16_t represent half size_t x2FileSize = k * n * sizeof(uint16_t); // uint16_t represent half size_t yFileSize = m * n * sizeof(float); @@ -62,9 +65,6 @@ void TestMatmulCpu(int64_t m, int64_t n, int64_t k) } size_t tilingFileSize = sizeof(TCubeTiling); uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); - auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); - MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), - static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; // Calculate Tiling const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams); memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize); @@ -81,7 +81,7 @@ void TestMatmulCpu(int64_t m, int64_t n, int64_t k) } // NPU #else -void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k, +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, MatmulHost::MatmulCaseParams testCaseParams, void* stream = nullptr) { // Init args @@ -97,9 +97,6 @@ void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, in uint8_t* tilingHost; uint8_t* tilingDevice; size_t tilingFileSize = sizeof(TCubeTiling); - auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); - MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), - static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; // Calculate Tiling const auto tilingData = MatmulHost::GenerateTiling(testCaseParams); CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize)); @@ -129,12 +126,12 @@ void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId CHECK_ACL(aclFinalize()); } -void TestMatmul(int64_t m, int64_t n, int64_t k) +void TestMatmul(const MatmulHost::MatmulCaseParams &testCaseParams) { - size_t x1FileSize = static_cast(m * k) * sizeof(uint16_t); // uint16_t represent half - size_t x2FileSize = static_cast(k * n) * sizeof(uint16_t); // uint16_t represent half - size_t yFileSize = static_cast(m * n) * sizeof(float); - size_t biasFileSize = static_cast(1 * n) * sizeof(float); + size_t x1FileSize = static_cast(testCaseParams.m * testCaseParams.k) * sizeof(uint16_t); // uint16_t represent half + size_t x2FileSize = static_cast(testCaseParams.k * testCaseParams.n) * sizeof(uint16_t); // uint16_t represent half + size_t yFileSize = static_cast(testCaseParams.m * testCaseParams.n) * sizeof(float); + size_t biasFileSize = static_cast(1 * testCaseParams.n) * sizeof(float); aclrtContext context; aclrtStream stream = nullptr; @@ -168,7 +165,7 @@ void TestMatmul(int64_t m, int64_t n, int64_t k) CHECK_ACL(aclrtMallocHost((void **)(&yHost), yFileSize)); CHECK_ACL(aclrtMalloc((void **)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream); + MatmulOp(x1Device, x2Device, yDevice, biasDevice, testCaseParams, stream); CHECK_ACL(aclrtSynchronizeStream(stream)); CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); @@ -191,16 +188,23 @@ void TestMatmul(int64_t m, int64_t n, int64_t k) int32_t main(int32_t argc, const char* args[]) { - int64_t problem[3] = {1, 1, 1}; - for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 + int64_t problem[12] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + int32_t size = 13; + for (int32_t i = 1; i < argc && i < size; ++i) { std::stringstream ss(args[i]); ss >> problem[i - 1]; } auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(problem[0]), static_cast(problem[1]), static_cast(problem[2]), + IS_BIAS, IS_A_TRANS, IS_B_TRANS, + static_cast(problem[3]), static_cast(problem[4]), static_cast(problem[5]), + static_cast(problem[6]), static_cast(problem[7]), static_cast(problem[8]), + static_cast(problem[9]), static_cast(problem[10]), static_cast(problem[11])}; #ifdef ASCENDC_CPU_DEBUG - MatmulHost::TestMatmulCpu(problem[0], problem[1], problem[2]); // 2 means problem shape k + MatmulHost::TestMatmulCpu(testCaseParams); // 2 means problem shape k #else - MatmulHost::TestMatmul(problem[0], problem[1], problem[2]); // 2 means problem shape k + MatmulHost::TestMatmul(testCaseParams); // 2 means problem shape k #endif return 0; } \ No newline at end of file diff --git a/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.cpp b/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.cpp index ffe6bb30..f10b125e 100644 --- a/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.cpp +++ b/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.cpp @@ -39,10 +39,25 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) tiling.SetShape(M, N, K); tiling.EnableBias(isBias); tiling.SetBufferSpace(-1, -1, -1); +#if defined(CUSTOM_PRELOAD_N) + tiling.SetTraverse(matmul_tiling::MatrixTraverse::FIRSTN); +#else + tiling.SetTraverse(matmul_tiling::MatrixTraverse::FIRSTM); +#endif + if (tiling.GetTiling(tilingData) == -1) { std::cout << "Generate tiling failed." << std::endl; return {}; } + tilingData.baseM = testCaseParams.baseM; + tilingData.baseN = testCaseParams.baseN; + tilingData.baseK = testCaseParams.baseK; + tilingData.stepM = testCaseParams.stepM; + tilingData.stepN = testCaseParams.stepN; + tilingData.stepKa = testCaseParams.stepKa; + tilingData.stepKb = testCaseParams.stepKb; + tilingData.depthA1 = testCaseParams.depthA1; + tilingData.depthB1 = testCaseParams.depthB1; return tilingData; } diff --git a/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.h b/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.h index bde9c15b..eed22057 100644 --- a/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.h +++ b/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.h @@ -24,6 +24,15 @@ struct MatmulCaseParams bool isBias; bool isATrans; bool isBTrans; + int32_t stepM; + int32_t stepN; + int32_t stepKa; + int32_t stepKb; + int32_t depthA1; + int32_t depthB1; + int32_t baseM; + int32_t baseN; + int32_t baseK; }; /** diff --git a/examples/matrix/matmul_preload/scripts/exec_test.py b/examples/matrix/matmul_preload/scripts/exec_test.py index 6f38ff9f..2ae3674c 100644 --- a/examples/matrix/matmul_preload/scripts/exec_test.py +++ b/examples/matrix/matmul_preload/scripts/exec_test.py @@ -14,13 +14,14 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_perf_task_duration, clear_file_cache, get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -31,7 +32,8 @@ logging.basicConfig(level=logging.INFO) class ProcessParams: - def __init__(self, case_name, m, n, k, b, is_perf, run_mode): + def __init__(self, case_name, m, n, k, b, stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek,\ + is_perf, run_mode): self.case_name = case_name self.m = m self.n = n @@ -39,27 +41,36 @@ class ProcessParams: self.b = b self.is_perf = is_perf self.run_mode = run_mode + self.stepm = stepm + self.stepn = stepn + self.stepka = stepka + self.stepkb = stepkb + self.deptha1 = deptha1 + self.depthb1 = depthb1 + self.basem = basem + self.basen = basen + self.basek = basek def process_case(file_work_dir, process_params): case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ process_params.k, process_params.b, process_params.is_perf, process_params.run_mode + stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek = process_params.stepm, process_params.stepn,\ + process_params.stepka, process_params.stepkb, process_params.deptha1, process_params.depthb1,\ + process_params.basem, process_params.basen, process_params.basek logging.info("[INFO] start process case [%s]" % (case_name)) logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_preload_bbit %s %s %s %s\"\ - --output=\"./prof_out\"" % (m, n, k, b)) - elif run_mode == "sim": # sim(is_perf = 0) - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_preload_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_preload_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {stepm} {stepn} {stepka} {stepkb} {deptha1} {depthb1} {basem} {basen} {basek} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_preload_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: @@ -100,12 +111,11 @@ def get_case_list(): item_list = [] item_list.append(int(row[0].lstrip("\ufeff"))) item_list.append(row[1]) - item_list.append(int(row[2])) - item_list.append(int(row[3])) - item_list.append(int(row[4])) - item_list.append(row[5].lstrip()) - if len(row) > 6: - item_list.append(int(row[6])) + for idx in range(2, 14): + item_list.append(int(row[idx])) + item_list.append(row[14].lstrip()) + if len(row) > 15: + item_list.append(int(row[15])) else: item_list.append(1) case_list.append(item_list) @@ -132,11 +142,13 @@ def main(): case_list = get_case_list() res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] run_mode = sys.argv[1] - for is_process, case_name, m, n, k, case_mode, b in case_list: + for is_process, case_name, m, n, k, stepm, stepn, stepka, stepkb, deptha1, depthb1,\ + basem, basen, basek, case_mode, b in case_list: is_run = is_process == 1 and\ ((preload_mode == "M" and case_mode == "M") or (preload_mode == "N" and case_mode == "N")) if is_run: - process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode) + process_params = ProcessParams(case_name, m, n, k, b, stepm, stepn, stepka, stepkb,\ + deptha1, depthb1, basem, basen, basek, is_perf, run_mode) res_data = process_case(file_work_dir, process_params) res_list.append(res_data) diff --git a/examples/matrix/matmul_preload/testcase/case.csv b/examples/matrix/matmul_preload/testcase/case.csv index f4d8f554..218ae898 100644 --- a/examples/matrix/matmul_preload/testcase/case.csv +++ b/examples/matrix/matmul_preload/testcase/case.csv @@ -1,2 +1,2 @@ -1, case001, 12288, 256, 128, M -1, case002, 134, 32112, 128, N \ No newline at end of file +1, case001, 12288, 256, 128, 5, 1, 2, 2, 10, 2, 128, 256, 64, M +1, case002, 128, 24576, 512, 1, 1, 8, 8, 8, 16, 128, 128, 64, N \ No newline at end of file diff --git a/examples/matrix/matmul_quant/CMakeLists.txt b/examples/matrix/matmul_quant/CMakeLists.txt new file mode 100644 index 00000000..68890e5e --- /dev/null +++ b/examples/matrix/matmul_quant/CMakeLists.txt @@ -0,0 +1,73 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if (${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_quant_custom_kernel.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_matmul_quant_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_quant_custom_tiling.cpp +) + +target_compile_options(ascendc_matmul_quant_bbit PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(ascendc_matmul_quant_bbit PRIVATE + SOC_VERSION="${SOC_VERSION}" +) + +target_include_directories(ascendc_matmul_quant_bbit PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(ascendc_matmul_quant_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl +) diff --git a/examples/matrix/matmul_quant/README.md b/examples/matrix/matmul_quant/README.md new file mode 100644 index 00000000..7b6479ad --- /dev/null +++ b/examples/matrix/matmul_quant/README.md @@ -0,0 +1,120 @@ + +## 概述 + +本样例使用Matmul API实现了int8类型输入、half类型输出的Matmul反量化场景的算子,支持同一系数的反量化模式和向量的反量化模式。关于Matmul量化/反量化的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程(高阶API) > 特性场景 > 矩阵乘输出的量化/反量化”章节。 + +本样例以直调的方式调用算子核函数。 +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为 +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 目录结构 +| 目录及文件 | 描述 | +|----------------------------------|----------------------| +| [cmake](cmake) | 编译工程文件 | +| [op_host](op_host) | 本样例的tiling代码实现 | +| [op_kernel](op_kernel) | 本样例的kernel代码实现 | +| [scripts](scripts) | 包含输入输出真值数据生成脚本文件和执行文件 | +| [testcase](testcase) | 用例文件,配置用例的计算shape信息 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + MatmulQuantCustom单算子调用Matmul高阶API实现Matmul计算时int8_t类型输入,计算结果以half类型反量化输出,同时支持同一系数的反量化模式与向量的反量化模式。该场景下将C矩阵数据从CO1搬出到Global Memory时,会执行反量化操作,对输出矩阵的所有值采用同一系数或向量进行反量化。 + +- 算子规格 + + + + + + + + + + + + + +
算子类型(OpType)MatmulQuantCustom
算子输入nameshapedata typeformatisTrans
a-int8_tNDfalse
b-int8_tNDtrue
bias-int32_tND-
quantVector-uint64_t--
算子输出c-halfND-
核函数名matmul_quant_custom
+ +## 算子实现介绍 +- 算子Kernel实现 + - 计算逻辑:C = A * B + Bias。 + - A、B为源操作数,A为左矩阵,形状为[M, K];B为右矩阵,形状为[N, K]。 + - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 + - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。 + - 具体步骤: + - 创建Matmul对象。 + - Matmul初始化。 + 基于testcase目录下case.csv的配置,初始化Matmul的反量化模式。若测试用例配置反量化模式值为1,则算子的输入quantVector为空,初始化Matmul的反量化模式quantMode为同一系数的反量化模式。反之,若测试用例配置反量化模式值为2,则算子的输入quantVector非空,初始化Matmul的反量化模式quantMode为向量反量化模式。 + ``` + MatmulQuantCustom::MatmulKernel matmulKernel; + ... ... + if (quantVector) { + matmulKernel.Init(a, b, bias, quantVector, c, tiling, false, true, 2); // 向量反量化模式(PER_CHANNEL) + } else { + matmulKernel.Init(a, b, bias, quantVector, c, tiling, false, true, 1); // 同一系数的反量化模式(PER_TENSOR) + } + ``` + - 设置左矩阵A、右矩阵B、偏置矩阵Bias。 + - 设置反量化参数。 + 根据Matmul初始化的反量化模式quantMode,对应设置反量化参数。 + ``` + if (this->quantMode == 1) { + float tmp = 0.1f; + uint64_t s = static_cast(*reinterpret_cast(&tmp)); // 转换为uint64_t + matmulObj.SetQuantScalar(s); // 同一系数的反量化模式(PER_TENSOR),设置反量化的Scalar系数 + } else if (this->quantMode == 2) { + matmulObj.SetQuantVector(quantGlobal); // 向量反量化模式(PER_CHANNEL),设置反量化的系数向量 + } + ``` + - 完成矩阵乘操作。 + - 结束矩阵乘操作。 + +- 算子Tiling实现 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - 获取Tiling参数的流程如下: + - 创建一个Tiling对象。 + - 设置Matmul反量化模式。 + ``` + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + + if (quantMode == 1) { + cubeTiling.SetDequantType(matmul_tiling::DequantType::SCALAR); // 设置同一系数的反量化模式(PER_TENSOR) + } else if (quantMode == 2) { + cubeTiling.SetDequantType(matmul_tiling::DequantType::TENSOR); // 设置向量反量化模式(PER_CHANNEL) + } + ``` + - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 + - 调用GetTiling接口,获取Tiling信息。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] + ``` + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy -p 0 + ``` \ No newline at end of file diff --git a/examples/matrix/matmul_quant/cmake/cpu_lib.cmake b/examples/matrix/matmul_quant/cmake/cpu_lib.cmake new file mode 100644 index 00000000..487a91e0 --- /dev/null +++ b/examples/matrix/matmul_quant/cmake/cpu_lib.cmake @@ -0,0 +1,31 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} + DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/matrix/matmul_quant/cmake/npu_lib.cmake b/examples/matrix/matmul_quant/cmake/npu_lib.cmake new file mode 100644 index 00000000..bc803099 --- /dev/null +++ b/examples/matrix/matmul_quant/cmake/npu_lib.cmake @@ -0,0 +1,27 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING +) \ No newline at end of file diff --git a/examples/matrix/matmul_quant/main.cpp b/examples/matrix/matmul_quant/main.cpp new file mode 100644 index 00000000..07011c35 --- /dev/null +++ b/examples/matrix/matmul_quant/main.cpp @@ -0,0 +1,258 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "../../common/data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "op_host/matmul_quant_custom_tiling.h" + +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" + +extern void matmul_quant_custom_do(uint32_t coreDim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* quantVector, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); +#else +#include "tikicpulib.h" +extern "C" void matmul_quant_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* quantVector, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); +#endif + +namespace { +constexpr bool IS_BIAS = true; +constexpr bool IS_A_TRANS = false; +constexpr bool IS_B_TRANS = true; +// 1: scalar quant mode, 2: vector quant mode +constexpr int64_t VECTOR_QUANT_MODE = 2; + +struct MatrixFileSize +{ + size_t x1FileSize; + size_t x2FileSize; + size_t yFileSize; + size_t biasFileSize; +}; +} + +namespace MatmulHost { +static size_t GetSysWorkSpaceSize() +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + return static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); +} + +// CPU debug mode +#ifdef ASCENDC_CPU_DEBUG +void TestMatmulCpu(int64_t m, int64_t n, int64_t k, int64_t quantMode, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize); + uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize); + uint8_t* bias = nullptr; + uint8_t* quantVector = nullptr; + uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize); + uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize); + ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize); + ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize); + if (IS_BIAS) { + bias = (uint8_t*)AscendC::GmAlloc(biasFileSize); + ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); + } + if (quantMode == VECTOR_QUANT_MODE){ + size_t quantVectorSize = 1 * n * sizeof(uint64_t); + quantVector = (uint8_t*)AscendC::GmAlloc(quantVectorSize); + ReadFile("../input/quant_vector_gm.bin", quantVectorSize, quantVector, quantVectorSize); + } + + // Calculate Tiling + size_t tilingFileSize = sizeof(TCubeTiling); + uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(m), static_cast(n), static_cast(k), static_cast(quantMode), + IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams); + memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize); + + // Kernel Launch + ICPU_RUN_KF(matmul_quant_custom, tilingData.usedCoreNum, x1, x2, bias, quantVector, y, workspace, tiling); + WriteFile("../output/output.bin", y, yFileSize); + AscendC::GmFree((void*)x1); + AscendC::GmFree((void*)x2); + AscendC::GmFree((void*)y); + AscendC::GmFree((void*)workspace); + AscendC::GmFree((void*)tiling); + if (IS_BIAS) { + AscendC::GmFree((void*)bias); + } + if (quantVector) { + AscendC::GmFree((void*)quantVector); + } +} +// NPU +#else +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, uint8_t* quantVector, + int64_t m, int64_t n, int64_t k, int64_t quantMode, void* stream = nullptr) +{ + // Init args + uint8_t* workspaceDevice = nullptr; + + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + + // Allocate workspace on device + CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + // Initialize kernel with arguments and workspace pointer + uint8_t* tilingHost = nullptr; + uint8_t* tilingDevice = nullptr; + size_t tilingFileSize = sizeof(TCubeTiling); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(m), static_cast(n), static_cast(k), static_cast(quantMode), + IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + // Calculate Tiling + const auto tilingData = MatmulHost::GenerateTiling(testCaseParams); + CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData, + tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, + tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + // Launch kernel + matmul_quant_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, quantVector, y, workspaceDevice, tilingDevice); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); +} + +void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void TestMatmul(int64_t m, int64_t n, int64_t k, int64_t quantMode, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + size_t quantVectorSize = static_cast(1 * n) * sizeof(uint64_t); + + aclrtContext context; + aclrtStream stream = nullptr; + int64_t deviceId = 0; + TestAclInit(context, stream, deviceId); + + uint8_t* x1Host = nullptr; + uint8_t* x1Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize); + CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* x2Host = nullptr; + uint8_t* x2Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize); + CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* biasHost = nullptr; + uint8_t* biasDevice = nullptr; + if (IS_BIAS) { + CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + + uint8_t* quantVectorHost = nullptr; + uint8_t* quantVectorDevice = nullptr; + if (quantMode == VECTOR_QUANT_MODE) { + CHECK_ACL(aclrtMallocHost((void**)(&quantVectorHost), quantVectorSize)); + CHECK_ACL(aclrtMalloc((void**)&quantVectorDevice, quantVectorSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/quant_vector_gm.bin", quantVectorSize, quantVectorHost, quantVectorSize); + CHECK_ACL(aclrtMemcpy(quantVectorDevice, quantVectorSize, quantVectorHost, quantVectorSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + + uint8_t* yHost = nullptr; + uint8_t* yDevice = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize)); + CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + MatmulOp(x1Device, x2Device, yDevice, biasDevice, quantVectorDevice, m, n, k, quantMode, stream); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, yFileSize); + + if (IS_BIAS) { + CHECK_ACL(aclrtFree(biasDevice)); + CHECK_ACL(aclrtFreeHost(biasHost)); + } + if (quantMode == VECTOR_QUANT_MODE) { + CHECK_ACL(aclrtFree(quantVectorDevice)); + CHECK_ACL(aclrtFreeHost(quantVectorHost)); + } + CHECK_ACL(aclrtFree(x1Device)); + CHECK_ACL(aclrtFreeHost(x1Host)); + CHECK_ACL(aclrtFree(x2Device)); + CHECK_ACL(aclrtFreeHost(x2Host)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); + TestAclDeInit(context, stream, deviceId); +} +#endif +} // namespace MatmulHost + +int32_t main(int32_t argc, const char* args[]) +{ + int64_t inputParams[argc] = {1}; // M, N, K, QuantMode + for (int32_t i = 1; i < argc; ++i) { + std::stringstream ss(args[i]); + ss >> inputParams[i - 1]; + } + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); + int64_t M = inputParams[0]; + int64_t N = inputParams[1]; + int64_t K = inputParams[2]; + int64_t quantMode = inputParams[3]; + MatrixFileSize matrixFileSize; + // uint16_t represent half + matrixFileSize.x1FileSize = static_cast(M * K) * sizeof(int8_t); + matrixFileSize.x2FileSize = static_cast(K * N) * sizeof(int8_t); + matrixFileSize.yFileSize = static_cast(M * N) * sizeof(uint16_t); + matrixFileSize.biasFileSize = static_cast(1 * N) * sizeof(int32_t); +#ifdef ASCENDC_CPU_DEBUG + MatmulHost::TestMatmulCpu(M, N, K, quantMode, matrixFileSize); +#else + MatmulHost::TestMatmul(M, N, K, quantMode, matrixFileSize); +#endif + return 0; +} diff --git a/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.cpp b/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.cpp new file mode 100644 index 00000000..6e9294c5 --- /dev/null +++ b/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.cpp @@ -0,0 +1,57 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "matmul_quant_custom_tiling.h" +#include + +namespace MatmulHost { + +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) +{ + TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + uint32_t M = testCaseParams.m; + uint32_t N = testCaseParams.n; + uint32_t K = testCaseParams.k; + uint32_t blockDim = testCaseParams.usedCoreNum; + uint32_t quantMode = testCaseParams.quantMode; + bool isBias = testCaseParams.isBias; + bool isAtrans = testCaseParams.isATrans; + bool isBtrans = testCaseParams.isBTrans; + + cubeTiling.SetDim(blockDim); + cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_INT8, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_INT8, isBtrans); + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_INT32); + + if (quantMode == 1) { + cubeTiling.SetDequantType(matmul_tiling::DequantType::SCALAR); // set PRE_TENSOR quant mode + } else if (quantMode == 2) { + cubeTiling.SetDequantType(matmul_tiling::DequantType::TENSOR); // set PRE_CHANNEL quant mode + } + std::cout << "[INFO] Set Quant Mode: " << quantMode << std::endl; + + cubeTiling.SetOrgShape(M, N, K); + cubeTiling.SetShape(M, N, K); + cubeTiling.EnableBias(isBias); + if (cubeTiling.GetTiling(tilingData) == -1) { + std::cout << "Generate tiling failed." << std::endl; + return {}; + } + return tilingData; +} + +} // namespace MatmulHost diff --git a/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.h b/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.h new file mode 100644 index 00000000..a41b2584 --- /dev/null +++ b/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.h @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_QUANT_OP_HOST_MATMUL_QUANT_CUSTOM_TILING_H +#define EXAMPLES_MATRIX_MATMUL_QUANT_OP_HOST_MATMUL_QUANT_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" + +namespace MatmulHost { + +struct MatmulCaseParams +{ + int32_t usedCoreNum; + int32_t m; + int32_t n; + int32_t k; + int32_t quantMode; + bool isBias; + bool isATrans; + bool isBTrans; +}; + +/** + * @brief Generate matmul tiling. + * @param testCaseParams: Testcase parameters. + * @retval Generated Tiling data. + */ +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams); + +} // namespace MatmulHost +#endif // EXAMPLES_MATRIX_MATMUL_QUANT_OP_HOST_MATMUL_QUANT_CUSTOM_TILING_H diff --git a/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.cpp b/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.cpp new file mode 100644 index 00000000..8f440983 --- /dev/null +++ b/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.cpp @@ -0,0 +1,156 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "matmul_quant_custom_kernel.h" + +namespace { +/** + * @brief Copy tiling data to TCubeTiling ptr from tiling gm addr. + * @param tiling: TCubeTiling ptr which needs to copy tiling data. + * @param tilingGM: Tiling gm addr. + * @retval None + */ +__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); ++i, ++ptr) { + *ptr = *(tiling32 + i); + } + return; +} +} + +namespace MatmulQuantCustom { +template +__aicore__ inline void MatmulKernel::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, + GM_ADDR quantVector, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB, int64_t quantMode) +{ + this->tiling = tiling; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N); + quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(quantVector), tiling.N); + + int32_t offsetA = 0; + int32_t offsetB = 0; + int32_t offsetC = 0; + int32_t offsetBias = 0; + this->isTransA = isTransA; + this->isTransB = isTransB; + this->quantMode = quantMode; + CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias); + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; + quantGlobal = quantGlobal[offsetBias]; + + if (GetSysWorkSpacePtr() == nullptr) { + return; + } +} + +template +__aicore__ inline void MatmulKernel::Process() +{ + if (AscendC::GetBlockIdx() >= this->tiling.usedCoreNum) { + return; + } + + if (this->quantMode == 1) { + float tmp = 0.1f; // set quant scale value + uint64_t s = static_cast(*reinterpret_cast(&tmp)); //float transport to uint64_t type + matmulObj.SetQuantScalar(s); // set PRE_TENSOR quant mode + } else if(this->quantMode == 2){ + matmulObj.SetQuantVector(quantGlobal); // set PRE_CHANNEL quant mode + } + + // process with tail block + int tailN = tiling.N - nCoreIndex * tiling.singleCoreN; + tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN; + int tailM = tiling.M - mCoreIndex * tiling.singleCoreM; + tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM; + if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) { + matmulObj.SetTail(tailM, tailN); + } + + matmulObj.SetTensorA(aGlobal, isTransA); + matmulObj.SetTensorB(bGlobal, isTransB); + if (tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + matmulObj.IterateAll(cGlobal); + matmulObj.End(); +} + +template +__aicore__ inline void MatmulKernel::CalcOffset( + int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias) +{ + const TCubeTiling& tiling = this->tiling; + const auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores + mCoreIndex = blockIdx % mSingleBlocks; + nCoreIndex = blockIdx / mSingleBlocks; + + offsetA = isTransA ? (mCoreIndex * tiling.singleCoreM) : + mCoreIndex * tiling.Ka * tiling.singleCoreM; + + offsetB = isTransB ? (nCoreIndex * tiling.Kb * tiling.singleCoreN) : + nCoreIndex * tiling.singleCoreN; + offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN; + offsetBias = nCoreIndex * tiling.singleCoreN; +} +} // namespace MatmulQuantCustom + +/** + * @brief matmul kernel function. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param quantVector: Quant Vector gm addr. + * @param c: C matrix gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tilingGm: Tiling data addr. + * @retval None + */ +extern "C" __global__ __aicore__ void matmul_quant_custom( + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR quantVector, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + if (g_coreType == AscendC::AIV) { + return; + } + // prepare tiling + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + // define matmul kernel + MatmulQuantCustom::MatmulKernel matmulKernel; + AscendC::TPipe pipe; + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); + // init matmul kernel, isTransA=false, isTransB=true, quantMode = {1: Scaler, 2: Vector} + if (quantVector) { + matmulKernel.Init(a, b, bias, quantVector, c, tiling, false, true, 2); + } else { + matmulKernel.Init(a, b, bias, quantVector, c, tiling, false, true, 1); + } + // matmul kernel process + matmulKernel.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void matmul_quant_custom_do(uint32_t blockDim, void* stream, + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR quantVector, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + matmul_quant_custom<<>>(a, b, bias, quantVector, c, workspace, tilingGm); +} +#endif diff --git a/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.h b/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.h new file mode 100644 index 00000000..f6e3be78 --- /dev/null +++ b/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.h @@ -0,0 +1,76 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_QUANT_OP_KERNEL_MATMUL_QUANT_CUSTOM_KERNEL_H +#define EXAMPLES_MATRIX_MATMUL_QUANT_OP_KERNEL_MATMUL_QUANT_CUSTOM_KERNEL_H +#include "kernel_operator.h" +// Cube Only +#define ASCENDC_CUBE_ONLY +#include "lib/matmul_intf.h" + +namespace MatmulQuantCustom { +template +class MatmulKernel { +public: + __aicore__ inline MatmulKernel(){}; + /** + * @brief Initialization before process. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param quantVector: Quant vector gm addr. + * @param c: C matrix gm addr. + * @param tiling: Matmul tiling struct. + * @param isTransA: Whether A matrix is transposed. + * @param isTransB: Whether B matrix is transposed. + * @retval None + */ + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR quantVector, GM_ADDR c, + const TCubeTiling& tiling, bool isTransA, bool isTransB, int64_t quantMode); + /** + * @brief Process matrix calculation. + * @retval None + */ + __aicore__ inline void Process(); + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + AscendC::Matmul matmulObj; + +private: + /** + * @brief Calculate the gm offset based on the blockIdx. + * @param blockIdx: Current Core blockidx. + * @param offsetA: Gm offset of A matrix. + * @param offsetB: Gm offset of B matrix. + * @param offsetC: Gm offset of C matrix. + * @param offsetBias: Gm offset of Bias matrix. + * @retval None + */ + __aicore__ inline void CalcOffset( + int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias); + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + AscendC::GlobalTensor quantGlobal; + TCubeTiling tiling; + int32_t mCoreIndex; + int32_t nCoreIndex; + bool isTransA{false}; + bool isTransB{false}; + int64_t quantMode{1}; +}; +} // namespace MatmulQuantCustom + +#endif // EXAMPLES_MATRIX_MATMUL_QUANT_OP_KERNEL_MATMUL_QUANT_CUSTOM_KERNEL_H diff --git a/examples/matrix/matmul_quant/run.sh b/examples/matrix/matmul_quant/run.sh new file mode 100644 index 00000000..34ed330f --- /dev/null +++ b/examples/matrix/matmul_quant/run.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +export IS_PERF="0" + +SHORT=r:,v:,p:, +LONG=run-mode:,soc-version:,perf:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (-p | --perf ) + IS_PERF="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +# Check invalid input +if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then + echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}" + exit 1 +fi + +if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then + echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu." + exit 1 +fi + +if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1." + exit 1 +fi + +# only npu mode support is_perf = 1 +if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." + exit 1 +fi + +rm -rf build +mkdir build +cd build + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +set -euo pipefail + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +cd ../ + +rm -rf input +mkdir input +rm -rf output +mkdir output + +rm -rf bin/ +mkdir -p bin +cd bin +cp ../build/ascendc_matmul_quant_bbit ./ + +export TF_CPP_MIN_LOG_LEVEL=3 + +if [ "${RUN_MODE}" = "npu" ]; then + if [ "${IS_PERF}" = "1" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + python3 -u ../scripts/exec_test.py npu "perf" + else + python3 -u ../scripts/exec_test.py npu "normal" + fi +elif [ "${RUN_MODE}" = "sim" ]; then + python3 -u ../scripts/exec_test.py sim "normal" +elif [ "${RUN_MODE}" = "cpu" ]; then + python3 -u ../scripts/exec_test.py cpu "normal" +fi diff --git a/examples/matrix/matmul_quant/scripts/exec_test.py b/examples/matrix/matmul_quant/scripts/exec_test.py new file mode 100644 index 00000000..db26fa8a --- /dev/null +++ b/examples/matrix/matmul_quant/scripts/exec_test.py @@ -0,0 +1,124 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import csv +import time +import logging +import subprocess +import shlex + +import numpy as np + +sys.path.append("../..") +from gen_quant_data import QuantMatmulGenData +from common_scripts.compare_data import compare_data +from common_scripts.exec_utils import get_file_work_dir, get_case_list, clear_file_cache, get_perf_task_duration + +IS_BIAS = True +IS_TRANS_A = False +IS_TRANS_B = True +# int8_float16_dequant(int8 in float16 out for quant matmul) +DATA_TYPE_STR = "int8_float16_dequant" + +logging.basicConfig(level=logging.INFO) + + +class ProcessParams: + def __init__(self, case_name, m, n, k, quant_mode, is_perf, run_mode): + self.case_name = case_name + self.m = m + self.n = n + self.k = k + self.quant_mode = quant_mode + self.is_perf = is_perf + self.run_mode = run_mode + + +def process_case(file_work_dir, process_params): + case_name, m, n, k, quant_mode, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ + process_params.k, process_params.quant_mode, process_params.is_perf, process_params.run_mode + logging.info("[INFO] start process case [%s]" % (case_name)) + logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) + clear_file_cache(file_work_dir) + + matmul_gen_data = QuantMatmulGenData(m, n, k, quant_mode, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) + params_str = f"{m} {n} {k} {quant_mode}" + matmul_gen_data.gen_golden_data(file_work_dir) + if is_perf: + cmd = f"msprof op --application=\"./ascendc_matmul_quant_bbit {params_str}\" --output=./prof_out" + elif run_mode == "sim": + cmd = f"msprof op simulator --application=\"./ascendc_matmul_quant_bbit {params_str}\" --output=./sim_out" + else: + cmd = f"./ascendc_matmul_quant_bbit {params_str}" + subprocess.run(shlex.split(cmd)) + if is_perf: + wrong_num = -1 + else: + logging.info("[INFO] compare data case[%s]" % (case_name)) + wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + res_data = [] + res_data.append(case_name) + res_data.append(wrong_num) + res_data.append(m * n) + if wrong_num == -1: + res_data.append("None") + elif wrong_num / (m * n) > 0.001: + res_data.append("Fail") + else: + res_data.append("Success") + if is_perf: + task_duration = get_perf_task_duration("./prof_out") + res_data.append(task_duration) + return res_data + + +def main(): + args_len = len(sys.argv) - 1 + if args_len != 2: + logging.info("[ERROR] exec_test input params error!") + return -1 + + file_work_dir = get_file_work_dir() + if not os.path.exists(file_work_dir): + logging.info("[ERROR] file path %s not exist!" % (file_work_dir)) + return -1 + + is_perf = False + title_list = ["case_name", "wrong_num", "total_num", "result"] + if sys.argv[2] == "perf": + is_perf = True + title_list.append("task_duration") + + case_list = get_case_list() + res_list = [title_list] + run_mode = sys.argv[1] + for is_process, case_name, m, n, k, quant_mode in case_list: + if is_process == 1: + process_params = ProcessParams(case_name, m, n, k, quant_mode, is_perf, run_mode) + res_data = process_case(file_work_dir, process_params) + res_list.append(res_data) + + timestamp = time.time() + result_file_name = "result_" + str(timestamp) + ".csv" + with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(res_list) + + logging.info("---------------RESULT---------------") + for res in res_list: + logging.info(res) + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/matrix/matmul_quant/scripts/gen_quant_data.py b/examples/matrix/matmul_quant/scripts/gen_quant_data.py new file mode 100644 index 00000000..4e5e2db9 --- /dev/null +++ b/examples/matrix/matmul_quant/scripts/gen_quant_data.py @@ -0,0 +1,97 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import logging + +import numpy as np + +IS_OUTPUT_TXT = False + + +class QuantMatmulGenData: + def __init__(self, m, n, k, quant_mode, is_trans_a, is_trans_b, is_bias, data_type_str): + self.m = m + self.n = n + self.k = k + self.quant_mode = quant_mode + self.is_bias = is_bias + self.is_trans_a = is_trans_a + self.is_trans_b = is_trans_b + self.data_type_str = data_type_str + + + def gen_golden_data_int8_float16_dequant(self, work_dir): + src_type = np.int8 + dst_type = np.float16 + data_type = np.int32 + quant_type = np.uint64 + temp_type = np.float32 + + # generate input x1, x2 + x1_gm = np.random.randint(-10, 10, [self.m, self.k]).astype(src_type) + x2_gm = np.random.randint(-10, 10, [self.k, self.n]).astype(src_type) + if self.is_bias: + bias_gm = np.random.randint(-2, 2, [1, self.n]).astype(data_type) + if self.quant_mode == 2: # 1: scalar quant mode, 2: vector quant mode + quant_vector = np.random.uniform(0.1, 2.0, [1, self.n]).astype(temp_type) + quant_vector_gm = np.frombuffer(quant_vector, data_type) + quant_vector_gm = quant_vector_gm.astype(quant_type) + + y_gm_int32 = np.matmul(x1_gm.astype(data_type), x2_gm.astype(data_type)) + if self.is_bias: + y_gm_int32 = y_gm_int32 + bias_gm + + golden = y_gm_int32.astype(dst_type) + if self.quant_mode == 1: + golden = golden * 0.1 + elif self.quant_mode == 2: + quant_vector = quant_vector.view("uint32") + for index, data in enumerate(quant_vector): + # 1 sign bit, 8 exponent bits and 10 mantissa bits + quant_vector[index] = np.bitwise_and(data, 0xFFFFE000) + quant_vector = quant_vector.view("float32") + for i in range(self.m): + golden[i, :] = golden[i, :] * quant_vector + else: + logging.info("[ERROR] can't support quant mode %s" % (self.quant_mode)) + + if self.is_trans_a: + x1_gm = x1_gm.transpose() + if self.is_trans_b: + x2_gm = x2_gm.transpose() + x1_gm.tofile(work_dir + "/input/x1_gm.bin") + x2_gm.tofile(work_dir + "/input/x2_gm.bin") + if self.is_bias: + bias_gm.tofile(work_dir + "/input/bias_gm.bin") + if self.quant_mode == 2: + quant_vector_gm.tofile(work_dir + "/input/quant_vector_gm.bin") + golden.tofile(work_dir + "/output/golden.bin") + + if IS_OUTPUT_TXT: + np.savetxt(work_dir + "/input/x1_gm.txt", x1_gm.flatten(), fmt='%d', newline='\n') + np.savetxt(work_dir + "/input/x2_gm.txt", x2_gm.flatten(), fmt='%d', newline='\n') + np.savetxt(work_dir + "/output/golden.txt", golden.flatten(), fmt='%f', newline='\n') + if self.is_bias: + np.savetxt(work_dir + "/input/bias_gm.txt", bias_gm.flatten(), fmt='%d', newline='\n') + if self.quant_mode == 2: + np.savetxt(work_dir + "/output/quant_vector_gm.txt", quant_vector_gm.flatten(), fmt='%d', newline='\n') + return 0 + + + def gen_golden_data(self, work_dir): + if self.data_type_str == "int8_float16_dequant": + self.gen_golden_data_int8_float16_dequant(work_dir) + else: + logging.info("[ERROR] can't support data type %s" % (self.data_type_str)) + return -1 + return 0 diff --git a/examples/matrix/matmul_quant/testcase/case.csv b/examples/matrix/matmul_quant/testcase/case.csv new file mode 100644 index 00000000..161b8812 --- /dev/null +++ b/examples/matrix/matmul_quant/testcase/case.csv @@ -0,0 +1,4 @@ +1, ScalarQuantCase001, 1024, 1024, 1024, 1 +1, ScalarQuantCase002, 32, 32, 64, 1 +1, VectorQuantCase001, 1024, 1024, 1024, 2 +1, VectorQuantCase002, 256, 512, 256, 2 \ No newline at end of file diff --git a/examples/matrix/matmul_sparse/README.md b/examples/matrix/matmul_sparse/README.md index 1124d0d8..d90f9175 100644 --- a/examples/matrix/matmul_sparse/README.md +++ b/examples/matrix/matmul_sparse/README.md @@ -1,7 +1,7 @@ ## 概述 -本样例使用Matmul API实现了Sparse Matmul场景的算子,即左矩阵A为稀疏矩阵,右矩阵B为4:2稠密化后的矩阵的Matmul计算。 +本样例使用Matmul API实现了Sparse Matmul场景的算子,即左矩阵A为稀疏矩阵,右矩阵B为4:2稠密化后的矩阵的Matmul计算。关于Sparse Matmul的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程(高阶API) > 特性场景 > 4:2稀疏矩阵乘”章节。 本样例以直调的方式调用算子核函数。 直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 @@ -15,7 +15,7 @@ | [cmake](cmake) | 编译工程文件 | | [op_host](op_host) | 本样例的tiling代码实现 | | [op_kernel](op_kernel) | 本样例的kernel代码实现 | -| [scripts](scripts) | 执行文件 | +| [scripts](scripts) | 包含输入输出真值数据生成脚本文件和执行文件 | | [testcase](testcase) | 用例文件,配置用例的计算shape信息 | | CMakeLists.txt | 编译工程文件 | | main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | @@ -68,7 +68,7 @@ - 结束矩阵乘操作。 - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 使能Sparse Matmul稀疏矩阵计算场景。 @@ -104,4 +104,4 @@ 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` bash run.sh -r cpu -v Ascendxxxyy -p 0 - ``` \ No newline at end of file + ``` diff --git a/examples/matrix/matmul_sparse/scripts/exec_test.py b/examples/matrix/matmul_sparse/scripts/exec_test.py index 1874639f..a44b20e2 100644 --- a/examples/matrix/matmul_sparse/scripts/exec_test.py +++ b/examples/matrix/matmul_sparse/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from gen_sparse_data import SparseMatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = SparseMatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_sparse_bbit %s %s %s %s\" --output=\"./prof_out\"" % - (m, n, k, b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_sparse_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_sparse_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_sparse_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_sparse/scripts/gen_sparse_data.py b/examples/matrix/matmul_sparse/scripts/gen_sparse_data.py index d6082ad3..758d7a68 100644 --- a/examples/matrix/matmul_sparse/scripts/gen_sparse_data.py +++ b/examples/matrix/matmul_sparse/scripts/gen_sparse_data.py @@ -197,11 +197,11 @@ class SparseMatmulGenData: with open(work_dir + "/input/x1_gm.bin", 'wb') as file: file.truncate(file_byte) - file_byte = self.k / 2 * self.n * data_type_bytes_ab + file_byte = int(self.k / 2 * self.n * data_type_bytes_ab) with open(work_dir + "/input/x2_gm.bin", 'wb') as file: file.truncate(file_byte) - file_byte = self.k / 8 * self.n * data_type_bytes_ab + file_byte = int(self.k / 8 * self.n * data_type_bytes_ab) with open(work_dir + "/input/index_gm.bin", 'wb') as file: file.truncate(file_byte) diff --git a/examples/matrix/matmul_splitk/README.md b/examples/matrix/matmul_splitk/README.md index dcb7c851..43ee9419 100644 --- a/examples/matrix/matmul_splitk/README.md +++ b/examples/matrix/matmul_splitk/README.md @@ -69,7 +69,7 @@ - 结束矩阵乘操作。 - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,并且使能多核切K,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,并且使能多核切K,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等,调用EnableMultiCoreSplitK使能多核切K。 diff --git a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom.cpp b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom.cpp index 629977b1..e07b783d 100644 --- a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom.cpp +++ b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom.cpp @@ -28,11 +28,13 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) extern "C" __global__ __aicore__ void matmul_splitk_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + if ASCEND_IS_AIV { + return; + } TCubeTiling tiling; MatmulSplitKCustom::CopyTiling(&tiling, tilingGm); MatmulSplitKCustom::MatmulSplitkKernel matmulKernel; AscendC::TPipe pipe; - REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); matmulKernel.Init(a, b, bias, c, workspace, tiling); matmulKernel.Process(&pipe); } diff --git a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h index 503eb3bf..26a36bb2 100644 --- a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h +++ b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h @@ -11,6 +11,7 @@ #ifndef EXAMPLES_MATRIX_MATMUL_SPLITK_OP_KERNEL_MATMUL_SPLITK_CUSTOM_IMPL_H #define EXAMPLES_MATRIX_MATMUL_SPLITK_OP_KERNEL_MATMUL_SPLITK_CUSTOM_IMPL_H #include "kernel_operator.h" +#define ASCENDC_CUBE_ONLY #include "lib/matmul_intf.h" namespace MatmulSplitKCustom { @@ -22,7 +23,7 @@ class MatmulSplitkKernel { __aicore__ inline MatmulSplitkKernel(){}; __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); __aicore__ inline void Process(AscendC::TPipe* pipe); - AscendC::Matmul< + AscendC::MatmulImpl< AscendC::MatmulType, AscendC::MatmulType, AscendC::MatmulType, @@ -69,9 +70,8 @@ __aicore__ inline void MatmulSplitkKernel::Init( template __aicore__ inline void MatmulSplitkKernel::Process(AscendC::TPipe* pipe) { - if (matmul::GetBlockIdx() >= tiling.usedCoreNum) { - return; - } + matmulObj.Init(&(this->tiling), pipe); + matmulObj.SetSubBlockIdx(0); matmulObj.SetTensorA(aGlobal); matmulObj.SetTensorB(bGlobal); if (tiling.isBias) { diff --git a/examples/matrix/matmul_splitk/scripts/exec_test.py b/examples/matrix/matmul_splitk/scripts/exec_test.py index f2057c01..2f2b876d 100644 --- a/examples/matrix/matmul_splitk/scripts/exec_test.py +++ b/examples/matrix/matmul_splitk/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = False IS_TRANS_A = False @@ -49,17 +51,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_splitk_bbit %s %s %s %s\"\ - --output=\"./prof_out\"" % (m, n, k, b)) - elif run_mode == "sim": # sim(is_perf = 0) - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_splitk_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_splitk_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_splitk_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_splitk/testcase/case.csv b/examples/matrix/matmul_splitk/testcase/case.csv index 0036707a..5b39cef8 100644 --- a/examples/matrix/matmul_splitk/testcase/case.csv +++ b/examples/matrix/matmul_splitk/testcase/case.csv @@ -1 +1,2 @@ -1, case001, 16, 16, 1024 \ No newline at end of file +1, case001, 16, 16, 1024 +0, case002, 256, 512, 1536 \ No newline at end of file diff --git a/examples/matrix/matmul_triangle/README.md b/examples/matrix/matmul_triangle/README.md index 31ef4972..2da1418a 100644 --- a/examples/matrix/matmul_triangle/README.md +++ b/examples/matrix/matmul_triangle/README.md @@ -15,7 +15,7 @@ | [cmake](cmake) | 编译工程文件 | | [op_host](op_host) | 本样例tiling代码实现 | | [op_kernel](op_kernel) | 本样例kernel代码实现 | -| [scripts](scripts) | 执行文件 | +| [scripts](scripts) | 包含输入输出真值数据生成脚本文件和执行文件 | | [testcase](testcase) | 用例文件,配置用例的计算shape信息 | | [CMakeLists.txt](CMakeLists.txt) | 编译工程文件 | | [main.cpp](main.cpp) | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | @@ -101,7 +101,7 @@ - 结束矩阵乘操作。 - Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息,以及SingleShape和baseM、baseN、baseK信息。 diff --git a/examples/matrix/matmul_triangle/main.cpp b/examples/matrix/matmul_triangle/main.cpp index b96f0726..346a9905 100644 --- a/examples/matrix/matmul_triangle/main.cpp +++ b/examples/matrix/matmul_triangle/main.cpp @@ -186,7 +186,8 @@ int32_t main(int32_t argc, const char* args[]) ss >> problem[i - 1]; } - optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3], problem[4], problem[5], + bool isBias = problem[4]; + optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3], isBias, problem[5], problem[6], problem[7], problem[8], problem[9], problem[10]}; auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); #ifdef ASCENDC_CPU_DEBUG diff --git a/examples/matrix/matmul_tscm/README.md b/examples/matrix/matmul_tscm/README.md index 454f5a9b..1fa3a717 100644 --- a/examples/matrix/matmul_tscm/README.md +++ b/examples/matrix/matmul_tscm/README.md @@ -1,7 +1,7 @@ ## 概述 -本样例介绍了Matmul API中用户自定义TSCM输入的使用方式。TSCM即Temp Swap Cache Memory,用于临时把数据交换到额外空间,需开发者自行管理以高效利用硬件资源。该场景由用户自行管理L1 Buffer,再将对应数据地址作为Matmul的输入。 +本样例介绍了Matmul API中用户自定义TSCM输入的使用方式。TSCM即Temp Swap Cache Memory,用于临时把数据交换到额外空间,需开发者自行管理以高效利用硬件资源。该场景由用户自行管理L1 Buffer,再将对应数据地址作为Matmul的输入。关于TSCM输入的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程(高阶API) > 特性场景 > TSCM输入的矩阵乘”章节。 本样例以直调的方式调用算子核函数。 直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 @@ -71,7 +71,7 @@ - 结束矩阵乘操作。 - 算子Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 @@ -101,4 +101,4 @@ 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` bash run.sh -r cpu -v Ascendxxxyy -p 0 - ``` \ No newline at end of file + ``` diff --git a/examples/matrix/matmul_tscm/scripts/exec_test.py b/examples/matrix/matmul_tscm/scripts/exec_test.py index a7a5461c..de439b5f 100644 --- a/examples/matrix/matmul_tscm/scripts/exec_test.py +++ b/examples/matrix/matmul_tscm/scripts/exec_test.py @@ -14,13 +14,15 @@ import sys import csv import time import logging +import subprocess import numpy as np sys.path.append("../..") from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd IS_BIAS = True IS_TRANS_A = False @@ -51,17 +53,14 @@ def process_case(file_work_dir, process_params): clear_file_cache(file_work_dir) matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR, A_FORMAT, B_FORMAT) - if is_perf: # npu(is_perf = 1) + if is_perf: matmul_gen_data.gen_fake_golden_data(file_work_dir) - os.system("msprof op --application=\"./ascendc_matmul_tscm_bbit %s %s %s %s\" --output=\"./prof_out\"" % - (m, n, k, b)) - elif run_mode == "sim": # sim - matmul_gen_data.gen_golden_data(file_work_dir) - os.system("msprof op simulator --application=\"./ascendc_matmul_tscm_bbit %s %s %s %s\"\ - --output=\"./sim_out\"" % (m, n, k, b)) - else: # cpu or npu(is_perf = 0) + else: matmul_gen_data.gen_golden_data(file_work_dir) - os.system("./ascendc_matmul_tscm_bbit %s %s %s %s" % (m, n, k, b)) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_tscm_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) if is_perf: wrong_num = -1 else: diff --git a/examples/matrix/matmul_unaligned/README.md b/examples/matrix/matmul_unaligned/README.md index 69dac450..78b02892 100644 --- a/examples/matrix/matmul_unaligned/README.md +++ b/examples/matrix/matmul_unaligned/README.md @@ -1,7 +1,7 @@ ## 概述 -本样例介绍了调用Matmul高阶API实现多核非对齐切分的单算子。多核非对齐的切分为M,N,K无法整除singleCoreM, singleCoreN, singleCoreK时,需要在不改变原有Tiling的情况下,在Kernel侧调用SetTail接口重新设置本次计算的singleCoreM/singleCoreN/singleCoreK。 +本样例介绍了调用Matmul高阶API实现多核非对齐切分的单算子。多核非对齐的切分为M,N,K无法整除singleCoreM, singleCoreN, singleCoreK时,需要在不改变原有Tiling的情况下,在Kernel侧调用SetTail接口重新设置本次计算的singleCoreM/singleCoreN/singleCoreK。关于多核非对齐切分的具体内容请参考《[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程(高阶API) > 特性场景 > 多核非对齐切分”章节。 本样例以直调的方式调用算子核函数。 直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方式。 @@ -78,7 +78,7 @@ - 结束矩阵乘操作。 - Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 diff --git a/examples/matrix/matmul_unaligned/main.cpp b/examples/matrix/matmul_unaligned/main.cpp index 9e7f1270..505344e1 100644 --- a/examples/matrix/matmul_unaligned/main.cpp +++ b/examples/matrix/matmul_unaligned/main.cpp @@ -184,7 +184,8 @@ int32_t main(int32_t argc, const char* args[]) ss >> problem[i - 1]; } - optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3]}; + bool isBias = problem[3]; + optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias}; auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); #ifdef ASCENDC_CPU_DEBUG diff --git a/examples/matrix/matmul_unitflag/CMakeLists.txt b/examples/matrix/matmul_unitflag/CMakeLists.txt new file mode 100644 index 00000000..dc2dc3ff --- /dev/null +++ b/examples/matrix/matmul_unitflag/CMakeLists.txt @@ -0,0 +1,74 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if (${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_unitflag_custom_kernel.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_matmul_unitflag_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_unitflag_custom_tiling.cpp +) + +target_compile_options(ascendc_matmul_unitflag_bbit PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(ascendc_matmul_unitflag_bbit PRIVATE + $<$:ENABLE_UNITFLAG_FEATURE> + SOC_VERSION="${SOC_VERSION}" +) + +target_include_directories(ascendc_matmul_unitflag_bbit PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(ascendc_matmul_unitflag_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl +) diff --git a/examples/matrix/matmul_unitflag/README.md b/examples/matrix/matmul_unitflag/README.md new file mode 100644 index 00000000..d10ff5c3 --- /dev/null +++ b/examples/matrix/matmul_unitflag/README.md @@ -0,0 +1,112 @@ + +## 概述 + +本样例介绍了调用Matmul API实现MDL模板开启UnitFlag功能的单算子。使能UnitFlag功能,可以使算子中的CUBE计算流水与FIXPIPE数据搬出流水并行,提升算子性能。Norm模板、IBShare模板默认使能UnitFlag功能,MDL模板默认不使能UnitFlag功能。 + +本样例以直调的方式调用算子核函数。 +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为 +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 目录结构 +| 目录及文件 | 描述 | +|----------------------------------|----------------------| +| [cmake](cmake) | 编译工程文件 | +| [op_host](op_host) | 本样例的tiling代码实现 | +| [op_kernel](op_kernel) | 本样例的kernel代码实现 | +| [scripts](scripts) | 执行脚本文件 | +| [testcase](testcase) | 用例文件,配置用例的计算shape信息 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + MatmulUnitFlagCustom算子调用Matmul API计算时,通过配置MatmulConfig中的enUnitFlag参数为true,使能MDL模板开启UnitFlag功能,对输入的A、B矩阵做矩阵乘和加bias偏置。算子使能UnitFlag功能后,在Matmul API内部实现MMAD指令和FIXPIPE指令的细粒度同步,从而使计算流水与数据搬出流水并行,提升算子性能。 + +- 算子规格 + + + + + + + + + + + + +
算子类型(OpType)MatmulUnitFlagCustom
算子输入nameshapedata typeformatisTrans
a-float16NDfalse
b-float16NDfalse
bias-floatND-
算子输出c-floatND-
核函数名matmul_unitflag_custom
+ +## 算子实现介绍 +- 约束条件 + - UnitFlag功能只支持Norm、IBshare、MDL三个模板。 + - 使能UnitFlag功能时,不支持算子内同时存在L0C搬出到Global Memory和L1搬出到Global Memory的两种流水。 + - 使能UnitFlag功能时,若同时使能L0C累加功能,不支持多次Iterate计算,一次GetTensorC输出。 + +- 算子Kernel实现 + - 计算逻辑:C = A * B + Bias。 + - A、B为源操作数,A为左矩阵,形状为[M, K];B为右矩阵,形状为[K, N]。 + - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 + - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。 + - 具体步骤: + - 创建Matmul对象。 + 创建Matmul对象时,自定义MatmulConfig参数,将其中的enUnitFlag参数设置为true,使能UnitFlag功能,获得自定义的使用MDL模板的Matmul对象。 + ``` + __aicore__ inline constexpr MatmulConfig GetUnitFlagCfg() + { + auto mmCfg = CFG_MDL; + #ifdef ENABLE_UNITFLAG_FEATURE + // enable UnitFlag feature + mmCfg.enUnitFlag = true; + #endif + return mmCfg; + } + constexpr static MatmulConfig CFG_MDL_UNITFLAG = GetUnitFlagCfg(); + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + AscendC::Matmul matmulObj; + ``` + - 初始化操作。 + - 设置左矩阵A、右矩阵B。 + - 完成矩阵乘操作。 + - 结束矩阵乘操作。 + +- 算子Tiling实现 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - 获取Tiling参数的流程如下: + - 创建一个Tiling对象。 + - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 + - 调用GetTiling接口,获取Tiling信息。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -e [ENABLE_FEATURE] + ``` + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - IS_PERF :是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + - ENABLE_FEATURE :是否使能UnitFlag功能,可选择关闭和开启该功能,对应参数分别为[0 / 1],默认开启。 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy -p 0 -e 1 + ``` \ No newline at end of file diff --git a/examples/matrix/matmul_unitflag/cmake/cpu_lib.cmake b/examples/matrix/matmul_unitflag/cmake/cpu_lib.cmake new file mode 100644 index 00000000..1cc21b1d --- /dev/null +++ b/examples/matrix/matmul_unitflag/cmake/cpu_lib.cmake @@ -0,0 +1,35 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$:ENABLE_UNITFLAG_FEATURE> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} + DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/matrix/matmul_unitflag/cmake/npu_lib.cmake b/examples/matrix/matmul_unitflag/cmake/npu_lib.cmake new file mode 100644 index 00000000..b3b8f341 --- /dev/null +++ b/examples/matrix/matmul_unitflag/cmake/npu_lib.cmake @@ -0,0 +1,28 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$:ENABLE_UNITFLAG_FEATURE> + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING +) \ No newline at end of file diff --git a/examples/matrix/matmul_unitflag/main.cpp b/examples/matrix/matmul_unitflag/main.cpp new file mode 100644 index 00000000..2be142a0 --- /dev/null +++ b/examples/matrix/matmul_unitflag/main.cpp @@ -0,0 +1,225 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "../../common/data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "op_host/matmul_unitflag_custom_tiling.h" + +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void matmul_unitflag_custom_do(uint32_t coreDim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); +#else +#include "tikicpulib.h" +extern "C" void matmul_unitflag_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); +#endif + +namespace { +constexpr bool IS_BIAS = true; +constexpr bool IS_A_TRANS = false; +constexpr bool IS_B_TRANS = false; + +struct MatrixFileSize +{ + size_t x1FileSize; + size_t x2FileSize; + size_t yFileSize; + size_t biasFileSize; +}; +} + +namespace MatmulHost { +static size_t GetSysWorkSpaceSize() +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + return static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); +} + +// CPU debug mode +#ifdef ASCENDC_CPU_DEBUG +void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize); + uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize); + uint8_t* bias = nullptr; + uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize); + uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize); + ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize); + ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize); + if (IS_BIAS) { + bias = (uint8_t*)AscendC::GmAlloc(biasFileSize); + ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); + } + size_t tilingFileSize = sizeof(TCubeTiling); + uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + // Calculate Tiling + const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams); + memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize); + ICPU_RUN_KF(matmul_unitflag_custom, tilingData.usedCoreNum, x1, x2, bias, y, workspace, tiling); + WriteFile("../output/output.bin", y, yFileSize); + AscendC::GmFree((void*)x1); + AscendC::GmFree((void*)x2); + AscendC::GmFree((void*)y); + AscendC::GmFree((void*)workspace); + AscendC::GmFree((void*)tiling); + if (IS_BIAS) { + AscendC::GmFree((void*)bias); + } +} +// NPU +#else +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k, + void* stream = nullptr) +{ + // Init args + uint8_t* workspaceDevice = nullptr; + + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + + // Allocate workspace on device + CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + // Initialize kernel with arguments and workspace pointer + uint8_t* tilingHost = nullptr; + uint8_t* tilingDevice = nullptr; + size_t tilingFileSize = sizeof(TCubeTiling); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + // Calculate Tiling + const auto tilingData = MatmulHost::GenerateTiling(testCaseParams); + CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData, + tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, + tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + // Launch kernel + matmul_unitflag_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); +} + +void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + + aclrtContext context; + aclrtStream stream = nullptr; + int64_t deviceId = 0; + TestAclInit(context, stream, deviceId); + + uint8_t* x1Host = nullptr; + uint8_t* x1Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize); + CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* x2Host = nullptr; + uint8_t* x2Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize); + CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* biasHost = nullptr; + uint8_t* biasDevice = nullptr; + if (IS_BIAS) { + CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + uint8_t* yHost = nullptr; + uint8_t* yDevice = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize)); + CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, yFileSize); + + if (IS_BIAS) { + CHECK_ACL(aclrtFree(biasDevice)); + CHECK_ACL(aclrtFreeHost(biasHost)); + } + CHECK_ACL(aclrtFree(x1Device)); + CHECK_ACL(aclrtFreeHost(x1Host)); + CHECK_ACL(aclrtFree(x2Device)); + CHECK_ACL(aclrtFreeHost(x2Host)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); + TestAclDeInit(context, stream, deviceId); +} +#endif +} // namespace MatmulHost + +int32_t main(int32_t argc, const char* args[]) +{ + int64_t inputParams[3] = {1, 1, 1}; + for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop + std::stringstream ss(args[i]); + ss >> inputParams[i - 1]; + } + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); + int64_t M = inputParams[0]; + int64_t N = inputParams[1]; + int64_t K = inputParams[2]; + MatrixFileSize matrixFileSize; + // uint16_t represent half + matrixFileSize.x1FileSize = static_cast(M * K) * sizeof(uint16_t); + matrixFileSize.x2FileSize = static_cast(K * N) * sizeof(uint16_t); + matrixFileSize.yFileSize = static_cast(M * N) * sizeof(float); + matrixFileSize.biasFileSize = static_cast(1 * N) * sizeof(float); +#ifdef ASCENDC_CPU_DEBUG + MatmulHost::TestMatmulCpu(M, N, K, matrixFileSize); +#else + MatmulHost::TestMatmul(M, N, K, matrixFileSize); +#endif + return 0; +} diff --git a/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.cpp b/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.cpp new file mode 100644 index 00000000..d86f2e3f --- /dev/null +++ b/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.cpp @@ -0,0 +1,49 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "matmul_unitflag_custom_tiling.h" +#include + +namespace MatmulHost { + +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) +{ + TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + uint32_t M = testCaseParams.m; + uint32_t N = testCaseParams.n; + uint32_t K = testCaseParams.k; + uint32_t blockDim = testCaseParams.usedCoreNum; + bool isBias = testCaseParams.isBias; + bool isAtrans = testCaseParams.isATrans; + bool isBtrans = testCaseParams.isBTrans; + + cubeTiling.SetDim(blockDim); + cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isBtrans); + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetOrgShape(M, N, K); + cubeTiling.SetShape(M, N, K); + cubeTiling.EnableBias(isBias); + cubeTiling.SetBufferSpace(-1, -1, -1); + if (cubeTiling.GetTiling(tilingData) == -1) { + std::cout << "Generate tiling failed." << std::endl; + return {}; + } + return tilingData; +} + +} // namespace MatmulHost diff --git a/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.h b/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.h new file mode 100644 index 00000000..ce78fcd6 --- /dev/null +++ b/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.h @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_HOST_MATMUL_UNITFLAG_CUSTOM_TILING_H +#define EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_HOST_MATMUL_UNITFLAG_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" + +namespace MatmulHost { + +struct MatmulCaseParams +{ + int32_t usedCoreNum; + int32_t m; + int32_t n; + int32_t k; + bool isBias; + bool isATrans; + bool isBTrans; +}; + +/** + * @brief Generate matmul tiling. + * @param testCaseParams: Testcase parameters. + * @retval Generated Tiling data. + */ +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams); + +} // namespace MatmulHost +#endif // EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_HOST_MATMUL_UNITFLAG_CUSTOM_TILING_H diff --git a/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.cpp b/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.cpp new file mode 100644 index 00000000..a72175db --- /dev/null +++ b/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.cpp @@ -0,0 +1,146 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "matmul_unitflag_custom_kernel.h" + +namespace { +/** + * @brief Copy tiling data to TCubeTiling ptr from tiling gm addr. + * @param tiling: TCubeTiling ptr which needs to copy tiling data. + * @param tilingGM: Tiling gm addr. + * @retval None + */ +__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} +} + +namespace MatmulUnitFlagCustom { +template +__aicore__ inline void MatmulKernel::Init( + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB) +{ + this->tiling = tiling; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N); + + int32_t offsetA = 0; + int32_t offsetB = 0; + int32_t offsetC = 0; + int32_t offsetBias = 0; + this->isTransA = isTransA; + this->isTransB = isTransB; + CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias); + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; + if (GetSysWorkSpacePtr() == nullptr) { + return; + } +} + +template +__aicore__ inline void MatmulKernel::Process() +{ + if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) { + return; + } + + // process with tail block + int tailM = tiling.M - mCoreIndex * tiling.singleCoreM; + tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM; + int tailN = tiling.N - nCoreIndex * tiling.singleCoreN; + tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN; + if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) { + matmulObj.SetTail(tailM, tailN); + } + + matmulObj.SetTensorA(aGlobal, isTransA); + matmulObj.SetTensorB(bGlobal, isTransB); + if (tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + matmulObj.IterateAll(cGlobal); + matmulObj.End(); +} + +template +__aicore__ inline void MatmulKernel::CalcOffset( + int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias) +{ + const TCubeTiling& tiling = this->tiling; + auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores + mCoreIndex = blockIdx % mSingleBlocks; + nCoreIndex = blockIdx / mSingleBlocks; + + if (isTransA) { + offsetA = mCoreIndex * tiling.singleCoreM; + } else { + offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM; + } + if (isTransB) { + offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN; + } else { + offsetB = nCoreIndex * tiling.singleCoreN; + } + offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN; + offsetBias = nCoreIndex * tiling.singleCoreN; +} +} // namespace MatmulUnitFlagCustom + +/** + * @brief matmul kernel function. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tilingGm: Tiling data addr. + * @retval None + */ +extern "C" __global__ __aicore__ void matmul_unitflag_custom(GM_ADDR a, GM_ADDR b, + GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ +#if defined(ASCENDC_CPU_DEBUG) + if (g_coreType == AscendC::AIV) { + return; + } +#endif + // prepare tiling + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + // define matmul kernel + MatmulUnitFlagCustom::MatmulKernel matmulKernel; + AscendC::TPipe pipe; + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); + // init matmul kernel, isTransA=false, isTransB=false + matmulKernel.Init(a, b, bias, c, tiling, false, false); + // matmul kernel process + matmulKernel.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void matmul_unitflag_custom_do(uint32_t blockDim, void* stream, + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + matmul_unitflag_custom<<>>(a, b, bias, c, workspace, tilingGm); +} +#endif diff --git a/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.h b/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.h new file mode 100644 index 00000000..bc2a2810 --- /dev/null +++ b/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.h @@ -0,0 +1,85 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_KERNEL_MATMUL_UNITFLAG_CUSTOM_KERNEL_H +#define EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_KERNEL_MATMUL_UNITFLAG_CUSTOM_KERNEL_H +#include "kernel_operator.h" +// Cube Only +#define ASCENDC_CUBE_ONLY +#include "lib/matmul_intf.h" + +namespace MatmulUnitFlagCustom { + +__aicore__ inline constexpr MatmulConfig GetUnitFlagCfg() +{ + auto mmCfg = CFG_MDL; +#ifdef ENABLE_UNITFLAG_FEATURE + // enable UnitFlag feature + mmCfg.enUnitFlag = true; +#endif + return mmCfg; +} +constexpr static MatmulConfig CFG_MDL_UNITFLAG = GetUnitFlagCfg(); + +template +class MatmulKernel { +public: + __aicore__ inline MatmulKernel(){}; + /** + * @brief Initialization before process. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param tiling: Matmul tiling struct. + * @param isTransA: Whether A matrix is transposed. + * @param isTransB: Whether B matrix is transposed. + * @retval None + */ + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, + bool isTransA, bool isTransB); + /** + * @brief Process matrix calculation. + * @retval None + */ + __aicore__ inline void Process(); + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + AscendC::Matmul matmulObj; + +private: + /** + * @brief Calculate the gm offset based on the blockIdx. + * @param blockIdx: Current Core blockidx. + * @param offsetA: Gm offset of A matrix. + * @param offsetB: Gm offset of B matrix. + * @param offsetC: Gm offset of C matrix. + * @param offsetBias: Gm offset of Bias matrix. + * @retval None + */ + __aicore__ inline void CalcOffset( + int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias); + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + TCubeTiling tiling; + int32_t mCoreIndex; + int32_t nCoreIndex; + bool isTransA{false}; + bool isTransB{false}; +}; +} // namespace MatmulUnitFlagCustom + +#endif // EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_KERNEL_MATMUL_UNITFLAG_CUSTOM_KERNEL_H diff --git a/examples/matrix/matmul_unitflag/run.sh b/examples/matrix/matmul_unitflag/run.sh new file mode 100644 index 00000000..c53da86e --- /dev/null +++ b/examples/matrix/matmul_unitflag/run.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +export IS_PERF="0" +export ENABLE_FEATURE="1" + +SHORT=r:,v:,p:,e:, +LONG=run-mode:,soc-version:,perf:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (-p | --perf ) + IS_PERF="$2" + shift 2;; + (-e | --enable-feature ) + ENABLE_FEATURE="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +# Check invalid input +if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then + echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}" + exit 1 +fi + +if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then + echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu." + exit 1 +fi + +if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1." + exit 1 +fi + +if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then + echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1." + exit 1 +fi + +# only npu mode support is_perf = 1 +if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." + exit 1 +fi + +rm -rf build +mkdir build +cd build + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +set -euo pipefail + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +cd ../ + +rm -rf input +mkdir input +rm -rf output +mkdir output + +rm -rf bin/ +mkdir -p bin +cd bin +cp ../build/ascendc_matmul_unitflag_bbit ./ + +export TF_CPP_MIN_LOG_LEVEL=3 + +if [ "${RUN_MODE}" = "npu" ]; then + if [ "${IS_PERF}" = "1" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + python3 -u ../scripts/exec_test.py npu "perf" + else + python3 -u ../scripts/exec_test.py npu "normal" + fi +elif [ "${RUN_MODE}" = "sim" ]; then + python3 -u ../scripts/exec_test.py sim "normal" +elif [ "${RUN_MODE}" = "cpu" ]; then + python3 -u ../scripts/exec_test.py cpu "normal" +fi diff --git a/examples/matrix/matmul_unitflag/scripts/exec_test.py b/examples/matrix/matmul_unitflag/scripts/exec_test.py new file mode 100644 index 00000000..eb918ff7 --- /dev/null +++ b/examples/matrix/matmul_unitflag/scripts/exec_test.py @@ -0,0 +1,124 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import csv +import time +import logging +import subprocess +import shlex + +import numpy as np + +sys.path.append("../..") +from common_scripts.gen_data import MatmulGenData +from common_scripts.compare_data import compare_data +from common_scripts.exec_utils import get_file_work_dir, get_case_list, clear_file_cache, get_perf_task_duration + +IS_BIAS = True +IS_TRANS_A = False +IS_TRANS_B = False +# float16 in float32 out +DATA_TYPE_STR = "float16_float32" + +logging.basicConfig(level=logging.INFO) + + +class ProcessParams: + def __init__(self, case_name, m, n, k, b, is_perf, run_mode): + self.case_name = case_name + self.m = m + self.n = n + self.k = k + self.b = b + self.is_perf = is_perf + self.run_mode = run_mode + + +def process_case(file_work_dir, process_params): + case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ + process_params.k, process_params.b, process_params.is_perf, process_params.run_mode + logging.info("[INFO] start process case [%s]" % (case_name)) + logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) + clear_file_cache(file_work_dir) + + matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) + params_str = f"{m} {n} {k}" + if is_perf: + matmul_gen_data.gen_fake_golden_data(file_work_dir) + cmd = f"msprof op --application=\"./ascendc_matmul_unitflag_bbit {params_str}\" --output=./prof_out" + elif run_mode == "sim": + matmul_gen_data.gen_golden_data(file_work_dir) + cmd = f"msprof op simulator --application=\"./ascendc_matmul_unitflag_bbit {params_str}\" --output=./sim_out" + else: + matmul_gen_data.gen_golden_data(file_work_dir) + cmd = f"./ascendc_matmul_unitflag_bbit {params_str}" + subprocess.run(shlex.split(cmd)) + if is_perf: + wrong_num = -1 + else: + logging.info("[INFO] compare data case[%s]" % (case_name)) + wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + res_data = [] + res_data.append(case_name) + res_data.append(wrong_num) + res_data.append(b * m * n) + if wrong_num == -1: + res_data.append("None") + elif wrong_num / (b * m * n) > 0.001: + res_data.append("Fail") + else: + res_data.append("Success") + if is_perf: + task_duration = get_perf_task_duration("./prof_out") + res_data.append(task_duration) + return res_data + + +def main(): + args_len = len(sys.argv) - 1 + if args_len != 2: + logging.info("[ERROR] exec_test input params error!") + return -1 + + file_work_dir = get_file_work_dir() + if not os.path.exists(file_work_dir): + logging.info("[ERROR] file path %s not exist!" % (file_work_dir)) + return -1 + + is_perf = False + if sys.argv[2] == "perf": + is_perf = True + + case_list = get_case_list() + res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] + run_mode = sys.argv[1] + for is_process, case_name, m, n, k, b in case_list: + if is_process == 1: + process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode) + res_data = process_case(file_work_dir, process_params) + res_list.append(res_data) + + timestamp = time.time() + result_file_name = "result_" + str(timestamp) + ".csv" + with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(res_list) + + logging.info("---------------RESULT---------------") + for res in res_list: + logging.info(res) + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/matrix/matmul_unitflag/testcase/case.csv b/examples/matrix/matmul_unitflag/testcase/case.csv new file mode 100644 index 00000000..e8f8491a --- /dev/null +++ b/examples/matrix/matmul_unitflag/testcase/case.csv @@ -0,0 +1 @@ +1, case001, 1024, 4096, 1024 \ No newline at end of file diff --git a/examples/readme.md b/examples/readme.md index 8a0e1fc8..e64a3b46 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -36,7 +36,7 @@ 对输入tensor按行做如下公式的计算:zi = (xi - ∑(xi * yi)) * yi,其中∑为按行reduce求和。 - matrix + matrix basic_block_matmul 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 @@ -56,10 +56,6 @@ matmul_async_iterate_all 调用IterateAll实现异步场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 - - matmul_constant - 实现MDL模板下使能Tiling常量化的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 - matmul_ibshareAB 实现A矩阵或B矩阵GM地址相同,A、B矩阵共享L1 Buffer场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 @@ -76,6 +72,10 @@ matmul_mndb 实现M或N轴方向流水并行场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + + matmul_constant + 实现MDL模板下使能Tiling常量化的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + matmul_gemv 实现NORM模板下实现矩阵向量乘的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 @@ -96,10 +96,13 @@ matmul_l0c_extend 基于自主管理CO1的Iterate接口实现Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + + matmul_l2cache + 实现NORM模板下支持L2 Cache切分的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + matmul_mixdualmaster 实现NORM模板下使能双主模式的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 - matmul_splitk 实现多核切K场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 @@ -113,12 +116,8 @@ 实现NORM模板下使能输出矩阵N方向对齐的矩阵乘法,计算公式为:C = A * B + Bias。 - matmul_l2cache - 实现NORM模板下支持L2 Cache切分的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 - - - matmul_nz - 输入矩阵内轴非256B对齐场景下,在AIV核上使用DataCopyPad实现ND转换NZ格式的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + matmul_sparse + 实现MDL模板下稀疏左矩阵A与4:2稠密化后的右矩阵B的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 matmul_l0cache @@ -128,10 +127,6 @@ matmul_tscm 基于TSCM输入实现Matmul矩阵乘法,计算公式为:C = A * B + Bias。 - - matmul_sparse - 实现MDL模板下稀疏左矩阵A与4:2稠密化后的右矩阵B的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 - matmul_triangle 实现NORM模板下使能上下三角模板策略的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 @@ -140,10 +135,26 @@ matmul_partial_output 实现MDL模板下使能Partial Output功能的Matmul矩阵乘法,计算公式为:C = A * B。 + + matmul_nz + 输入矩阵内轴非256B对齐场景下,在AIV核上使用DataCopyPad实现ND转换NZ格式的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + matmul_unaligned 实现NORM模板下多核非对齐切分的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + + matmul_unitflag + 实现MDL模板下使能UnitFlag功能的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + + + matmul_channelsplit + 实现Norm模板下使能Channel拆分功能的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + + + matmul_quant + 实现MDL模板下反量化功能的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + normalization layernorm -- Gitee