From 8465216551caf85dd04171142db5e6fd65a6564d Mon Sep 17 00:00:00 2001 From: huawei-zhangjunbo Date: Thu, 28 Mar 2024 19:03:21 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0mskpp=20sample=E6=A0=B7?= =?UTF-8?q?=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/README.md | 26 +++++ sample/mskpp_sample/fa_grad.py | 186 +++++++++++++++++++++++++++++++++ sample/mskpp_sample/softmax.py | 53 ++++++++++ 3 files changed, 265 insertions(+) create mode 100644 sample/mskpp_sample/fa_grad.py create mode 100644 sample/mskpp_sample/softmax.py diff --git a/sample/README.md b/sample/README.md index 6bd55a2f834..b52a7bdd94d 100644 --- a/sample/README.md +++ b/sample/README.md @@ -133,3 +133,29 @@ mssanitizer ./*.fatbin # 默认进行memcheck检查 └── trace.json # 算子所有核的流水图 ``` 4. 更多指标信息请参考算子开发工具使用手册。 + +### 性能建模 +用户可参照mskpp_sample中的参考案例,根据实际用户算子的数学逻辑,编写算子dsl语言的Python脚本, +即可对昇腾算子进行理论性能建模,获得算子的极限性能参考 + +1. 编写算子dsl语言的Python脚本,并执行,以sample中的softmax为例 + ``` + python3 ./mskpp_sample/softmax.py + ``` +2. 执行完成后,在当前目录下生成如下生成件: + ``` + [WorkSpace] + ├── Pipe_Statistic.csv # 以Pipe维度统计cycle耗时 + └── Instruction_Statistic.csv # 以指令维度统计cycle耗时 + ├── trace.json # 理想流水图 + └── instruction_cycle_consumption.html # 当安装plotly三方库后,可生成该文件,输出各指令、Pipe的耗时比例 + ``` +3. 以softmax为例,mskpp建模数据与实际算子性能数据比较: + +| 测试用例名 | 类型 | 端到端时间 (us) | 数据量 (KB) | aiv_vec_cycles | aiv_MTE2_cycles | aiv_MTE3_cycles | +|----|----|----|----|----|----|----| +| 实测softmax | FP16 | 94.98 | 51200 | 7835157 | 1256661 | 1062680 | +| mskpp建模softmax | FP16 | 95.85 | 51200 | 5916350 | 1118750 | 1595580 | + +近期还会新增对GM相关的核外内存搬运指令地精细化建模,进一步提高GM核外搬运Pipe的理论耗时精准度,并对外展示L2Cache命中率的情况, +敬请期待~ \ No newline at end of file diff --git a/sample/mskpp_sample/fa_grad.py b/sample/mskpp_sample/fa_grad.py new file mode 100644 index 00000000000..07a06815d3f --- /dev/null +++ b/sample/mskpp_sample/fa_grad.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +from mskpp import mmad, Tensor, Chip, Core, vadd, vadds, vsub, vexp, vmul, vmuls + +seq_split_out = 64 +seq_split_in = 256 +batch_size, head_num, seq_size, head_dim = 1, 1, 2048, 127 +scale_value = 1 / 128 +keep_prob = 1.0 + + +def fa_forward_vec_part(gm_temp, gm_atten_mask_split, gm_softmax_log_max_sum_split, shape_info): + m, n = shape_info + ub_bmm1_res_loop = Tensor("UB", "FP16", [m, n], format="NZ") + + ub_bmm1_res_loop.load(gm_temp, expect_value=1) + vadds(ub_bmm1_res_loop, scale_value, ub_bmm1_res_loop)() + + ub_atten_mask_loop = Tensor("UB", "FP16", [m, n], format="NZ") # [64, 256] + + ub_atten_mask_loop.load(gm_atten_mask_split) # [64, 256] + vadds(ub_atten_mask_loop, -10000, ub_atten_mask_loop)() + vadd(ub_bmm1_res_loop, ub_atten_mask_loop, ub_bmm1_res_loop)() + ub_softmax_log_max_sum_loop = Tensor("UB", "FP16", [m, 1], format="NZ") # [64, 1] + + ub_softmax_log_max_sum_loop.load(gm_softmax_log_max_sum_split) # [64, 1] + vsub(ub_bmm1_res_loop, ub_softmax_log_max_sum_loop, ub_bmm1_res_loop)() + + ub_bmm1_res_loop_last = Tensor("UB", "FP16", [m, n], format="NZ") + # gm_temp初始化为invalid,gm_temp完成后生成一个值,会通知ub_bmm1_res_loop_last是否为等待值,如果是就设置为就绪状态 + # 这里可以将gm_temp + vexp(ub_bmm1_res_loop, ub_bmm1_res_loop_last)() + return ub_bmm1_res_loop_last + + +def softmax_out_grad_vec_part(gm_temp2, gm_softmax_out_sum_split, ub_bmm1_res_loop_last, shape_info): + """ + 这是一个对UB和VEC的行为的包裹,其输入是GM,可以看做一个小型计算核 + """ + m, n = shape_info + ub_softmax_out_drop_grad_loop = Tensor("UB", "FP16", [m, n], format="NZ") + ub_softmax_out_drop_grad_loop.load(gm_temp2, expect_value=2) # [64, 256] + ub_softmax_out_sum_loop = Tensor("UB", "FP16", [m, 1], format="NZ") # [64, 1] + ub_softmax_out_sum_loop.load(gm_softmax_out_sum_split) # [64, 1] + vsub(ub_softmax_out_drop_grad_loop, ub_softmax_out_sum_loop, ub_softmax_out_drop_grad_loop)() + vmul(ub_softmax_out_drop_grad_loop, ub_bmm1_res_loop_last, ub_softmax_out_drop_grad_loop)() + ub_softmax_out_drop_grad_loop_last = Tensor("UB", "FP16", [m, n], format="NZ") + vmuls(ub_softmax_out_drop_grad_loop, scale_value, ub_softmax_out_drop_grad_loop_last)() + gm_temp2.load(ub_softmax_out_drop_grad_loop_last, set_value=4) + return gm_temp2 + + +def batch_dot(out, a, b, shape_info, tensor_value): + """ + 这其实是一个对L0,MMAD的行为包裹,其输入是L1的内容,可以作为一个小型计算核 + """ + m, k, n = shape_info + l0a_n = Tensor("L0A", "FP16", [m, k], format="NZ") # [64, 256] + l0b_n = Tensor("L0B", "FP16", [k, n], format="NZ") # [256, 128] + l0c_n = Tensor("L0C", "FP16", [m, n], format="NZ") # [64, 128] + l0a_n.load(a) + l0b_n.load(b) + mmad_out = mmad(l0a_n, l0b_n, l0c_n, True)() + out.load(mmad_out[0], set_value=tensor_value) + # 是在这一次的调度时,需要标记一个值 + # 某块内存在被某次调度时,设置特定的值 + # out调度时,将任务id传入,标识是这一次的任务执行 + # out自己也记录这一次的任务ID和需要写入的值 + # 这里out计算完成后,需要将自己的值设置为一个数(不需要给所有在等待的tensor发消息,因为通过调度系统确保刚好调度到依赖项目) + return out + + +def flash_attention_grad(gm_q, gm_k, gm_v, gm_atten_mask, gm_attention_out_grad, gm_softmax_out_sum, gm_softmax_log_max_sum, gm_q_grad, gm_k_grad, gm_v_grad): + seq_out_loop_times = seq_size // seq_split_out # 2048 // 64 = 32 + seq_in_loop_times = seq_size // seq_split_in # 2048 // 256 = 8 + + for batch_index in range(batch_size): + for head_index in range(head_num): + for seq_out_index in range(seq_out_loop_times): + seq_out_start_index = seq_out_index * seq_split_out + seq_out_end_index = seq_out_start_index + seq_split_out + for seq_in_index in range(seq_in_loop_times): + seq_in_start_index = seq_in_index * seq_split_in + seq_in_end_index = seq_in_start_index + seq_split_in + """对一次处理的内存切块,满足L1或者UB的要求,此处不属于Core的范围,所以处理GM的内存。""" + gm_q_split = gm_q[batch_index, head_index, seq_out_start_index: seq_out_end_index, :] + gm_attention_out_grad_split = gm_attention_out_grad[batch_index, head_index, + seq_out_start_index:seq_out_end_index, :] + gm_k_split = gm_k[batch_index, head_index, seq_in_start_index:seq_in_end_index, :] + gm_v_split = gm_v[batch_index, head_index, seq_in_start_index:seq_in_end_index, :] + gm_atten_mask_split = gm_atten_mask[seq_out_start_index: seq_out_end_index, + seq_in_start_index:seq_in_end_index] + gm_softmax_log_max_sum_split = gm_softmax_log_max_sum[batch_index, head_index, + seq_out_start_index:seq_out_end_index, :] + gm_softmax_out_sum_split = gm_softmax_out_sum[batch_index, head_index, + seq_out_start_index:seq_out_end_index, :] + + """对于需要用于核间同步的GM内存最好单独管理,在这个作用域创建,保证不同循环间的gm是不一样的,这样相互不影响""" + gm_temp = Tensor("GM", "FP16", [seq_split_out, seq_split_in], format="NZ") + gm_temp2 = Tensor("GM", "FP16", [seq_split_out, seq_split_in], format="NZ") + + with Core("AIC0") as aic: + """在这个作用域下,仅处理: + 1. L1的创建和对(CUBE,L0)联合对象函数的处理,L1的新建请尽量靠近计算本身,减少理解难度 + 2. 与AIV的同步 + """ + # task 1.1: 前向计算 AIC 部分 + l1_q_loop = Tensor("L1", "FP16", [seq_split_out, head_dim], format="NZ") # [64, 128] + l1_k_loop = Tensor("L1", "FP16", [seq_split_in, head_dim], format="NZ") # [256, 128] + l1_q_loop.load(gm_q_split) # [64, 128] + l1_k_loop.load(gm_k_split) # [256, 128] + gm_temp = batch_dot(gm_temp, l1_q_loop, l1_k_loop, (seq_split_out, head_dim, seq_split_in), 1) + + # task 2.1: softmax_out_grad计算 AIC 部分 + l1_v_loop = Tensor("L1", "FP16", [seq_split_in, head_dim], format="NZ") # [256, 128] + l1_attention_out_loop = Tensor("L1", "FP16", [seq_split_out, head_dim], format="NZ") + l1_v_loop.load(gm_v_split) # [256, 128] + l1_attention_out_loop.load(gm_attention_out_grad_split) # [64, 128] + gm_temp2 = batch_dot(gm_temp2, l1_attention_out_loop, l1_v_loop, + (seq_split_out, head_dim, seq_split_in), 2) + # 这里会第一次写入gm_temp2,写入后立即触发task2.2 + + # task 3:反向计算 AIC, 依赖task2.2 + l1_softmax_out_loop = Tensor("L1", "FP16", [seq_split_out, seq_split_in], format = "NZ") + l1_softmax_out_loop.load(gm_temp2, expect_value=4) + gm_q_grad = batch_dot(gm_q_grad, l1_attention_out_loop, l1_softmax_out_loop, + (seq_split_out, head_dim, seq_split_in), -1) + + # gm_temp.load(ub_bmm1_res_loop_last) + gm_k_grad = batch_dot(gm_k_grad, l1_q_loop, l1_softmax_out_loop, + (seq_split_out, head_dim, seq_split_in), -1) + + l1_softmax_out_loop.load(gm_temp, expect_value=3) + gm_v_grad = batch_dot(gm_v_grad, l1_softmax_out_loop, l1_k_loop, + (seq_split_out, seq_split_in, head_dim), -1) + + with Core("AIV0") as aiv: + """在这个作用域下,仅处理:与AIC的同步""" + # task 1.2: 前向计算 AIC 部分, 依赖task1.1 + ub_bmm1_res_loop_last = fa_forward_vec_part(gm_temp, gm_atten_mask_split, + gm_softmax_log_max_sum_split, + (seq_split_out, seq_split_in)) + gm_temp.load(ub_bmm1_res_loop_last, set_value=3) + + # task 2.2: softmax_out_grad计算 AIV 部分, 依赖 task2.1(重点关注) 和 task1.2(不用管) + gm_temp2 = softmax_out_grad_vec_part(gm_temp2, gm_softmax_out_sum_split, ub_bmm1_res_loop_last, + (seq_split_out, seq_split_in)) + + +if __name__ == "__main__": + """在main部分主要定义全局的输入输出 + """ + with Chip("Ascend910B1") as chip: + chip.enable_trace() + chip.enable_metrics() + chip.enable_cache_detail() + # init input gm. + gm_q = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") + gm_k = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") + gm_v = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") + gm_atten_mask = Tensor("GM", "FP16", [seq_size, seq_size], format="NZ") + gm_attention_out_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") + gm_softmax_out_sum = Tensor("GM", "FP16", [batch_size, head_num, seq_size], format="NZ") + gm_softmax_log_max_sum = Tensor("GM", "FP16", [batch_size, head_num, seq_size], format="NZ") + + # init output gm. + gm_q_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") + gm_k_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") + gm_v_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") + + flash_attention_grad(gm_q, gm_k, gm_v, gm_atten_mask, gm_attention_out_grad, gm_softmax_out_sum, + gm_softmax_log_max_sum, gm_q_grad, gm_k_grad, gm_v_grad) \ No newline at end of file diff --git a/sample/mskpp_sample/softmax.py b/sample/mskpp_sample/softmax.py new file mode 100644 index 00000000000..a9c5b36854a --- /dev/null +++ b/sample/mskpp_sample/softmax.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +from mskpp import * + + +def softmax_func(gm_input): + ub_input = Tensor("UB", dtype="FP16") + ub_temp = Tensor("UB", dtype="FP16") + ub_cast = Tensor("UB", dtype="FP32") + ub_input.load(gm_input) + vmax_output = vmax(ub_input, ub_input, ub_temp)() + vsub_output = vsub(ub_input, vmax_output[0], ub_temp)() + vexp_output = vexp(vsub_output[0], ub_temp)() + vconv_output = vconv(vexp_output[0], ub_cast, "FP32")() + vadd_output = vadd(vconv_output[0], vconv_output[0], ub_cast)() + vdiv_output = vdiv(vconv_output[0],vadd_output[0], ub_cast)() + softmax_output = Tensor("UB") + softmax_output = vdiv(vdiv_output[0], vadd_output[0], softmax_output)() + return softmax_output[0] + + +def ascend910b1_test(input_shape): + with Chip("Ascend910B1") as chip: + chip.enable_trace() + chip.enable_metrics() + with Core("AIV0") as aiv0: + gm_input = Tensor("GM", dtype="FP16", size=input_shape, format="ND") + gm_output = Tensor("GM", dtype="FP32", size=input_shape, format="ND") + ub_output = softmax_func(gm_input) + gm_output.load(ub_output) + with Core("AIV1") as aiv1: + gm_input_1 = Tensor("GM", dtype="FP16", size=input_shape, format="ND") + gm_output_1 = Tensor("GM", dtype="FP32", size=input_shape, format="ND") + ub_output_1 = softmax_func(gm_input_1) + gm_output_1.load(ub_output_1) + + +if __name__ == "__main__": + ascend910b1_test([1, 64, 64, 128]) \ No newline at end of file -- Gitee From 1c053303ddd806c9dccb690e582d1c278cffed4e Mon Sep 17 00:00:00 2001 From: huawei-zhangjunbo Date: Thu, 28 Mar 2024 21:15:30 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0mskpp=20sample=E6=A0=B7?= =?UTF-8?q?=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/README.md | 8 ++------ sample/mskpp_sample/README.md | 10 ++++++++++ sample/mskpp_sample/softmax.py | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) create mode 100644 sample/mskpp_sample/README.md diff --git a/sample/README.md b/sample/README.md index b52a7bdd94d..80d29b5e2ac 100644 --- a/sample/README.md +++ b/sample/README.md @@ -140,7 +140,8 @@ mssanitizer ./*.fatbin # 默认进行memcheck检查 1. 编写算子dsl语言的Python脚本,并执行,以sample中的softmax为例 ``` - python3 ./mskpp_sample/softmax.py + cd ./sample/mskpp_sample/ + python3 softmax.py ``` 2. 执行完成后,在当前目录下生成如下生成件: ``` @@ -150,12 +151,7 @@ mssanitizer ./*.fatbin # 默认进行memcheck检查 ├── trace.json # 理想流水图 └── instruction_cycle_consumption.html # 当安装plotly三方库后,可生成该文件,输出各指令、Pipe的耗时比例 ``` -3. 以softmax为例,mskpp建模数据与实际算子性能数据比较: -| 测试用例名 | 类型 | 端到端时间 (us) | 数据量 (KB) | aiv_vec_cycles | aiv_MTE2_cycles | aiv_MTE3_cycles | -|----|----|----|----|----|----|----| -| 实测softmax | FP16 | 94.98 | 51200 | 7835157 | 1256661 | 1062680 | -| mskpp建模softmax | FP16 | 95.85 | 51200 | 5916350 | 1118750 | 1595580 | 近期还会新增对GM相关的核外内存搬运指令地精细化建模,进一步提高GM核外搬运Pipe的理论耗时精准度,并对外展示L2Cache命中率的情况, 敬请期待~ \ No newline at end of file diff --git a/sample/mskpp_sample/README.md b/sample/mskpp_sample/README.md new file mode 100644 index 00000000000..b53cd2c1be5 --- /dev/null +++ b/sample/mskpp_sample/README.md @@ -0,0 +1,10 @@ +# mskpp结果说明补充 + +## softmax案例与实测结果对比 + +以softmax为例,mskpp建模数据与实际算子性能数据比较: + +| 测试用例名 | 类型 | 端到端时间 (us) | 数据量 (KB) | aiv_vec_cycles | aiv_MTE2_cycles | aiv_MTE3_cycles | +|----|----|------------|----|-----------------|----|----| +| 实测softmax | FP16 | 94.98 | 51200 | 7835157 | 1256661 | 1062680 | +| mskpp建模softmax | FP16 | 77.68 | 51200 | 4277000 | 1118750 | 1595580 | \ No newline at end of file diff --git a/sample/mskpp_sample/softmax.py b/sample/mskpp_sample/softmax.py index a9c5b36854a..32df4acb88d 100644 --- a/sample/mskpp_sample/softmax.py +++ b/sample/mskpp_sample/softmax.py @@ -18,22 +18,25 @@ from mskpp import * def softmax_func(gm_input): + # 创建输入输出的临时变量 ub_input = Tensor("UB", dtype="FP16") ub_temp = Tensor("UB", dtype="FP16") ub_cast = Tensor("UB", dtype="FP32") ub_input.load(gm_input) + # softmax numpy 实现方式:softmax = np.exp(x) / np.sum(np.exp(x), axis=0) + # 下面是以mskpp语言实现的softmax算子伪代码 vmax_output = vmax(ub_input, ub_input, ub_temp)() vsub_output = vsub(ub_input, vmax_output[0], ub_temp)() vexp_output = vexp(vsub_output[0], ub_temp)() + # 昇腾softmax算子中,在exp指令计算之前,是以fp16计算,后转成fp32计算,因此除公式所需步骤外,插入vconv的指令 vconv_output = vconv(vexp_output[0], ub_cast, "FP32")() vadd_output = vadd(vconv_output[0], vconv_output[0], ub_cast)() vdiv_output = vdiv(vconv_output[0],vadd_output[0], ub_cast)() - softmax_output = Tensor("UB") - softmax_output = vdiv(vdiv_output[0], vadd_output[0], softmax_output)() - return softmax_output[0] + return vdiv_output[0] def ascend910b1_test(input_shape): + # Ascend910B1分两个vector核,通过with语句将其分开,并分别定义全局的输入输出 with Chip("Ascend910B1") as chip: chip.enable_trace() chip.enable_metrics() -- Gitee From 22307aba3e39316f9792c63525898cff9e34514e Mon Sep 17 00:00:00 2001 From: huawei-zhangjunbo Date: Thu, 28 Mar 2024 22:29:41 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0mskpp=20sample=E6=A0=B7?= =?UTF-8?q?=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/README.md | 6 +- sample/mskpp_sample/README.md | 15 +- sample/mskpp_sample/cube_only/matmul.cpp.py | 64 ++++++ sample/mskpp_sample/fa_grad.py | 186 ------------------ .../mix/matmul_leakyrelu_kernel.py | 74 +++++++ sample/mskpp_sample/softmax.py | 56 ------ sample/mskpp_sample/vec_only/add_kernel.py | 51 +++++ 7 files changed, 201 insertions(+), 251 deletions(-) create mode 100644 sample/mskpp_sample/cube_only/matmul.cpp.py delete mode 100644 sample/mskpp_sample/fa_grad.py create mode 100644 sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py delete mode 100644 sample/mskpp_sample/softmax.py create mode 100644 sample/mskpp_sample/vec_only/add_kernel.py diff --git a/sample/README.md b/sample/README.md index 80d29b5e2ac..d20f261325f 100644 --- a/sample/README.md +++ b/sample/README.md @@ -135,15 +135,15 @@ mssanitizer ./*.fatbin # 默认进行memcheck检查 4. 更多指标信息请参考算子开发工具使用手册。 ### 性能建模 -用户可参照mskpp_sample中的参考案例,根据实际用户算子的数学逻辑,编写算子dsl语言的Python脚本, +用户可参照sample/mskpp_sample中的参考案例,根据实际用户算子的数学逻辑,编写算子dsl语言的Python脚本, 即可对昇腾算子进行理论性能建模,获得算子的极限性能参考 -1. 编写算子dsl语言的Python脚本,并执行,以sample中的softmax为例 +1. 编写算子dsl语言的Python脚本,并执行,以sample/mskpp_sample中的vec_softmax为例 ``` cd ./sample/mskpp_sample/ python3 softmax.py ``` -2. 执行完成后,在当前目录下生成如下生成件: +2. 执行完成后,在sample/mskpp_sample目录下生成如下生成件: ``` [WorkSpace] ├── Pipe_Statistic.csv # 以Pipe维度统计cycle耗时 diff --git a/sample/mskpp_sample/README.md b/sample/mskpp_sample/README.md index b53cd2c1be5..ab4a8e3e3b4 100644 --- a/sample/mskpp_sample/README.md +++ b/sample/mskpp_sample/README.md @@ -1,10 +1,13 @@ # mskpp结果说明补充 -## softmax案例与实测结果对比 +## vec案例与实测结果对比 -以softmax为例,mskpp建模数据与实际算子性能数据比较: +以add_kernel.cpp为例,mskpp建模数据与实际算子性能数据比较: -| 测试用例名 | 类型 | 端到端时间 (us) | 数据量 (KB) | aiv_vec_cycles | aiv_MTE2_cycles | aiv_MTE3_cycles | -|----|----|------------|----|-----------------|----|----| -| 实测softmax | FP16 | 94.98 | 51200 | 7835157 | 1256661 | 1062680 | -| mskpp建模softmax | FP16 | 77.68 | 51200 | 4277000 | 1118750 | 1595580 | \ No newline at end of file +其中实测数据来源于sample/normal_sample/vec_only的上板调优结果 + + +| 测试用例名 | 类型 | 端到端时间 (us) | shape | +|-------------------|----|------------|-------------| +| 实测add_kernel | FP16 | 5.9 | 8,2048 | +| mskpp建模add_kernel | FP16 | 3.5424 | 8,2048 | \ No newline at end of file diff --git a/sample/mskpp_sample/cube_only/matmul.cpp.py b/sample/mskpp_sample/cube_only/matmul.cpp.py new file mode 100644 index 00000000000..ffc5b980ae8 --- /dev/null +++ b/sample/mskpp_sample/cube_only/matmul.cpp.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +from mskpp import mamd, Tensor, Chip + + +def my_mmad(gm_x, gm_y, gm_z): + # 矩阵乘的基本数据通路 + # 左矩阵A:GM-L1-L0A + # 右矩阵A:GM-L1-L0B + # 结果矩阵C:L0C(初始化)-GM + + # 定义和分配L1上的变量 + l1_x = Tensor("L1") + l1_y = Tensor("L1") + + # 定义和分配L0A和L0B上的变量 + x = Tensor("L0A") + y = Tensor("L0B") + + # 定义和分配在L0C上的运算结果变量 + z = Tensor("L0C", "FP32", [32, 16], format="NC1HWC0") + + # 将GM上的数据移动到L1对应内存空间上 + l1_x.load(gm_x) + l1_y.load(gm_y) + + # 将L1上的左右矩阵移动到L0A和L0B上 + x.load(l1_x) + y.load(l1_y) + + # 当前数据已加载到L0A和L0B上,调用指令进行计算,结果保存在L0C上 + out = mmad(x, y, z, True)() + + # 将L0C上的数据移动到GM变量gm_z的地址空间上 + gm_z.load(out[0]) + return z + + +if __name__ == '__main__': + with Chip("Ascend910B1") as chip: + chip.enable_trace() + chip.enable_metrics() + + # 模拟一个大矩阵被切分成5个小矩阵进行计算 + for _ in range(5): + # 应用算子进行AICORE计算 + in_x = Tensor("GM", "FP16", [32, 48], format="ND") + in_y = Tensor("GM", "FP16", [48, 16], format="ND") + in_z = Tensor("GM", "FP32", [32, 16], format="NC1HWC0") + my_mmad(in_x, in_y, in_z) diff --git a/sample/mskpp_sample/fa_grad.py b/sample/mskpp_sample/fa_grad.py deleted file mode 100644 index 07a06815d3f..00000000000 --- a/sample/mskpp_sample/fa_grad.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -from mskpp import mmad, Tensor, Chip, Core, vadd, vadds, vsub, vexp, vmul, vmuls - -seq_split_out = 64 -seq_split_in = 256 -batch_size, head_num, seq_size, head_dim = 1, 1, 2048, 127 -scale_value = 1 / 128 -keep_prob = 1.0 - - -def fa_forward_vec_part(gm_temp, gm_atten_mask_split, gm_softmax_log_max_sum_split, shape_info): - m, n = shape_info - ub_bmm1_res_loop = Tensor("UB", "FP16", [m, n], format="NZ") - - ub_bmm1_res_loop.load(gm_temp, expect_value=1) - vadds(ub_bmm1_res_loop, scale_value, ub_bmm1_res_loop)() - - ub_atten_mask_loop = Tensor("UB", "FP16", [m, n], format="NZ") # [64, 256] - - ub_atten_mask_loop.load(gm_atten_mask_split) # [64, 256] - vadds(ub_atten_mask_loop, -10000, ub_atten_mask_loop)() - vadd(ub_bmm1_res_loop, ub_atten_mask_loop, ub_bmm1_res_loop)() - ub_softmax_log_max_sum_loop = Tensor("UB", "FP16", [m, 1], format="NZ") # [64, 1] - - ub_softmax_log_max_sum_loop.load(gm_softmax_log_max_sum_split) # [64, 1] - vsub(ub_bmm1_res_loop, ub_softmax_log_max_sum_loop, ub_bmm1_res_loop)() - - ub_bmm1_res_loop_last = Tensor("UB", "FP16", [m, n], format="NZ") - # gm_temp初始化为invalid,gm_temp完成后生成一个值,会通知ub_bmm1_res_loop_last是否为等待值,如果是就设置为就绪状态 - # 这里可以将gm_temp - vexp(ub_bmm1_res_loop, ub_bmm1_res_loop_last)() - return ub_bmm1_res_loop_last - - -def softmax_out_grad_vec_part(gm_temp2, gm_softmax_out_sum_split, ub_bmm1_res_loop_last, shape_info): - """ - 这是一个对UB和VEC的行为的包裹,其输入是GM,可以看做一个小型计算核 - """ - m, n = shape_info - ub_softmax_out_drop_grad_loop = Tensor("UB", "FP16", [m, n], format="NZ") - ub_softmax_out_drop_grad_loop.load(gm_temp2, expect_value=2) # [64, 256] - ub_softmax_out_sum_loop = Tensor("UB", "FP16", [m, 1], format="NZ") # [64, 1] - ub_softmax_out_sum_loop.load(gm_softmax_out_sum_split) # [64, 1] - vsub(ub_softmax_out_drop_grad_loop, ub_softmax_out_sum_loop, ub_softmax_out_drop_grad_loop)() - vmul(ub_softmax_out_drop_grad_loop, ub_bmm1_res_loop_last, ub_softmax_out_drop_grad_loop)() - ub_softmax_out_drop_grad_loop_last = Tensor("UB", "FP16", [m, n], format="NZ") - vmuls(ub_softmax_out_drop_grad_loop, scale_value, ub_softmax_out_drop_grad_loop_last)() - gm_temp2.load(ub_softmax_out_drop_grad_loop_last, set_value=4) - return gm_temp2 - - -def batch_dot(out, a, b, shape_info, tensor_value): - """ - 这其实是一个对L0,MMAD的行为包裹,其输入是L1的内容,可以作为一个小型计算核 - """ - m, k, n = shape_info - l0a_n = Tensor("L0A", "FP16", [m, k], format="NZ") # [64, 256] - l0b_n = Tensor("L0B", "FP16", [k, n], format="NZ") # [256, 128] - l0c_n = Tensor("L0C", "FP16", [m, n], format="NZ") # [64, 128] - l0a_n.load(a) - l0b_n.load(b) - mmad_out = mmad(l0a_n, l0b_n, l0c_n, True)() - out.load(mmad_out[0], set_value=tensor_value) - # 是在这一次的调度时,需要标记一个值 - # 某块内存在被某次调度时,设置特定的值 - # out调度时,将任务id传入,标识是这一次的任务执行 - # out自己也记录这一次的任务ID和需要写入的值 - # 这里out计算完成后,需要将自己的值设置为一个数(不需要给所有在等待的tensor发消息,因为通过调度系统确保刚好调度到依赖项目) - return out - - -def flash_attention_grad(gm_q, gm_k, gm_v, gm_atten_mask, gm_attention_out_grad, gm_softmax_out_sum, gm_softmax_log_max_sum, gm_q_grad, gm_k_grad, gm_v_grad): - seq_out_loop_times = seq_size // seq_split_out # 2048 // 64 = 32 - seq_in_loop_times = seq_size // seq_split_in # 2048 // 256 = 8 - - for batch_index in range(batch_size): - for head_index in range(head_num): - for seq_out_index in range(seq_out_loop_times): - seq_out_start_index = seq_out_index * seq_split_out - seq_out_end_index = seq_out_start_index + seq_split_out - for seq_in_index in range(seq_in_loop_times): - seq_in_start_index = seq_in_index * seq_split_in - seq_in_end_index = seq_in_start_index + seq_split_in - """对一次处理的内存切块,满足L1或者UB的要求,此处不属于Core的范围,所以处理GM的内存。""" - gm_q_split = gm_q[batch_index, head_index, seq_out_start_index: seq_out_end_index, :] - gm_attention_out_grad_split = gm_attention_out_grad[batch_index, head_index, - seq_out_start_index:seq_out_end_index, :] - gm_k_split = gm_k[batch_index, head_index, seq_in_start_index:seq_in_end_index, :] - gm_v_split = gm_v[batch_index, head_index, seq_in_start_index:seq_in_end_index, :] - gm_atten_mask_split = gm_atten_mask[seq_out_start_index: seq_out_end_index, - seq_in_start_index:seq_in_end_index] - gm_softmax_log_max_sum_split = gm_softmax_log_max_sum[batch_index, head_index, - seq_out_start_index:seq_out_end_index, :] - gm_softmax_out_sum_split = gm_softmax_out_sum[batch_index, head_index, - seq_out_start_index:seq_out_end_index, :] - - """对于需要用于核间同步的GM内存最好单独管理,在这个作用域创建,保证不同循环间的gm是不一样的,这样相互不影响""" - gm_temp = Tensor("GM", "FP16", [seq_split_out, seq_split_in], format="NZ") - gm_temp2 = Tensor("GM", "FP16", [seq_split_out, seq_split_in], format="NZ") - - with Core("AIC0") as aic: - """在这个作用域下,仅处理: - 1. L1的创建和对(CUBE,L0)联合对象函数的处理,L1的新建请尽量靠近计算本身,减少理解难度 - 2. 与AIV的同步 - """ - # task 1.1: 前向计算 AIC 部分 - l1_q_loop = Tensor("L1", "FP16", [seq_split_out, head_dim], format="NZ") # [64, 128] - l1_k_loop = Tensor("L1", "FP16", [seq_split_in, head_dim], format="NZ") # [256, 128] - l1_q_loop.load(gm_q_split) # [64, 128] - l1_k_loop.load(gm_k_split) # [256, 128] - gm_temp = batch_dot(gm_temp, l1_q_loop, l1_k_loop, (seq_split_out, head_dim, seq_split_in), 1) - - # task 2.1: softmax_out_grad计算 AIC 部分 - l1_v_loop = Tensor("L1", "FP16", [seq_split_in, head_dim], format="NZ") # [256, 128] - l1_attention_out_loop = Tensor("L1", "FP16", [seq_split_out, head_dim], format="NZ") - l1_v_loop.load(gm_v_split) # [256, 128] - l1_attention_out_loop.load(gm_attention_out_grad_split) # [64, 128] - gm_temp2 = batch_dot(gm_temp2, l1_attention_out_loop, l1_v_loop, - (seq_split_out, head_dim, seq_split_in), 2) - # 这里会第一次写入gm_temp2,写入后立即触发task2.2 - - # task 3:反向计算 AIC, 依赖task2.2 - l1_softmax_out_loop = Tensor("L1", "FP16", [seq_split_out, seq_split_in], format = "NZ") - l1_softmax_out_loop.load(gm_temp2, expect_value=4) - gm_q_grad = batch_dot(gm_q_grad, l1_attention_out_loop, l1_softmax_out_loop, - (seq_split_out, head_dim, seq_split_in), -1) - - # gm_temp.load(ub_bmm1_res_loop_last) - gm_k_grad = batch_dot(gm_k_grad, l1_q_loop, l1_softmax_out_loop, - (seq_split_out, head_dim, seq_split_in), -1) - - l1_softmax_out_loop.load(gm_temp, expect_value=3) - gm_v_grad = batch_dot(gm_v_grad, l1_softmax_out_loop, l1_k_loop, - (seq_split_out, seq_split_in, head_dim), -1) - - with Core("AIV0") as aiv: - """在这个作用域下,仅处理:与AIC的同步""" - # task 1.2: 前向计算 AIC 部分, 依赖task1.1 - ub_bmm1_res_loop_last = fa_forward_vec_part(gm_temp, gm_atten_mask_split, - gm_softmax_log_max_sum_split, - (seq_split_out, seq_split_in)) - gm_temp.load(ub_bmm1_res_loop_last, set_value=3) - - # task 2.2: softmax_out_grad计算 AIV 部分, 依赖 task2.1(重点关注) 和 task1.2(不用管) - gm_temp2 = softmax_out_grad_vec_part(gm_temp2, gm_softmax_out_sum_split, ub_bmm1_res_loop_last, - (seq_split_out, seq_split_in)) - - -if __name__ == "__main__": - """在main部分主要定义全局的输入输出 - """ - with Chip("Ascend910B1") as chip: - chip.enable_trace() - chip.enable_metrics() - chip.enable_cache_detail() - # init input gm. - gm_q = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") - gm_k = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") - gm_v = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") - gm_atten_mask = Tensor("GM", "FP16", [seq_size, seq_size], format="NZ") - gm_attention_out_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") - gm_softmax_out_sum = Tensor("GM", "FP16", [batch_size, head_num, seq_size], format="NZ") - gm_softmax_log_max_sum = Tensor("GM", "FP16", [batch_size, head_num, seq_size], format="NZ") - - # init output gm. - gm_q_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") - gm_k_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") - gm_v_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ") - - flash_attention_grad(gm_q, gm_k, gm_v, gm_atten_mask, gm_attention_out_grad, gm_softmax_out_sum, - gm_softmax_log_max_sum, gm_q_grad, gm_k_grad, gm_v_grad) \ No newline at end of file diff --git a/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py b/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py new file mode 100644 index 00000000000..7df09265df6 --- /dev/null +++ b/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +from mskpp import mamd, Tensor, Chip + + +def my_mmad(gm_x, gm_y, gm_z): + # 矩阵乘的基本数据通路: + # 左矩阵A:GM-L1-L0A + # 右矩阵B:GM-L1-L0B + # 结果矩阵C: L0C(初始化)-GM + + # 定义和分配L1上的变量 + l1_x = Tensor("L1") + l1_y = Tensor("L1") + # 定义和分配L0A和L0B上的变量 + x = Tensor("L0A") + y = Tensor("L0B") + # 定义和分配在L0C上的运算结果变量 + z = Tensor("L0C", "FP32", [256, 128], format="ND") + + # 将GM上的数据移动到L1对应内存空间上 + l1_x.load(gm_x) + l1_y.load(gm_y) + + # 将L1上的左右矩阵移动到L0A和L0B上 + x.load(l1_x) + y.load(l1_y) + + # 当前数据已加载到L0A和L0B上,调用指令进行计算,结果保存在L0C上 + out = mmad(x, y, z, True)() + gm_z.load(out[0], set_value=1) # set_value 表示设置同步事件1 + + +def my_vrelu(gm_x, gm_y): + # 定义和分配UB上的变量 + x = Tensor("UB") + y = Tensor("UB") + + x.load(gm_x, expect_value=1) # expect_value 表示等待同步事件1完成后,该load才可执行 + out = vrelu(x, y)() + gm_y.load(out[0]) + + +def Ascend910B1_Test(): + with Chip("Ascend910B1") as chip: + chip.enable_trace() + chip.enable_metrics() + # 应用算子进行AICORE计算 + in_x = Tensor("GM", "FP16", [1024, 640], format="ND") + in_y = Tensor("GM", "FP16", [640, 256], format="ND") + in_z = Tensor("GM", "FP32", [1024, 256], format="ND") + relu_out = Tensor("GM", "FP32", [1024, 256], format="ND") + with Core("AIC0") as aiv0: + my_mmad(in_x, in_y, in_z) + with Core("AIV0") as aiv1: + my_vrelu(in_z, relu_out) + + +if __name__ == "__main__": + Ascend910B1_Test() diff --git a/sample/mskpp_sample/softmax.py b/sample/mskpp_sample/softmax.py deleted file mode 100644 index 32df4acb88d..00000000000 --- a/sample/mskpp_sample/softmax.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -from mskpp import * - - -def softmax_func(gm_input): - # 创建输入输出的临时变量 - ub_input = Tensor("UB", dtype="FP16") - ub_temp = Tensor("UB", dtype="FP16") - ub_cast = Tensor("UB", dtype="FP32") - ub_input.load(gm_input) - # softmax numpy 实现方式:softmax = np.exp(x) / np.sum(np.exp(x), axis=0) - # 下面是以mskpp语言实现的softmax算子伪代码 - vmax_output = vmax(ub_input, ub_input, ub_temp)() - vsub_output = vsub(ub_input, vmax_output[0], ub_temp)() - vexp_output = vexp(vsub_output[0], ub_temp)() - # 昇腾softmax算子中,在exp指令计算之前,是以fp16计算,后转成fp32计算,因此除公式所需步骤外,插入vconv的指令 - vconv_output = vconv(vexp_output[0], ub_cast, "FP32")() - vadd_output = vadd(vconv_output[0], vconv_output[0], ub_cast)() - vdiv_output = vdiv(vconv_output[0],vadd_output[0], ub_cast)() - return vdiv_output[0] - - -def ascend910b1_test(input_shape): - # Ascend910B1分两个vector核,通过with语句将其分开,并分别定义全局的输入输出 - with Chip("Ascend910B1") as chip: - chip.enable_trace() - chip.enable_metrics() - with Core("AIV0") as aiv0: - gm_input = Tensor("GM", dtype="FP16", size=input_shape, format="ND") - gm_output = Tensor("GM", dtype="FP32", size=input_shape, format="ND") - ub_output = softmax_func(gm_input) - gm_output.load(ub_output) - with Core("AIV1") as aiv1: - gm_input_1 = Tensor("GM", dtype="FP16", size=input_shape, format="ND") - gm_output_1 = Tensor("GM", dtype="FP32", size=input_shape, format="ND") - ub_output_1 = softmax_func(gm_input_1) - gm_output_1.load(ub_output_1) - - -if __name__ == "__main__": - ascend910b1_test([1, 64, 64, 128]) \ No newline at end of file diff --git a/sample/mskpp_sample/vec_only/add_kernel.py b/sample/mskpp_sample/vec_only/add_kernel.py new file mode 100644 index 00000000000..1941d10a21c --- /dev/null +++ b/sample/mskpp_sample/vec_only/add_kernel.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +from mskpp import mamd, Tensor, Chip + + +def my_vadd(gm_x, gm_y, gm_z): + # 向量Add的基本数据通路 + # 被加数x:GM-UB + # 加数y:GM-UB + # 结果向量z:UB-GM + + # 定义和分配UB上的变量 + x = Tensor("UB") + y = Tensor("UB") + z = Tensor("UB") + + # 将GM上的数据移动到UB对应内存空间上 + x.load(gm_x) + y.load(gm_y) + + # 当前数据已加载到UB上,调用指令进行计算,结果保存在UB上 + out = vadd(x, y, z)() + + # 将UB上的数据移动到GM变量gm_z的地址空间上 + gm_z.load(out[0]) + + +if __name__ == '__main__': + with Chip("Ascend910B1") as chip: + chip.enable_trace() + chip.enable_metrics() + + # 应用算子进行AICORE计算 + in_x = Tensor("GM", "FP16", [32, 48], format="ND") + in_y = Tensor("GM", "FP16", [48, 16], format="ND") + in_z = Tensor("GM", "FP16", [32, 48], format="ND") + my_vadd(in_x, in_y, in_z) -- Gitee From 3d3826e2038ad267a99a87dec8e85dadf5d29c40 Mon Sep 17 00:00:00 2001 From: huawei-zhangjunbo Date: Fri, 29 Mar 2024 10:00:27 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0mskpp=20sample=E6=A0=B7?= =?UTF-8?q?=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/mskpp_sample/README.md | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/sample/mskpp_sample/README.md b/sample/mskpp_sample/README.md index ab4a8e3e3b4..34dfb15775f 100644 --- a/sample/mskpp_sample/README.md +++ b/sample/mskpp_sample/README.md @@ -2,12 +2,36 @@ ## vec案例与实测结果对比 -以add_kernel.cpp为例,mskpp建模数据与实际算子性能数据比较: +以add_kernel.py为例,mskpp建模数据与实际算子性能数据比较: 其中实测数据来源于sample/normal_sample/vec_only的上板调优结果 -| 测试用例名 | 类型 | 端到端时间 (us) | shape | -|-------------------|----|------------|-------------| -| 实测add_kernel | FP16 | 5.9 | 8,2048 | -| mskpp建模add_kernel | FP16 | 3.5424 | 8,2048 | \ No newline at end of file +| 测试用例名 | 类型 | 端到端时间 (us) | shape | +|-------------------|----|------------|----------| +| 实测add_kernel | FP16 | 5.9 | [8,2048] | +| mskpp建模add_kernel | FP16 | 3.5424 | [8,2048] | + +## mix案例与实测结果对比 + +以matmul_leakyrelu_kernel.py为例,mskpp建模数据与实际算子性能数据比较: + +其中实测数据来源于sample/normal_sample/mix的上板调优结果 + + +| 测试用例名 | 类型 | 端到端时间 (us) | shape | +|-------------------|----|------------|----------------| +| 实测matmul_leakyrelu_kernel | FP16 | 287.42 | [1024,640,256] | +| mskpp建模matmul_leakyrelu_kernel | FP16 | 60.1356 | [1024,640,256] | + +## cube案例与实测结果对比 + +以matmul_kernel.py为例,mskpp建模数据与实际算子性能数据比较: + +其中实测数据来源于sample/normal_sample/cube_only的上板调优结果 + + +| 测试用例名 | 类型 | 端到端时间 (us) | shape | +|-------------------|----|------------|----------------| +| 实测matmul_kernel | FP16 | 9.86 | [512,1024,512] | +| mskpp建模matmul_kernel | FP16 | 5.9089 | [512,1024,512] | \ No newline at end of file -- Gitee