diff --git a/sample/README.md b/sample/README.md index 6bd55a2f83422b2f0c8424c9687a38f1698aa6fb..d20f261325fa2bc8023dc08bf0e25d9d2a19fee1 100644 --- a/sample/README.md +++ b/sample/README.md @@ -133,3 +133,25 @@ mssanitizer ./*.fatbin # 默认进行memcheck检查 └── trace.json # 算子所有核的流水图 ``` 4. 更多指标信息请参考算子开发工具使用手册。 + +### 性能建模 +用户可参照sample/mskpp_sample中的参考案例,根据实际用户算子的数学逻辑,编写算子dsl语言的Python脚本, +即可对昇腾算子进行理论性能建模,获得算子的极限性能参考 + +1. 编写算子dsl语言的Python脚本,并执行,以sample/mskpp_sample中的vec_softmax为例 + ``` + cd ./sample/mskpp_sample/ + python3 softmax.py + ``` +2. 执行完成后,在sample/mskpp_sample目录下生成如下生成件: + ``` + [WorkSpace] + ├── Pipe_Statistic.csv # 以Pipe维度统计cycle耗时 + └── Instruction_Statistic.csv # 以指令维度统计cycle耗时 + ├── trace.json # 理想流水图 + └── instruction_cycle_consumption.html # 当安装plotly三方库后,可生成该文件,输出各指令、Pipe的耗时比例 + ``` + + +近期还会新增对GM相关的核外内存搬运指令地精细化建模,进一步提高GM核外搬运Pipe的理论耗时精准度,并对外展示L2Cache命中率的情况, +敬请期待~ \ No newline at end of file diff --git a/sample/mskpp_sample/README.md b/sample/mskpp_sample/README.md new file mode 100644 index 0000000000000000000000000000000000000000..34dfb15775fa19417bbcdc9b0d9a3dc06d7fb806 --- /dev/null +++ b/sample/mskpp_sample/README.md @@ -0,0 +1,37 @@ +# mskpp结果说明补充 + +## vec案例与实测结果对比 + +以add_kernel.py为例,mskpp建模数据与实际算子性能数据比较: + +其中实测数据来源于sample/normal_sample/vec_only的上板调优结果 + + +| 测试用例名 | 类型 | 端到端时间 (us) | shape | +|-------------------|----|------------|----------| +| 实测add_kernel | FP16 | 5.9 | [8,2048] | +| mskpp建模add_kernel | FP16 | 3.5424 | [8,2048] | + +## mix案例与实测结果对比 + +以matmul_leakyrelu_kernel.py为例,mskpp建模数据与实际算子性能数据比较: + +其中实测数据来源于sample/normal_sample/mix的上板调优结果 + + +| 测试用例名 | 类型 | 端到端时间 (us) | shape | +|-------------------|----|------------|----------------| +| 实测matmul_leakyrelu_kernel | FP16 | 287.42 | [1024,640,256] | +| mskpp建模matmul_leakyrelu_kernel | FP16 | 60.1356 | [1024,640,256] | + +## cube案例与实测结果对比 + +以matmul_kernel.py为例,mskpp建模数据与实际算子性能数据比较: + +其中实测数据来源于sample/normal_sample/cube_only的上板调优结果 + + +| 测试用例名 | 类型 | 端到端时间 (us) | shape | +|-------------------|----|------------|----------------| +| 实测matmul_kernel | FP16 | 9.86 | [512,1024,512] | +| mskpp建模matmul_kernel | FP16 | 5.9089 | [512,1024,512] | \ No newline at end of file diff --git a/sample/mskpp_sample/cube_only/matmul.cpp.py b/sample/mskpp_sample/cube_only/matmul.cpp.py new file mode 100644 index 0000000000000000000000000000000000000000..ffc5b980ae8f8a1714156597a654ddb97874e592 --- /dev/null +++ b/sample/mskpp_sample/cube_only/matmul.cpp.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +from mskpp import mamd, Tensor, Chip + + +def my_mmad(gm_x, gm_y, gm_z): + # 矩阵乘的基本数据通路 + # 左矩阵A:GM-L1-L0A + # 右矩阵A:GM-L1-L0B + # 结果矩阵C:L0C(初始化)-GM + + # 定义和分配L1上的变量 + l1_x = Tensor("L1") + l1_y = Tensor("L1") + + # 定义和分配L0A和L0B上的变量 + x = Tensor("L0A") + y = Tensor("L0B") + + # 定义和分配在L0C上的运算结果变量 + z = Tensor("L0C", "FP32", [32, 16], format="NC1HWC0") + + # 将GM上的数据移动到L1对应内存空间上 + l1_x.load(gm_x) + l1_y.load(gm_y) + + # 将L1上的左右矩阵移动到L0A和L0B上 + x.load(l1_x) + y.load(l1_y) + + # 当前数据已加载到L0A和L0B上,调用指令进行计算,结果保存在L0C上 + out = mmad(x, y, z, True)() + + # 将L0C上的数据移动到GM变量gm_z的地址空间上 + gm_z.load(out[0]) + return z + + +if __name__ == '__main__': + with Chip("Ascend910B1") as chip: + chip.enable_trace() + chip.enable_metrics() + + # 模拟一个大矩阵被切分成5个小矩阵进行计算 + for _ in range(5): + # 应用算子进行AICORE计算 + in_x = Tensor("GM", "FP16", [32, 48], format="ND") + in_y = Tensor("GM", "FP16", [48, 16], format="ND") + in_z = Tensor("GM", "FP32", [32, 16], format="NC1HWC0") + my_mmad(in_x, in_y, in_z) diff --git a/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py b/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py new file mode 100644 index 0000000000000000000000000000000000000000..7df09265df6af5ea34b89b2b37b220011292a6fe --- /dev/null +++ b/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +from mskpp import mamd, Tensor, Chip + + +def my_mmad(gm_x, gm_y, gm_z): + # 矩阵乘的基本数据通路: + # 左矩阵A:GM-L1-L0A + # 右矩阵B:GM-L1-L0B + # 结果矩阵C: L0C(初始化)-GM + + # 定义和分配L1上的变量 + l1_x = Tensor("L1") + l1_y = Tensor("L1") + # 定义和分配L0A和L0B上的变量 + x = Tensor("L0A") + y = Tensor("L0B") + # 定义和分配在L0C上的运算结果变量 + z = Tensor("L0C", "FP32", [256, 128], format="ND") + + # 将GM上的数据移动到L1对应内存空间上 + l1_x.load(gm_x) + l1_y.load(gm_y) + + # 将L1上的左右矩阵移动到L0A和L0B上 + x.load(l1_x) + y.load(l1_y) + + # 当前数据已加载到L0A和L0B上,调用指令进行计算,结果保存在L0C上 + out = mmad(x, y, z, True)() + gm_z.load(out[0], set_value=1) # set_value 表示设置同步事件1 + + +def my_vrelu(gm_x, gm_y): + # 定义和分配UB上的变量 + x = Tensor("UB") + y = Tensor("UB") + + x.load(gm_x, expect_value=1) # expect_value 表示等待同步事件1完成后,该load才可执行 + out = vrelu(x, y)() + gm_y.load(out[0]) + + +def Ascend910B1_Test(): + with Chip("Ascend910B1") as chip: + chip.enable_trace() + chip.enable_metrics() + # 应用算子进行AICORE计算 + in_x = Tensor("GM", "FP16", [1024, 640], format="ND") + in_y = Tensor("GM", "FP16", [640, 256], format="ND") + in_z = Tensor("GM", "FP32", [1024, 256], format="ND") + relu_out = Tensor("GM", "FP32", [1024, 256], format="ND") + with Core("AIC0") as aiv0: + my_mmad(in_x, in_y, in_z) + with Core("AIV0") as aiv1: + my_vrelu(in_z, relu_out) + + +if __name__ == "__main__": + Ascend910B1_Test() diff --git a/sample/mskpp_sample/vec_only/add_kernel.py b/sample/mskpp_sample/vec_only/add_kernel.py new file mode 100644 index 0000000000000000000000000000000000000000..1941d10a21cb8256fe21bc8af57a0ccba1920b6d --- /dev/null +++ b/sample/mskpp_sample/vec_only/add_kernel.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +from mskpp import mamd, Tensor, Chip + + +def my_vadd(gm_x, gm_y, gm_z): + # 向量Add的基本数据通路 + # 被加数x:GM-UB + # 加数y:GM-UB + # 结果向量z:UB-GM + + # 定义和分配UB上的变量 + x = Tensor("UB") + y = Tensor("UB") + z = Tensor("UB") + + # 将GM上的数据移动到UB对应内存空间上 + x.load(gm_x) + y.load(gm_y) + + # 当前数据已加载到UB上,调用指令进行计算,结果保存在UB上 + out = vadd(x, y, z)() + + # 将UB上的数据移动到GM变量gm_z的地址空间上 + gm_z.load(out[0]) + + +if __name__ == '__main__': + with Chip("Ascend910B1") as chip: + chip.enable_trace() + chip.enable_metrics() + + # 应用算子进行AICORE计算 + in_x = Tensor("GM", "FP16", [32, 48], format="ND") + in_y = Tensor("GM", "FP16", [48, 16], format="ND") + in_z = Tensor("GM", "FP16", [32, 48], format="ND") + my_vadd(in_x, in_y, in_z)