From 8465216551caf85dd04171142db5e6fd65a6564d Mon Sep 17 00:00:00 2001
From: huawei-zhangjunbo <zhangjunbo15@h-partners.com>
Date: Thu, 28 Mar 2024 19:03:21 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0mskpp=20sample=E6=A0=B7?=
 =?UTF-8?q?=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/README.md               |  26 +++++
 sample/mskpp_sample/fa_grad.py | 186 +++++++++++++++++++++++++++++++++
 sample/mskpp_sample/softmax.py |  53 ++++++++++
 3 files changed, 265 insertions(+)
 create mode 100644 sample/mskpp_sample/fa_grad.py
 create mode 100644 sample/mskpp_sample/softmax.py

diff --git a/sample/README.md b/sample/README.md
index 6bd55a2f834..b52a7bdd94d 100644
--- a/sample/README.md
+++ b/sample/README.md
@@ -133,3 +133,29 @@ mssanitizer ./*.fatbin  # 默认进行memcheck检查
         └── trace.json                          # 算子所有核的流水图
     ```
 4. 更多指标信息请参考算子开发工具使用手册。
+
+### 性能建模
+用户可参照mskpp_sample中的参考案例，根据实际用户算子的数学逻辑，编写算子dsl语言的Python脚本，
+即可对昇腾算子进行理论性能建模，获得算子的极限性能参考
+
+1. 编写算子dsl语言的Python脚本，并执行，以sample中的softmax为例
+   ```
+   python3 ./mskpp_sample/softmax.py
+   ```
+2. 执行完成后，在当前目录下生成如下生成件：
+    ```
+    [WorkSpace]
+    ├── Pipe_Statistic.csv                      # 以Pipe维度统计cycle耗时
+    └── Instruction_Statistic.csv               # 以指令维度统计cycle耗时
+    ├── trace.json                              # 理想流水图
+    └── instruction_cycle_consumption.html      # 当安装plotly三方库后，可生成该文件，输出各指令、Pipe的耗时比例
+    ```
+3. 以softmax为例，mskpp建模数据与实际算子性能数据比较：
+
+| 测试用例名 | 类型 | 端到端时间 (us) | 数据量 (KB) | aiv_vec_cycles | aiv_MTE2_cycles | aiv_MTE3_cycles |
+|----|----|----|----|----|----|----|
+| 实测softmax | FP16 | 94.98 | 51200 | 7835157 | 1256661 | 1062680 |
+| mskpp建模softmax | FP16 | 95.85 | 51200 | 5916350 | 1118750 | 1595580 |
+
+近期还会新增对GM相关的核外内存搬运指令地精细化建模，进一步提高GM核外搬运Pipe的理论耗时精准度，并对外展示L2Cache命中率的情况，
+敬请期待~
\ No newline at end of file
diff --git a/sample/mskpp_sample/fa_grad.py b/sample/mskpp_sample/fa_grad.py
new file mode 100644
index 00000000000..07a06815d3f
--- /dev/null
+++ b/sample/mskpp_sample/fa_grad.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from mskpp import mmad, Tensor, Chip, Core, vadd, vadds, vsub, vexp, vmul, vmuls
+
+seq_split_out = 64
+seq_split_in = 256
+batch_size, head_num, seq_size, head_dim = 1, 1, 2048, 127
+scale_value = 1 / 128
+keep_prob = 1.0
+
+
+def fa_forward_vec_part(gm_temp, gm_atten_mask_split, gm_softmax_log_max_sum_split, shape_info):
+    m, n = shape_info
+    ub_bmm1_res_loop = Tensor("UB", "FP16", [m, n], format="NZ")
+
+    ub_bmm1_res_loop.load(gm_temp, expect_value=1)
+    vadds(ub_bmm1_res_loop, scale_value, ub_bmm1_res_loop)()
+
+    ub_atten_mask_loop = Tensor("UB", "FP16", [m, n], format="NZ")  # [64, 256]
+
+    ub_atten_mask_loop.load(gm_atten_mask_split)  # [64, 256]
+    vadds(ub_atten_mask_loop, -10000, ub_atten_mask_loop)()
+    vadd(ub_bmm1_res_loop, ub_atten_mask_loop, ub_bmm1_res_loop)()
+    ub_softmax_log_max_sum_loop = Tensor("UB", "FP16", [m, 1], format="NZ")  # [64, 1]
+
+    ub_softmax_log_max_sum_loop.load(gm_softmax_log_max_sum_split)  # [64, 1]
+    vsub(ub_bmm1_res_loop, ub_softmax_log_max_sum_loop, ub_bmm1_res_loop)()
+
+    ub_bmm1_res_loop_last = Tensor("UB", "FP16", [m, n], format="NZ")
+    # gm_temp初始化为invalid，gm_temp完成后生成一个值，会通知ub_bmm1_res_loop_last是否为等待值，如果是就设置为就绪状态
+    # 这里可以将gm_temp
+    vexp(ub_bmm1_res_loop, ub_bmm1_res_loop_last)()
+    return ub_bmm1_res_loop_last
+
+
+def softmax_out_grad_vec_part(gm_temp2, gm_softmax_out_sum_split, ub_bmm1_res_loop_last, shape_info):
+    """
+    这是一个对UB和VEC的行为的包裹，其输入是GM，可以看做一个小型计算核
+    """
+    m, n = shape_info
+    ub_softmax_out_drop_grad_loop = Tensor("UB", "FP16", [m, n], format="NZ")
+    ub_softmax_out_drop_grad_loop.load(gm_temp2, expect_value=2)  # [64, 256]
+    ub_softmax_out_sum_loop = Tensor("UB", "FP16", [m, 1], format="NZ")  # [64, 1]
+    ub_softmax_out_sum_loop.load(gm_softmax_out_sum_split)  # [64, 1]
+    vsub(ub_softmax_out_drop_grad_loop, ub_softmax_out_sum_loop, ub_softmax_out_drop_grad_loop)()
+    vmul(ub_softmax_out_drop_grad_loop, ub_bmm1_res_loop_last, ub_softmax_out_drop_grad_loop)()
+    ub_softmax_out_drop_grad_loop_last = Tensor("UB", "FP16", [m, n], format="NZ")
+    vmuls(ub_softmax_out_drop_grad_loop, scale_value, ub_softmax_out_drop_grad_loop_last)()
+    gm_temp2.load(ub_softmax_out_drop_grad_loop_last, set_value=4)
+    return gm_temp2
+
+
+def batch_dot(out, a, b, shape_info, tensor_value):
+    """
+    这其实是一个对L0,MMAD的行为包裹，其输入是L1的内容，可以作为一个小型计算核
+    """
+    m, k, n = shape_info
+    l0a_n = Tensor("L0A", "FP16", [m, k], format="NZ")  # [64, 256]
+    l0b_n = Tensor("L0B", "FP16", [k, n], format="NZ")  # [256, 128]
+    l0c_n = Tensor("L0C", "FP16", [m, n], format="NZ")  # [64, 128]
+    l0a_n.load(a)
+    l0b_n.load(b)
+    mmad_out = mmad(l0a_n, l0b_n, l0c_n, True)()
+    out.load(mmad_out[0], set_value=tensor_value)
+    # 是在这一次的调度时，需要标记一个值
+    # 某块内存在被某次调度时，设置特定的值
+    # out调度时，将任务id传入，标识是这一次的任务执行
+    # out自己也记录这一次的任务ID和需要写入的值
+    # 这里out计算完成后，需要将自己的值设置为一个数(不需要给所有在等待的tensor发消息，因为通过调度系统确保刚好调度到依赖项目)
+    return out
+
+
+def flash_attention_grad(gm_q, gm_k, gm_v, gm_atten_mask, gm_attention_out_grad, gm_softmax_out_sum, gm_softmax_log_max_sum, gm_q_grad, gm_k_grad, gm_v_grad):
+    seq_out_loop_times = seq_size // seq_split_out  # 2048 // 64 = 32
+    seq_in_loop_times = seq_size // seq_split_in    # 2048 // 256 = 8
+
+    for batch_index in range(batch_size):
+        for head_index in range(head_num):
+            for seq_out_index in range(seq_out_loop_times):
+                seq_out_start_index = seq_out_index * seq_split_out
+                seq_out_end_index = seq_out_start_index + seq_split_out
+                for seq_in_index in range(seq_in_loop_times):
+                    seq_in_start_index = seq_in_index * seq_split_in
+                    seq_in_end_index = seq_in_start_index + seq_split_in
+                    """对一次处理的内存切块，满足L1或者UB的要求，此处不属于Core的范围，所以处理GM的内存。"""
+                    gm_q_split = gm_q[batch_index, head_index, seq_out_start_index: seq_out_end_index, :]
+                    gm_attention_out_grad_split = gm_attention_out_grad[batch_index, head_index,
+                                                  seq_out_start_index:seq_out_end_index, :]
+                    gm_k_split = gm_k[batch_index, head_index, seq_in_start_index:seq_in_end_index, :]
+                    gm_v_split = gm_v[batch_index, head_index, seq_in_start_index:seq_in_end_index, :]
+                    gm_atten_mask_split = gm_atten_mask[seq_out_start_index: seq_out_end_index,
+                                          seq_in_start_index:seq_in_end_index]
+                    gm_softmax_log_max_sum_split = gm_softmax_log_max_sum[batch_index, head_index,
+                                                   seq_out_start_index:seq_out_end_index, :]
+                    gm_softmax_out_sum_split = gm_softmax_out_sum[batch_index, head_index,
+                                               seq_out_start_index:seq_out_end_index, :]
+
+                    """对于需要用于核间同步的GM内存最好单独管理，在这个作用域创建，保证不同循环间的gm是不一样的，这样相互不影响"""
+                    gm_temp = Tensor("GM", "FP16", [seq_split_out, seq_split_in], format="NZ")
+                    gm_temp2 = Tensor("GM", "FP16", [seq_split_out, seq_split_in], format="NZ")
+
+                    with Core("AIC0") as aic:
+                        """在这个作用域下，仅处理：
+                          1. L1的创建和对(CUBE，L0)联合对象函数的处理，L1的新建请尽量靠近计算本身，减少理解难度
+                          2. 与AIV的同步
+                        """
+                        # task 1.1： 前向计算 AIC 部分
+                        l1_q_loop = Tensor("L1", "FP16", [seq_split_out, head_dim], format="NZ")  # [64, 128]
+                        l1_k_loop = Tensor("L1", "FP16", [seq_split_in, head_dim], format="NZ")  # [256, 128]
+                        l1_q_loop.load(gm_q_split)  # [64, 128]
+                        l1_k_loop.load(gm_k_split)  # [256, 128]
+                        gm_temp = batch_dot(gm_temp, l1_q_loop, l1_k_loop, (seq_split_out, head_dim, seq_split_in), 1)
+
+                        # task 2.1: softmax_out_grad计算 AIC 部分
+                        l1_v_loop = Tensor("L1", "FP16", [seq_split_in, head_dim], format="NZ")  # [256, 128]
+                        l1_attention_out_loop = Tensor("L1", "FP16", [seq_split_out, head_dim], format="NZ")
+                        l1_v_loop.load(gm_v_split)  # [256, 128]
+                        l1_attention_out_loop.load(gm_attention_out_grad_split)  # [64, 128]
+                        gm_temp2 = batch_dot(gm_temp2, l1_attention_out_loop, l1_v_loop,
+                                             (seq_split_out, head_dim, seq_split_in), 2)
+                        # 这里会第一次写入gm_temp2，写入后立即触发task2.2
+
+                        # task 3：反向计算 AIC, 依赖task2.2
+                        l1_softmax_out_loop = Tensor("L1", "FP16", [seq_split_out, seq_split_in], format = "NZ")
+                        l1_softmax_out_loop.load(gm_temp2, expect_value=4)
+                        gm_q_grad = batch_dot(gm_q_grad, l1_attention_out_loop, l1_softmax_out_loop,
+                                              (seq_split_out, head_dim, seq_split_in), -1)
+
+                        # gm_temp.load(ub_bmm1_res_loop_last)
+                        gm_k_grad = batch_dot(gm_k_grad, l1_q_loop, l1_softmax_out_loop,
+                                              (seq_split_out, head_dim, seq_split_in), -1)
+
+                        l1_softmax_out_loop.load(gm_temp, expect_value=3)
+                        gm_v_grad = batch_dot(gm_v_grad, l1_softmax_out_loop, l1_k_loop,
+                                              (seq_split_out, seq_split_in, head_dim), -1)
+
+                    with Core("AIV0") as aiv:
+                        """在这个作用域下，仅处理：与AIC的同步"""
+                        # task 1.2： 前向计算 AIC 部分, 依赖task1.1
+                        ub_bmm1_res_loop_last = fa_forward_vec_part(gm_temp, gm_atten_mask_split,
+                                                                    gm_softmax_log_max_sum_split,
+                                                                    (seq_split_out, seq_split_in))
+                        gm_temp.load(ub_bmm1_res_loop_last, set_value=3)
+
+                        # task 2.2: softmax_out_grad计算 AIV 部分, 依赖 task2.1(重点关注) 和 task1.2(不用管)
+                        gm_temp2 = softmax_out_grad_vec_part(gm_temp2, gm_softmax_out_sum_split, ub_bmm1_res_loop_last,
+                                                             (seq_split_out, seq_split_in))
+
+
+if __name__ == "__main__":
+    """在main部分主要定义全局的输入输出
+    """
+    with Chip("Ascend910B1") as chip:
+        chip.enable_trace()
+        chip.enable_metrics()
+        chip.enable_cache_detail()
+        # init input gm.
+        gm_q = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
+        gm_k = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
+        gm_v = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
+        gm_atten_mask = Tensor("GM", "FP16", [seq_size, seq_size], format="NZ")
+        gm_attention_out_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
+        gm_softmax_out_sum = Tensor("GM", "FP16", [batch_size, head_num, seq_size], format="NZ")
+        gm_softmax_log_max_sum = Tensor("GM", "FP16", [batch_size, head_num, seq_size], format="NZ")
+
+        # init output gm.
+        gm_q_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
+        gm_k_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
+        gm_v_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
+
+        flash_attention_grad(gm_q, gm_k, gm_v, gm_atten_mask, gm_attention_out_grad, gm_softmax_out_sum,
+                             gm_softmax_log_max_sum, gm_q_grad, gm_k_grad, gm_v_grad)
\ No newline at end of file
diff --git a/sample/mskpp_sample/softmax.py b/sample/mskpp_sample/softmax.py
new file mode 100644
index 00000000000..a9c5b36854a
--- /dev/null
+++ b/sample/mskpp_sample/softmax.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from mskpp import *
+
+
+def softmax_func(gm_input):
+    ub_input = Tensor("UB", dtype="FP16")
+    ub_temp = Tensor("UB", dtype="FP16")
+    ub_cast = Tensor("UB", dtype="FP32")
+    ub_input.load(gm_input)
+    vmax_output = vmax(ub_input, ub_input, ub_temp)()
+    vsub_output = vsub(ub_input, vmax_output[0], ub_temp)()
+    vexp_output = vexp(vsub_output[0], ub_temp)()
+    vconv_output = vconv(vexp_output[0], ub_cast, "FP32")()
+    vadd_output = vadd(vconv_output[0], vconv_output[0], ub_cast)()
+    vdiv_output = vdiv(vconv_output[0],vadd_output[0], ub_cast)()
+    softmax_output = Tensor("UB")
+    softmax_output = vdiv(vdiv_output[0], vadd_output[0], softmax_output)()
+    return softmax_output[0]
+
+
+def ascend910b1_test(input_shape):
+    with Chip("Ascend910B1") as chip:
+        chip.enable_trace()
+        chip.enable_metrics()
+        with Core("AIV0") as aiv0:
+            gm_input = Tensor("GM", dtype="FP16", size=input_shape, format="ND")
+            gm_output = Tensor("GM", dtype="FP32", size=input_shape, format="ND")
+            ub_output = softmax_func(gm_input)
+            gm_output.load(ub_output)
+        with Core("AIV1") as aiv1:
+            gm_input_1 = Tensor("GM", dtype="FP16", size=input_shape, format="ND")
+            gm_output_1 = Tensor("GM", dtype="FP32", size=input_shape, format="ND")
+            ub_output_1 = softmax_func(gm_input_1)
+            gm_output_1.load(ub_output_1)
+
+
+if __name__ == "__main__":
+    ascend910b1_test([1, 64, 64, 128])
\ No newline at end of file
-- 
Gitee


From 1c053303ddd806c9dccb690e582d1c278cffed4e Mon Sep 17 00:00:00 2001
From: huawei-zhangjunbo <zhangjunbo15@h-partners.com>
Date: Thu, 28 Mar 2024 21:15:30 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0mskpp=20sample=E6=A0=B7?=
 =?UTF-8?q?=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/README.md               |  8 ++------
 sample/mskpp_sample/README.md  | 10 ++++++++++
 sample/mskpp_sample/softmax.py |  9 ++++++---
 3 files changed, 18 insertions(+), 9 deletions(-)
 create mode 100644 sample/mskpp_sample/README.md

diff --git a/sample/README.md b/sample/README.md
index b52a7bdd94d..80d29b5e2ac 100644
--- a/sample/README.md
+++ b/sample/README.md
@@ -140,7 +140,8 @@ mssanitizer ./*.fatbin  # 默认进行memcheck检查
 
 1. 编写算子dsl语言的Python脚本，并执行，以sample中的softmax为例
    ```
-   python3 ./mskpp_sample/softmax.py
+   cd ./sample/mskpp_sample/
+   python3 softmax.py
    ```
 2. 执行完成后，在当前目录下生成如下生成件：
     ```
@@ -150,12 +151,7 @@ mssanitizer ./*.fatbin  # 默认进行memcheck检查
     ├── trace.json                              # 理想流水图
     └── instruction_cycle_consumption.html      # 当安装plotly三方库后，可生成该文件，输出各指令、Pipe的耗时比例
     ```
-3. 以softmax为例，mskpp建模数据与实际算子性能数据比较：
 
-| 测试用例名 | 类型 | 端到端时间 (us) | 数据量 (KB) | aiv_vec_cycles | aiv_MTE2_cycles | aiv_MTE3_cycles |
-|----|----|----|----|----|----|----|
-| 实测softmax | FP16 | 94.98 | 51200 | 7835157 | 1256661 | 1062680 |
-| mskpp建模softmax | FP16 | 95.85 | 51200 | 5916350 | 1118750 | 1595580 |
 
 近期还会新增对GM相关的核外内存搬运指令地精细化建模，进一步提高GM核外搬运Pipe的理论耗时精准度，并对外展示L2Cache命中率的情况，
 敬请期待~
\ No newline at end of file
diff --git a/sample/mskpp_sample/README.md b/sample/mskpp_sample/README.md
new file mode 100644
index 00000000000..b53cd2c1be5
--- /dev/null
+++ b/sample/mskpp_sample/README.md
@@ -0,0 +1,10 @@
+# mskpp结果说明补充
+
+## softmax案例与实测结果对比
+
+以softmax为例，mskpp建模数据与实际算子性能数据比较：
+
+| 测试用例名 | 类型 | 端到端时间 (us) | 数据量 (KB) | aiv_vec_cycles  | aiv_MTE2_cycles | aiv_MTE3_cycles |
+|----|----|------------|----|-----------------|----|----|
+| 实测softmax | FP16 | 94.98      | 51200 | 7835157         | 1256661 | 1062680 |
+| mskpp建模softmax | FP16 | 77.68  | 51200 | 4277000         | 1118750 | 1595580 |
\ No newline at end of file
diff --git a/sample/mskpp_sample/softmax.py b/sample/mskpp_sample/softmax.py
index a9c5b36854a..32df4acb88d 100644
--- a/sample/mskpp_sample/softmax.py
+++ b/sample/mskpp_sample/softmax.py
@@ -18,22 +18,25 @@ from mskpp import *
 
 
 def softmax_func(gm_input):
+    # 创建输入输出的临时变量
     ub_input = Tensor("UB", dtype="FP16")
     ub_temp = Tensor("UB", dtype="FP16")
     ub_cast = Tensor("UB", dtype="FP32")
     ub_input.load(gm_input)
+    # softmax numpy 实现方式：softmax = np.exp(x) / np.sum(np.exp(x), axis=0)
+    # 下面是以mskpp语言实现的softmax算子伪代码
     vmax_output = vmax(ub_input, ub_input, ub_temp)()
     vsub_output = vsub(ub_input, vmax_output[0], ub_temp)()
     vexp_output = vexp(vsub_output[0], ub_temp)()
+    # 昇腾softmax算子中，在exp指令计算之前，是以fp16计算，后转成fp32计算，因此除公式所需步骤外，插入vconv的指令
     vconv_output = vconv(vexp_output[0], ub_cast, "FP32")()
     vadd_output = vadd(vconv_output[0], vconv_output[0], ub_cast)()
     vdiv_output = vdiv(vconv_output[0],vadd_output[0], ub_cast)()
-    softmax_output = Tensor("UB")
-    softmax_output = vdiv(vdiv_output[0], vadd_output[0], softmax_output)()
-    return softmax_output[0]
+    return vdiv_output[0]
 
 
 def ascend910b1_test(input_shape):
+    # Ascend910B1分两个vector核，通过with语句将其分开，并分别定义全局的输入输出
     with Chip("Ascend910B1") as chip:
         chip.enable_trace()
         chip.enable_metrics()
-- 
Gitee


From 22307aba3e39316f9792c63525898cff9e34514e Mon Sep 17 00:00:00 2001
From: huawei-zhangjunbo <zhangjunbo15@h-partners.com>
Date: Thu, 28 Mar 2024 22:29:41 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0mskpp=20sample=E6=A0=B7?=
 =?UTF-8?q?=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/README.md                              |   6 +-
 sample/mskpp_sample/README.md                 |  15 +-
 sample/mskpp_sample/cube_only/matmul.cpp.py   |  64 ++++++
 sample/mskpp_sample/fa_grad.py                | 186 ------------------
 .../mix/matmul_leakyrelu_kernel.py            |  74 +++++++
 sample/mskpp_sample/softmax.py                |  56 ------
 sample/mskpp_sample/vec_only/add_kernel.py    |  51 +++++
 7 files changed, 201 insertions(+), 251 deletions(-)
 create mode 100644 sample/mskpp_sample/cube_only/matmul.cpp.py
 delete mode 100644 sample/mskpp_sample/fa_grad.py
 create mode 100644 sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py
 delete mode 100644 sample/mskpp_sample/softmax.py
 create mode 100644 sample/mskpp_sample/vec_only/add_kernel.py

diff --git a/sample/README.md b/sample/README.md
index 80d29b5e2ac..d20f261325f 100644
--- a/sample/README.md
+++ b/sample/README.md
@@ -135,15 +135,15 @@ mssanitizer ./*.fatbin  # 默认进行memcheck检查
 4. 更多指标信息请参考算子开发工具使用手册。
 
 ### 性能建模
-用户可参照mskpp_sample中的参考案例，根据实际用户算子的数学逻辑，编写算子dsl语言的Python脚本，
+用户可参照sample/mskpp_sample中的参考案例，根据实际用户算子的数学逻辑，编写算子dsl语言的Python脚本，
 即可对昇腾算子进行理论性能建模，获得算子的极限性能参考
 
-1. 编写算子dsl语言的Python脚本，并执行，以sample中的softmax为例
+1. 编写算子dsl语言的Python脚本，并执行，以sample/mskpp_sample中的vec_softmax为例
    ```
    cd ./sample/mskpp_sample/
    python3 softmax.py
    ```
-2. 执行完成后，在当前目录下生成如下生成件：
+2. 执行完成后，在sample/mskpp_sample目录下生成如下生成件：
     ```
     [WorkSpace]
     ├── Pipe_Statistic.csv                      # 以Pipe维度统计cycle耗时
diff --git a/sample/mskpp_sample/README.md b/sample/mskpp_sample/README.md
index b53cd2c1be5..ab4a8e3e3b4 100644
--- a/sample/mskpp_sample/README.md
+++ b/sample/mskpp_sample/README.md
@@ -1,10 +1,13 @@
 # mskpp结果说明补充
 
-## softmax案例与实测结果对比
+## vec案例与实测结果对比
 
-以softmax为例，mskpp建模数据与实际算子性能数据比较：
+以add_kernel.cpp为例，mskpp建模数据与实际算子性能数据比较：
 
-| 测试用例名 | 类型 | 端到端时间 (us) | 数据量 (KB) | aiv_vec_cycles  | aiv_MTE2_cycles | aiv_MTE3_cycles |
-|----|----|------------|----|-----------------|----|----|
-| 实测softmax | FP16 | 94.98      | 51200 | 7835157         | 1256661 | 1062680 |
-| mskpp建模softmax | FP16 | 77.68  | 51200 | 4277000         | 1118750 | 1595580 |
\ No newline at end of file
+其中实测数据来源于sample/normal_sample/vec_only的上板调优结果
+
+
+| 测试用例名             | 类型 | 端到端时间 (us) | shape       |
+|-------------------|----|------------|-------------|
+| 实测add_kernel      | FP16 | 5.9        | 8,2048 |
+| mskpp建模add_kernel | FP16 | 3.5424     | 8,2048       |
\ No newline at end of file
diff --git a/sample/mskpp_sample/cube_only/matmul.cpp.py b/sample/mskpp_sample/cube_only/matmul.cpp.py
new file mode 100644
index 00000000000..ffc5b980ae8
--- /dev/null
+++ b/sample/mskpp_sample/cube_only/matmul.cpp.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from mskpp import mamd, Tensor, Chip
+
+
+def my_mmad(gm_x, gm_y, gm_z):
+    # 矩阵乘的基本数据通路
+    # 左矩阵A：GM-L1-L0A
+    # 右矩阵A：GM-L1-L0B
+    # 结果矩阵C：L0C（初始化）-GM
+
+    # 定义和分配L1上的变量
+    l1_x = Tensor("L1")
+    l1_y = Tensor("L1")
+
+    # 定义和分配L0A和L0B上的变量
+    x = Tensor("L0A")
+    y = Tensor("L0B")
+
+    # 定义和分配在L0C上的运算结果变量
+    z = Tensor("L0C", "FP32", [32, 16], format="NC1HWC0")
+
+    # 将GM上的数据移动到L1对应内存空间上
+    l1_x.load(gm_x)
+    l1_y.load(gm_y)
+
+    # 将L1上的左右矩阵移动到L0A和L0B上
+    x.load(l1_x)
+    y.load(l1_y)
+
+    # 当前数据已加载到L0A和L0B上，调用指令进行计算，结果保存在L0C上
+    out = mmad(x, y, z, True)()
+
+    # 将L0C上的数据移动到GM变量gm_z的地址空间上
+    gm_z.load(out[0])
+    return z
+
+
+if __name__ == '__main__':
+    with Chip("Ascend910B1") as chip:
+        chip.enable_trace()
+        chip.enable_metrics()
+
+        # 模拟一个大矩阵被切分成5个小矩阵进行计算
+        for _ in range(5):
+            # 应用算子进行AICORE计算
+            in_x = Tensor("GM", "FP16", [32, 48], format="ND")
+            in_y = Tensor("GM", "FP16", [48, 16], format="ND")
+            in_z = Tensor("GM", "FP32", [32, 16], format="NC1HWC0")
+            my_mmad(in_x, in_y, in_z)
diff --git a/sample/mskpp_sample/fa_grad.py b/sample/mskpp_sample/fa_grad.py
deleted file mode 100644
index 07a06815d3f..00000000000
--- a/sample/mskpp_sample/fa_grad.py
+++ /dev/null
@@ -1,186 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-from mskpp import mmad, Tensor, Chip, Core, vadd, vadds, vsub, vexp, vmul, vmuls
-
-seq_split_out = 64
-seq_split_in = 256
-batch_size, head_num, seq_size, head_dim = 1, 1, 2048, 127
-scale_value = 1 / 128
-keep_prob = 1.0
-
-
-def fa_forward_vec_part(gm_temp, gm_atten_mask_split, gm_softmax_log_max_sum_split, shape_info):
-    m, n = shape_info
-    ub_bmm1_res_loop = Tensor("UB", "FP16", [m, n], format="NZ")
-
-    ub_bmm1_res_loop.load(gm_temp, expect_value=1)
-    vadds(ub_bmm1_res_loop, scale_value, ub_bmm1_res_loop)()
-
-    ub_atten_mask_loop = Tensor("UB", "FP16", [m, n], format="NZ")  # [64, 256]
-
-    ub_atten_mask_loop.load(gm_atten_mask_split)  # [64, 256]
-    vadds(ub_atten_mask_loop, -10000, ub_atten_mask_loop)()
-    vadd(ub_bmm1_res_loop, ub_atten_mask_loop, ub_bmm1_res_loop)()
-    ub_softmax_log_max_sum_loop = Tensor("UB", "FP16", [m, 1], format="NZ")  # [64, 1]
-
-    ub_softmax_log_max_sum_loop.load(gm_softmax_log_max_sum_split)  # [64, 1]
-    vsub(ub_bmm1_res_loop, ub_softmax_log_max_sum_loop, ub_bmm1_res_loop)()
-
-    ub_bmm1_res_loop_last = Tensor("UB", "FP16", [m, n], format="NZ")
-    # gm_temp初始化为invalid，gm_temp完成后生成一个值，会通知ub_bmm1_res_loop_last是否为等待值，如果是就设置为就绪状态
-    # 这里可以将gm_temp
-    vexp(ub_bmm1_res_loop, ub_bmm1_res_loop_last)()
-    return ub_bmm1_res_loop_last
-
-
-def softmax_out_grad_vec_part(gm_temp2, gm_softmax_out_sum_split, ub_bmm1_res_loop_last, shape_info):
-    """
-    这是一个对UB和VEC的行为的包裹，其输入是GM，可以看做一个小型计算核
-    """
-    m, n = shape_info
-    ub_softmax_out_drop_grad_loop = Tensor("UB", "FP16", [m, n], format="NZ")
-    ub_softmax_out_drop_grad_loop.load(gm_temp2, expect_value=2)  # [64, 256]
-    ub_softmax_out_sum_loop = Tensor("UB", "FP16", [m, 1], format="NZ")  # [64, 1]
-    ub_softmax_out_sum_loop.load(gm_softmax_out_sum_split)  # [64, 1]
-    vsub(ub_softmax_out_drop_grad_loop, ub_softmax_out_sum_loop, ub_softmax_out_drop_grad_loop)()
-    vmul(ub_softmax_out_drop_grad_loop, ub_bmm1_res_loop_last, ub_softmax_out_drop_grad_loop)()
-    ub_softmax_out_drop_grad_loop_last = Tensor("UB", "FP16", [m, n], format="NZ")
-    vmuls(ub_softmax_out_drop_grad_loop, scale_value, ub_softmax_out_drop_grad_loop_last)()
-    gm_temp2.load(ub_softmax_out_drop_grad_loop_last, set_value=4)
-    return gm_temp2
-
-
-def batch_dot(out, a, b, shape_info, tensor_value):
-    """
-    这其实是一个对L0,MMAD的行为包裹，其输入是L1的内容，可以作为一个小型计算核
-    """
-    m, k, n = shape_info
-    l0a_n = Tensor("L0A", "FP16", [m, k], format="NZ")  # [64, 256]
-    l0b_n = Tensor("L0B", "FP16", [k, n], format="NZ")  # [256, 128]
-    l0c_n = Tensor("L0C", "FP16", [m, n], format="NZ")  # [64, 128]
-    l0a_n.load(a)
-    l0b_n.load(b)
-    mmad_out = mmad(l0a_n, l0b_n, l0c_n, True)()
-    out.load(mmad_out[0], set_value=tensor_value)
-    # 是在这一次的调度时，需要标记一个值
-    # 某块内存在被某次调度时，设置特定的值
-    # out调度时，将任务id传入，标识是这一次的任务执行
-    # out自己也记录这一次的任务ID和需要写入的值
-    # 这里out计算完成后，需要将自己的值设置为一个数(不需要给所有在等待的tensor发消息，因为通过调度系统确保刚好调度到依赖项目)
-    return out
-
-
-def flash_attention_grad(gm_q, gm_k, gm_v, gm_atten_mask, gm_attention_out_grad, gm_softmax_out_sum, gm_softmax_log_max_sum, gm_q_grad, gm_k_grad, gm_v_grad):
-    seq_out_loop_times = seq_size // seq_split_out  # 2048 // 64 = 32
-    seq_in_loop_times = seq_size // seq_split_in    # 2048 // 256 = 8
-
-    for batch_index in range(batch_size):
-        for head_index in range(head_num):
-            for seq_out_index in range(seq_out_loop_times):
-                seq_out_start_index = seq_out_index * seq_split_out
-                seq_out_end_index = seq_out_start_index + seq_split_out
-                for seq_in_index in range(seq_in_loop_times):
-                    seq_in_start_index = seq_in_index * seq_split_in
-                    seq_in_end_index = seq_in_start_index + seq_split_in
-                    """对一次处理的内存切块，满足L1或者UB的要求，此处不属于Core的范围，所以处理GM的内存。"""
-                    gm_q_split = gm_q[batch_index, head_index, seq_out_start_index: seq_out_end_index, :]
-                    gm_attention_out_grad_split = gm_attention_out_grad[batch_index, head_index,
-                                                  seq_out_start_index:seq_out_end_index, :]
-                    gm_k_split = gm_k[batch_index, head_index, seq_in_start_index:seq_in_end_index, :]
-                    gm_v_split = gm_v[batch_index, head_index, seq_in_start_index:seq_in_end_index, :]
-                    gm_atten_mask_split = gm_atten_mask[seq_out_start_index: seq_out_end_index,
-                                          seq_in_start_index:seq_in_end_index]
-                    gm_softmax_log_max_sum_split = gm_softmax_log_max_sum[batch_index, head_index,
-                                                   seq_out_start_index:seq_out_end_index, :]
-                    gm_softmax_out_sum_split = gm_softmax_out_sum[batch_index, head_index,
-                                               seq_out_start_index:seq_out_end_index, :]
-
-                    """对于需要用于核间同步的GM内存最好单独管理，在这个作用域创建，保证不同循环间的gm是不一样的，这样相互不影响"""
-                    gm_temp = Tensor("GM", "FP16", [seq_split_out, seq_split_in], format="NZ")
-                    gm_temp2 = Tensor("GM", "FP16", [seq_split_out, seq_split_in], format="NZ")
-
-                    with Core("AIC0") as aic:
-                        """在这个作用域下，仅处理：
-                          1. L1的创建和对(CUBE，L0)联合对象函数的处理，L1的新建请尽量靠近计算本身，减少理解难度
-                          2. 与AIV的同步
-                        """
-                        # task 1.1： 前向计算 AIC 部分
-                        l1_q_loop = Tensor("L1", "FP16", [seq_split_out, head_dim], format="NZ")  # [64, 128]
-                        l1_k_loop = Tensor("L1", "FP16", [seq_split_in, head_dim], format="NZ")  # [256, 128]
-                        l1_q_loop.load(gm_q_split)  # [64, 128]
-                        l1_k_loop.load(gm_k_split)  # [256, 128]
-                        gm_temp = batch_dot(gm_temp, l1_q_loop, l1_k_loop, (seq_split_out, head_dim, seq_split_in), 1)
-
-                        # task 2.1: softmax_out_grad计算 AIC 部分
-                        l1_v_loop = Tensor("L1", "FP16", [seq_split_in, head_dim], format="NZ")  # [256, 128]
-                        l1_attention_out_loop = Tensor("L1", "FP16", [seq_split_out, head_dim], format="NZ")
-                        l1_v_loop.load(gm_v_split)  # [256, 128]
-                        l1_attention_out_loop.load(gm_attention_out_grad_split)  # [64, 128]
-                        gm_temp2 = batch_dot(gm_temp2, l1_attention_out_loop, l1_v_loop,
-                                             (seq_split_out, head_dim, seq_split_in), 2)
-                        # 这里会第一次写入gm_temp2，写入后立即触发task2.2
-
-                        # task 3：反向计算 AIC, 依赖task2.2
-                        l1_softmax_out_loop = Tensor("L1", "FP16", [seq_split_out, seq_split_in], format = "NZ")
-                        l1_softmax_out_loop.load(gm_temp2, expect_value=4)
-                        gm_q_grad = batch_dot(gm_q_grad, l1_attention_out_loop, l1_softmax_out_loop,
-                                              (seq_split_out, head_dim, seq_split_in), -1)
-
-                        # gm_temp.load(ub_bmm1_res_loop_last)
-                        gm_k_grad = batch_dot(gm_k_grad, l1_q_loop, l1_softmax_out_loop,
-                                              (seq_split_out, head_dim, seq_split_in), -1)
-
-                        l1_softmax_out_loop.load(gm_temp, expect_value=3)
-                        gm_v_grad = batch_dot(gm_v_grad, l1_softmax_out_loop, l1_k_loop,
-                                              (seq_split_out, seq_split_in, head_dim), -1)
-
-                    with Core("AIV0") as aiv:
-                        """在这个作用域下，仅处理：与AIC的同步"""
-                        # task 1.2： 前向计算 AIC 部分, 依赖task1.1
-                        ub_bmm1_res_loop_last = fa_forward_vec_part(gm_temp, gm_atten_mask_split,
-                                                                    gm_softmax_log_max_sum_split,
-                                                                    (seq_split_out, seq_split_in))
-                        gm_temp.load(ub_bmm1_res_loop_last, set_value=3)
-
-                        # task 2.2: softmax_out_grad计算 AIV 部分, 依赖 task2.1(重点关注) 和 task1.2(不用管)
-                        gm_temp2 = softmax_out_grad_vec_part(gm_temp2, gm_softmax_out_sum_split, ub_bmm1_res_loop_last,
-                                                             (seq_split_out, seq_split_in))
-
-
-if __name__ == "__main__":
-    """在main部分主要定义全局的输入输出
-    """
-    with Chip("Ascend910B1") as chip:
-        chip.enable_trace()
-        chip.enable_metrics()
-        chip.enable_cache_detail()
-        # init input gm.
-        gm_q = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
-        gm_k = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
-        gm_v = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
-        gm_atten_mask = Tensor("GM", "FP16", [seq_size, seq_size], format="NZ")
-        gm_attention_out_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
-        gm_softmax_out_sum = Tensor("GM", "FP16", [batch_size, head_num, seq_size], format="NZ")
-        gm_softmax_log_max_sum = Tensor("GM", "FP16", [batch_size, head_num, seq_size], format="NZ")
-
-        # init output gm.
-        gm_q_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
-        gm_k_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
-        gm_v_grad = Tensor("GM", "FP16", [batch_size, head_num, seq_size, head_dim], format="NZ")
-
-        flash_attention_grad(gm_q, gm_k, gm_v, gm_atten_mask, gm_attention_out_grad, gm_softmax_out_sum,
-                             gm_softmax_log_max_sum, gm_q_grad, gm_k_grad, gm_v_grad)
\ No newline at end of file
diff --git a/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py b/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py
new file mode 100644
index 00000000000..7df09265df6
--- /dev/null
+++ b/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from mskpp import mamd, Tensor, Chip
+
+
+def my_mmad(gm_x, gm_y, gm_z):
+    # 矩阵乘的基本数据通路：
+    # 左矩阵A：GM-L1-L0A
+    # 右矩阵B：GM-L1-L0B
+    # 结果矩阵C： L0C(初始化)-GM
+
+    # 定义和分配L1上的变量
+    l1_x = Tensor("L1")
+    l1_y = Tensor("L1")
+    # 定义和分配L0A和L0B上的变量
+    x = Tensor("L0A")
+    y = Tensor("L0B")
+    # 定义和分配在L0C上的运算结果变量
+    z = Tensor("L0C", "FP32", [256, 128], format="ND")
+
+    # 将GM上的数据移动到L1对应内存空间上
+    l1_x.load(gm_x)
+    l1_y.load(gm_y)
+
+    # 将L1上的左右矩阵移动到L0A和L0B上
+    x.load(l1_x)
+    y.load(l1_y)
+
+    # 当前数据已加载到L0A和L0B上，调用指令进行计算，结果保存在L0C上
+    out = mmad(x, y, z, True)()
+    gm_z.load(out[0], set_value=1)  # set_value 表示设置同步事件1
+
+
+def my_vrelu(gm_x, gm_y):
+    # 定义和分配UB上的变量
+    x = Tensor("UB")
+    y = Tensor("UB")
+
+    x.load(gm_x, expect_value=1)  # expect_value 表示等待同步事件1完成后，该load才可执行
+    out = vrelu(x, y)()
+    gm_y.load(out[0])
+
+
+def Ascend910B1_Test():
+    with Chip("Ascend910B1") as chip:
+        chip.enable_trace()
+        chip.enable_metrics()
+        # 应用算子进行AICORE计算
+        in_x = Tensor("GM", "FP16", [1024, 640], format="ND")
+        in_y = Tensor("GM", "FP16", [640, 256], format="ND")
+        in_z = Tensor("GM", "FP32", [1024, 256], format="ND")
+        relu_out = Tensor("GM", "FP32", [1024, 256], format="ND")
+        with Core("AIC0") as aiv0:
+            my_mmad(in_x, in_y, in_z)
+        with Core("AIV0") as aiv1:
+            my_vrelu(in_z, relu_out)
+
+
+if __name__ == "__main__":
+    Ascend910B1_Test()
diff --git a/sample/mskpp_sample/softmax.py b/sample/mskpp_sample/softmax.py
deleted file mode 100644
index 32df4acb88d..00000000000
--- a/sample/mskpp_sample/softmax.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-from mskpp import *
-
-
-def softmax_func(gm_input):
-    # 创建输入输出的临时变量
-    ub_input = Tensor("UB", dtype="FP16")
-    ub_temp = Tensor("UB", dtype="FP16")
-    ub_cast = Tensor("UB", dtype="FP32")
-    ub_input.load(gm_input)
-    # softmax numpy 实现方式：softmax = np.exp(x) / np.sum(np.exp(x), axis=0)
-    # 下面是以mskpp语言实现的softmax算子伪代码
-    vmax_output = vmax(ub_input, ub_input, ub_temp)()
-    vsub_output = vsub(ub_input, vmax_output[0], ub_temp)()
-    vexp_output = vexp(vsub_output[0], ub_temp)()
-    # 昇腾softmax算子中，在exp指令计算之前，是以fp16计算，后转成fp32计算，因此除公式所需步骤外，插入vconv的指令
-    vconv_output = vconv(vexp_output[0], ub_cast, "FP32")()
-    vadd_output = vadd(vconv_output[0], vconv_output[0], ub_cast)()
-    vdiv_output = vdiv(vconv_output[0],vadd_output[0], ub_cast)()
-    return vdiv_output[0]
-
-
-def ascend910b1_test(input_shape):
-    # Ascend910B1分两个vector核，通过with语句将其分开，并分别定义全局的输入输出
-    with Chip("Ascend910B1") as chip:
-        chip.enable_trace()
-        chip.enable_metrics()
-        with Core("AIV0") as aiv0:
-            gm_input = Tensor("GM", dtype="FP16", size=input_shape, format="ND")
-            gm_output = Tensor("GM", dtype="FP32", size=input_shape, format="ND")
-            ub_output = softmax_func(gm_input)
-            gm_output.load(ub_output)
-        with Core("AIV1") as aiv1:
-            gm_input_1 = Tensor("GM", dtype="FP16", size=input_shape, format="ND")
-            gm_output_1 = Tensor("GM", dtype="FP32", size=input_shape, format="ND")
-            ub_output_1 = softmax_func(gm_input_1)
-            gm_output_1.load(ub_output_1)
-
-
-if __name__ == "__main__":
-    ascend910b1_test([1, 64, 64, 128])
\ No newline at end of file
diff --git a/sample/mskpp_sample/vec_only/add_kernel.py b/sample/mskpp_sample/vec_only/add_kernel.py
new file mode 100644
index 00000000000..1941d10a21c
--- /dev/null
+++ b/sample/mskpp_sample/vec_only/add_kernel.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from mskpp import mamd, Tensor, Chip
+
+
+def my_vadd(gm_x, gm_y, gm_z):
+    # 向量Add的基本数据通路
+    # 被加数x：GM-UB
+    # 加数y：GM-UB
+    # 结果向量z：UB-GM
+
+    # 定义和分配UB上的变量
+    x = Tensor("UB")
+    y = Tensor("UB")
+    z = Tensor("UB")
+
+    # 将GM上的数据移动到UB对应内存空间上
+    x.load(gm_x)
+    y.load(gm_y)
+
+    # 当前数据已加载到UB上，调用指令进行计算，结果保存在UB上
+    out = vadd(x, y, z)()
+
+    # 将UB上的数据移动到GM变量gm_z的地址空间上
+    gm_z.load(out[0])
+
+
+if __name__ == '__main__':
+    with Chip("Ascend910B1") as chip:
+        chip.enable_trace()
+        chip.enable_metrics()
+
+        # 应用算子进行AICORE计算
+        in_x = Tensor("GM", "FP16", [32, 48], format="ND")
+        in_y = Tensor("GM", "FP16", [48, 16], format="ND")
+        in_z = Tensor("GM", "FP16", [32, 48], format="ND")
+        my_vadd(in_x, in_y, in_z)
-- 
Gitee


From 3d3826e2038ad267a99a87dec8e85dadf5d29c40 Mon Sep 17 00:00:00 2001
From: huawei-zhangjunbo <zhangjunbo15@h-partners.com>
Date: Fri, 29 Mar 2024 10:00:27 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=E5=A2=9E=E5=8A=A0mskpp=20sample=E6=A0=B7?=
 =?UTF-8?q?=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/mskpp_sample/README.md | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/sample/mskpp_sample/README.md b/sample/mskpp_sample/README.md
index ab4a8e3e3b4..34dfb15775f 100644
--- a/sample/mskpp_sample/README.md
+++ b/sample/mskpp_sample/README.md
@@ -2,12 +2,36 @@
 
 ## vec案例与实测结果对比
 
-以add_kernel.cpp为例，mskpp建模数据与实际算子性能数据比较：
+以add_kernel.py为例，mskpp建模数据与实际算子性能数据比较：
 
 其中实测数据来源于sample/normal_sample/vec_only的上板调优结果
 
 
-| 测试用例名             | 类型 | 端到端时间 (us) | shape       |
-|-------------------|----|------------|-------------|
-| 实测add_kernel      | FP16 | 5.9        | 8,2048 |
-| mskpp建模add_kernel | FP16 | 3.5424     | 8,2048       |
\ No newline at end of file
+| 测试用例名             | 类型 | 端到端时间 (us) | shape    |
+|-------------------|----|------------|----------|
+| 实测add_kernel      | FP16 | 5.9        | [8,2048] |
+| mskpp建模add_kernel | FP16 | 3.5424     | [8,2048] |
+
+## mix案例与实测结果对比
+
+以matmul_leakyrelu_kernel.py为例，mskpp建模数据与实际算子性能数据比较：
+
+其中实测数据来源于sample/normal_sample/mix的上板调优结果
+
+
+| 测试用例名             | 类型 | 端到端时间 (us) | shape          |
+|-------------------|----|------------|----------------|
+| 实测matmul_leakyrelu_kernel      | FP16 | 287.42     | [1024,640,256] |
+| mskpp建模matmul_leakyrelu_kernel | FP16 | 60.1356    | [1024,640,256] |
+
+## cube案例与实测结果对比
+
+以matmul_kernel.py为例，mskpp建模数据与实际算子性能数据比较：
+
+其中实测数据来源于sample/normal_sample/cube_only的上板调优结果
+
+
+| 测试用例名             | 类型 | 端到端时间 (us) | shape          |
+|-------------------|----|------------|----------------|
+| 实测matmul_kernel      | FP16 | 9.86       | [512,1024,512] |
+| mskpp建模matmul_kernel | FP16 | 5.9089     | [512,1024,512] |
\ No newline at end of file
-- 
Gitee