diff --git a/sample/README.md b/sample/README.md
index 6bd55a2f83422b2f0c8424c9687a38f1698aa6fb..d20f261325fa2bc8023dc08bf0e25d9d2a19fee1 100644
--- a/sample/README.md
+++ b/sample/README.md
@@ -133,3 +133,25 @@ mssanitizer ./*.fatbin  # 默认进行memcheck检查
         └── trace.json                          # 算子所有核的流水图
     ```
 4. 更多指标信息请参考算子开发工具使用手册。
+
+### 性能建模
+用户可参照sample/mskpp_sample中的参考案例，根据实际用户算子的数学逻辑，编写算子dsl语言的Python脚本，
+即可对昇腾算子进行理论性能建模，获得算子的极限性能参考
+
+1. 编写算子dsl语言的Python脚本，并执行，以sample/mskpp_sample中的vec_softmax为例
+   ```
+   cd ./sample/mskpp_sample/
+   python3 softmax.py
+   ```
+2. 执行完成后，在sample/mskpp_sample目录下生成如下生成件：
+    ```
+    [WorkSpace]
+    ├── Pipe_Statistic.csv                      # 以Pipe维度统计cycle耗时
+    └── Instruction_Statistic.csv               # 以指令维度统计cycle耗时
+    ├── trace.json                              # 理想流水图
+    └── instruction_cycle_consumption.html      # 当安装plotly三方库后，可生成该文件，输出各指令、Pipe的耗时比例
+    ```
+
+
+近期还会新增对GM相关的核外内存搬运指令地精细化建模，进一步提高GM核外搬运Pipe的理论耗时精准度，并对外展示L2Cache命中率的情况，
+敬请期待~
\ No newline at end of file
diff --git a/sample/mskpp_sample/README.md b/sample/mskpp_sample/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..34dfb15775fa19417bbcdc9b0d9a3dc06d7fb806
--- /dev/null
+++ b/sample/mskpp_sample/README.md
@@ -0,0 +1,37 @@
+# mskpp结果说明补充
+
+## vec案例与实测结果对比
+
+以add_kernel.py为例，mskpp建模数据与实际算子性能数据比较：
+
+其中实测数据来源于sample/normal_sample/vec_only的上板调优结果
+
+
+| 测试用例名             | 类型 | 端到端时间 (us) | shape    |
+|-------------------|----|------------|----------|
+| 实测add_kernel      | FP16 | 5.9        | [8,2048] |
+| mskpp建模add_kernel | FP16 | 3.5424     | [8,2048] |
+
+## mix案例与实测结果对比
+
+以matmul_leakyrelu_kernel.py为例，mskpp建模数据与实际算子性能数据比较：
+
+其中实测数据来源于sample/normal_sample/mix的上板调优结果
+
+
+| 测试用例名             | 类型 | 端到端时间 (us) | shape          |
+|-------------------|----|------------|----------------|
+| 实测matmul_leakyrelu_kernel      | FP16 | 287.42     | [1024,640,256] |
+| mskpp建模matmul_leakyrelu_kernel | FP16 | 60.1356    | [1024,640,256] |
+
+## cube案例与实测结果对比
+
+以matmul_kernel.py为例，mskpp建模数据与实际算子性能数据比较：
+
+其中实测数据来源于sample/normal_sample/cube_only的上板调优结果
+
+
+| 测试用例名             | 类型 | 端到端时间 (us) | shape          |
+|-------------------|----|------------|----------------|
+| 实测matmul_kernel      | FP16 | 9.86       | [512,1024,512] |
+| mskpp建模matmul_kernel | FP16 | 5.9089     | [512,1024,512] |
\ No newline at end of file
diff --git a/sample/mskpp_sample/cube_only/matmul.cpp.py b/sample/mskpp_sample/cube_only/matmul.cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc5b980ae8f8a1714156597a654ddb97874e592
--- /dev/null
+++ b/sample/mskpp_sample/cube_only/matmul.cpp.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from mskpp import mamd, Tensor, Chip
+
+
+def my_mmad(gm_x, gm_y, gm_z):
+    # 矩阵乘的基本数据通路
+    # 左矩阵A：GM-L1-L0A
+    # 右矩阵A：GM-L1-L0B
+    # 结果矩阵C：L0C（初始化）-GM
+
+    # 定义和分配L1上的变量
+    l1_x = Tensor("L1")
+    l1_y = Tensor("L1")
+
+    # 定义和分配L0A和L0B上的变量
+    x = Tensor("L0A")
+    y = Tensor("L0B")
+
+    # 定义和分配在L0C上的运算结果变量
+    z = Tensor("L0C", "FP32", [32, 16], format="NC1HWC0")
+
+    # 将GM上的数据移动到L1对应内存空间上
+    l1_x.load(gm_x)
+    l1_y.load(gm_y)
+
+    # 将L1上的左右矩阵移动到L0A和L0B上
+    x.load(l1_x)
+    y.load(l1_y)
+
+    # 当前数据已加载到L0A和L0B上，调用指令进行计算，结果保存在L0C上
+    out = mmad(x, y, z, True)()
+
+    # 将L0C上的数据移动到GM变量gm_z的地址空间上
+    gm_z.load(out[0])
+    return z
+
+
+if __name__ == '__main__':
+    with Chip("Ascend910B1") as chip:
+        chip.enable_trace()
+        chip.enable_metrics()
+
+        # 模拟一个大矩阵被切分成5个小矩阵进行计算
+        for _ in range(5):
+            # 应用算子进行AICORE计算
+            in_x = Tensor("GM", "FP16", [32, 48], format="ND")
+            in_y = Tensor("GM", "FP16", [48, 16], format="ND")
+            in_z = Tensor("GM", "FP32", [32, 16], format="NC1HWC0")
+            my_mmad(in_x, in_y, in_z)
diff --git a/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py b/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..7df09265df6af5ea34b89b2b37b220011292a6fe
--- /dev/null
+++ b/sample/mskpp_sample/mix/matmul_leakyrelu_kernel.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from mskpp import mamd, Tensor, Chip
+
+
+def my_mmad(gm_x, gm_y, gm_z):
+    # 矩阵乘的基本数据通路：
+    # 左矩阵A：GM-L1-L0A
+    # 右矩阵B：GM-L1-L0B
+    # 结果矩阵C： L0C(初始化)-GM
+
+    # 定义和分配L1上的变量
+    l1_x = Tensor("L1")
+    l1_y = Tensor("L1")
+    # 定义和分配L0A和L0B上的变量
+    x = Tensor("L0A")
+    y = Tensor("L0B")
+    # 定义和分配在L0C上的运算结果变量
+    z = Tensor("L0C", "FP32", [256, 128], format="ND")
+
+    # 将GM上的数据移动到L1对应内存空间上
+    l1_x.load(gm_x)
+    l1_y.load(gm_y)
+
+    # 将L1上的左右矩阵移动到L0A和L0B上
+    x.load(l1_x)
+    y.load(l1_y)
+
+    # 当前数据已加载到L0A和L0B上，调用指令进行计算，结果保存在L0C上
+    out = mmad(x, y, z, True)()
+    gm_z.load(out[0], set_value=1)  # set_value 表示设置同步事件1
+
+
+def my_vrelu(gm_x, gm_y):
+    # 定义和分配UB上的变量
+    x = Tensor("UB")
+    y = Tensor("UB")
+
+    x.load(gm_x, expect_value=1)  # expect_value 表示等待同步事件1完成后，该load才可执行
+    out = vrelu(x, y)()
+    gm_y.load(out[0])
+
+
+def Ascend910B1_Test():
+    with Chip("Ascend910B1") as chip:
+        chip.enable_trace()
+        chip.enable_metrics()
+        # 应用算子进行AICORE计算
+        in_x = Tensor("GM", "FP16", [1024, 640], format="ND")
+        in_y = Tensor("GM", "FP16", [640, 256], format="ND")
+        in_z = Tensor("GM", "FP32", [1024, 256], format="ND")
+        relu_out = Tensor("GM", "FP32", [1024, 256], format="ND")
+        with Core("AIC0") as aiv0:
+            my_mmad(in_x, in_y, in_z)
+        with Core("AIV0") as aiv1:
+            my_vrelu(in_z, relu_out)
+
+
+if __name__ == "__main__":
+    Ascend910B1_Test()
diff --git a/sample/mskpp_sample/vec_only/add_kernel.py b/sample/mskpp_sample/vec_only/add_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..1941d10a21cb8256fe21bc8af57a0ccba1920b6d
--- /dev/null
+++ b/sample/mskpp_sample/vec_only/add_kernel.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from mskpp import mamd, Tensor, Chip
+
+
+def my_vadd(gm_x, gm_y, gm_z):
+    # 向量Add的基本数据通路
+    # 被加数x：GM-UB
+    # 加数y：GM-UB
+    # 结果向量z：UB-GM
+
+    # 定义和分配UB上的变量
+    x = Tensor("UB")
+    y = Tensor("UB")
+    z = Tensor("UB")
+
+    # 将GM上的数据移动到UB对应内存空间上
+    x.load(gm_x)
+    y.load(gm_y)
+
+    # 当前数据已加载到UB上，调用指令进行计算，结果保存在UB上
+    out = vadd(x, y, z)()
+
+    # 将UB上的数据移动到GM变量gm_z的地址空间上
+    gm_z.load(out[0])
+
+
+if __name__ == '__main__':
+    with Chip("Ascend910B1") as chip:
+        chip.enable_trace()
+        chip.enable_metrics()
+
+        # 应用算子进行AICORE计算
+        in_x = Tensor("GM", "FP16", [32, 48], format="ND")
+        in_y = Tensor("GM", "FP16", [48, 16], format="ND")
+        in_z = Tensor("GM", "FP16", [32, 48], format="ND")
+        my_vadd(in_x, in_y, in_z)