diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
index 4250464b6833898c46aacebad584db19cd1094d2..f732a38513e26d9f9de404d9243fe4b2274c41c4 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
@@ -756,9 +756,11 @@ def check_file_valid(file_path):
 
 
 def get_md5_for_tensor(x):
+    if x.dtype == torch.bfloat16:
+        x = x.float()
     tensor_bytes = x.cpu().detach().numpy().tobytes()
-    crc_hash = zlib.crc32(tensor_bytes)
-    return crc_hash
+    crc32_hash = zlib.crc32(tensor_bytes)
+    return f"{crc32_hash:08x}"
 
 
 def check_path_before_create(path):
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
index 91cacafd0e88770abdd25bee014bb0c4e6027983..a6b769ff2a41955aa6363f08d086a091335bf5f1 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
@@ -96,6 +96,8 @@ class DataInfo(object):
 
 
 def get_not_float_tensor_info(data):
+    if DumpUtil.summary_mode == "md5":
+        return DataInfo([], [], str(data.dtype), tuple(data.shape), get_md5_for_tensor(data))
     if data.numel() == 0 or data.dtype == torch.bool:
         tensor_max = []
         tensor_min = []
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py
index 18ef57b9c1219ea591c4e473577f08ecf5bebd18..9ae980102121314205446bcd4e4d80fadbd74dad 100644
--- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py
@@ -41,4 +41,4 @@ class TestUtilsMethods(unittest.TestCase):
         data = [[1, 2], [3, 4]]
         x_data = torch.tensor(data)
         md5_value = get_md5_for_tensor(x_data)
-        self.assertEqual(md5_value, 2624136704)
+        self.assertEqual(md5_value, '9c692600')
diff --git a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py
index bfc631c66c86f061b10445e117e9f947d7ebdbc5..7ad66c0fa7d8f42c6078bd1d04f2164ebda0e9d8 100644
--- a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py
+++ b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py
@@ -37,9 +37,12 @@ class OverallPerformanceComparator(BaseComparator):
             self._headers.append('Mem Usage')
             base_col.append(f'{base_profiling_info.memory_used:.2f}G')
             comp_col.append(f'{comp_profiling_info.memory_used:.2f}G')
-        self._headers.extend(['Uncovered Communication Time'])
-        base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s'])
-        comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s'])
+        self._headers.extend(['Uncovered Communication Time(Wait Time)'])
+        if base_profiling_info.wait_time:
+            base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s({base_profiling_info.wait_time:.3f}s'])
+        else:
+            base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s( / )'])
+        comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s({comp_profiling_info.wait_time:.3f}s)'])
         if base_profiling_info.sdma_time or comp_profiling_info.sdma_time:
             self._headers.append('SDMA Time(Num)')
             base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})')
diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py
index 9184c790b7ea59246b602442a13e7e533d921bc8..b100e7ba9877d6bf4c316b590afc3a37a9346070 100644
--- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py
+++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py
@@ -18,6 +18,7 @@ class ProfilingInfo:
         self.fa_num_bwd = 0
         self.compute_time = 0.0
         self.communication_not_overlapped = 0.0
+        self.wait_time = 0.0
         self.memory_used = 0.0
         self.e2e_time = 0.0
         self.sdma_time = 0.0
@@ -33,6 +34,7 @@ class ProfilingInfo:
         self.vec_time = self.vec_time / 10 ** 6
         self.compute_time = self.compute_time / 10 ** 6
         self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6
+        self.wait_time = self.wait_time / 10 ** 6
         self.e2e_time = self.e2e_time / 10 ** 6
         self.sdma_time = self.sdma_time / 10 ** 6
         self.scheduling_time = self.scheduling_time / 10 ** 6
@@ -84,6 +86,9 @@ class ProfilingInfo:
     def update_comm_not_overlap(self, time: float):
         self.communication_not_overlapped += time
 
+    def update_comm_not_overlap_wait_time(self, time: float):
+        self.wait_time = time
+
     def set_memory_used(self, memory: float):
         self.memory_used = memory
 
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
index f872e52a5314a40dbc2e0d4ff7868e875986b809..dfc7d7d4363d1feee7221a930ce6e08350ddb47b 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
@@ -48,6 +48,7 @@ class NPUProfilingParser(BaseProfilingParser):
         if self._enable_profiling_compare:
             func_list.add(self._picking_overlap_analysis_data)
             func_list.add(self._picking_kernel_event)
+            func_list.add(self._picking_hccl_event)
         return list(func_list)
 
     def _update_memory_list(self):
@@ -98,10 +99,65 @@ class NPUProfilingParser(BaseProfilingParser):
         self.__parse_kernel_csv()
         self.__add_sdma_time()
         self.__add_overlap_analysis_time()
+        self._picking_notify_wait_event_and_not_overlap_event()
+        self.__add_overlap_wait_time()
         self._result_data.overall_metrics.calculate_other_time()
         self._result_data.overall_metrics.calculate_schedule_time()
         self._result_data.overall_metrics.trans_time_to_s()
 
+    def _picking_notify_wait_event_and_not_overlap_event(self):
+        self.notify_event_cache = []
+        self._not_overlaped_commu_event = []
+        for event in self._commu_task_list:
+            if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK':
+                self.notify_event_cache.append(event)
+        for event in self._overlap_analysis:
+            if event.is_comm_not_overlap():
+                self._not_overlaped_commu_event.append(event)
+
+    def __add_overlap_wait_time(self):
+        notify_wait_event_dict = dict()
+        for notify_event in self.notify_event_cache:
+            if notify_event.tid in notify_wait_event_dict:
+                notify_wait_event_dict[notify_event.tid].append(notify_event)
+            else:
+                notify_wait_event_dict[notify_event.tid] = [notify_event]
+        total_time = 0
+        for commu_event in self._not_overlaped_commu_event:
+            commu_event_start_time = float(commu_event.start_time)
+            commu_event_end_time = float(commu_event.end_time)
+            wait_time_list = []
+
+            for plane_id, events in notify_wait_event_dict.items():
+                wait_time = 0
+                idx = 0
+                for notify_event in events:
+                    notify_event_start_time = float(notify_event.start_time)
+                    notify_event_end_time = float(notify_event.end_time)
+                    if notify_event_start_time < commu_event_start_time and notify_event_end_time > commu_event_end_time:
+                        wait_time = commu_event_end_time - commu_event_start_time
+                        break
+                    elif notify_event_start_time < commu_event_start_time and  commu_event_start_time <= notify_event_end_time <= commu_event_end_time:
+                        wait_time += notify_event_end_time - commu_event_start_time
+                        idx += 1
+                    elif commu_event_start_time <= notify_event_start_time <= commu_event_end_time and notify_event_end_time > commu_event_end_time:
+                        wait_time += commu_event_end_time - commu_event_start_time
+                        break
+                    elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= commu_event_end_time:
+                        wait_time += notify_event_end_time - notify_event_start_time
+                    elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= commu_event_end_time:
+                        wait_time += notify_event_end_time - notify_event_start_time
+                        idx += 1
+                    elif notify_event_end_time < commu_event_start_time:
+                        idx += 1
+                    else:
+                        break
+
+                wait_time_list.append(wait_time)
+                notify_wait_event_dict[plane_id] = notify_wait_event_dict[plane_id][idx:]
+            total_time += max(wait_time_list)
+        self._result_data.overall_metrics.update_comm_not_overlap(total_time)
+
     def _picking_hccl_event(self, event: TraceEventBean):
         if event.pid != self._hccl_pid or not event.is_x_mode():
             return False
diff --git a/sample/README.md b/sample/README.md
index 167b1a01cbd87c75eb6a6479a39fc198360a402f..6bd55a2f83422b2f0c8424c9687a38f1698aa6fb 100644
--- a/sample/README.md
+++ b/sample/README.md
@@ -5,12 +5,61 @@
 
 如果考虑商用集成，推荐使用CANN软件包中的AscendC样例工程，比如：ascendc_kernel_cmake目录。本项目中的工程就是基于其进行简化仅用于快速验证。
 
+说明：该sample目录中，每个最小目录就是一个完整的样例工程。这些样例工程本身可能以为依赖的不同存在差异。
+
 ## 依赖说明
 安装CANN包，并使能环境变量，并确保```ASCEND_HOME_PATH```生效，可以在CANN包安装目录下使能：
 ```
 source set_env.sh
 ```
 
+## 目录介绍
+整体目录结构如下：
+```
+- sample
+  |- build              # 编译并运行所有样例内容（建议按需使用，此处命令可以参考
+  |- normal_sample      # 纯C/C++的AscendC单算子极简工程，可配合msdebug和msprof工具
+    |- cube_only        # 仅含aic的AscendC单算子极简工程
+    |- mix              # mix算子的AscendC单算子极简工程
+    |- vec_only         # 仅含aiv的AscendC单算子极简工程
+  |- pytorch_adapter    # 适配pytorch的AscendC单算子极简工程，可配合msdebug和msprof工具
+    |- jit_compile      # jit模式，运行时编译使用
+    |- with_setuptools  # 编译成wheel包安装使用
+  |- sanitizer_sample   # 异常样例，用于配合mssanitizer工具
+    |- racecheck        # 含竞争问题的样例
+    |- xx               # 其他异常样例 
+```
+
+如果你关注自定义算子的pytorch框架适配，详见[此处](./pytorch_adapter/README.md)
+
+
+## 算子调试 msdebug
+若使用msdebug进行上板调试，还需要额外调整，具体如下：
+1. 编译阶段：在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容：
+    + 调试信息增强，并扩大栈空间：
+    ```
+    COMPILER_FLAG		:= -xcce -O2 -std=c++17
+    修改为：
+    COMPILER_FLAG		:= -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true
+    ```
+
+2. 运行阶段：
+```
+msdebug ./*.fatbin
+```
+
+## 内存检测 sanitizer
+1. 编译阶段：在编译过程中添加```--cce-enable-sanitizer -g```参数, 在链接过程中添加```--cce-enable-sanitizer```参数。（现样例中已在Makefile中添加），执行如下命令：
+```
+make
+```
+
+2. 运行阶段：
+```
+mssanitizer ./*.fatbin  # 默认进行memcheck检查
+```
+
+
 ## 算子调优
 算子调优工具可以支持上板和仿真算子的调优，下面将以vec_only中的算子为例，进行工具使用的实战命令讲解
 
@@ -84,30 +133,3 @@ source set_env.sh
         └── trace.json                          # 算子所有核的流水图
     ```
 4. 更多指标信息请参考算子开发工具使用手册。
-
-## 算子调试msdebug
-若使用msdebug进行上板调试，还需要额外调整，具体如下：
-1. 编译阶段：在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容：
-    + 调试信息增强，并扩大栈空间：
-    ```
-    COMPILER_FLAG		:= -xcce -O2 -std=c++17
-    修改为：
-    COMPILER_FLAG		:= -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true
-
-## 内存检测 sanitizer
-### sanitizer_sample目录介绍
-
-此目录下为sanitizer对应的样例库，包含竞争检测和内存检测相关的样例。
-
-#### Racecheck目录介绍
-
-Racecheck为竞争检测相关的样例。
-
-raw_error_kernel.cpp文件为UB上先读后写竞争和GM上先写后读竞争问题的样例。
-
-
-运行阶段：
-
-```
-/usr/local/Ascend/ascend-toolkit/latest/tools/mssanitizer/bin/mssanitizer --tool=racecheck ./raw_error.fatbin
-```
\ No newline at end of file
diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a2b1ba63570058ac954a121f4b14b396f5dace81
--- /dev/null
+++ b/sample/pytorch_adapter/README.md
@@ -0,0 +1,53 @@
+# 自定义算子的pytorch框架适配说明
+
+## 简介
+昇腾提供丰富的算子接入框架的方式，此处将介绍最简单的一种，每个目录中都是一个独立的可使用的工程
+
+## 依赖
+与业内pytorch的算子介入方式相同，算子接入框架需要保障设备上有正确的pytorch版本（我们还依赖torch_npu版本）
+
+pytorch版本可由pip安装，torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases)，请选择与pytorch适配的torch_npu版本。
+
+## 工程介绍
+整体工程目录如下：
+```
+- pytorch_adapter
+  |- jit_compile         # 实时编译的接入方式
+    |- add_adapter.cpp   # 使用算子动态库接口完成算子在pytorch框架的适配
+    |- add_kernel.cpp    # 昇腾算子实现，并提供host侧的动态库接口
+    |- main.py           # python的入口，实现整体集成
+    |- Makefile          # 用以生成昇腾算子的host侧动态库的编译脚本
+  |- with_setuptools     # wheel包的接入方式
+    |- add_adapter.cpp
+    |- add_kernel.cpp
+    |- Makefile
+    |- setup.py          # setuptools的入口，支持编译并打包生成wheel包
+    |- test.py           # 测试wheel包功能的入口
+```
+
+## 工程使用
+
+### jit_compile工程
+执行如下命令，就会在运行过程中，现场生成python模块并使用：
+```
+python main.py
+```
+
+### setuptools工程
+针对with_setuptools工程，可以编译出可安装的wheel包，便于多机部署使用。
+
+
+1. 执行如下命令可以编译出软件包(setuptools可以支持多种方式，比如：build,install等，此处不一一展示)：
+```
+pytorch setup.py bdist_wheel    # 编译出wheel包，在dist目录下
+```
+
+2. 到```dist```目录下用pip命令安装对应软件包。
+
+3. 执行测试脚本
+```
+python test.py
+```
+
+## 其他
+1. 此处样例使用的是静态tiling，如果使用动态tiling，则可以在adapter.cpp中对Tensor的shape进行分析，选择合适tiling。（这部分是流程中必须的，只是可能在不同位置，比如aclnn中，这部分在接口实现；此处，我们本身也可以对add_custom_do进行封装，将tiling内置。）
\ No newline at end of file
diff --git a/sample/pytorch_adapter/jit_compile/Makefile b/sample/pytorch_adapter/jit_compile/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ec9115f377a578677470b89f365583dfcf246515
--- /dev/null
+++ b/sample/pytorch_adapter/jit_compile/Makefile
@@ -0,0 +1,20 @@
+# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
+ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
+
+COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
+COMPILER_FLAG				:= -xcce -O2 -std=c++17
+DYNAMIC_LIB_FLAG			:= -fPIC -shared
+DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
+ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include  # 参考device_intf.cmake的配置简化
+
+all: build
+
+build: libcustom_kernels.so
+
+# 后续如果要扩展，把多个kernel的cpp都加到后面
+libcustom_kernels.so: add_kernel.cpp
+	$(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^
+
+.PHONY: clean
+clean:
+	rm *.so
\ No newline at end of file
diff --git a/sample/pytorch_adapter/jit_compile/add_adapter.cpp b/sample/pytorch_adapter/jit_compile/add_adapter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c65e60ec596fe8b5627e06f678549b5f2f05660
--- /dev/null
+++ b/sample/pytorch_adapter/jit_compile/add_adapter.cpp
@@ -0,0 +1,128 @@
+#include <torch/extension.h>
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/framework/OpCommand.h"
+
+using torch::autograd::AutogradContext;
+using torch::autograd::Function;
+using tensor_list = std::vector<at::Tensor>;
+using namespace at;
+
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
+
+// 为NPU设备注册前向实现
+at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other)
+{
+    // 创建输出内存
+    at::Tensor result = at::Tensor(self);
+    // 将pytorch中的结构翻译成为CANN认识的数据类型和结构
+    // 1. (重要)通过对tensor的shape分析，选择合适的tiling（该算子为了简化，固定了tiling，只有特定shape下计算才正确）
+    // 2. 对数据类型和格式转换  -- 此处无需数据格式处理，直接使用
+    auto stream = c10_npu::getCurrentNPUStream().stream(false);
+    auto x = self.storage().data();
+    auto y = other.storage().data();
+    auto z = result.storage().data();
+
+    uint32_t blockDim = 8;
+    auto callback = [stream, blockDim, x, y, z]() -> int {
+        add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z);
+        return 0;  // 此处可以通过某种方式获取算子执行结果，还未实现
+    };
+    // 下发算子
+    at_npu::native::OpCommand cmd;
+    cmd.Name("my_add").SetCustomHandler(callback).Run();
+    return result;
+}
+
+// 为NPU设备注册反向实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_npu(const at::Tensor &self)
+{
+    at::Tensor result = at::Tensor(self);  // 创建输出内存
+
+    return {result, result};
+}
+
+// 为Meta设备注册前向实现
+at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other)
+{
+    return empty_like(self);
+}
+
+// 为Meta设备注册反向实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_meta(const at::Tensor &self)
+{
+    auto result = empty_like(self);
+    return std::make_tuple(result, result);
+}
+
+// 寻找注册在该op上的不同设备的实现
+at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other)
+{
+    static auto op =
+        torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed<decltype(my_add_impl)>();
+    return op.call(self, other);
+}
+// 寻找注册在该op上的不同设备的实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl(const at::Tensor &self)
+{
+    static auto op = torch::Dispatcher::singleton()
+                         .findSchemaOrThrow("myaten::my_add_backward", "")
+                         .typed<decltype(my_add_backward_impl)>();
+    return op.call(self);
+}
+
+// 在myaten命名空间里注册my_add和my_add_backward两个schema
+TORCH_LIBRARY(myaten, m)
+{
+    m.def("my_add(Tensor self, Tensor other) -> Tensor");
+    m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)");
+}
+
+// 通过继承torch::autograd::Function类实现前反向绑定
+class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
+public:
+    static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other)
+    {
+        at::AutoDispatchBelowADInplaceOrView guard;
+        return my_add_impl(self, other);
+    }
+
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs)
+    {
+        auto grad_output = grad_outputs[0];
+        auto result = my_add_backward_impl(grad_output);
+        return {std::get<0>(result), std::get<1>(result)};
+    }
+};
+
+at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other)
+{
+    return MyAddFunction::apply(self, other);
+}
+
+// 给op绑定NPU的自动求导实现
+// 如果是pytorch 2.1以下的版本，AutogradPrivateUse1需要改成AutogradXLA
+TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m)
+{
+    m.impl("my_add", &my_add_impl_autograd);
+}
+
+// 为NPU设备注册前反向实现
+// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1，在2.1以下版本用的是XLA，如果是2.1以下版本PrivateUse1需要改成XLA
+TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m)
+{
+    m.impl("my_add", &my_add_impl_npu);
+    m.impl("my_add_backward", &my_add_backward_impl_npu);
+}
+
+// 为Meta设备注册前反向实现
+TORCH_LIBRARY_IMPL(myaten, Meta, m)
+{
+    m.impl("my_add", &my_add_impl_meta);
+    m.impl("my_add_backward", &my_add_backward_impl_meta);
+}
+
+// 通过pybind将c++接口和python接口绑定
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("my_add", &my_add_impl_autograd, "x + y");
+}
diff --git a/sample/pytorch_adapter/jit_compile/add_kernel.cpp b/sample/pytorch_adapter/jit_compile/add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9aa62e093633de1f5bddc8d9b7f80fb58831bdb9
--- /dev/null
+++ b/sample/pytorch_adapter/jit_compile/add_kernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ *
+ * Function : z = x + y
+ * This sample is a very basic sample that implements vector add on Ascend plaform.
+ * In this sample:
+ * Length of x / y / z is 8*2048.
+ * Num of vector core used in sample is 8.
+ * Length for each core to compute is 2048.
+ * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
+ *
+ */
+#include "kernel_operator.h"
+using namespace AscendC;
+constexpr int32_t TOTAL_LENGTH = 8 * 2048;                             // total length of data
+constexpr int32_t USE_CORE_NUM = 8;                                    // num of core used
+constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;          // length computed of each core
+constexpr int32_t TILE_NUM = 8;                                        // split data into 8 tiles for each core
+constexpr int32_t BUFFER_NUM = 2;                                      // tensor num for each queue
+constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM;  // seperate to 2 parts, due to double buffer
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd()
+    {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        // get start index for current core, core parallel
+        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        // pipe alloc memory to queue, the unit is Bytes
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        // loop count need to be doubled, due to double buffer
+        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
+        // tiling strategy, pipeline parallel
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        // alloc tensor from queue memory
+        LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+        // copy progress_th tile from global tensor to local tensor
+        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
+        // enque input tensors to VECIN queue
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        // deque input tensors from VECIN queue
+        LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+        // call Add instr for computation
+        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+        // enque the output tensor to VECOUT queue
+        outQueueZ.EnQue<half>(zLocal);
+        // free input tensors for reuse
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        // deque output tensor from VECOUT queue
+        LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        // copy progress_th tile from local tensor to global tensor
+        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
+        // free output tensor for reuse
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    TPipe pipe;
+    // create queues for input, in this case depth is equal to buffer num
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    // create queue for output, in this case depth is equal to buffer num
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    GlobalTensor<half> xGm, yGm, zGm;
+};
+// implementation of kernel function
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+// 包裹核函数，使得普通编译器能认识这个符号
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
+}
\ No newline at end of file
diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..847a51f1c4787dcf353759d1115f352c1c760353
--- /dev/null
+++ b/sample/pytorch_adapter/jit_compile/main.py
@@ -0,0 +1,70 @@
+import os
+import subprocess
+import torch
+import torch_npu
+import torch.utils.cpp_extension
+from torch_npu.testing.testcase import TestCase, run_tests
+
+PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
+CUR_PATH = os.path.abspath(os.path.dirname(__file__))
+
+
+def compile_kernels():
+    # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
+    subprocess.run("make")
+
+
+def compile_host():
+    extra_ldflags = []
+    extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib")
+    extra_ldflags.append("-ltorch_npu")
+    extra_ldflags.append(f"-L{CUR_PATH}/")
+    extra_ldflags.append("-lcustom_kernels")
+    extra_include_paths = []
+    extra_include_paths.append("./")
+    extra_include_paths.append(os.path.join(
+        PYTORCH_NPU_INSTALL_PATH, "include"))
+    extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join(
+        PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"))
+
+    module = torch.utils.cpp_extension.load(
+        name="jit_extension",
+        sources=[
+            "add_adapter.cpp"
+        ],
+        extra_include_paths=extra_include_paths,
+        extra_ldflags=extra_ldflags,
+        verbose=True)
+    return module
+
+
+class TestCustomAdd(TestCase):
+    def test_add(self):
+        module = compile_host()
+        # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
+        # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
+        length = [8, 2048]
+        x = torch.rand(length, device='cpu', dtype=torch.float16)
+        y = torch.rand(length, device='cpu', dtype=torch.float16)
+
+        x_npu = x.npu()
+        y_npu = y.npu()
+        x_npu.requires_grad = True
+        y_npu.requires_grad = True
+        output = module.my_add(x_npu, y_npu)
+        # 反向能力验证
+        output.backward(output)
+
+        x.requires_grad = True
+        y.requires_grad = True
+        cpuout = torch.add(x, y)
+        cpuout.backward(cpuout)
+
+        self.assertRtolEqual(output, cpuout)
+        self.assertRtolEqual(x_npu.grad, x.grad)
+        self.assertRtolEqual(y_npu.grad, y.grad)
+
+
+if __name__ == '__main__':
+    compile_kernels()
+    run_tests()
diff --git a/sample/pytorch_adapter/with_setuptools/Makefile b/sample/pytorch_adapter/with_setuptools/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ec9115f377a578677470b89f365583dfcf246515
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/Makefile
@@ -0,0 +1,20 @@
+# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
+ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
+
+COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
+COMPILER_FLAG				:= -xcce -O2 -std=c++17
+DYNAMIC_LIB_FLAG			:= -fPIC -shared
+DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
+ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include  # 参考device_intf.cmake的配置简化
+
+all: build
+
+build: libcustom_kernels.so
+
+# 后续如果要扩展，把多个kernel的cpp都加到后面
+libcustom_kernels.so: add_kernel.cpp
+	$(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^
+
+.PHONY: clean
+clean:
+	rm *.so
\ No newline at end of file
diff --git a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c65e60ec596fe8b5627e06f678549b5f2f05660
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp
@@ -0,0 +1,128 @@
+#include <torch/extension.h>
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/framework/OpCommand.h"
+
+using torch::autograd::AutogradContext;
+using torch::autograd::Function;
+using tensor_list = std::vector<at::Tensor>;
+using namespace at;
+
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
+
+// 为NPU设备注册前向实现
+at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other)
+{
+    // 创建输出内存
+    at::Tensor result = at::Tensor(self);
+    // 将pytorch中的结构翻译成为CANN认识的数据类型和结构
+    // 1. (重要)通过对tensor的shape分析，选择合适的tiling（该算子为了简化，固定了tiling，只有特定shape下计算才正确）
+    // 2. 对数据类型和格式转换  -- 此处无需数据格式处理，直接使用
+    auto stream = c10_npu::getCurrentNPUStream().stream(false);
+    auto x = self.storage().data();
+    auto y = other.storage().data();
+    auto z = result.storage().data();
+
+    uint32_t blockDim = 8;
+    auto callback = [stream, blockDim, x, y, z]() -> int {
+        add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z);
+        return 0;  // 此处可以通过某种方式获取算子执行结果，还未实现
+    };
+    // 下发算子
+    at_npu::native::OpCommand cmd;
+    cmd.Name("my_add").SetCustomHandler(callback).Run();
+    return result;
+}
+
+// 为NPU设备注册反向实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_npu(const at::Tensor &self)
+{
+    at::Tensor result = at::Tensor(self);  // 创建输出内存
+
+    return {result, result};
+}
+
+// 为Meta设备注册前向实现
+at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other)
+{
+    return empty_like(self);
+}
+
+// 为Meta设备注册反向实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_meta(const at::Tensor &self)
+{
+    auto result = empty_like(self);
+    return std::make_tuple(result, result);
+}
+
+// 寻找注册在该op上的不同设备的实现
+at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other)
+{
+    static auto op =
+        torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed<decltype(my_add_impl)>();
+    return op.call(self, other);
+}
+// 寻找注册在该op上的不同设备的实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl(const at::Tensor &self)
+{
+    static auto op = torch::Dispatcher::singleton()
+                         .findSchemaOrThrow("myaten::my_add_backward", "")
+                         .typed<decltype(my_add_backward_impl)>();
+    return op.call(self);
+}
+
+// 在myaten命名空间里注册my_add和my_add_backward两个schema
+TORCH_LIBRARY(myaten, m)
+{
+    m.def("my_add(Tensor self, Tensor other) -> Tensor");
+    m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)");
+}
+
+// 通过继承torch::autograd::Function类实现前反向绑定
+class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
+public:
+    static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other)
+    {
+        at::AutoDispatchBelowADInplaceOrView guard;
+        return my_add_impl(self, other);
+    }
+
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs)
+    {
+        auto grad_output = grad_outputs[0];
+        auto result = my_add_backward_impl(grad_output);
+        return {std::get<0>(result), std::get<1>(result)};
+    }
+};
+
+at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other)
+{
+    return MyAddFunction::apply(self, other);
+}
+
+// 给op绑定NPU的自动求导实现
+// 如果是pytorch 2.1以下的版本，AutogradPrivateUse1需要改成AutogradXLA
+TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m)
+{
+    m.impl("my_add", &my_add_impl_autograd);
+}
+
+// 为NPU设备注册前反向实现
+// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1，在2.1以下版本用的是XLA，如果是2.1以下版本PrivateUse1需要改成XLA
+TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m)
+{
+    m.impl("my_add", &my_add_impl_npu);
+    m.impl("my_add_backward", &my_add_backward_impl_npu);
+}
+
+// 为Meta设备注册前反向实现
+TORCH_LIBRARY_IMPL(myaten, Meta, m)
+{
+    m.impl("my_add", &my_add_impl_meta);
+    m.impl("my_add_backward", &my_add_backward_impl_meta);
+}
+
+// 通过pybind将c++接口和python接口绑定
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("my_add", &my_add_impl_autograd, "x + y");
+}
diff --git a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9aa62e093633de1f5bddc8d9b7f80fb58831bdb9
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ *
+ * Function : z = x + y
+ * This sample is a very basic sample that implements vector add on Ascend plaform.
+ * In this sample:
+ * Length of x / y / z is 8*2048.
+ * Num of vector core used in sample is 8.
+ * Length for each core to compute is 2048.
+ * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
+ *
+ */
+#include "kernel_operator.h"
+using namespace AscendC;
+constexpr int32_t TOTAL_LENGTH = 8 * 2048;                             // total length of data
+constexpr int32_t USE_CORE_NUM = 8;                                    // num of core used
+constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;          // length computed of each core
+constexpr int32_t TILE_NUM = 8;                                        // split data into 8 tiles for each core
+constexpr int32_t BUFFER_NUM = 2;                                      // tensor num for each queue
+constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM;  // seperate to 2 parts, due to double buffer
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd()
+    {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        // get start index for current core, core parallel
+        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        // pipe alloc memory to queue, the unit is Bytes
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        // loop count need to be doubled, due to double buffer
+        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
+        // tiling strategy, pipeline parallel
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        // alloc tensor from queue memory
+        LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+        // copy progress_th tile from global tensor to local tensor
+        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
+        // enque input tensors to VECIN queue
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        // deque input tensors from VECIN queue
+        LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+        // call Add instr for computation
+        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+        // enque the output tensor to VECOUT queue
+        outQueueZ.EnQue<half>(zLocal);
+        // free input tensors for reuse
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        // deque output tensor from VECOUT queue
+        LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        // copy progress_th tile from local tensor to global tensor
+        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
+        // free output tensor for reuse
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    TPipe pipe;
+    // create queues for input, in this case depth is equal to buffer num
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    // create queue for output, in this case depth is equal to buffer num
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    GlobalTensor<half> xGm, yGm, zGm;
+};
+// implementation of kernel function
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+// 包裹核函数，使得普通编译器能认识这个符号
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
+}
\ No newline at end of file
diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ab1d3c78c7866b4bd53d9531bf0674c8b2987e
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/setup.py
@@ -0,0 +1,51 @@
+import os
+import subprocess
+import torch
+import torch_npu
+from setuptools import setup, find_packages
+from torch.utils.cpp_extension import BuildExtension
+from torch_npu.utils.cpp_extension import NpuExtension
+
+PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
+CUR_PATH = os.path.abspath(os.path.dirname(__file__))
+
+
+def compile_kernels():
+    # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
+    subprocess.run("make")
+    return "libcustom_kernels.so"  # 这个make出来的库名字
+
+
+def compile_adapter():
+    ext = NpuExtension(
+        name="ascend_custom_kernels_lib",  # import的库的名字
+        # 如果还有其他cpp文件参与编译，需要在这里添加
+        sources=[f"{CUR_PATH}/add_adapter.cpp"],
+        extra_compile_args=[
+            '-I' + os.path.join(os.path.join(os.path.join(os.path.join(
+                PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"),
+        ],
+        library_dirs=[f"{CUR_PATH}"],  # 编译时需要依赖的库文件的路径，相当于g++编译时的-L选项
+        libraries=["custom_kernels"],  # 编译时依赖的库文件，相当于-l选项
+    )
+    return [ext]
+
+
+if __name__ == "__main__":
+    # 编译出含有算子的库，并以so的方式提供
+    kernel_so = compile_kernels()
+
+    # 编译出pytorch适配层的库，支持被框架集成
+    exts = compile_adapter()
+
+    # 将整体打包成wheel包
+    setup(
+        name="ascend_custom_kernels",  # package的名字
+        version='1.0',
+        keywords='ascend_custom_kernels',
+        ext_modules=exts,
+        packages=find_packages(),
+        cmdclass={"build_ext": BuildExtension},
+        data_files=[(".", [kernel_so])],
+        include_package_data=True,
+    )
diff --git a/sample/pytorch_adapter/with_setuptools/test.py b/sample/pytorch_adapter/with_setuptools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..896eef2c0fbb1a113377fb7dc770f45fd99832f4
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/test.py
@@ -0,0 +1,34 @@
+import torch
+import torch_npu
+import ascend_custom_kernels_lib
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
+class TestCustomAdd(TestCase):
+    def test_add(self):
+        # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
+        # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
+        length = [8, 2048]
+        x = torch.rand(length, device='cpu', dtype=torch.float16)
+        y = torch.rand(length, device='cpu', dtype=torch.float16)
+
+        x_npu = x.npu()
+        y_npu = y.npu()
+        x_npu.requires_grad = True
+        y_npu.requires_grad = True
+        output = ascend_custom_kernels_lib.my_add(x_npu, y_npu)
+        # 反向能力验证
+        output.backward(output)
+
+        x.requires_grad = True
+        y.requires_grad = True
+        cpuout = torch.add(x, y)
+        cpuout.backward(cpuout)
+
+        self.assertRtolEqual(output, cpuout)
+        self.assertRtolEqual(x_npu.grad, x.grad)
+        self.assertRtolEqual(y_npu.grad, y.grad)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/sample/third_party/lib/libruntime.so.aarch64 b/sample/third_party/lib/libruntime.so.aarch64
deleted file mode 100644
index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000
Binary files a/sample/third_party/lib/libruntime.so.aarch64 and /dev/null differ
diff --git a/sample/third_party/lib/libruntime.so.x86 b/sample/third_party/lib/libruntime.so.x86
deleted file mode 100644
index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000
Binary files a/sample/third_party/lib/libruntime.so.x86 and /dev/null differ
diff --git a/sample/third_party/lib/libruntime_camodel.so.aarch64 b/sample/third_party/lib/libruntime_camodel.so.aarch64
deleted file mode 100644
index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000
Binary files a/sample/third_party/lib/libruntime_camodel.so.aarch64 and /dev/null differ
diff --git a/sample/third_party/lib/libruntime_camodel.so.x86 b/sample/third_party/lib/libruntime_camodel.so.x86
deleted file mode 100644
index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000
Binary files a/sample/third_party/lib/libruntime_camodel.so.x86 and /dev/null differ