diff --git a/debug/accuracy_tools/api_accuracy_checker/common/config.py b/debug/accuracy_tools/api_accuracy_checker/common/config.py index 55b25d25c6d3b0b673596fc22358e573a4e889d0..f53a2f97c7abefd057cc5293a1988e75ea18c56a 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/config.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/config.py @@ -56,12 +56,13 @@ class Config: def __str__(self): return '\n'.join(f"{key}={value}" for key, value in self.config.items()) - def update_config(self, dump_path=None, real_data=False, target_iter=None, white_list=None): + def update_config(self, dump_path=None, real_data=False, target_iter=None, white_list=None, enable_dataloader=False): args = { "dump_path": dump_path if dump_path else self.config.get("dump_path", './'), "real_data": real_data, "target_iter": target_iter if target_iter else self.config.get("target_iter", [1]), - "white_list": white_list if white_list else self.config.get("white_list", []) + "white_list": white_list if white_list else self.config.get("white_list", []), + "enable_dataloader": enable_dataloader } for key, value in args.items(): if key in self.config: diff --git a/debug/accuracy_tools/api_accuracy_checker/config.yaml b/debug/accuracy_tools/api_accuracy_checker/config.yaml index ff3b91de7e07ce0055bf14ee3265f49dde2cc4df..a6e70c57ebaec9141434499cfebe2aed6c21a7be 100644 --- a/debug/accuracy_tools/api_accuracy_checker/config.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/config.yaml @@ -1,6 +1,6 @@ dump_path: './' real_data: False -enable_dataloader: True +enable_dataloader: False target_iter: [1] white_list: [] error_data_path: './' diff --git a/sample/README.md b/sample/README.md index 167b1a01cbd87c75eb6a6479a39fc198360a402f..6bd55a2f83422b2f0c8424c9687a38f1698aa6fb 100644 --- a/sample/README.md +++ b/sample/README.md @@ -5,12 +5,61 @@ 如果考虑商用集成,推荐使用CANN软件包中的AscendC样例工程,比如:ascendc_kernel_cmake目录。本项目中的工程就是基于其进行简化仅用于快速验证。 +说明:该sample目录中,每个最小目录就是一个完整的样例工程。这些样例工程本身可能以为依赖的不同存在差异。 + ## 依赖说明 安装CANN包,并使能环境变量,并确保```ASCEND_HOME_PATH```生效,可以在CANN包安装目录下使能: ``` source set_env.sh ``` +## 目录介绍 +整体目录结构如下: +``` +- sample + |- build # 编译并运行所有样例内容(建议按需使用,此处命令可以参考 + |- normal_sample # 纯C/C++的AscendC单算子极简工程,可配合msdebug和msprof工具 + |- cube_only # 仅含aic的AscendC单算子极简工程 + |- mix # mix算子的AscendC单算子极简工程 + |- vec_only # 仅含aiv的AscendC单算子极简工程 + |- pytorch_adapter # 适配pytorch的AscendC单算子极简工程,可配合msdebug和msprof工具 + |- jit_compile # jit模式,运行时编译使用 + |- with_setuptools # 编译成wheel包安装使用 + |- sanitizer_sample # 异常样例,用于配合mssanitizer工具 + |- racecheck # 含竞争问题的样例 + |- xx # 其他异常样例 +``` + +如果你关注自定义算子的pytorch框架适配,详见[此处](./pytorch_adapter/README.md) + + +## 算子调试 msdebug +若使用msdebug进行上板调试,还需要额外调整,具体如下: +1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: + + 调试信息增强,并扩大栈空间: + ``` + COMPILER_FLAG := -xcce -O2 -std=c++17 + 修改为: + COMPILER_FLAG := -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true + ``` + +2. 运行阶段: +``` +msdebug ./*.fatbin +``` + +## 内存检测 sanitizer +1. 编译阶段:在编译过程中添加```--cce-enable-sanitizer -g```参数, 在链接过程中添加```--cce-enable-sanitizer```参数。(现样例中已在Makefile中添加),执行如下命令: +``` +make +``` + +2. 运行阶段: +``` +mssanitizer ./*.fatbin # 默认进行memcheck检查 +``` + + ## 算子调优 算子调优工具可以支持上板和仿真算子的调优,下面将以vec_only中的算子为例,进行工具使用的实战命令讲解 @@ -84,30 +133,3 @@ source set_env.sh └── trace.json # 算子所有核的流水图 ``` 4. 更多指标信息请参考算子开发工具使用手册。 - -## 算子调试msdebug -若使用msdebug进行上板调试,还需要额外调整,具体如下: -1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: - + 调试信息增强,并扩大栈空间: - ``` - COMPILER_FLAG := -xcce -O2 -std=c++17 - 修改为: - COMPILER_FLAG := -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true - -## 内存检测 sanitizer -### sanitizer_sample目录介绍 - -此目录下为sanitizer对应的样例库,包含竞争检测和内存检测相关的样例。 - -#### Racecheck目录介绍 - -Racecheck为竞争检测相关的样例。 - -raw_error_kernel.cpp文件为UB上先读后写竞争和GM上先写后读竞争问题的样例。 - - -运行阶段: - -``` -/usr/local/Ascend/ascend-toolkit/latest/tools/mssanitizer/bin/mssanitizer --tool=racecheck ./raw_error.fatbin -``` \ No newline at end of file diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a2b1ba63570058ac954a121f4b14b396f5dace81 --- /dev/null +++ b/sample/pytorch_adapter/README.md @@ -0,0 +1,53 @@ +# 自定义算子的pytorch框架适配说明 + +## 简介 +昇腾提供丰富的算子接入框架的方式,此处将介绍最简单的一种,每个目录中都是一个独立的可使用的工程 + +## 依赖 +与业内pytorch的算子介入方式相同,算子接入框架需要保障设备上有正确的pytorch版本(我们还依赖torch_npu版本) + +pytorch版本可由pip安装,torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases),请选择与pytorch适配的torch_npu版本。 + +## 工程介绍 +整体工程目录如下: +``` +- pytorch_adapter + |- jit_compile # 实时编译的接入方式 + |- add_adapter.cpp # 使用算子动态库接口完成算子在pytorch框架的适配 + |- add_kernel.cpp # 昇腾算子实现,并提供host侧的动态库接口 + |- main.py # python的入口,实现整体集成 + |- Makefile # 用以生成昇腾算子的host侧动态库的编译脚本 + |- with_setuptools # wheel包的接入方式 + |- add_adapter.cpp + |- add_kernel.cpp + |- Makefile + |- setup.py # setuptools的入口,支持编译并打包生成wheel包 + |- test.py # 测试wheel包功能的入口 +``` + +## 工程使用 + +### jit_compile工程 +执行如下命令,就会在运行过程中,现场生成python模块并使用: +``` +python main.py +``` + +### setuptools工程 +针对with_setuptools工程,可以编译出可安装的wheel包,便于多机部署使用。 + + +1. 执行如下命令可以编译出软件包(setuptools可以支持多种方式,比如:build,install等,此处不一一展示): +``` +pytorch setup.py bdist_wheel # 编译出wheel包,在dist目录下 +``` + +2. 到```dist```目录下用pip命令安装对应软件包。 + +3. 执行测试脚本 +``` +python test.py +``` + +## 其他 +1. 此处样例使用的是静态tiling,如果使用动态tiling,则可以在adapter.cpp中对Tensor的shape进行分析,选择合适tiling。(这部分是流程中必须的,只是可能在不同位置,比如aclnn中,这部分在接口实现;此处,我们本身也可以对add_custom_do进行封装,将tiling内置。) \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/Makefile b/sample/pytorch_adapter/jit_compile/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ec9115f377a578677470b89f365583dfcf246515 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/Makefile @@ -0,0 +1,20 @@ +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: + rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/add_adapter.cpp b/sample/pytorch_adapter/jit_compile/add_adapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c65e60ec596fe8b5627e06f678549b5f2f05660 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/add_adapter.cpp @@ -0,0 +1,128 @@ +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; + +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); + +// 为NPU设备注册前向实现 +at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) +{ + // 创建输出内存 + at::Tensor result = at::Tensor(self); + // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 + // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) + // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + auto x = self.storage().data(); + auto y = other.storage().data(); + auto z = result.storage().data(); + + uint32_t blockDim = 8; + auto callback = [stream, blockDim, x, y, z]() -> int { + add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); + return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 + }; + // 下发算子 + at_npu::native::OpCommand cmd; + cmd.Name("my_add").SetCustomHandler(callback).Run(); + return result; +} + +// 为NPU设备注册反向实现 +std::tuple my_add_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + + return {result, result}; +} + +// 为Meta设备注册前向实现 +at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) +{ + return empty_like(self); +} + +// 为Meta设备注册反向实现 +std::tuple my_add_backward_impl_meta(const at::Tensor &self) +{ + auto result = empty_like(self); + return std::make_tuple(result, result); +} + +// 寻找注册在该op上的不同设备的实现 +at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) +{ + static auto op = + torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); + return op.call(self, other); +} +// 寻找注册在该op上的不同设备的实现 +std::tuple my_add_backward_impl(const at::Tensor &self) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("myaten::my_add_backward", "") + .typed(); + return op.call(self); +} + +// 在myaten命名空间里注册my_add和my_add_backward两个schema +TORCH_LIBRARY(myaten, m) +{ + m.def("my_add(Tensor self, Tensor other) -> Tensor"); + m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class MyAddFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) + { + at::AutoDispatchBelowADInplaceOrView guard; + return my_add_impl(self, other); + } + + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) + { + auto grad_output = grad_outputs[0]; + auto result = my_add_backward_impl(grad_output); + return {std::get<0>(result), std::get<1>(result)}; + } +}; + +at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) +{ + return MyAddFunction::apply(self, other); +} + +// 给op绑定NPU的自动求导实现 +// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA +TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_autograd); +} + +// 为NPU设备注册前反向实现 +// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA +TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_npu); + m.impl("my_add_backward", &my_add_backward_impl_npu); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(myaten, Meta, m) +{ + m.impl("my_add", &my_add_impl_meta); + m.impl("my_add_backward", &my_add_backward_impl_meta); +} + +// 通过pybind将c++接口和python接口绑定 +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("my_add", &my_add_impl_autograd, "x + y"); +} diff --git a/sample/pytorch_adapter/jit_compile/add_kernel.cpp b/sample/pytorch_adapter/jit_compile/add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aa62e093633de1f5bddc8d9b7f80fb58831bdb9 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/add_kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); +} \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py new file mode 100644 index 0000000000000000000000000000000000000000..847a51f1c4787dcf353759d1115f352c1c760353 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/main.py @@ -0,0 +1,70 @@ +import os +import subprocess +import torch +import torch_npu +import torch.utils.cpp_extension +from torch_npu.testing.testcase import TestCase, run_tests + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.path.abspath(os.path.dirname(__file__)) + + +def compile_kernels(): + # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + subprocess.run("make") + + +def compile_host(): + extra_ldflags = [] + extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib") + extra_ldflags.append("-ltorch_npu") + extra_ldflags.append(f"-L{CUR_PATH}/") + extra_ldflags.append("-lcustom_kernels") + extra_include_paths = [] + extra_include_paths.append("./") + extra_include_paths.append(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include")) + extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc")) + + module = torch.utils.cpp_extension.load( + name="jit_extension", + sources=[ + "add_adapter.cpp" + ], + extra_include_paths=extra_include_paths, + extra_ldflags=extra_ldflags, + verbose=True) + return module + + +class TestCustomAdd(TestCase): + def test_add(self): + module = compile_host() + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = module.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) + + +if __name__ == '__main__': + compile_kernels() + run_tests() diff --git a/sample/pytorch_adapter/with_setuptools/Makefile b/sample/pytorch_adapter/with_setuptools/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ec9115f377a578677470b89f365583dfcf246515 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/Makefile @@ -0,0 +1,20 @@ +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: + rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c65e60ec596fe8b5627e06f678549b5f2f05660 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp @@ -0,0 +1,128 @@ +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; + +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); + +// 为NPU设备注册前向实现 +at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) +{ + // 创建输出内存 + at::Tensor result = at::Tensor(self); + // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 + // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) + // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + auto x = self.storage().data(); + auto y = other.storage().data(); + auto z = result.storage().data(); + + uint32_t blockDim = 8; + auto callback = [stream, blockDim, x, y, z]() -> int { + add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); + return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 + }; + // 下发算子 + at_npu::native::OpCommand cmd; + cmd.Name("my_add").SetCustomHandler(callback).Run(); + return result; +} + +// 为NPU设备注册反向实现 +std::tuple my_add_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + + return {result, result}; +} + +// 为Meta设备注册前向实现 +at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) +{ + return empty_like(self); +} + +// 为Meta设备注册反向实现 +std::tuple my_add_backward_impl_meta(const at::Tensor &self) +{ + auto result = empty_like(self); + return std::make_tuple(result, result); +} + +// 寻找注册在该op上的不同设备的实现 +at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) +{ + static auto op = + torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); + return op.call(self, other); +} +// 寻找注册在该op上的不同设备的实现 +std::tuple my_add_backward_impl(const at::Tensor &self) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("myaten::my_add_backward", "") + .typed(); + return op.call(self); +} + +// 在myaten命名空间里注册my_add和my_add_backward两个schema +TORCH_LIBRARY(myaten, m) +{ + m.def("my_add(Tensor self, Tensor other) -> Tensor"); + m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class MyAddFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) + { + at::AutoDispatchBelowADInplaceOrView guard; + return my_add_impl(self, other); + } + + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) + { + auto grad_output = grad_outputs[0]; + auto result = my_add_backward_impl(grad_output); + return {std::get<0>(result), std::get<1>(result)}; + } +}; + +at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) +{ + return MyAddFunction::apply(self, other); +} + +// 给op绑定NPU的自动求导实现 +// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA +TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_autograd); +} + +// 为NPU设备注册前反向实现 +// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA +TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_npu); + m.impl("my_add_backward", &my_add_backward_impl_npu); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(myaten, Meta, m) +{ + m.impl("my_add", &my_add_impl_meta); + m.impl("my_add_backward", &my_add_backward_impl_meta); +} + +// 通过pybind将c++接口和python接口绑定 +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("my_add", &my_add_impl_autograd, "x + y"); +} diff --git a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aa62e093633de1f5bddc8d9b7f80fb58831bdb9 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); +} \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..92ab1d3c78c7866b4bd53d9531bf0674c8b2987e --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/setup.py @@ -0,0 +1,51 @@ +import os +import subprocess +import torch +import torch_npu +from setuptools import setup, find_packages +from torch.utils.cpp_extension import BuildExtension +from torch_npu.utils.cpp_extension import NpuExtension + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.path.abspath(os.path.dirname(__file__)) + + +def compile_kernels(): + # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + subprocess.run("make") + return "libcustom_kernels.so" # 这个make出来的库名字 + + +def compile_adapter(): + ext = NpuExtension( + name="ascend_custom_kernels_lib", # import的库的名字 + # 如果还有其他cpp文件参与编译,需要在这里添加 + sources=[f"{CUR_PATH}/add_adapter.cpp"], + extra_compile_args=[ + '-I' + os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"), + ], + library_dirs=[f"{CUR_PATH}"], # 编译时需要依赖的库文件的路径,相当于g++编译时的-L选项 + libraries=["custom_kernels"], # 编译时依赖的库文件,相当于-l选项 + ) + return [ext] + + +if __name__ == "__main__": + # 编译出含有算子的库,并以so的方式提供 + kernel_so = compile_kernels() + + # 编译出pytorch适配层的库,支持被框架集成 + exts = compile_adapter() + + # 将整体打包成wheel包 + setup( + name="ascend_custom_kernels", # package的名字 + version='1.0', + keywords='ascend_custom_kernels', + ext_modules=exts, + packages=find_packages(), + cmdclass={"build_ext": BuildExtension}, + data_files=[(".", [kernel_so])], + include_package_data=True, + ) diff --git a/sample/pytorch_adapter/with_setuptools/test.py b/sample/pytorch_adapter/with_setuptools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..896eef2c0fbb1a113377fb7dc770f45fd99832f4 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/test.py @@ -0,0 +1,34 @@ +import torch +import torch_npu +import ascend_custom_kernels_lib +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestCustomAdd(TestCase): + def test_add(self): + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = ascend_custom_kernels_lib.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) + + +if __name__ == "__main__": + run_tests() diff --git a/sample/third_party/lib/libruntime.so.aarch64 b/sample/third_party/lib/libruntime.so.aarch64 deleted file mode 100644 index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime.so.aarch64 and /dev/null differ diff --git a/sample/third_party/lib/libruntime.so.x86 b/sample/third_party/lib/libruntime.so.x86 deleted file mode 100644 index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime.so.x86 and /dev/null differ diff --git a/sample/third_party/lib/libruntime_camodel.so.aarch64 b/sample/third_party/lib/libruntime_camodel.so.aarch64 deleted file mode 100644 index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime_camodel.so.aarch64 and /dev/null differ diff --git a/sample/third_party/lib/libruntime_camodel.so.x86 b/sample/third_party/lib/libruntime_camodel.so.x86 deleted file mode 100644 index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime_camodel.so.x86 and /dev/null differ