From a5799567715f799fa8fa79476aa98acb40221e1e Mon Sep 17 00:00:00 2001 From: lian Date: Fri, 1 Mar 2024 12:42:59 +0000 Subject: [PATCH 01/26] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20pytorch=5Fadapter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/pytorch_adapter/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sample/pytorch_adapter/.keep diff --git a/sample/pytorch_adapter/.keep b/sample/pytorch_adapter/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 3bdbd9d3eaabfb59c09390146c275600da035327 Mon Sep 17 00:00:00 2001 From: lian Date: Fri, 1 Mar 2024 12:43:17 +0000 Subject: [PATCH 02/26] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20jit=5Fmode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/pytorch_adapter/jit_mode/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sample/pytorch_adapter/jit_mode/.keep diff --git a/sample/pytorch_adapter/jit_mode/.keep b/sample/pytorch_adapter/jit_mode/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From c5bfd34c2527b4ab23aeb57c1d75375bad587509 Mon Sep 17 00:00:00 2001 From: lian Date: Fri, 1 Mar 2024 12:43:47 +0000 Subject: [PATCH 03/26] rename sample/pytorch_adapter/jit_mode/.keep to sample/pytorch_adapter/jit_mode/add_adapter.cpp. Signed-off-by: lian --- sample/pytorch_adapter/jit_mode/.keep | 0 .../pytorch_adapter/jit_mode/add_adapter.cpp | 128 ++++++++++++++++++ 2 files changed, 128 insertions(+) delete mode 100644 sample/pytorch_adapter/jit_mode/.keep create mode 100644 sample/pytorch_adapter/jit_mode/add_adapter.cpp diff --git a/sample/pytorch_adapter/jit_mode/.keep b/sample/pytorch_adapter/jit_mode/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/sample/pytorch_adapter/jit_mode/add_adapter.cpp b/sample/pytorch_adapter/jit_mode/add_adapter.cpp new file mode 100644 index 000000000..6c65e60ec --- /dev/null +++ b/sample/pytorch_adapter/jit_mode/add_adapter.cpp @@ -0,0 +1,128 @@ +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; + +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); + +// 为NPU设备注册前向实现 +at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) +{ + // 创建输出内存 + at::Tensor result = at::Tensor(self); + // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 + // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) + // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + auto x = self.storage().data(); + auto y = other.storage().data(); + auto z = result.storage().data(); + + uint32_t blockDim = 8; + auto callback = [stream, blockDim, x, y, z]() -> int { + add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); + return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 + }; + // 下发算子 + at_npu::native::OpCommand cmd; + cmd.Name("my_add").SetCustomHandler(callback).Run(); + return result; +} + +// 为NPU设备注册反向实现 +std::tuple my_add_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + + return {result, result}; +} + +// 为Meta设备注册前向实现 +at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) +{ + return empty_like(self); +} + +// 为Meta设备注册反向实现 +std::tuple my_add_backward_impl_meta(const at::Tensor &self) +{ + auto result = empty_like(self); + return std::make_tuple(result, result); +} + +// 寻找注册在该op上的不同设备的实现 +at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) +{ + static auto op = + torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); + return op.call(self, other); +} +// 寻找注册在该op上的不同设备的实现 +std::tuple my_add_backward_impl(const at::Tensor &self) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("myaten::my_add_backward", "") + .typed(); + return op.call(self); +} + +// 在myaten命名空间里注册my_add和my_add_backward两个schema +TORCH_LIBRARY(myaten, m) +{ + m.def("my_add(Tensor self, Tensor other) -> Tensor"); + m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class MyAddFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) + { + at::AutoDispatchBelowADInplaceOrView guard; + return my_add_impl(self, other); + } + + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) + { + auto grad_output = grad_outputs[0]; + auto result = my_add_backward_impl(grad_output); + return {std::get<0>(result), std::get<1>(result)}; + } +}; + +at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) +{ + return MyAddFunction::apply(self, other); +} + +// 给op绑定NPU的自动求导实现 +// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA +TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_autograd); +} + +// 为NPU设备注册前反向实现 +// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA +TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_npu); + m.impl("my_add_backward", &my_add_backward_impl_npu); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(myaten, Meta, m) +{ + m.impl("my_add", &my_add_impl_meta); + m.impl("my_add_backward", &my_add_backward_impl_meta); +} + +// 通过pybind将c++接口和python接口绑定 +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("my_add", &my_add_impl_autograd, "x + y"); +} -- Gitee From 93525f90747a91f6ad22b4179ea3a02dd7510164 Mon Sep 17 00:00:00 2001 From: lian Date: Fri, 1 Mar 2024 12:44:06 +0000 Subject: [PATCH 04/26] add sample/pytorch_adapter/jit_mode/add_kernel.cpp. Signed-off-by: lian --- .../pytorch_adapter/jit_mode/add_kernel.cpp | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 sample/pytorch_adapter/jit_mode/add_kernel.cpp diff --git a/sample/pytorch_adapter/jit_mode/add_kernel.cpp b/sample/pytorch_adapter/jit_mode/add_kernel.cpp new file mode 100644 index 000000000..9164764ef --- /dev/null +++ b/sample/pytorch_adapter/jit_mode/add_kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ int16_t *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ int16_t *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ int16_t *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); +} \ No newline at end of file -- Gitee From c7fd1621add7d64b6c85e552f9b78073cb7be985 Mon Sep 17 00:00:00 2001 From: lian Date: Fri, 1 Mar 2024 12:44:24 +0000 Subject: [PATCH 05/26] add sample/pytorch_adapter/jit_mode/main.py. Signed-off-by: lian --- sample/pytorch_adapter/jit_mode/main.py | 50 +++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 sample/pytorch_adapter/jit_mode/main.py diff --git a/sample/pytorch_adapter/jit_mode/main.py b/sample/pytorch_adapter/jit_mode/main.py new file mode 100644 index 000000000..47cf34f48 --- /dev/null +++ b/sample/pytorch_adapter/jit_mode/main.py @@ -0,0 +1,50 @@ +import os +import torch +import torch.utils.cpp_extension +import torch_npu + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.getcwd() + + +def compile_kernels(): + os.system("make") # 由于pytorch中没有device编译的扩展,所以此处人工加make + + +def compile_host(): + extra_ldflags = [] + extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib") + extra_ldflags.append("-ltorch_npu") + extra_ldflags.append(f"-L{CUR_PATH}/") + extra_ldflags.append("-lcustom_kernels") + extra_include_paths = [] + extra_include_paths.append("./") + extra_include_paths.append(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include")) + extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc")) + + module = torch.utils.cpp_extension.load( + name="jit_extension", + sources=[ + "add_adapter.cpp" + ], + extra_include_paths=extra_include_paths, + extra_ldflags=extra_ldflags, + verbose=True) + return module + + +def test_add(module): + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + x = torch.arange(0, 100).short() + y = torch.arange(0, 100).short() + z = module.my_add(x.npu(), y.npu()) + print(z) + + +if __name__ == '__main__': + compile_kernels() + module = compile_host() + test_add(module) -- Gitee From fe0423aadae0c310b3fda6b24eda2b588d7f8d6d Mon Sep 17 00:00:00 2001 From: lian Date: Fri, 1 Mar 2024 12:44:46 +0000 Subject: [PATCH 06/26] add sample/pytorch_adapter/jit_mode/Makefile. Signed-off-by: lian --- sample/pytorch_adapter/jit_mode/Makefile | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 sample/pytorch_adapter/jit_mode/Makefile diff --git a/sample/pytorch_adapter/jit_mode/Makefile b/sample/pytorch_adapter/jit_mode/Makefile new file mode 100644 index 000000000..ad17f3a51 --- /dev/null +++ b/sample/pytorch_adapter/jit_mode/Makefile @@ -0,0 +1,20 @@ +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: + rm *.so \ No newline at end of file -- Gitee From 257c68e44943a904e3204f8c1cf31ac0c234dfb8 Mon Sep 17 00:00:00 2001 From: lian Date: Sat, 2 Mar 2024 05:26:43 +0000 Subject: [PATCH 07/26] rename sample/pytorch_adapter/.keep to sample/pytorch_adapter/README.md. Signed-off-by: lian --- sample/pytorch_adapter/{.keep => README.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sample/pytorch_adapter/{.keep => README.md} (100%) diff --git a/sample/pytorch_adapter/.keep b/sample/pytorch_adapter/README.md similarity index 100% rename from sample/pytorch_adapter/.keep rename to sample/pytorch_adapter/README.md -- Gitee From a4fd47638bcd4ea5469ee07f80b895a1c25fe9b3 Mon Sep 17 00:00:00 2001 From: lian Date: Mon, 4 Mar 2024 13:50:51 +0000 Subject: [PATCH 08/26] update sample/pytorch_adapter/jit_mode/add_kernel.cpp. Signed-off-by: lian --- .../pytorch_adapter/jit_mode/add_kernel.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sample/pytorch_adapter/jit_mode/add_kernel.cpp b/sample/pytorch_adapter/jit_mode/add_kernel.cpp index 9164764ef..7e82e8882 100644 --- a/sample/pytorch_adapter/jit_mode/add_kernel.cpp +++ b/sample/pytorch_adapter/jit_mode/add_kernel.cpp @@ -26,13 +26,13 @@ public: __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) { // get start index for current core, core parallel - xGm.SetGlobalBuffer((__gm__ int16_t *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - yGm.SetGlobalBuffer((__gm__ int16_t *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - zGm.SetGlobalBuffer((__gm__ int16_t *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); // pipe alloc memory to queue, the unit is Bytes - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); } __aicore__ inline void Process() { @@ -50,8 +50,8 @@ private: __aicore__ inline void CopyIn(int32_t progress) { // alloc tensor from queue memory - LocalTensor xLocal = inQueueX.AllocTensor(); - LocalTensor yLocal = inQueueY.AllocTensor(); + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); // copy progress_th tile from global tensor to local tensor DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); @@ -62,13 +62,13 @@ private: __aicore__ inline void Compute(int32_t progress) { // deque input tensors from VECIN queue - LocalTensor xLocal = inQueueX.DeQue(); - LocalTensor yLocal = inQueueY.DeQue(); - LocalTensor zLocal = outQueueZ.AllocTensor(); + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); // call Add instr for computation Add(zLocal, xLocal, yLocal, TILE_LENGTH); // enque the output tensor to VECOUT queue - outQueueZ.EnQue(zLocal); + outQueueZ.EnQue(zLocal); // free input tensors for reuse inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); @@ -76,7 +76,7 @@ private: __aicore__ inline void CopyOut(int32_t progress) { // deque output tensor from VECOUT queue - LocalTensor zLocal = outQueueZ.DeQue(); + LocalTensor zLocal = outQueueZ.DeQue(); // copy progress_th tile from local tensor to global tensor DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); // free output tensor for reuse @@ -89,7 +89,7 @@ private: TQue inQueueX, inQueueY; // create queue for output, in this case depth is equal to buffer num TQue outQueueZ; - GlobalTensor xGm, yGm, zGm; + GlobalTensor xGm, yGm, zGm; }; // implementation of kernel function extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) -- Gitee From c4174d15564910b728fd3d6f41ea0a89130ff8c2 Mon Sep 17 00:00:00 2001 From: lian Date: Mon, 4 Mar 2024 13:51:16 +0000 Subject: [PATCH 09/26] update sample/pytorch_adapter/jit_mode/main.py. Signed-off-by: lian --- sample/pytorch_adapter/jit_mode/main.py | 36 ++++++++++++++++++------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/sample/pytorch_adapter/jit_mode/main.py b/sample/pytorch_adapter/jit_mode/main.py index 47cf34f48..388e0799f 100644 --- a/sample/pytorch_adapter/jit_mode/main.py +++ b/sample/pytorch_adapter/jit_mode/main.py @@ -2,6 +2,7 @@ import os import torch import torch.utils.cpp_extension import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) CUR_PATH = os.getcwd() @@ -35,16 +36,33 @@ def compile_host(): return module -def test_add(module): - # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 - # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 - x = torch.arange(0, 100).short() - y = torch.arange(0, 100).short() - z = module.my_add(x.npu(), y.npu()) - print(z) +class TestCustomAdd(TestCase): + def test_add(self): + module = compile_host() + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = module.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) if __name__ == '__main__': compile_kernels() - module = compile_host() - test_add(module) + run_tests() -- Gitee From 57a297853cd3cd3dae79fee8a9d711d7701a0432 Mon Sep 17 00:00:00 2001 From: lian Date: Mon, 4 Mar 2024 13:51:39 +0000 Subject: [PATCH 10/26] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20with=5Fsetuptools?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/pytorch_adapter/with_setuptools/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sample/pytorch_adapter/with_setuptools/.keep diff --git a/sample/pytorch_adapter/with_setuptools/.keep b/sample/pytorch_adapter/with_setuptools/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 6f0ddce1f5ee1639e7fa1de801b51dc912cbe48e Mon Sep 17 00:00:00 2001 From: lian Date: Mon, 4 Mar 2024 13:52:12 +0000 Subject: [PATCH 11/26] rename setup.py. Signed-off-by: lian --- sample/pytorch_adapter/with_setuptools/.keep | 0 .../pytorch_adapter/with_setuptools/setup.py | 51 +++++++++++++++++++ 2 files changed, 51 insertions(+) delete mode 100644 sample/pytorch_adapter/with_setuptools/.keep create mode 100644 sample/pytorch_adapter/with_setuptools/setup.py diff --git a/sample/pytorch_adapter/with_setuptools/.keep b/sample/pytorch_adapter/with_setuptools/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py new file mode 100644 index 000000000..8f5ad6d9f --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/setup.py @@ -0,0 +1,51 @@ +import os +import torch +import torch_npu +from setuptools import setup, find_packages +from torch.utils.cpp_extension import BuildExtension +from torch_npu.utils.cpp_extension import NpuExtension + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.getcwd() + + +def compile_kernels(): + os.system("make") # 由于pytorch中没有device编译的扩展,所以此处人工加make + return "libcustom_kernels.so" # 这个make出来的库名字 + + +def compile_adapter(): + exts = [] + ext1 = NpuExtension( + name="ascend_custom_kernels_lib", # import的库的名字 + # 如果还有其他cpp文件参与编译,需要在这里添加 + sources=["./add_adapter.cpp"], + extra_compile_args=[ + '-I' + os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"), + ], + library_dirs=[f"{CUR_PATH}"], # 编译时需要依赖的库文件的路径,相当于g++编译时的-L选项 + libraries=["custom_kernels"], # 编译时依赖的库文件,相当于-l选项 + ) + exts.append(ext1) + return exts + + +if __name__ == "__main__": + # 编译出含有算子的库,并以so的方式提供 + kernel_so = compile_kernels() + + # 编译出pytorch适配层的库,支持被框架集成 + exts = compile_adapter() + + # 将整体打包成wheel包 + setup( + name="ascend_custom_kernels", # package的名字 + version='1.0', + keywords='ascend_custom_kernels', + ext_modules=exts, + packages=find_packages(), + cmdclass={"build_ext": BuildExtension}, + data_files=[(".", [kernel_so])], + include_package_data=True, + ) -- Gitee From 25745342bb2515faf2a30829f92a36807172e57f Mon Sep 17 00:00:00 2001 From: lian Date: Mon, 4 Mar 2024 13:52:30 +0000 Subject: [PATCH 12/26] add sample/pytorch_adapter/with_setuptools/test.py. Signed-off-by: lian --- .../pytorch_adapter/with_setuptools/test.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 sample/pytorch_adapter/with_setuptools/test.py diff --git a/sample/pytorch_adapter/with_setuptools/test.py b/sample/pytorch_adapter/with_setuptools/test.py new file mode 100644 index 000000000..896eef2c0 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/test.py @@ -0,0 +1,34 @@ +import torch +import torch_npu +import ascend_custom_kernels_lib +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestCustomAdd(TestCase): + def test_add(self): + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = ascend_custom_kernels_lib.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) + + +if __name__ == "__main__": + run_tests() -- Gitee From 4a8047d27e5e443cbc73bf2e46715d3456c3b868 Mon Sep 17 00:00:00 2001 From: binlien Date: Mon, 4 Mar 2024 23:29:03 +0800 Subject: [PATCH 13/26] =?UTF-8?q?=E6=95=B4=E6=94=B9=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{jit_mode => jit_compile}/Makefile | 38 ++-- .../{jit_mode => jit_compile}/add_adapter.cpp | 0 .../{jit_mode => jit_compile}/add_kernel.cpp | 210 +++++++++--------- .../{jit_mode => jit_compile}/main.py | 136 ++++++------ sample/third_party/lib/libruntime.so.aarch64 | Bin 8584 -> 0 bytes sample/third_party/lib/libruntime.so.x86 | Bin 16096 -> 0 bytes .../lib/libruntime_camodel.so.aarch64 | Bin 8584 -> 0 bytes .../third_party/lib/libruntime_camodel.so.x86 | Bin 16096 -> 0 bytes 8 files changed, 192 insertions(+), 192 deletions(-) rename sample/pytorch_adapter/{jit_mode => jit_compile}/Makefile (98%) rename sample/pytorch_adapter/{jit_mode => jit_compile}/add_adapter.cpp (100%) rename sample/pytorch_adapter/{jit_mode => jit_compile}/add_kernel.cpp (97%) rename sample/pytorch_adapter/{jit_mode => jit_compile}/main.py (96%) delete mode 100644 sample/third_party/lib/libruntime.so.aarch64 delete mode 100644 sample/third_party/lib/libruntime.so.x86 delete mode 100644 sample/third_party/lib/libruntime_camodel.so.aarch64 delete mode 100644 sample/third_party/lib/libruntime_camodel.so.x86 diff --git a/sample/pytorch_adapter/jit_mode/Makefile b/sample/pytorch_adapter/jit_compile/Makefile similarity index 98% rename from sample/pytorch_adapter/jit_mode/Makefile rename to sample/pytorch_adapter/jit_compile/Makefile index ad17f3a51..ec9115f37 100644 --- a/sample/pytorch_adapter/jit_mode/Makefile +++ b/sample/pytorch_adapter/jit_compile/Makefile @@ -1,20 +1,20 @@ -# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 -COMPILER_FLAG := -xcce -O2 -std=c++17 -DYNAMIC_LIB_FLAG := -fPIC -shared -DAV_FLAG := --cce-aicore-arch=dav-c220-vec -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 - -all: build - -build: libcustom_kernels.so - -# 后续如果要扩展,把多个kernel的cpp都加到后面 -libcustom_kernels.so: add_kernel.cpp - $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ - -.PHONY: clean -clean: +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_mode/add_adapter.cpp b/sample/pytorch_adapter/jit_compile/add_adapter.cpp similarity index 100% rename from sample/pytorch_adapter/jit_mode/add_adapter.cpp rename to sample/pytorch_adapter/jit_compile/add_adapter.cpp diff --git a/sample/pytorch_adapter/jit_mode/add_kernel.cpp b/sample/pytorch_adapter/jit_compile/add_kernel.cpp similarity index 97% rename from sample/pytorch_adapter/jit_mode/add_kernel.cpp rename to sample/pytorch_adapter/jit_compile/add_kernel.cpp index 7e82e8882..9aa62e093 100644 --- a/sample/pytorch_adapter/jit_mode/add_kernel.cpp +++ b/sample/pytorch_adapter/jit_compile/add_kernel.cpp @@ -1,106 +1,106 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. - * - * Function : z = x + y - * This sample is a very basic sample that implements vector add on Ascend plaform. - * In this sample: - * Length of x / y / z is 8*2048. - * Num of vector core used in sample is 8. - * Length for each core to compute is 2048. - * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. - * - */ -#include "kernel_operator.h" -using namespace AscendC; -constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data -constexpr int32_t USE_CORE_NUM = 8; // num of core used -constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core -constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core -constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue -constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer - -class KernelAdd { -public: - __aicore__ inline KernelAdd() - {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) - { - // get start index for current core, core parallel - xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - // pipe alloc memory to queue, the unit is Bytes - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - } - __aicore__ inline void Process() - { - // loop count need to be doubled, due to double buffer - constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; - // tiling strategy, pipeline parallel - for (int32_t i = 0; i < loopCount; i++) { - CopyIn(i); - Compute(i); - CopyOut(i); - } - } - -private: - __aicore__ inline void CopyIn(int32_t progress) - { - // alloc tensor from queue memory - LocalTensor xLocal = inQueueX.AllocTensor(); - LocalTensor yLocal = inQueueY.AllocTensor(); - // copy progress_th tile from global tensor to local tensor - DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); - DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); - // enque input tensors to VECIN queue - inQueueX.EnQue(xLocal); - inQueueY.EnQue(yLocal); - } - __aicore__ inline void Compute(int32_t progress) - { - // deque input tensors from VECIN queue - LocalTensor xLocal = inQueueX.DeQue(); - LocalTensor yLocal = inQueueY.DeQue(); - LocalTensor zLocal = outQueueZ.AllocTensor(); - // call Add instr for computation - Add(zLocal, xLocal, yLocal, TILE_LENGTH); - // enque the output tensor to VECOUT queue - outQueueZ.EnQue(zLocal); - // free input tensors for reuse - inQueueX.FreeTensor(xLocal); - inQueueY.FreeTensor(yLocal); - } - __aicore__ inline void CopyOut(int32_t progress) - { - // deque output tensor from VECOUT queue - LocalTensor zLocal = outQueueZ.DeQue(); - // copy progress_th tile from local tensor to global tensor - DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); - // free output tensor for reuse - outQueueZ.FreeTensor(zLocal); - } - -private: - TPipe pipe; - // create queues for input, in this case depth is equal to buffer num - TQue inQueueX, inQueueY; - // create queue for output, in this case depth is equal to buffer num - TQue outQueueZ; - GlobalTensor xGm, yGm, zGm; -}; -// implementation of kernel function -extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) -{ - KernelAdd op; - op.Init(x, y, z); - op.Process(); -} - -// 包裹核函数,使得普通编译器能认识这个符号 -extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) -{ - add_custom<<>>(x, y, z); +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); } \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_mode/main.py b/sample/pytorch_adapter/jit_compile/main.py similarity index 96% rename from sample/pytorch_adapter/jit_mode/main.py rename to sample/pytorch_adapter/jit_compile/main.py index 388e0799f..11f92600d 100644 --- a/sample/pytorch_adapter/jit_mode/main.py +++ b/sample/pytorch_adapter/jit_compile/main.py @@ -1,68 +1,68 @@ -import os -import torch -import torch.utils.cpp_extension -import torch_npu -from torch_npu.testing.testcase import TestCase, run_tests - -PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) -CUR_PATH = os.getcwd() - - -def compile_kernels(): - os.system("make") # 由于pytorch中没有device编译的扩展,所以此处人工加make - - -def compile_host(): - extra_ldflags = [] - extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib") - extra_ldflags.append("-ltorch_npu") - extra_ldflags.append(f"-L{CUR_PATH}/") - extra_ldflags.append("-lcustom_kernels") - extra_include_paths = [] - extra_include_paths.append("./") - extra_include_paths.append(os.path.join( - PYTORCH_NPU_INSTALL_PATH, "include")) - extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join( - PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc")) - - module = torch.utils.cpp_extension.load( - name="jit_extension", - sources=[ - "add_adapter.cpp" - ], - extra_include_paths=extra_include_paths, - extra_ldflags=extra_ldflags, - verbose=True) - return module - - -class TestCustomAdd(TestCase): - def test_add(self): - module = compile_host() - # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 - # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 - length = [8, 2048] - x = torch.rand(length, device='cpu', dtype=torch.float16) - y = torch.rand(length, device='cpu', dtype=torch.float16) - - x_npu = x.npu() - y_npu = y.npu() - x_npu.requires_grad = True - y_npu.requires_grad = True - output = module.my_add(x_npu, y_npu) - # 反向能力验证 - output.backward(output) - - x.requires_grad = True - y.requires_grad = True - cpuout = torch.add(x, y) - cpuout.backward(cpuout) - - self.assertRtolEqual(output, cpuout) - self.assertRtolEqual(x_npu.grad, x.grad) - self.assertRtolEqual(y_npu.grad, y.grad) - - -if __name__ == '__main__': - compile_kernels() - run_tests() +import os +import torch +import torch.utils.cpp_extension +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.getcwd() + + +def compile_kernels(): + os.system("make") # 由于pytorch中没有device编译的扩展,所以此处人工加make + + +def compile_host(): + extra_ldflags = [] + extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib") + extra_ldflags.append("-ltorch_npu") + extra_ldflags.append(f"-L{CUR_PATH}/") + extra_ldflags.append("-lcustom_kernels") + extra_include_paths = [] + extra_include_paths.append("./") + extra_include_paths.append(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include")) + extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc")) + + module = torch.utils.cpp_extension.load( + name="jit_extension", + sources=[ + "add_adapter.cpp" + ], + extra_include_paths=extra_include_paths, + extra_ldflags=extra_ldflags, + verbose=True) + return module + + +class TestCustomAdd(TestCase): + def test_add(self): + module = compile_host() + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = module.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) + + +if __name__ == '__main__': + compile_kernels() + run_tests() diff --git a/sample/third_party/lib/libruntime.so.aarch64 b/sample/third_party/lib/libruntime.so.aarch64 deleted file mode 100644 index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8584 zcmeHMYitzP6+XM`V8F&+6CQ1#VL_-$lz7&LGcKyI6GA7-aidK<^@$T5WVju48 zEU`n>CIP80Ma2b$RBD2M_q*pc_ciC9{dVub7QfF2QUdTxpjK#+gG9Jv_7<)ni9jdRKBn2H=VN$w2GonbklbNnho7dvX8 z4k7On4J-P4HzLU=#v%eGH-9jd)pI(LJ$iK+rL5S2{oFt6K zssW#lSEl!@II^L=|2IGW{PYu7ntoM!?Y_UB{ae#VM=qQ?b^EGg$a38hfF&?fV^IWP z6$FFgv7>@op&Hs^h;68WuSG0W1Mge_e+=>E_*QrF{LvUuIgAhC5yn5GP=IeT9%P*C zQK9)+UgNz^D!c(6@#tTr@J2YmI5t)JIsPTPI~kYtW-$H%FB-86x;~o8W%aymShlW% zp30_d&_|GfzIAAuo-i$QG?lkaYiL_{I+rzvjN!ED^wsXsfW$+^7rNQpm>bRL;#dU?7%+bZR(n zC*pV9q2+Vh-PFvKYoqbFo~Kqj;c*A=8`#>_t#8uWJDhQ&zs}P4;o5cobNUH+5390| z4o^ObBk6~KINf;8_*5;x8+4AxdB%euWqj6yOMAB2(c?osw3nouap7_XP~UkM?%om? zT{xR`RPp}sd-N}M;cop`_n!Lc)c{Q5dOLJ=@9Cp60sWl#M05`O4*(o!xz<+9t> zSNFcC#^wUPL#Z2nd$AshZ`Q-~r2tF=@*y@!wo3c3Zyij(iFPg_kLugUeJoeOFS-KE zD|%i+J*&~i&8Vjtif^I5x0~74OAG6h`E<{vZ%yD!I{{!RK8LQAN@GZ0M!FwqU8z)h z3h4mS8Kn0k{T}>)d{186l^LR+`Uyuyq#Dlhi-OT?o^OMhn_b~oP#>r2_+xi&) z0Gs5CTMx0^C^tsn#t7UPf&YgQkod(Lui%lc%CDJ9TY>^sHj6RJuWzMAm^Y*_Hi{9zJ5k1`)$U(5FJv!VQ-Wq$mOsl9Jin4 z{vx-bC0+-z9xCyhK*qHaUj#BAm3TeKyjJ3iLFT^_U+CVt6l8oV`5Vf4 zl1jY#-beibSO!zF;9LR$Xe#FwTrl!C&u<)8M*t%#*?%q=`CCBtT_wI8g zL(Ji?o-YTl9)F587q+_uUXz1J`62on;VI92{pGrP8lY1SFxK;Tq~yOBrx=v*q=&y= zO-J>7Ek&I8<-BM05JKGN%Bv`G>~!>0uZOVWug1T|`sKSK`W-#h>up%+sm2}tW>_Hy zDJydD>UHuM>Zvs^4uAE2c|zHp$N8A$_0Ry4KdJP;8OA;SEHsbe|6)1#S<#fT+YDZu zx@-A=4EdqPzCNw=Tq$|?*B=zVvMN9Jmcpyo1D&Tn+(+L1>#YCk^Qwsaly~yxsr>lC z43Vpz(t`MMsNP4b5#Qk9rRJsu^b9IJHS%2x@b6`QZ(i<)jLZB=TycbPZ+`DN#=UvG zR}@}-Za_FUW{0h0IAQLwbGdZhv|BsERxW3U3wbL%oXUnZEsXi8Fs7@*hGE5%ci+`o z$nMVO9?G_+Q`y3J>u9zR?jSbjzc9^r#w|Ogk<&-hxnU!%C+wV+*Np-?oXd=*P1{Up9UYt7Ycr{= zq;6Q2F`;8B(VBn}%gC5|qL9fF^tr<9Ga9`HfNg}u2XHekV+?7QwdP5q>)d8mYB$*q(g1X z>DgmidCcfmG<58<%(OuQTpLT*h`ii`B8-dt6dMvM_a|gi zh1E$D`msiL8IhMbTj;IKDgFze&>h%McOz*_JT5fE^5j3+7XKNB1Xid|Ux|-bejG9KOTsa^{|c436IoOtU)?2-4Y$197lcOK%vhJa ztI%(Gn5kEO z$|GN%Ab!!tPk8^Ip**H{)!sl-z8}f&sNV6Cw$SHM25(zsdAUc;Z~{d5NH-N}3w;qW za#7Z&+_z>wL7sF`c^w~Gdm=A!o&3%%=ZK7(*p+!bkFsw25?{{HOJTXqA?3d!FZ2!U zam&j+@A9XV|1-*yF7aP}=MA!hVn-bCOeeg<_S=MksmM!SNxDVeO=ay0579QZf1Wos s<*&YikN6|!2Zj6MJ66kOF<)U{^(Ys+!q|e9TJmQ(VMg~ock#;q3uj)=)&Kwi diff --git a/sample/third_party/lib/libruntime.so.x86 b/sample/third_party/lib/libruntime.so.x86 deleted file mode 100644 index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16096 zcmeHOZ){sv6~B&?HC@u!>qeKRm13rBT3hy-G)kLNb#)p$eXWwT^WqOoL=VS)NsRm_ zJU@5IL`7Mll7ULC_JJY|RY;TQhJ?lkG>Q;wm92opfC=$OVxVZWQb1@+1)BnzV$Qkm zoR{ZoN3B3Ye0aCA&pp3$e&^nI-k)CQ-SdU`=txr_Ah-p^lLED9*g>LXhzwE(L{tol z&G5ZLbV=XlbyHpMr!5Y|QczqRLz+luoaMJ@N-P;iSdJHQ;tECPh{w{)yALR>n!{^- zvQ~zh9KDky^H4=zgm@9+vAj;>v21n1$T{LX2PvK^d&rNaJSMZ;m()svLD- z!IF7T0T1VSuZ`25pz$`!%PJioOH$?WJp_XV;d5*}DZV-M*yeX%JYPKc!7mTKdaZhK z;CqKQ;at#;=NWl8zv%N}5p7<&gc$0Z;d9{4Yo{N&@<^Zl);AuzcHvIP&Farj{PoxG zcYJu|Pp`keYuk6hWy6MXBMq!OH^A{SZseZ;{1N!5?TY|x6>TE^fMW@f!`BI)ApAkX zKS%gp!nYBQ`Q9qv>fpo^{$uh#uf#{*R?#jl6He~pJLEq@csChRgm)4CF(w$sOukq! zN_NV!4MP~&Le>_>GzeiFpBOVTre)4#OSWlEj1A|C1#==bl`|b%V~dfVPho&mF8iD* zEc5+<#M3M%+p7~+geB#5;T+-^6o^ zz-C1*J5{nX={HFYToV5q|-(TwLZkt4t{L(_|aiwpSDjM;7o`nxC(3k;dL1h z?|}!OmU34XF3BdbN-Xy|L3)_-8|LQ+;lH8)oeIwS(hkUx3#RI$x8OU}9fjU2(3`$kee{P?>K6{b4V|QZq2*^F^@FQ=<+i;YLVg8R z2)Sjlx-_kYy1yg?PA*QbE1_-y{7dL~VF3M;`mQzL>o?YddSykwv3x`i+|=J*vpXT+ z5Cv?eS{9U|bn z>zymrY6j#dKzr1Bvg1u?3z>h`@X} zu%mrb>qWqO0K<2(ufAWc%2#C_;gOEc6QQd5{L!IppIEp52;dml<-1(A zkFVR`0s8^4cYFEItlK{VeE$aVk>WQ$1AYel4EP!FGvH^y&w!r+KLdUS{0#g*WPtZI zs{0t>r<-g)A<6o06@5w3aZe?-c1O4yEVq!(`&ze?&igF!^)oj7O^2<^5rX$y?j)V< zAFNf22z`b&PvX8}Y)iChlJ`VkCHwu3S>SJ1Y;BGZoh048mQe?go0y>+p};)uRr1IE z%h;H|R(0m%Y;K1)=cIGq;OC*7@2zHs_fOK@aUJ-7_Rag))d7zU4}UW9IPC1+6*-{w zYkiS@y?ukd{k{FFV*@=1J6w6cyVo3qeYJd^YVn}p^HYmAyXUbMZxNioS{%+cUjQCM zP;7F~p9hA$>2NlAYgs+h`98xCp@XUwrzY&So=O<5b_&LaN7NmcD zo-YI5X#Our|Ml}rw>iwEvHxq*KO|=HK$O-f{+rUL{yO??}=S8}hpUauHnbGzi>>p_CB#SkSlx3wB45+ZM7R0ob%9}=}oX;-+#iPmU zjv801T(GlwQ%ldy3FDd3zW&675lckEp=A~dz;vW-IhNC=Rf0;F8pQycW#&>y5SlF1(dKftHdC~P zW}EZyEwiUt#Z1ah3C*0PlRujQDr*kaxe6Qx_mPDoKma+LR6d)A85C_CMrc!|lF;Ca zlyyAHef6KJxSqo`6fM8pbpfvrdX&KYykuB|&WEr@Vm_}QSjNba*H64YWBwp?aE-@& zUT3fzg!>sAj*DI9^Lj1{7{+G)5)H(X*OA~N8--(4=+MJ8AoF?M!jjjwJb(6MnFKzr zDOq>dH?+=)&`9e12^JaXppE$v=8?Qgjy%44`~|S%G4VRuT`$r9!#sZqBgf)C%0Ixz z%jb0!OXa_FHN3DDKCi!6s(hY5%QFf;N(7ck<%Fs6&MAEU-NKUB|KKJY=GxmTpx_Xk ze-R-oN!4G~FM+$4-^@IcqhR+om45|{UOulQS?)xmr*YmGdKo&n_HwTeu5ZH|6PoJJ zI?L~a558|>KL3sj-y{D9aB+&v=k;snUHCwe?HBNk*DmvUoy+yS*Xj51B;h0qU74^NPf-`>gJRAOJKl0&!5+Eoy2E9=F{P@DsDp$ zF>-ft=BxKF^S#o&c3n2qHB-PaJxB0&{BHR0eE7KV{@}TL=t3WGv_JDeWK>l6CDWL{ P2N(Fz-gAWet@8g37k2|A diff --git a/sample/third_party/lib/libruntime_camodel.so.aarch64 b/sample/third_party/lib/libruntime_camodel.so.aarch64 deleted file mode 100644 index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8584 zcmeHMYitzP6+XM`V8F&+6CQ1#VL_-$lz7&LGcKyI6GA7-aidK<^@$T5WVju48 zEU`n>CIP80Ma2b$RBD2M_q*pc_ciC9{dVub7QfF2QUdTxpjK#+gG9Jv_7<)ni9jdRKBn2H=VN$w2GonbklbNnho7dvX8 z4k7On4J-P4HzLU=#v%eGH-9jd)pI(LJ$iK+rL5S2{oFt6K zssW#lSEl!@II^L=|2IGW{PYu7ntoM!?Y_UB{ae#VM=qQ?b^EGg$a38hfF&?fV^IWP z6$FFgv7>@op&Hs^h;68WuSG0W1Mge_e+=>E_*QrF{LvUuIgAhC5yn5GP=IeT9%P*C zQK9)+UgNz^D!c(6@#tTr@J2YmI5t)JIsPTPI~kYtW-$H%FB-86x;~o8W%aymShlW% zp30_d&_|GfzIAAuo-i$QG?lkaYiL_{I+rzvjN!ED^wsXsfW$+^7rNQpm>bRL;#dU?7%+bZR(n zC*pV9q2+Vh-PFvKYoqbFo~Kqj;c*A=8`#>_t#8uWJDhQ&zs}P4;o5cobNUH+5390| z4o^ObBk6~KINf;8_*5;x8+4AxdB%euWqj6yOMAB2(c?osw3nouap7_XP~UkM?%om? zT{xR`RPp}sd-N}M;cop`_n!Lc)c{Q5dOLJ=@9Cp60sWl#M05`O4*(o!xz<+9t> zSNFcC#^wUPL#Z2nd$AshZ`Q-~r2tF=@*y@!wo3c3Zyij(iFPg_kLugUeJoeOFS-KE zD|%i+J*&~i&8Vjtif^I5x0~74OAG6h`E<{vZ%yD!I{{!RK8LQAN@GZ0M!FwqU8z)h z3h4mS8Kn0k{T}>)d{186l^LR+`Uyuyq#Dlhi-OT?o^OMhn_b~oP#>r2_+xi&) z0Gs5CTMx0^C^tsn#t7UPf&YgQkod(Lui%lc%CDJ9TY>^sHj6RJuWzMAm^Y*_Hi{9zJ5k1`)$U(5FJv!VQ-Wq$mOsl9Jin4 z{vx-bC0+-z9xCyhK*qHaUj#BAm3TeKyjJ3iLFT^_U+CVt6l8oV`5Vf4 zl1jY#-beibSO!zF;9LR$Xe#FwTrl!C&u<)8M*t%#*?%q=`CCBtT_wI8g zL(Ji?o-YTl9)F587q+_uUXz1J`62on;VI92{pGrP8lY1SFxK;Tq~yOBrx=v*q=&y= zO-J>7Ek&I8<-BM05JKGN%Bv`G>~!>0uZOVWug1T|`sKSK`W-#h>up%+sm2}tW>_Hy zDJydD>UHuM>Zvs^4uAE2c|zHp$N8A$_0Ry4KdJP;8OA;SEHsbe|6)1#S<#fT+YDZu zx@-A=4EdqPzCNw=Tq$|?*B=zVvMN9Jmcpyo1D&Tn+(+L1>#YCk^Qwsaly~yxsr>lC z43Vpz(t`MMsNP4b5#Qk9rRJsu^b9IJHS%2x@b6`QZ(i<)jLZB=TycbPZ+`DN#=UvG zR}@}-Za_FUW{0h0IAQLwbGdZhv|BsERxW3U3wbL%oXUnZEsXi8Fs7@*hGE5%ci+`o z$nMVO9?G_+Q`y3J>u9zR?jSbjzc9^r#w|Ogk<&-hxnU!%C+wV+*Np-?oXd=*P1{Up9UYt7Ycr{= zq;6Q2F`;8B(VBn}%gC5|qL9fF^tr<9Ga9`HfNg}u2XHekV+?7QwdP5q>)d8mYB$*q(g1X z>DgmidCcfmG<58<%(OuQTpLT*h`ii`B8-dt6dMvM_a|gi zh1E$D`msiL8IhMbTj;IKDgFze&>h%McOz*_JT5fE^5j3+7XKNB1Xid|Ux|-bejG9KOTsa^{|c436IoOtU)?2-4Y$197lcOK%vhJa ztI%(Gn5kEO z$|GN%Ab!!tPk8^Ip**H{)!sl-z8}f&sNV6Cw$SHM25(zsdAUc;Z~{d5NH-N}3w;qW za#7Z&+_z>wL7sF`c^w~Gdm=A!o&3%%=ZK7(*p+!bkFsw25?{{HOJTXqA?3d!FZ2!U zam&j+@A9XV|1-*yF7aP}=MA!hVn-bCOeeg<_S=MksmM!SNxDVeO=ay0579QZf1Wos s<*&YikN6|!2Zj6MJ66kOF<)U{^(Ys+!q|e9TJmQ(VMg~ock#;q3uj)=)&Kwi diff --git a/sample/third_party/lib/libruntime_camodel.so.x86 b/sample/third_party/lib/libruntime_camodel.so.x86 deleted file mode 100644 index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16096 zcmeHOZ){sv6~B&?HC@u!>qeKRm13rBT3hy-G)kLNb#)p$eXWwT^WqOoL=VS)NsRm_ zJU@5IL`7Mll7ULC_JJY|RY;TQhJ?lkG>Q;wm92opfC=$OVxVZWQb1@+1)BnzV$Qkm zoR{ZoN3B3Ye0aCA&pp3$e&^nI-k)CQ-SdU`=txr_Ah-p^lLED9*g>LXhzwE(L{tol z&G5ZLbV=XlbyHpMr!5Y|QczqRLz+luoaMJ@N-P;iSdJHQ;tECPh{w{)yALR>n!{^- zvQ~zh9KDky^H4=zgm@9+vAj;>v21n1$T{LX2PvK^d&rNaJSMZ;m()svLD- z!IF7T0T1VSuZ`25pz$`!%PJioOH$?WJp_XV;d5*}DZV-M*yeX%JYPKc!7mTKdaZhK z;CqKQ;at#;=NWl8zv%N}5p7<&gc$0Z;d9{4Yo{N&@<^Zl);AuzcHvIP&Farj{PoxG zcYJu|Pp`keYuk6hWy6MXBMq!OH^A{SZseZ;{1N!5?TY|x6>TE^fMW@f!`BI)ApAkX zKS%gp!nYBQ`Q9qv>fpo^{$uh#uf#{*R?#jl6He~pJLEq@csChRgm)4CF(w$sOukq! zN_NV!4MP~&Le>_>GzeiFpBOVTre)4#OSWlEj1A|C1#==bl`|b%V~dfVPho&mF8iD* zEc5+<#M3M%+p7~+geB#5;T+-^6o^ zz-C1*J5{nX={HFYToV5q|-(TwLZkt4t{L(_|aiwpSDjM;7o`nxC(3k;dL1h z?|}!OmU34XF3BdbN-Xy|L3)_-8|LQ+;lH8)oeIwS(hkUx3#RI$x8OU}9fjU2(3`$kee{P?>K6{b4V|QZq2*^F^@FQ=<+i;YLVg8R z2)Sjlx-_kYy1yg?PA*QbE1_-y{7dL~VF3M;`mQzL>o?YddSykwv3x`i+|=J*vpXT+ z5Cv?eS{9U|bn z>zymrY6j#dKzr1Bvg1u?3z>h`@X} zu%mrb>qWqO0K<2(ufAWc%2#C_;gOEc6QQd5{L!IppIEp52;dml<-1(A zkFVR`0s8^4cYFEItlK{VeE$aVk>WQ$1AYel4EP!FGvH^y&w!r+KLdUS{0#g*WPtZI zs{0t>r<-g)A<6o06@5w3aZe?-c1O4yEVq!(`&ze?&igF!^)oj7O^2<^5rX$y?j)V< zAFNf22z`b&PvX8}Y)iChlJ`VkCHwu3S>SJ1Y;BGZoh048mQe?go0y>+p};)uRr1IE z%h;H|R(0m%Y;K1)=cIGq;OC*7@2zHs_fOK@aUJ-7_Rag))d7zU4}UW9IPC1+6*-{w zYkiS@y?ukd{k{FFV*@=1J6w6cyVo3qeYJd^YVn}p^HYmAyXUbMZxNioS{%+cUjQCM zP;7F~p9hA$>2NlAYgs+h`98xCp@XUwrzY&So=O<5b_&LaN7NmcD zo-YI5X#Our|Ml}rw>iwEvHxq*KO|=HK$O-f{+rUL{yO??}=S8}hpUauHnbGzi>>p_CB#SkSlx3wB45+ZM7R0ob%9}=}oX;-+#iPmU zjv801T(GlwQ%ldy3FDd3zW&675lckEp=A~dz;vW-IhNC=Rf0;F8pQycW#&>y5SlF1(dKftHdC~P zW}EZyEwiUt#Z1ah3C*0PlRujQDr*kaxe6Qx_mPDoKma+LR6d)A85C_CMrc!|lF;Ca zlyyAHef6KJxSqo`6fM8pbpfvrdX&KYykuB|&WEr@Vm_}QSjNba*H64YWBwp?aE-@& zUT3fzg!>sAj*DI9^Lj1{7{+G)5)H(X*OA~N8--(4=+MJ8AoF?M!jjjwJb(6MnFKzr zDOq>dH?+=)&`9e12^JaXppE$v=8?Qgjy%44`~|S%G4VRuT`$r9!#sZqBgf)C%0Ixz z%jb0!OXa_FHN3DDKCi!6s(hY5%QFf;N(7ck<%Fs6&MAEU-NKUB|KKJY=GxmTpx_Xk ze-R-oN!4G~FM+$4-^@IcqhR+om45|{UOulQS?)xmr*YmGdKo&n_HwTeu5ZH|6PoJJ zI?L~a558|>KL3sj-y{D9aB+&v=k;snUHCwe?HBNk*DmvUoy+yS*Xj51B;h0qU74^NPf-`>gJRAOJKl0&!5+Eoy2E9=F{P@DsDp$ zF>-ft=BxKF^S#o&c3n2qHB-PaJxB0&{BHR0eE7KV{@}TL=t3WGv_JDeWK>l6CDWL{ P2N(Fz-gAWet@8g37k2|A -- Gitee From dd923e70eedff70a8443a8df3990dd9ff7be0b71 Mon Sep 17 00:00:00 2001 From: binlien Date: Mon, 4 Mar 2024 23:31:23 +0800 Subject: [PATCH 14/26] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=BA=90=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pytorch_adapter/with_setuptools/Makefile | 20 +++ .../with_setuptools/add_adapter.cpp | 128 ++++++++++++++++++ .../with_setuptools/add_kernel.cpp | 106 +++++++++++++++ 3 files changed, 254 insertions(+) create mode 100644 sample/pytorch_adapter/with_setuptools/Makefile create mode 100644 sample/pytorch_adapter/with_setuptools/add_adapter.cpp create mode 100644 sample/pytorch_adapter/with_setuptools/add_kernel.cpp diff --git a/sample/pytorch_adapter/with_setuptools/Makefile b/sample/pytorch_adapter/with_setuptools/Makefile new file mode 100644 index 000000000..ec9115f37 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/Makefile @@ -0,0 +1,20 @@ +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: + rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp new file mode 100644 index 000000000..6c65e60ec --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp @@ -0,0 +1,128 @@ +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; + +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); + +// 为NPU设备注册前向实现 +at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) +{ + // 创建输出内存 + at::Tensor result = at::Tensor(self); + // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 + // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) + // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + auto x = self.storage().data(); + auto y = other.storage().data(); + auto z = result.storage().data(); + + uint32_t blockDim = 8; + auto callback = [stream, blockDim, x, y, z]() -> int { + add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); + return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 + }; + // 下发算子 + at_npu::native::OpCommand cmd; + cmd.Name("my_add").SetCustomHandler(callback).Run(); + return result; +} + +// 为NPU设备注册反向实现 +std::tuple my_add_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + + return {result, result}; +} + +// 为Meta设备注册前向实现 +at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) +{ + return empty_like(self); +} + +// 为Meta设备注册反向实现 +std::tuple my_add_backward_impl_meta(const at::Tensor &self) +{ + auto result = empty_like(self); + return std::make_tuple(result, result); +} + +// 寻找注册在该op上的不同设备的实现 +at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) +{ + static auto op = + torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); + return op.call(self, other); +} +// 寻找注册在该op上的不同设备的实现 +std::tuple my_add_backward_impl(const at::Tensor &self) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("myaten::my_add_backward", "") + .typed(); + return op.call(self); +} + +// 在myaten命名空间里注册my_add和my_add_backward两个schema +TORCH_LIBRARY(myaten, m) +{ + m.def("my_add(Tensor self, Tensor other) -> Tensor"); + m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class MyAddFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) + { + at::AutoDispatchBelowADInplaceOrView guard; + return my_add_impl(self, other); + } + + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) + { + auto grad_output = grad_outputs[0]; + auto result = my_add_backward_impl(grad_output); + return {std::get<0>(result), std::get<1>(result)}; + } +}; + +at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) +{ + return MyAddFunction::apply(self, other); +} + +// 给op绑定NPU的自动求导实现 +// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA +TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_autograd); +} + +// 为NPU设备注册前反向实现 +// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA +TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_npu); + m.impl("my_add_backward", &my_add_backward_impl_npu); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(myaten, Meta, m) +{ + m.impl("my_add", &my_add_impl_meta); + m.impl("my_add_backward", &my_add_backward_impl_meta); +} + +// 通过pybind将c++接口和python接口绑定 +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("my_add", &my_add_impl_autograd, "x + y"); +} diff --git a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp new file mode 100644 index 000000000..9aa62e093 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); +} \ No newline at end of file -- Gitee From f9b845d746c524da42c90635a6cee1f557cfc095 Mon Sep 17 00:00:00 2001 From: sunyiming Date: Sat, 2 Mar 2024 01:05:02 +0000 Subject: [PATCH 15/26] update debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py. Signed-off-by: sunyiming --- .../ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 91cacafd0..a6b769ff2 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -96,6 +96,8 @@ class DataInfo(object): def get_not_float_tensor_info(data): + if DumpUtil.summary_mode == "md5": + return DataInfo([], [], str(data.dtype), tuple(data.shape), get_md5_for_tensor(data)) if data.numel() == 0 or data.dtype == torch.bool: tensor_max = [] tensor_min = [] -- Gitee From 642a95c0ceb30aa9cfbe4a548ec6f8ccf3392f79 Mon Sep 17 00:00:00 2001 From: sunyiming Date: Sat, 2 Mar 2024 01:06:05 +0000 Subject: [PATCH 16/26] update debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py. Signed-off-by: sunyiming --- .../ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index 4250464b6..f732a3851 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -756,9 +756,11 @@ def check_file_valid(file_path): def get_md5_for_tensor(x): + if x.dtype == torch.bfloat16: + x = x.float() tensor_bytes = x.cpu().detach().numpy().tobytes() - crc_hash = zlib.crc32(tensor_bytes) - return crc_hash + crc32_hash = zlib.crc32(tensor_bytes) + return f"{crc32_hash:08x}" def check_path_before_create(path): -- Gitee From f853c5ef34148ead0e0b9d4f810e09672860676d Mon Sep 17 00:00:00 2001 From: sunyiming Date: Sat, 2 Mar 2024 01:20:34 +0000 Subject: [PATCH 17/26] update debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py. Signed-off-by: sunyiming --- debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py index 18ef57b9c..9ae980102 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py @@ -41,4 +41,4 @@ class TestUtilsMethods(unittest.TestCase): data = [[1, 2], [3, 4]] x_data = torch.tensor(data) md5_value = get_md5_for_tensor(x_data) - self.assertEqual(md5_value, 2624136704) + self.assertEqual(md5_value, '9c692600') -- Gitee From ab1317a86f1d009c24af96423716f1d164c4b345 Mon Sep 17 00:00:00 2001 From: binlien Date: Tue, 5 Mar 2024 00:10:11 +0800 Subject: [PATCH 18/26] =?UTF-8?q?=E5=8A=A0=E5=BC=BAREADME=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=EF=BC=8C=E6=8F=90=E4=BE=9B=E6=80=BB=E4=BD=93=E7=BB=93?= =?UTF-8?q?=E6=9E=84=E7=9A=84=E8=B0=83=E6=95=B4=E5=92=8C=E9=92=88=E5=AF=B9?= =?UTF-8?q?pytorch=20adapter=E7=9A=84=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/README.md | 76 ++++++++++++++++++++------------ sample/pytorch_adapter/README.md | 33 ++++++++++++++ 2 files changed, 82 insertions(+), 27 deletions(-) diff --git a/sample/README.md b/sample/README.md index 167b1a01c..1f02b0609 100644 --- a/sample/README.md +++ b/sample/README.md @@ -5,12 +5,61 @@ 如果考虑商用集成,推荐使用CANN软件包中的AscendC样例工程,比如:ascendc_kernel_cmake目录。本项目中的工程就是基于其进行简化仅用于快速验证。 +说明:该sample目录中,每个最小目录就是一个完整的样例工程。这些样例工程本身可能以为依赖的不同存在差异。 + ## 依赖说明 安装CANN包,并使能环境变量,并确保```ASCEND_HOME_PATH```生效,可以在CANN包安装目录下使能: ``` source set_env.sh ``` +## 目录介绍 +整体目录结构如下: +``` +- sample + |- build # 编译并运行所有样例内容(建议按需使用,此处命令可以参考 + |- normal_sample # 纯C/C++的AscendC单算子极简工程,可配合msdebug和msprof工具 + |- cube_only # 仅含aic的AscendC单算子极简工程 + |- mix # mix算子的AscendC单算子极简工程 + |- vec_only # 仅含aiv的AscendC单算子极简工程 + |- pytorch_adapter # 适配pytorch的AscendC单算子极简工程,可配合msdebug和msprof工具 + |- jit_compile # jit模式,运行时编译使用 + |- with_setuptools # 编译成wheel包安装使用 + |- sanitizer_sample # 异常样例,用于配合mssanitizer工具 + |- racecheck # 含竞争问题的样例 + |- xx # 其他异常样例 +``` + +如果你关注算子的框架适配,详见[此处](./pytorch_adapter/README.md) + + +## 算子调试 msdebug +若使用msdebug进行上板调试,还需要额外调整,具体如下: +1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: + + 调试信息增强,并扩大栈空间: + ``` + COMPILER_FLAG := -xcce -O2 -std=c++17 + 修改为: + COMPILER_FLAG := -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true + ``` + +2. 运行阶段: +``` +msdebug ./*.fatbin +``` + +## 内存检测 sanitizer +1. 编译阶段:在编译过程中添加```--cce-enable-sanitizer -g```参数, 在链接过程中添加```--cce-enable-sanitizer```参数。(现样例中已在Makefile中添加),执行如下命令: +``` +make +``` + +2. 运行阶段: +``` +mssanitizer ./*.fatbin # 默认进行memcheck检查 +``` + + ## 算子调优 算子调优工具可以支持上板和仿真算子的调优,下面将以vec_only中的算子为例,进行工具使用的实战命令讲解 @@ -84,30 +133,3 @@ source set_env.sh └── trace.json # 算子所有核的流水图 ``` 4. 更多指标信息请参考算子开发工具使用手册。 - -## 算子调试msdebug -若使用msdebug进行上板调试,还需要额外调整,具体如下: -1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: - + 调试信息增强,并扩大栈空间: - ``` - COMPILER_FLAG := -xcce -O2 -std=c++17 - 修改为: - COMPILER_FLAG := -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true - -## 内存检测 sanitizer -### sanitizer_sample目录介绍 - -此目录下为sanitizer对应的样例库,包含竞争检测和内存检测相关的样例。 - -#### Racecheck目录介绍 - -Racecheck为竞争检测相关的样例。 - -raw_error_kernel.cpp文件为UB上先读后写竞争和GM上先写后读竞争问题的样例。 - - -运行阶段: - -``` -/usr/local/Ascend/ascend-toolkit/latest/tools/mssanitizer/bin/mssanitizer --tool=racecheck ./raw_error.fatbin -``` \ No newline at end of file diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md index e69de29bb..588401bf3 100644 --- a/sample/pytorch_adapter/README.md +++ b/sample/pytorch_adapter/README.md @@ -0,0 +1,33 @@ +# 算子框架适配说明 + +## 简介 +昇腾提供丰富的算子接入框架的方式,此处将介绍最简单的一种,每个目录中都是一个独立的可使用的工程 + +## 依赖 +与业内pytorch的算子介入方式相同,算子接入框架需要保障设备上有正确的pytorch版本(我们还依赖torch_npu版本) + +pytorch版本可由pip安装,torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases),请选择与pytorch适配的torch_npu版本。 + +## 工程使用 + +### jit_compile工程 +执行如下命令,就会在运行过程中,现场生成python模块并使用: +``` +python main.py +``` + +### setuptools工程 +针对with_setuptools工程,可以编译出可安装的wheel包,便于多机部署使用。 + + +1. 执行如下命令可以编译出软件包(setuptools可以支持多种方式,比如:build,install等,此处不一一展示): +``` +pytorch setup.py bdist_wheel # 编译出wheel包,在dist目录下 +``` + +2. 到```dist```目录下用pip命令安装对应软件包。 + +3. 执行测试脚本 +``` +python test.py +``` -- Gitee From b516e3e632701c9d72eaac729559448dc31e1397 Mon Sep 17 00:00:00 2001 From: binlien Date: Tue, 5 Mar 2024 00:12:04 +0800 Subject: [PATCH 19/26] . --- sample/README.md | 2 +- sample/pytorch_adapter/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sample/README.md b/sample/README.md index 1f02b0609..6bd55a2f8 100644 --- a/sample/README.md +++ b/sample/README.md @@ -30,7 +30,7 @@ source set_env.sh |- xx # 其他异常样例 ``` -如果你关注算子的框架适配,详见[此处](./pytorch_adapter/README.md) +如果你关注自定义算子的pytorch框架适配,详见[此处](./pytorch_adapter/README.md) ## 算子调试 msdebug diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md index 588401bf3..4233303b6 100644 --- a/sample/pytorch_adapter/README.md +++ b/sample/pytorch_adapter/README.md @@ -1,4 +1,4 @@ -# 算子框架适配说明 +# 自定义算子的pytorch框架适配说明 ## 简介 昇腾提供丰富的算子接入框架的方式,此处将介绍最简单的一种,每个目录中都是一个独立的可使用的工程 -- Gitee From b88c33972dd9d8766c8ec3e0d291222dee07cdde Mon Sep 17 00:00:00 2001 From: lian Date: Tue, 5 Mar 2024 01:31:32 +0000 Subject: [PATCH 20/26] update sample/pytorch_adapter/jit_compile/main.py. Signed-off-by: lian --- sample/pytorch_adapter/jit_compile/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py index 11f92600d..a0e17fb07 100644 --- a/sample/pytorch_adapter/jit_compile/main.py +++ b/sample/pytorch_adapter/jit_compile/main.py @@ -1,7 +1,8 @@ import os import torch -import torch.utils.cpp_extension import torch_npu +import torch.utils.cpp_extension +import subprocess from torch_npu.testing.testcase import TestCase, run_tests PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) @@ -9,7 +10,7 @@ CUR_PATH = os.getcwd() def compile_kernels(): - os.system("make") # 由于pytorch中没有device编译的扩展,所以此处人工加make + subprocess.run("make", shell=True) # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make def compile_host(): -- Gitee From a64aefb65c188fe92d9254f50033ad203f14e2fe Mon Sep 17 00:00:00 2001 From: lian Date: Tue, 5 Mar 2024 01:32:08 +0000 Subject: [PATCH 21/26] update sample/pytorch_adapter/with_setuptools/setup.py. Signed-off-by: lian --- sample/pytorch_adapter/with_setuptools/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py index 8f5ad6d9f..2c75808fa 100644 --- a/sample/pytorch_adapter/with_setuptools/setup.py +++ b/sample/pytorch_adapter/with_setuptools/setup.py @@ -1,6 +1,7 @@ import os import torch import torch_npu +import subprocess from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension from torch_npu.utils.cpp_extension import NpuExtension @@ -10,7 +11,7 @@ CUR_PATH = os.getcwd() def compile_kernels(): - os.system("make") # 由于pytorch中没有device编译的扩展,所以此处人工加make + subprocess.run("make", shell=True) # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make return "libcustom_kernels.so" # 这个make出来的库名字 -- Gitee From db49ecd261feb7bc78f24b418a3d88c043607e6e Mon Sep 17 00:00:00 2001 From: lian Date: Tue, 5 Mar 2024 01:50:47 +0000 Subject: [PATCH 22/26] update sample/pytorch_adapter/README.md. Signed-off-by: lian --- sample/pytorch_adapter/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md index 4233303b6..76a1c55e9 100644 --- a/sample/pytorch_adapter/README.md +++ b/sample/pytorch_adapter/README.md @@ -8,6 +8,23 @@ pytorch版本可由pip安装,torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases),请选择与pytorch适配的torch_npu版本。 +## 工程介绍 +整体工程目录如下: +``` +- pytorch_adapter + |- jit_compile # 实时编译的接入方式 + |- add_adapter.cpp # 使用算子动态库接口完成算子在pytorch框架的适配 + |- add_kernel.cpp # 昇腾算子实现,并提供host侧的动态库接口 + |- main.py # python的入口,实现整体集成 + |- Makefile # 用以生成昇腾算子的host侧动态库的编译脚本 + |- with_setuptools # wheel包的接入方式 + |- add_adapter.cpp + |- add_kernel.cpp + |- Makefile + |- setup.py # setuptools的入口,支持编译并打包生成wheel包 + |- test.py # 测试wheel包功能的入口 +``` + ## 工程使用 ### jit_compile工程 -- Gitee From fc343410ae0e55ac0fd081617dfac36b6f7576b7 Mon Sep 17 00:00:00 2001 From: lian Date: Tue, 5 Mar 2024 01:56:00 +0000 Subject: [PATCH 23/26] update sample/pytorch_adapter/README.md. Signed-off-by: lian --- sample/pytorch_adapter/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md index 76a1c55e9..a2b1ba635 100644 --- a/sample/pytorch_adapter/README.md +++ b/sample/pytorch_adapter/README.md @@ -48,3 +48,6 @@ pytorch setup.py bdist_wheel # 编译出wheel包,在dist目录下 ``` python test.py ``` + +## 其他 +1. 此处样例使用的是静态tiling,如果使用动态tiling,则可以在adapter.cpp中对Tensor的shape进行分析,选择合适tiling。(这部分是流程中必须的,只是可能在不同位置,比如aclnn中,这部分在接口实现;此处,我们本身也可以对add_custom_do进行封装,将tiling内置。) \ No newline at end of file -- Gitee From 90b9a525323ae47e0879c45b354c24d6c514c6db Mon Sep 17 00:00:00 2001 From: lian Date: Tue, 5 Mar 2024 03:57:26 +0000 Subject: [PATCH 24/26] update sample/pytorch_adapter/with_setuptools/setup.py. Signed-off-by: lian --- sample/pytorch_adapter/with_setuptools/setup.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py index 2c75808fa..92ab1d3c7 100644 --- a/sample/pytorch_adapter/with_setuptools/setup.py +++ b/sample/pytorch_adapter/with_setuptools/setup.py @@ -1,26 +1,26 @@ import os +import subprocess import torch import torch_npu -import subprocess from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension from torch_npu.utils.cpp_extension import NpuExtension PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) -CUR_PATH = os.getcwd() +CUR_PATH = os.path.abspath(os.path.dirname(__file__)) def compile_kernels(): - subprocess.run("make", shell=True) # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + subprocess.run("make") return "libcustom_kernels.so" # 这个make出来的库名字 def compile_adapter(): - exts = [] - ext1 = NpuExtension( + ext = NpuExtension( name="ascend_custom_kernels_lib", # import的库的名字 # 如果还有其他cpp文件参与编译,需要在这里添加 - sources=["./add_adapter.cpp"], + sources=[f"{CUR_PATH}/add_adapter.cpp"], extra_compile_args=[ '-I' + os.path.join(os.path.join(os.path.join(os.path.join( PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"), @@ -28,8 +28,7 @@ def compile_adapter(): library_dirs=[f"{CUR_PATH}"], # 编译时需要依赖的库文件的路径,相当于g++编译时的-L选项 libraries=["custom_kernels"], # 编译时依赖的库文件,相当于-l选项 ) - exts.append(ext1) - return exts + return [ext] if __name__ == "__main__": -- Gitee From 509cfdba00bbafbe1b8bee9ff2fff6101bd9ae68 Mon Sep 17 00:00:00 2001 From: lian Date: Tue, 5 Mar 2024 03:57:44 +0000 Subject: [PATCH 25/26] update sample/pytorch_adapter/jit_compile/main.py. Signed-off-by: lian --- sample/pytorch_adapter/jit_compile/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py index a0e17fb07..847a51f1c 100644 --- a/sample/pytorch_adapter/jit_compile/main.py +++ b/sample/pytorch_adapter/jit_compile/main.py @@ -1,16 +1,17 @@ import os +import subprocess import torch import torch_npu import torch.utils.cpp_extension -import subprocess from torch_npu.testing.testcase import TestCase, run_tests PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) -CUR_PATH = os.getcwd() +CUR_PATH = os.path.abspath(os.path.dirname(__file__)) def compile_kernels(): - subprocess.run("make", shell=True) # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + subprocess.run("make") def compile_host(): -- Gitee From 4dc673f9acd61a72b26c98cd6d0bf7f5036029ab Mon Sep 17 00:00:00 2001 From: hid86196561 Date: Tue, 5 Mar 2024 15:46:28 +0800 Subject: [PATCH 26/26] wait_time_added --- .../overall_performance_comparator.py | 9 ++- .../compare_bean/profiling_info.py | 5 ++ .../profiling_parser/npu_profiling_parser.py | 56 +++++++++++++++++++ 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py index bfc631c66..7ad66c0fa 100644 --- a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py +++ b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py @@ -37,9 +37,12 @@ class OverallPerformanceComparator(BaseComparator): self._headers.append('Mem Usage') base_col.append(f'{base_profiling_info.memory_used:.2f}G') comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') - self._headers.extend(['Uncovered Communication Time']) - base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s']) - comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s']) + self._headers.extend(['Uncovered Communication Time(Wait Time)']) + if base_profiling_info.wait_time: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s({base_profiling_info.wait_time:.3f}s']) + else: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s( / )']) + comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s({comp_profiling_info.wait_time:.3f}s)']) if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: self._headers.append('SDMA Time(Num)') base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py index 9184c790b..b100e7ba9 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py +++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py @@ -18,6 +18,7 @@ class ProfilingInfo: self.fa_num_bwd = 0 self.compute_time = 0.0 self.communication_not_overlapped = 0.0 + self.wait_time = 0.0 self.memory_used = 0.0 self.e2e_time = 0.0 self.sdma_time = 0.0 @@ -33,6 +34,7 @@ class ProfilingInfo: self.vec_time = self.vec_time / 10 ** 6 self.compute_time = self.compute_time / 10 ** 6 self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6 + self.wait_time = self.wait_time / 10 ** 6 self.e2e_time = self.e2e_time / 10 ** 6 self.sdma_time = self.sdma_time / 10 ** 6 self.scheduling_time = self.scheduling_time / 10 ** 6 @@ -84,6 +86,9 @@ class ProfilingInfo: def update_comm_not_overlap(self, time: float): self.communication_not_overlapped += time + def update_comm_not_overlap_wait_time(self, time: float): + self.wait_time = time + def set_memory_used(self, memory: float): self.memory_used = memory diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py index f872e52a5..dfc7d7d43 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py @@ -48,6 +48,7 @@ class NPUProfilingParser(BaseProfilingParser): if self._enable_profiling_compare: func_list.add(self._picking_overlap_analysis_data) func_list.add(self._picking_kernel_event) + func_list.add(self._picking_hccl_event) return list(func_list) def _update_memory_list(self): @@ -98,10 +99,65 @@ class NPUProfilingParser(BaseProfilingParser): self.__parse_kernel_csv() self.__add_sdma_time() self.__add_overlap_analysis_time() + self._picking_notify_wait_event_and_not_overlap_event() + self.__add_overlap_wait_time() self._result_data.overall_metrics.calculate_other_time() self._result_data.overall_metrics.calculate_schedule_time() self._result_data.overall_metrics.trans_time_to_s() + def _picking_notify_wait_event_and_not_overlap_event(self): + self.notify_event_cache = [] + self._not_overlaped_commu_event = [] + for event in self._commu_task_list: + if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK': + self.notify_event_cache.append(event) + for event in self._overlap_analysis: + if event.is_comm_not_overlap(): + self._not_overlaped_commu_event.append(event) + + def __add_overlap_wait_time(self): + notify_wait_event_dict = dict() + for notify_event in self.notify_event_cache: + if notify_event.tid in notify_wait_event_dict: + notify_wait_event_dict[notify_event.tid].append(notify_event) + else: + notify_wait_event_dict[notify_event.tid] = [notify_event] + total_time = 0 + for commu_event in self._not_overlaped_commu_event: + commu_event_start_time = float(commu_event.start_time) + commu_event_end_time = float(commu_event.end_time) + wait_time_list = [] + + for plane_id, events in notify_wait_event_dict.items(): + wait_time = 0 + idx = 0 + for notify_event in events: + notify_event_start_time = float(notify_event.start_time) + notify_event_end_time = float(notify_event.end_time) + if notify_event_start_time < commu_event_start_time and notify_event_end_time > commu_event_end_time: + wait_time = commu_event_end_time - commu_event_start_time + break + elif notify_event_start_time < commu_event_start_time and commu_event_start_time <= notify_event_end_time <= commu_event_end_time: + wait_time += notify_event_end_time - commu_event_start_time + idx += 1 + elif commu_event_start_time <= notify_event_start_time <= commu_event_end_time and notify_event_end_time > commu_event_end_time: + wait_time += commu_event_end_time - commu_event_start_time + break + elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= commu_event_end_time: + wait_time += notify_event_end_time - notify_event_start_time + elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= commu_event_end_time: + wait_time += notify_event_end_time - notify_event_start_time + idx += 1 + elif notify_event_end_time < commu_event_start_time: + idx += 1 + else: + break + + wait_time_list.append(wait_time) + notify_wait_event_dict[plane_id] = notify_wait_event_dict[plane_id][idx:] + total_time += max(wait_time_list) + self._result_data.overall_metrics.update_comm_not_overlap(total_time) + def _picking_hccl_event(self, event: TraceEventBean): if event.pid != self._hccl_pid or not event.is_x_mode(): return False -- Gitee