diff --git a/CMakeLists.txt b/CMakeLists.txt index f58f410ede3f17598fc63c0f05d612c9983b33ae..367e0be68283fb4b8f0393cd223980f371557fc2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,19 +5,6 @@ include(cmake/config.cmake) include(cmake/func.cmake) include(cmake/intf.cmake) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/bind) -set(MX_DRIVING_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mx_driving) -add_subdirectory(${MX_DRIVING_DIR}/common) -add_subdirectory(${MX_DRIVING_DIR}/preprocess) -add_subdirectory(${MX_DRIVING_DIR}/fused) -add_subdirectory(${MX_DRIVING_DIR}/point) -add_subdirectory(${MX_DRIVING_DIR}/detection) -add_subdirectory(${MX_DRIVING_DIR}/spconv) - -if(BUILD_STAGE EQUAL 0) - include(cmake/stage_0.cmake) -elseif(BUILD_STAGE EQUAL 1) - include(cmake/stage_1.cmake) -elseif(BUILD_STAGE EQUAL 2) - include(cmake/stage_2.cmake) -endif() +add_subdirectory(${PROJECT_SOURCE_DIR}/kernels) +add_subdirectory(${PROJECT_SOURCE_DIR}/onnx_plugin) +add_subdirectory(${PROJECT_SOURCE_DIR}/mx_driving/csrc) diff --git a/Third_Party_Open_Source__Software_Notice b/Third_Party_Open_Source_Software_Notice similarity index 100% rename from Third_Party_Open_Source__Software_Notice rename to Third_Party_Open_Source_Software_Notice diff --git a/bind/CMakeLists.txt b/bind/CMakeLists.txt deleted file mode 100644 index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000 --- a/bind/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_CSRC_SRC - ${ASCEND_CSRC_SRC} ${CSRC_SRC} - CACHE INTERNAL "") diff --git a/bind/pybind.cpp b/bind/pybind.cpp deleted file mode 100644 index a227bee8f5c9b147bf7c4bdcfd3c335367f36509..0000000000000000000000000000000000000000 --- a/bind/pybind.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "csrc/pybind.h" -#include - -#include -#include - -std::string g_opApiSoPath; -std::once_flag init_flag; // Flag for one-time initialization - -void init_op_api_so_path(const std::string& path) -{ - std::call_once(init_flag, [&]() { g_opApiSoPath = path; }); -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("_init_op_api_so_path", &init_op_api_so_path); - init_common(m); - init_fused(m); - init_point(m); - init_preprocess(m); - init_detection(m); - init_spconv(m); -} diff --git a/cmake/config.cmake b/cmake/config.cmake index cfb478e0713248f27c00e6c5f440ae2e4a90df9b..1c3690727932ee4b6b3323971448e1f7f402b7db 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -58,40 +58,8 @@ execute_process( set(ASCEND_TENSOR_COMPILER_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler) set(ASCEND_CCEC_COMPILER_PATH ${ASCEND_TENSOR_COMPILER_PATH}/ccec_compiler/bin) set(ASCEND_AUTOGEN_PATH ${CMAKE_BINARY_DIR}/autogen) -set(ASCEND_KERNEL_PATH ${CMAKE_BINARY_DIR}/kernels) - -unset(ASCEND_CSRC_SRC CACHE) -unset(ASCEND_HOST_SRC CACHE) -unset(ASCEND_KERNEL_SRC CACHE) -unset(ACLNN_SRC_CUSTOM CACHE) -unset(ACLNN_INC_CUSTOM CACHE) -unset(aclop_exclude CACHE) -unset(ASCEND_ONNX_SRC CACHE) -set(ASCEND_CSRC_SRC - "" - CACHE STRING "csrc source files") -set(ASCEND_HOST_SRC - "" - CACHE STRING "host source files") -set(ASCEND_KERNEL_SRC - "" - CACHE STRING "kernel source files") -set(ACLNN_SRC_CUSTOM - "" - CACHE STRING "aclnn source files") -set(ACLNN_INC_CUSTOM - "" - CACHE STRING "aclnn include files") -set(aclop_exclude - "" - CACHE STRING "aclop exclude files") -set(ASCEND_ONNX_SRC - "" - CACHE STRING "onnx source files") - set(ASCEND_FRAMEWORK_TYPE tensorflow) file(MAKE_DIRECTORY ${ASCEND_AUTOGEN_PATH}) -file(MAKE_DIRECTORY ${ASCEND_KERNEL_PATH}) set(CUSTOM_COMPILE_OPTIONS "custom_compile_options.ini") execute_process(COMMAND rm -rf ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} COMMAND touch ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS}) diff --git a/cmake/stage_0.cmake b/cmake/stage_0.cmake deleted file mode 100644 index edac7db17d85196fef6851eca2e120e161e8b0eb..0000000000000000000000000000000000000000 --- a/cmake/stage_0.cmake +++ /dev/null @@ -1,11 +0,0 @@ -add_library(ascend_all_ops SHARED ${ASCEND_HOST_SRC}) -target_compile_options(ascend_all_ops PRIVATE -g -fPIC -std=c++11 - -D_GLIBCXX_USE_CXX11_ABI=0) -target_include_directories(ascend_all_ops PRIVATE ${CANN_INCLUDE_PATH}) -target_link_libraries(ascend_all_ops PRIVATE intf_pub exe_graph register - tiling_api ascendcl) -add_custom_command( - TARGET ascend_all_ops - POST_BUILD - COMMAND ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build - $ ${ASCEND_AUTOGEN_PATH}) diff --git a/cmake/stage_1.cmake b/cmake/stage_1.cmake deleted file mode 100644 index 502263afe6c29b90cde3bc42c2a988bdc6a866d8..0000000000000000000000000000000000000000 --- a/cmake/stage_1.cmake +++ /dev/null @@ -1,214 +0,0 @@ -# ===================Build proto =================== -add_library(cust_op_proto SHARED ${ASCEND_AUTOGEN_PATH}/op_proto.cc) -target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) -target_compile_options(cust_op_proto PRIVATE -fvisibility=hidden) -target_link_libraries( - cust_op_proto - PRIVATE intf_pub - exe_graph - register - tiling_api - ascendcl - -Wl,--whole-archive - rt2_registry - -Wl,--no-whole-archive) -set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME cust_opsproto_rt2.0) -install_target( - TRG cust_op_proto DST - packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) -install_file(TRG cust_op_proto SRC ${ASCEND_AUTOGEN_PATH}/op_proto.h DST - packages/vendors/${vendor_name}/op_proto/inc) - -add_library(cust_optiling SHARED ${ASCEND_HOST_SRC}) -target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) -target_compile_options(cust_optiling PRIVATE -fvisibility=hidden) -target_link_libraries( - cust_optiling - PRIVATE intf_pub - exe_graph - register - tiling_api - ascendcl - -Wl,--whole-archive - rt2_registry - -Wl,--no-whole-archive) -set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME cust_opmaster_rt2.0) -install_target( - TRG - cust_optiling - DST - packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR} -) -# create liboptiling.so link -add_custom_command( - TARGET cust_optiling - POST_BUILD - COMMAND - ${CMAKE_COMMAND} -E chdir - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling - ${CMAKE_COMMAND} -E create_symlink - lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ - liboptiling.so) -install( - FILES - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/liboptiling.so - DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) - -if(${ENABLE_ONNX}) - if(CANN_PATHS) - if(${ARCH} STREQUAL "aarch64") - protobuf_generate( - PROTO_FILE ${CANN_PATHS}/aarch64-linux/include/proto/ge_onnx.proto - OUT_DIR ${ASCEND_AUTOGEN_PATH}) - else() - protobuf_generate( - PROTO_FILE ${CANN_PATHS}/x86_64-linux/include/proto/ge_onnx.proto - OUT_DIR ${ASCEND_AUTOGEN_PATH}) - endif() - else() - protobuf_generate( - PROTO_FILE ${ASCEND_CANN_PACKAGE_PATH}/include/proto/ge_onnx.proto - OUT_DIR ${ASCEND_AUTOGEN_PATH}) - endif() - - add_library(cust_onnx_parsers SHARED ${ASCEND_ONNX_SRC}) - target_compile_options( - cust_onnx_parsers - PRIVATE -O2 -Werror -Wno-deprecated-declarations -Dgoogle=ascend_private - "-fno-common" "-fno-strict-aliasing") - target_link_libraries(cust_onnx_parsers PRIVATE intf_pub) - target_include_directories( - cust_onnx_parsers PRIVATE ${PROJECT_SOURCE_DIR}/include - ${ASCEND_AUTOGEN_PATH}) - - install_target(TRG cust_onnx_parsers DST - packages/vendors/${vendor_name}/framework/onnx/) -endif() - -# ===================Build ACLNN=================== -file(GLOB ACLNN_SRC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) -file(GLOB ACLNN_INC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) -set(ACLNN_SRC ${ACLNN_SRC_GEN} ${ACLNN_SRC_CUSTOM}) -set(ACLNN_INC ${ACLNN_INC_GEN} ${ACLNN_INC_CUSTOM}) -add_library(cust_opapi SHARED ${ACLNN_SRC}) -target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase opapi) -install_target(TRG cust_opapi DST packages/vendors/${vendor_name}/op_api/lib) -install_file(TRG cust_opapi SRC ${ACLNN_INC} DST - packages/vendors/${vendor_name}/op_api/include) - -# ===================Build Kernel=================== -# set custom compile options -if("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") - add_ops_compile_options(ALL OPTIONS -g -O0) -endif() - -file(COPY ${ASCEND_KERNEL_SRC} DESTINATION ${ASCEND_KERNEL_PATH}) - -foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) - if(EXISTS ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini) - # generate aic-${compute_unit}-ops-info.json - add_ops_info_target( - TARGET - ops_info_gen_${compute_unit} - OUTPUT - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}/aic-${compute_unit}-ops-info.json - OPS_INFO - ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini - INSTALL_DIR - packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} - ) - - # generate ascendc impl py once - if(NOT TARGET ascendc_impl_gen) - add_ops_impl_target( - TARGET - ascendc_impl_gen - OPS_INFO - ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini - IMPL_DIR - ${ASCEND_KERNEL_PATH} - OUT_DIR - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl - ) - install_file( - TRG - ascendc_impl_gen - SRC - ${ASCEND_KERNEL_SRC} - DST - packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic - ) - endif() - - # dynamic shape binary compile - if(${ENABLE_BINARY_PACKAGE}) - add_bin_compile_target( - TARGET - ascendc_bin_${compute_unit} - OPS_INFO - ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini - IMPL_DIR - ${ASCEND_KERNEL_PATH} - ADP_DIR - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic - OUT_DIR - ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} - KERNEL_DIR - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel - INSTALL_DIR - packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel - COMPUTE_UNIT - ${compute_unit}) - add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen - cust_optiling) - endif() - endif() -endforeach() - -# generate npu_supported_ops.json -add_npu_support_target( - TARGET - npu_supported_ops - OPS_INFO_DIR - ${ASCEND_AUTOGEN_PATH} - OUT_DIR - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_info_cfg/ai_core - INSTALL_DIR - packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}) - -# ===================Build test=================== -# WARN: WIP -if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) - add_subdirectory(testcases) -endif() - -get_system_info(SYSTEM_INFO) - -# gen version.info -add_custom_target( - gen_version_info ALL - COMMAND - bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh - ${ASCEND_CANN_PACKAGE_PATH} - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}) - -install(FILES ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/version.info - DESTINATION packages/vendors/${vendor_name}) - -if(COMPILE_OPP_PACKAGE) - # CPack config - set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME}) - set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION}) - set(CPACK_PACKAGE_DESCRIPTION "CPack opp project") - set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project") - set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX}) - set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run") - set(CPACK_GENERATOR External) - set(CPACK_CMAKE_GENERATOR "Unix Makefiles") - set(CPACK_EXTERNAL_ENABLE_STAGING TRUE) - set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake) - set(CPACK_EXTERNAL_BUILT_PACKAGES - ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME} - ) - include(CPack) -endif() diff --git a/cmake/stage_2.cmake b/cmake/stage_2.cmake deleted file mode 100644 index 8d6da51b64b235ad2b129e3d5a37c9d16704ffd1..0000000000000000000000000000000000000000 --- a/cmake/stage_2.cmake +++ /dev/null @@ -1,59 +0,0 @@ -set(Python3_USE_STATIC_LIBS FALSE) -find_package(Python3 COMPONENTS Interpreter Development) - -execute_process( - COMMAND ${Python3_EXECUTABLE} -c - "import os; import torch; print(os.path.dirname(torch.__file__))" - OUTPUT_STRIP_TRAILING_WHITESPACE - OUTPUT_VARIABLE TORCH_PATH) -execute_process( - COMMAND - ${Python3_EXECUTABLE} -c - "import os; import site; print(site.getsitepackages()[0] + '/torch_npu')" - OUTPUT_STRIP_TRAILING_WHITESPACE - OUTPUT_VARIABLE TORCH_NPU_PATH) -message("TORCH_PATH is ${TORCH_PATH}") -message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") - -set(EXT_CXX_FLAGS "${EXT_CXX_FLAGS}") -separate_arguments(EXT_CXX_FLAGS) -add_library(_C SHARED ${ASCEND_CSRC_SRC}) -set_target_properties( - _C - PROPERTIES OUTPUT_NAME "${MX_DRIVING_PATH}/_C.${Python3_SOABI}" - PREFIX "" - SUFFIX ".so") - -if(${COMPILE_WITH_XLA}) - target_compile_definitions(_C PRIVATE COMPILE_WITH_XLA) -endif() -target_include_directories( - _C - PRIVATE ${Python3_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/include - ${TORCH_NPU_PATH}/include ${TORCH_PATH}/include - ${TORCH_PATH}/include/torch/csrc/api/include) -target_compile_options( - _C - PRIVATE -fprofile-arcs - -ftest-coverage - -fPIC - $<$:-O3> - $<$:-O0 - -g> - -fstack-protector-all - -DTORCH_API_INCLUDE_EXTENSION_H - -DTORCH_EXTENSION_NAME=_C - -D_GLIBCXX_USE_CXX11_ABI=0 - -D__FILENAME__=__FILE__ - ${EXT_CXX_FLAGS}) - -target_link_directories(_C PRIVATE ${TORCH_PATH}/lib ${TORCH_NPU_PATH}/lib) -target_link_libraries(_C PRIVATE gcov c10 torch torch_python torch_npu) -target_link_options( - _C - PRIVATE - $<$,EXECUTABLE>:-pie> - $<$:-s> - -Wl,-z,relro - -Wl,-z,now - -Wl,-z,noexecstack) diff --git a/docs/api/README.md b/docs/api/README.md index 2986bfbec667653aac35e60d9a0cc9aaa6fc8dd2..50478d6208642c5d9767b985f42f04c78cd312e7 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -1,1455 +1,10 @@ -> Note: 以prototype标注的接口,表示该接口为预发布接口,可能会有变动,不建议在生产环境中使用。 -# 通用算子 -## scatter_max -### 接口原型 -```python -mx_driving.common.scatter_max(Tensor updates, Tensor indices, Tensor out=None) -> (Tensor out, Tensor argmax) -``` -### 功能描述 -在第0维上,将输入张量`updates`中的元素按照`indices`中的索引进行分散,然后在第0维上取最大值,返回最大值和对应的索引。对于1维张量,公式如下: -$$out_i = max(out_i, max_j(updates_j))$$ -$$argmax_i = argmax_j(updates_j)$$ -这里,$i = indices_j$。 -### 参数说明 -- `updates`:更新源张量,数据类型为`float32`,且 - - `updates`的第0维外其余轴合轴后必须32字节对齐。 -- `indices`:索引张量,数据类型为`int32`,且 - - `indices`的维度必须为`1`, - - `indices`第0维的长度必须与`updates`第0维的长度相同。 - - `indices`的最大值必须小于`491520`。 - - `indices`的取值必须为非负的有效索引值。 -- `out`:被更新张量,数据类型为`float32`,默认为`None`,且 - - `out`的维度必须与`updates`的维度相同。 - - `out`除第0维外其余维的长度必须与`updates`相同。 -### 返回值 -- `out`:更新后的张量,数据类型为`float32`。 -- `argmax`:最大值对应的索引张量,数据类型为`int32`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.common import scatter_max -updates = torch.tensor([[2, 0, 1, 3, 1, 0, 0, 4], [0, 2, 1, 3, 0, 3, 4, 2], [1, 2, 3, 4, 4, 3, 2, 1]], dtype=torch.float32).npu() -indices = torch.tensor([0, 2, 0], dtype=torch.int32).npu() -out = updates.new_zeros((3, 8)) -out, argmax = scatter_max(updates, indices, out) -``` -## knn -### 接口原型 -```python -mx_driving.common.knn(int k, Tensor xyz, Tensor center_xyz, bool Transposed) -> Tensor -``` -### 功能描述 -对center_xyz中的每个点找到xyz中对应batch中的距离最近的k个点,并且返回此k个点的索引值。 -### 参数说明 -- `xyz(Tensor)`:点数据,表示(x, y, z)三维坐标,数据类型为`float32`。shape为`[B, N, 3]`(当Transposed=False)或`[B, 3, N]`(当Transposed=True)。其中`B`为batch size,`N`为点的数量。 -- `center_xyz(Tensor)`:点数据,表示(x, y, z)三维坐标,数据类型为`float32`。shape为`[B, npoint, 3]`(当Transposed=False)或`[B, 3, npoint]`(当Transposed=True)。其中`B`为batch size,`npoint`为点的数量。 -- `k(int)`:采样点的数量。 -- `Transposed(bool)`: 输入是否需要进行转置 -### 返回值 -- `idx(Tensor)`:采样后的索引数据,数据类型为`int32`。shape为`[B, k, npoint]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.common import knn -xyz = torch.tensor([[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=torch.float32).npu() -center_xyz = torch.tensor([[[1, 2, 3]], [[1, 2, 3]]], dtype=torch.float32).npu() -idx = knn(2, xyz, center_xyz, False) -``` -### 算子约束 -1. k必须>0且<100。 -2. xyz中的每个batch中的任意一个点到center_xyz对应batch中的任意一个点的距离必须在1e10f以内。 -3. xyz和center_xyz的shape必须是3维,当Transposed=True时,xyz和center_xyz的shape的dim的第1维必须是3;当Transposed=False时,xyz和center_xyz的shape的dim的第2维必须是3。 -4. 由于距离相同时排序为不稳定排序,存在距离精度通过但索引精度错误问题,与竞品无法完全对齐。 - -## scatter_mean -### 接口原型 -```python -mx_driving.common.scatter_mean(Tensor src, Tensor indices, int dim=0, Tensor out=None, int dim_size=None) -> Tensor -``` -### 功能描述 -将输入张量`src`中的元素按照`indices`中的索引在指定的`dim`维进行分组,并计算每组的平均值,返回平均值。 -### 参数说明 -- `src`:源张量,数据类型为`float32`。 -- `indices`:索引张量,数据类型为`int32`。 -- `out`:被更新张量,数据类型为`float32`,可选入参,默认为`None`,输入`out`不为`None`时,`out`中的元素参与平均值的计算。 -- `dim`:指定的维度,表示按照哪个维度进行分组平均计算,数据类型为`int32`,可选入参,默认取值为`0`。 -- `dim_size`:输出张量在`dim`维的长度,数据类型为`int32`,可选入参,默认为`None`,该参数仅在输入`out`为`None`时生效。 -### 返回值 -- `out`:求平均后的张量,数据类型为`float32`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 - -```python -import torch, torch_npu -from mx_driving.common import scatter_mean -src = torch.randn(4, 5, 6).to(torch.float) -indices = torch.randint(5, (4, 5)).to(torch.int32) -dim = 0 -src.requires_grad = True -out = scatter_mean(src.npu(), indices.npu(), None, dim) -grad_out_tensor = torch.ones_like(out) -out.backward(grad_out_tensor) -``` -### 算子约束 -- `indices`的维度必须小于等于`src`的维度,且每一维的长度均必须与`src`长度相同。 -- `indices`的取值必须为非负的有效索引值,参数`out`或`data_size`不为`None`时,`indices`的取值应该为输出张量在`dim`维的有效索引值。 -- `out`的维度必须与`src`的维度相同,且除第`dim`维外其余维的长度必须与`src`相同。 -- `dim`取值不能超过`indices`的维度。 -- `dim_size`的取值必须为非负的有效长度值。 -- `src`和`out`不支持`inf`、`-inf`和`nan`。 -### 其他说明 -- 该算子对尾块较大的场景较为亲和,对尾块很小的场景不亲和,其中,尾块表示`src`后`N`维的大小,`N = src.dim() - indices.dim()`。 - -## three_interpolate -### 接口原型 -```python -mx_driving.common.three_interpolate(features: torch.Tensor, indices: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: -``` -### 功能描述 -对三维数据进行加权最近邻线性插值处理 -### 参数说明 -- `features`:需要被插值的特征,数据类型为`float32|float16`,维度为(B, C, M)。 -- `indices`:获取目标特征计算的索引,数据类型为`int32`,维度为(B, N, 3), - - `indices`的元素值需小于`features`的第三维度,即值在[0, M)。 -- `weight`:获取目标特征计算的权重,数据类型为`float32|float16`,维度为(B, N, 3)。 - - `weight`数据类型与`features`须一致。 -- `features`,`indices`,`weights`三个参数的每个维度须小于10000。 -- `features`,`indices`,`weights`三个参数的大小请勿超过2^24。 -### 返回值 -- `output`:目标特征张量,数据类型为`float32|float16`,维度为(B, C, N)。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch -from mx_driving.common import three_interpolate - - -features = torch.tensor( - [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350], - [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236], - [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732], - [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124], - [0.3207, 0.0000, 0.3411, 0.3207, 0.3207, 0.3207]], - [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000], - [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346], - [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000], - [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414], - [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]], - ).npu() -idx = torch.tensor( - [[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2], [0, 1, 3]], - [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4], [0, 1, 2]]], - ).int().npu() -weight = torch.tensor( - [[[3.3333e-01, 3.3333e-01, 3.3333e-01], - [1.0000e+00, 5.8155e-08, 2.2373e-08], - [1.0000e+00, 1.7737e-08, 1.7356e-08], - [3.3333e-01, 3.3333e-01, 3.3333e-01], - [3.3333e-01, 3.3333e-01, 3.3333e-01], - [3.3333e-01, 3.3333e-01, 3.3333e-01]], - [[3.3333e-01, 3.3333e-01, 3.3333e-01], - [1.0000e+00, 1.3651e-08, 7.7312e-09], - [1.0000e+00, 1.7148e-08, 1.4070e-08], - [3.3333e-01, 3.3333e-01, 3.3333e-01], - [3.3333e-01, 3.3333e-01, 3.3333e-01], - [3.3333e-01, 3.3333e-01, 3.3333e-01]]], - ).npu() -output = three_interpolate(features, idx, weight) -``` - - -## three_nn -### 接口原型 -```python -mx_driving.common.three_nn(Tensor target, Tensor source) -> (Tensor dist, Tensor idx) -``` -### 功能描述 -对target中的每个点找到source中对应batch中的距离最近的3个点,并且返回此3个点的距离和索引值。 -### 参数说明 -- `target(Tensor)`:点数据,表示(x, y, z)三维坐标,数据类型为`float32/float16`。shape为`[B, npoint, 3]`。其中`B`为batch size,`npoint`为点的数量。 -- `source(Tensor)`:点数据,表示(x, y, z)三维坐标,数据类型为`float32/float16`。shape为`[B, N, 3]`。其中`B`为batch size,`N`为点的数量。 -### 返回值 -- `dist(Tensor)`:采样后的索引数据,数据类型为`float32/float16`。shape为`[B, npoint, 3]`。 -- `idx(Tensor)`:采样后的索引数据,数据类型为`int32/int32`。shape为`[B, npoint, 3]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.common import three_nn -source = torch.tensor([[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=torch.float32).npu() -target = torch.tensor([[[1, 2, 3]], [[1, 2, 3]]], dtype=torch.float32).npu() -dist, idx = three_nn(target, source) -``` -### 算子约束 -1. source和target的shape必须是3维,且source和target的shape的dim的第2维必须是3。 -2. 距离相同时排序为不稳定排序,存在距离精度通过但索引精度错误问题,与竞品无法完全对齐。 - - -## hypot -### 接口原型 -```python -mx_driving.common.hypot(Tensor input, Tensor other) -> Tensor -``` -### 功能描述 -给出直角三角形的两边,返回它的斜边。 -### 参数说明 -- `input(Tensor)`:代表直角三角形第一条直角边的输入张量,数据类型为`float32`。 -- `other(Tensor)`:代表直角三角形第二条直角边的输入张量,数据类型为`float32`。 -### 返回值 -- `Tensor`:经过计算后的直角三角形斜边,数据类型为`float32`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.common import hypot -input = torch.tensor([3,3,3], dtype=torch.float32).npu() -other = torch.tensor([4,4,4], dtype=torch.float32).npu() -out = hypot(input, other) # tensor([5.,5.,5.]) -``` -### 算子约束 -1. input和other的shape必须是可广播的。 - - -## assign_score_withk -### 接口原型 -```python -mx_driving.common.assign_score_withk(Tensor scores, Tensor point_features, Tensor center_features, Tensor knn_idx, str aggregate='sum') -> Tensor -``` -### 功能描述 -根据`knn_idx`得到采样点及其邻居点的索引,计算`point_features`和`center_features`的差,并与`scores`相乘后在特征维度进行聚合,返回采样点的特征。 -### 参数说明 -- `scores(Tensor)`:权重矩阵的重要系数,数据类型为`float32`。Shape为`[B, npoint, K, M]`,其中`B`为batch size,`npoint`为采样点的数量,`K`为一个样本点及其邻居点的数量之和,`M`为权重矩阵集合的规模。 -- `point_features(Tensor)`:所有点的特征,数据类型为`float32`。Shape为`[B, N, M, O]`,其中`N`为所有点的数量,`O`为特征数量。 -- `center_features(Tensor)`:所有点的中心特征,数据类型为`float32`。Shape为`[B, N, M, O]`。 -- `knn_idx[Tensor]`:采样点及其邻居点的索引,数据类型为`int64`。Shape为`[B, npoint, K]`。 -- `aggregate`:聚合方式,默认为`sum`,数据类型为`str`。 -### 返回值 -- `output`:聚合后采样点的特征,数据类型为`float32`。Shape为`[B, O, npoint, K]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 - -```python -import torch, torch_npu -from mx_driving.common import assign_score_withk -points = np.random.rand(4, 100, 8, 16).astype(np.float32) -centers = np.random.rand(4, 100, 8, 16).astype(np.float32) -scores = np.random.rand(4, 64, 10, 8).astype(np.float32) -knn_idx = np.random.randint(0, N, size=(4, 64, 10)).astype(np.int64) -output = assign_score_withk(torch.from_numpy(scores).npu(), - torch.from_numpy(points).npu(), - torch.from_numpy(centers).npu(), - torch.from_numpy(knn_idx).npu(), - "sum") -``` -### 算子约束 -- `npoint`和`K`都不大于`N`。 - - -# 数据预处理算子 -## npu_points_in_box -### 接口原型 -```python -mx_driving.preprocess.npu_points_in_box(Tensor boxes, Tensor points) -> Tensor -``` -### 功能描述 -判断点是否在框内。 -### 参数说明 -- `boxes(Tensor)`:框张量,数据类型为`float32`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 -- `points(Tensor)`:点张量,数据类型为`float32`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。 -### 返回值 -- `boxes_idx_of_points(Tensor)`:点在框内的索引张量,数据类型为`int32`。shape 为`[B, N]`。 -### 约束说明 -- `boxes`和`points`的`B`必须相同,且只能为`1`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.preprocess import npu_points_in_box -boxes = torch.tensor([[[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]]], dtype=torch.float32).npu() -points = torch.tensor([[[1, 2, 3], [3, 4, 5]]], dtype=torch.float32).npu() -out = npu_points_in_box(boxes, points) -``` - -## npu_points_in_box_all -Note: 该接口命名将于2025年改为`points_in_boxes_all`。 -### 接口原型 -```python -mx_driving.preprocess.npu_points_in_box_all(Tensor boxes, Tensor points) -> Tensor -``` -### 功能描述 -判断点是否在框内。 -### 参数说明 -- `boxes(Tensor)`:框张量,数据类型为`float32`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 -- `points(Tensor)`:点张量,数据类型为`float32`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。 -### 返回值 -- `boxes_idx_of_points(Tensor)`:同一`batch`下,各点是否在各框内的张量,数据类型为`int32`。shape 为`[B, N, M]`。 -### 约束说明 -- `boxes`和`points`的`B`必须相同。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.preprocess import npu_points_in_box_all -boxes = torch.tensor([[[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]]], dtype=torch.float32).npu() -points = torch.tensor([[[1, 2, 5], [3, 4, 8]]], dtype=torch.float32).npu() -out = npu_points_in_box_all(boxes, points) -``` - -## RoipointPool3d -### 接口原型 -```python -mx_driving.preprocess.RoipointPool3d(int num_sampled_points, Tensor points, Tensor point_features, Tensor boxes3d) -> (Tensor pooled_features, Tensor pooled_empty_flag) -``` -### 功能描述 -对每个3D方案的几何特定特征进行编码。 -### 参数说明 -- `num_sampled_points(int)`:特征点的数量,正整数。 -- `points(Tensor)`:点张量,数据类型为`float32, float16`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。 -- `point_features(Tensor)`:点特征张量,数据类型为`float32, float16`。shape 为`[B, N, C]`。`C`分别代表`x, y, z`。 -- `boxes3d(Tensor)`:框张量,数据类型为`float32, float16`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 -### 返回值 -- `pooled_features(Tensor)`:点在框内的特征张量,数据类型为`float32, float16`。shape 为`[B, M, num, 3+C]`。 -- `pooled_empty_flag(Tensor)`:所有点不在框内的空标记张量,数据类型为`int32`。shape 为`[B, M]`。 -### 约束说明 -- `points`、`point_features`和`boxes3d`的数据类型必须相同,以及`B`也必须相同。 -- `num_sampled_points`必须小于等于`N`。 -- 数据类型为`float32`时,建议`B`小于100、`N`小于等于2640、`M`小于等于48、`num_sampled_points`小于等于48,个别shape值略微超过建议值无影响,但所有shape值均大于建议值时,算子执行会发生错误。 -- 数据类型为`float16`时,建议`B`小于100、`N`小于等于3360、`M`小于等于60、`num_sampled_points`小于等于60,个别shape值略微超过建议值无影响,但所有shape值均大于建议值时,算子执行会发生错误。 -- `N`/`M`的值越大,性能劣化越严重,建议`N`小于`M`的六百倍,否则性能可能会低于0.1x A100。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.preprocess import RoIPointPool3d -num_sampled_points = 1 -points = torch.tensor([[[1, 2, 3]]], dtype=torch.float).npu() -point_features = points.clone() -boxes3d = torch.tensor([[[1, 2, 3, 4, 5, 6, 1]]], dtype=torch.float).npu() -roipoint_pool3d = RoIPointPool3d(num_sampled_points) -pooled_features, pooled_empty_flag = roipoint_pool3d(points, point_features, boxes3d) -``` - - -# 目标检测算子 -## npu_boxes_overlap_bev -Note: 该接口命名将于2025年改为`boxes_overlap_bev`。 -### 接口原型 -```python -mx_driving.detection.npu_boxes_overlap_bev(Tensor boxes_a, Tensor boxes_b) -> Tensor -``` -### 功能描述 -计算bev视角下中两个边界框的重叠面积。 -### 参数说明 -- `boxes_a (Tensor)`:第一组bounding boxes,数据类型为`float32`。shape为`[M, 5]`。其中`5`分别代表`x1, y1, x2, y2, angle`, `x1, y1, x2, y2`代表box四个顶点的横纵坐标,`angle`代表box的弧度制旋转角。 -- `boxes_b (Tensor)`:第二组bounding boxes,数据类型为`float32`。shape为`[N, 5]`。其中`5`分别代表`x1, y1, x2, y2, angle`, `x1, y1, x2, y2`代表box四个顶点的横纵坐标,`angle`代表box的弧度制旋转角。 -### 返回值 -- `area_overlap(Tensor)`:包含两组bounding boxes交叠面积的张量,数据类型为`float32`。shape为`[M, N]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.detection import npu_boxes_overlap_bev -boxes_a = torch.tensor([[0, 0, 2, 2, 0]], dtype=torch.float32).npu() -boxes_b = torch.tensor([[1, 1, 3, 3, 0]], dtype=torch.float32).npu() -area_overlap = npu_boxes_overlap_bev(boxes_a, boxes_b) -``` -## box_iou_quadri -### 接口原型 -```python -mx_driving.detection.box_iou_quadri(Tensor boxes_a, Tensor boxes_b, str mode='iou', bool aligned=False) -> Tensor -``` -### 功能描述 -计算两个边界框的IoU。 -### 参数说明 -- `boxes_a (Tensor)`:第一组bounding boxes,数据类型为`float32`。shape为`[M, 8]`。其中`8`分别代表`x1, y1, x2, y2, x3, y3, x4, y4`, 表示box四个顶点的横纵坐标。 -- `boxes_b (Tensor)`:第二组bounding boxes,数据类型为`float32`。shape为`[N, 8]`。其中`8`分别代表`x1, y1, x2, y2, x3, y3, x4, y4`, 表示box四个顶点的横纵坐标。 -- `mode (str)`:取值为`"iou"`时,计算IoU(intersection over union);取值为`"iof"`时,计算IoF(intersection over foregroud)。 -- `aligned (bool)`:取值为`True`时,只计算配对的box之间的结果;取值为`False`时,计算每对box之间的结果。 -### 返回值 -- `ious(Tensor)`:包含两组bounding boxes的IoU(`mode="iou"`)或IoF(`mode="iof"`)的张量,数据类型为`float32`。shape为`[M]`(`aligned=True`)或`[M, N]`(`aligned=False`)。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.detection import box_iou_quadri -boxes_a = torch.tensor([[7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]], dtype=torch.float32).npu() -boxes_b = torch.tensor([[7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]], dtype=torch.float32).npu() -ious = box_iou_quadri(boxes_a, boxes_b, mode="iou", aligned=False) -``` -## npu_nms3d -### 接口原型 -```python -mx_driving.detection.npu_nms3d(Tensor boxes, Tensor scores, float: iou_threshold) -> Tensor -``` -### 功能描述 -3D非极大值抑制,在bev视角下剔除多个3d box交并比大于阈值的box。 -### 参数说明 -- `boxes(Tensor)`:框张量,数据类型为`float32, float16`。shape 为`[N, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 -- `scores(Tensor)`:评分张量,数据类型为`float32, float16`。shape 为`[N]`。 -- `iou_threshold(float)`:IoU阈值。 -### 返回值 -- `Tensor`:NMS后的框张量,数据类型为`int32`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.detection import npu_nms3d -boxes = torch.tensor([[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]], dtype=torch.float32).npu() -scores = torch.tensor([1, 2], dtype=torch.float32).npu() -out = npu_nms3d(boxes, scores, 0.5) -``` -## npu_nms3d_normal -### 接口原型 -```python -mx_driving.detection.npu_nms3d_normal(Tensor boxes, Tensor scores, float: iou_threshold) -> Tensor -``` -### 功能描述 -3D非极大值抑制。 -### 参数说明 -- `boxes(Tensor)`:框张量,数据类型为`float32, float16`。shape 为`[N, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 -- `scores(Tensor)`:评分张量,数据类型为`float32, float16`。shape 为`[N]`。 -- `iou_threshold(float)`:IoU阈值。 -### 返回值 -- `Tensor`:NMS后的框张量,数据类型为`int32`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.detection import npu_nms3d_normal -boxes = torch.tensor([[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]], dtype=torch.float32).npu() -scores = torch.tensor([1, 2], dtype=torch.float32).npu() -out = npu_nms3d_normal(boxes, scores, 0.5) -``` -## npu_rotated_iou -### 接口原型 -```python -mx_driving.detection.npu_rotated_iou(Tensor self, Tensor query_boxes, bool trans=False, int mode=0, bool is_cross=True, float v_threshold=0.0, float e_threshold=0.0) -> Tensor -``` -### 功能描述 -计算旋转框的IoU。 -### 参数说明 -- `self(Tensor)`:边界框张量,数据类型为`float32, float16`,形状为`[B, N, 5]`。 -- `query_boxes(Tensor)`:查询框张量,数据类型为`float32, float16`,形状为`[B, M, 5]`。 -- `trans(bool)`:是否进行坐标变换。默认值为`False`。值为`True`时,表示`xyxyt`, 值为`False`时,表示`xywht`,其中`t`为角度制。 -- `is_cross(bool)`:值为`True`时,则对两组边界框中每个边界框之间进行计算。值为`False`时,只对对齐的边界框之间进行计算。 -- `mode(int)`:计算IoU的模式。默认值为`0`。值为`0`时,表示计算`IoU`,值为`1`时,表示计算`IoF`。 -- `v_threshold(float)`:顶点判断的容忍阈值。 -- `e_threshold(float)`:边相交判断的容忍阈值。 -### 返回值 -- `Tensor`:IoU张量,数据类型为`float32, float16`,`is_cross`为`True`时形状为`[B, N, M],反之则为`[B, N]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -import numpy as np -from mx_driving.detection import npu_rotated_iou -a = np.random.uniform(0, 1, (2, 2, 5)).astype(np.float16) -b = np.random.uniform(0, 1, (2, 3, 5)).astype(np.float16) -box1 = torch.from_numpy(a).npu() -box2 = torch.from_numpy(b).npu() -iou = npu_rotated_iou(box1, box2, False, 0, True, 1e-5, 1e-5) -``` -## npu_rotated_overlaps -### 接口原型 -```python -mx_driving.detection.npu_rotated_overlaps(Tensor self, Tensor query_boxes, bool trans=False) -> Tensor -``` -### 功能描述 -计算旋转框的重叠面积。 -### 参数说明 -- `self(Tensor)`:边界框张量,数据类型为`float32, float16`,形状为`[B, N, 5]`。 -- `query_boxes(Tensor)`:查询框张量,数据类型为`float32, float16`,形状为`[B, M, 5]`。 -- `trans(bool)`:是否进行坐标变换。默认值为`False`。值为`True`时,表示`xyxyt`, 值为`False`时,表示`xywht`。 -### 返回值 -- `Tensor`:重叠面积张量,数据类型为`float32, float16`,形状为`[B, N, M]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -import numpy as np -from mx_driving.detection import npu_rotated_overlaps -a = np.random.uniform(0, 1, (1, 3, 5)).astype(np.float16) -b = np.random.uniform(0, 1, (1, 2, 5)).astype(np.float16) -box1 = torch.from_numpy(a).npu() -box2 = torch.from_numpy(b).npu() -output = npu_rotated_overlaps(box1, box2, True) -``` -## roi_align_rotated[beta] -### 接口原型 -```python -mx_driving.detection.roi_align_rotated(Tensor feature_map, Tensor rois, float: spatial_scale, - int: sampling_ratio, int: pooled_height, int: pooled_width, bool: aligned, bool: clockwise) -> Tensor -``` -### 功能描述 -计算旋转候选框的RoI Align池化特征图。 -### 参数说明 -- `feature map(Tensor)`:特征图张量,数据类型为`float32`,形状为`[B, C, H, W]`。 -- `rois(Tensor)`:感兴趣区域张量,数据类型为`float32`,形状为`[n, 6]`。 -- `spatial_scale(float)`:感兴趣区域边界框的缩放率,数据类型为`float32`。 -- `sampling_ratio(int)`:采样率,数据类型为`int`。取值范围为非负整数。 -- `pooled_height(int)`:池化特征图高度,数据类型为`int`。 -- `pooled_width(int)`:池化特征图宽度,数据类型为`int`。 -- `aligned(bool)`:是否对齐,数据类型为`bool`。值为`True`时,表示对齐, 值为`False`时,表示不对齐。 -- `clockwise(bool)`:旋转候选框的旋转方向,数据类型为`bool`。值为`True`时,表示逆时针旋转,值为`False`时,表示顺时针旋转。 -### 返回值 -- `Tensor`:池化特征图张量,数据类型为`float32`,形状为`[n, C, pooled_height, pooled_width]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import math -import torch, torch_npu -import numpy as np -from mx_driving.detection import roi_align_rotated - -feature_map = torch.rand([1, 3, 16, 16]) -feature_map.requires_grad = True -rois = torch.Tensor(6, 8) -rois[0] = torch.randint(0, 1, (8,)) -rois[1].uniform_(0, 16) -rois[2].uniform_(0, 16) -rois[3].uniform_(0, 16) -rois[4].uniform_(0, 16) -rois[5].uniform_(0, math.pi) - -output = roi_align_rotated(feature_map.npu(), rois.npu(), 1, 1, 7, 7, True, True) -output.backward(torch.ones_like(output)) -``` -### 其他说明 -在双线性插值采样过程中,当采样点`x`接近`-1`或`W`位置,`y`接近`-1`或`H`位置时,由于平台差异和计算误差,可能导致该采样点的精度无法与竞品精度完全对齐。 - -## roiaware_pool3d -### 接口原型 -```python -mx_driving.detection.roiaware_pool3d(Tensor rois, Tensor pts, Tensor pts_feature, - Union[int, tuple] out_size, int max_pts_per_voxel, int mode) -> Tensor -``` -### 功能描述 -将输入的点云特征在ROI框内进行池化 -### 参数说明 -- `rois (Tensor)`:输入的RoI框坐标与尺寸,数据类型为`float32/float16`,shape为`[Roi_num, 7]`。 -- `pts (Tensor)`:输入的点云坐标,数据类型为`float32/float16`,shape为`[Pts_num, 3]`。 -- `pts_feature (Tensor)`:输入的点的特征向量,数据类型为`float32/float16`,shape为`[Pts_num, Channels]`。 -- `out_size (Union)`:输出的RoI框内voxel的尺寸,数据类型为`int`或者`tuple`,shape为`[out_x, out_y, out_z]`。 -- `max_pts_per_voxel (int)`:每个voxel内最大的点的个数,数据类型为`int`。 -- `mode (int)`:池化的方式,0为maxpool, 1为avgpool,数据类型为`int`。 -### 返回值 -- `pooled_features (Tensor)`:池化得到的RoI框特征,数据类型为`float32/float16`,shape为`[Roi_num, out_x, out_y, out_z, Channels]`。 -### 约束说明 -- Roi_num <= 100 -- Pts_num <= 1000 -- Channels <= 1024 -- 1 <= max_pts_per_voxel <=256,max_pts_per_voxel <= Pts_num -- 反向具有相同约束。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch -import math -import torch_npu -import mx_driving.detection - -out_size = (5, 5, 5) -max_pts_per_voxel = 128 -mode = 1 - -N = 40 -npoints = 1000 -channels = 1024 - -xyz_coor = np.random.uniform(-1, 1, size = (N, 3)).astype(np.float32) -xyz_size_num = np.random.uniform(5, 50, size = (1, 3)) -xyz_size = (xyz_size_num * np.ones((N, 3))).astype(np.float32) -angle = np.radians(np.random.randint(0, 360, size = (N , 1))).astype(np.float32) - -rois = np.concatenate((xyz_coor, xyz_size), axis=1) -rois = np.concatenate((rois, angle), axis=1) - -pts = np.random.uniform(-5, 5, size = (npoints, 3)).astype(np.float32) -pts_feature = np.random.uniform(-1, 1, size=(npoints, channels)).astype(np.float32) - -pooled_features_npu = mx_driving.detection.roiaware_pool3d(torch.tensor(rois).npu(), torch.tensor(pts).npu(), - torch.tensor(pts_feature).npu(), out_size, max_pts_per_voxel, mode) -``` - -## border_align -### 接口原型 -```python -mx_driving.detection.border_align(Tensor feature_map, Tensor rois, int pooled_size) -> Tensor -``` -### 功能描述 -对输入的RoI框进行边缘特征提取。 -### 参数说明 -- `feature_map (Tensor)`:输入的特征图,数据类型为`float32`,shape为`[Batch_size, Channels, Height, Width]`。 -- `rois (Tensor)`:输入的RoI框坐标,数据类型为`int32`,shape为`[Batch_size, Height * Width, 4]`。 -- `pooled_size (int)`:在每条边上的采样点数,数据类型为`int`。 -### 返回值 -- `out_features (Tensor)`:提取到的RoI框特征,数据类型为`float32`,shape为`[Batch_size, Channels / 4, Height * Width, 4]`。 -### 约束说明 -- Batch_size <= 128 -- Channels <= 8192, Channels % 4 == 0 -- Height <= 256, Width <= 256 -- 2 <= pooled_size <= 20 -- 反向具有相同约束。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch -import torch_npu -import numpy as np -from mx_driving.detection import border_align - -def generate_features(feature_shape): - features = torch.rand(feature_shape) - return features - -def generate_rois(inputs): - num_boxes = inputs.shape[0] * inputs.shape[2] * inputs.shape[3] - xyxy = torch.rand(num_boxes, 4) - xyxy[:, 0::2] = xyxy[:, 0::2] * inputs.size(3) - xyxy[:, 1::2] = xyxy[:, 1::2] * inputs.size(2) - xyxy[:, 2:] = xyxy[:, 0:2] + xyxy[:, 2:] - rois = xyxy.view(inputs.shape[0], -1, 4).contiguous() - return rois - -batch_size = 2 -input_channels = 16 -input_height = 8 -input_width = 8 -pooled_size = 3 -features = generate_features([batch_size, input_channels, input_height, input_width]) -rois = generate_rois(features) -output = border_align(features.npu(), rois.npu(), pooled_size) -``` - -## pixel_group -### 接口原型 -```python -mx_driving.detection.pixel_group(Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label, Tensor kernel_contour, int kernel_region_num, float distance_threshold) -> List[List] -``` -### 功能描述 -根据像素之间的嵌入向量和距离,将未被分组的像素分组。 -### 参数说明 -- `score (Tensor)`:前景得分矩阵,数据类型为`float32`,shape为`[Height, Width]`。 -- `mask (Tensor)`:前景掩码矩阵,数据类型为`bool`,shape为`[Height, Width]`。 -- `embedding (Tensor)`:特征向量,数据类型为`float32`,shape为`[Height, Width, Embedding_dim]`。 -- `kernel_label (Tensor)`:像素的实例标签,数据类型为`int32`,shape为`[Height, Width]`。 -- `kernel_contour (Tensor)`:内核的边界像素,数据类型为`uint8`,shape为`[Height, Width]`。 -- `kernel_region_num`:不同内核(分组)的数量,数据类型为`int`。 -- `distance_threshold`:嵌入向量的距离阈值,数据类型为`float`。 -### 返回值 -- `pixel_assignment (List)`:像素的分组信息,数据类型为`float32`,length为入参`kernel_region_num`。 -### 约束说明 -- mask = score > 0.5 -- `score`的取值范围在`[0, 1]`之间 -- `kernel_label`的最大值为`kernel_region_num`-1 -- `kernel_contour`的取值非0即1 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -import numpy as np -from mx_driving.detection import pixel_group -H, W, dim, num = 10, 10, 8, 3 -score = np.random.uniform(0, 1, [H, W]).astype(np.float32) -score = torch.from_numpy(score).npu() -mask = (score) > 0.5 -embedding = np.random.uniform(0, 10, [H, W, dim]).astype(np.float32) -embedding = torch.from_numpy(embedding).npu() -kernel_label = np.random.uniform(0, num, [H, W]).astype(np.int32) -kernel_label = torch.from_numpy(kernel_label).npu() -kernel_contour = np.random.uniform(0, 1, [H, W]).astype(np.uint8) -kernel_contour = torch.from_numpy(kernel_contour).npu() -kernel_region_num = num -distance_threshold = float(0.8) - -output = pixel_group(score, mask, embedding, kernel_label, kernel_contour, kernel_region_num, distance_threshold) -``` - -# 融合算子 -## multi_scale_deformable_attn(MultiScaleDeformableAttnFunction.Apply) -### 接口原型 -```python -mx_driving.fused.multi_scale_deformable_attn(Tensor value, Tensor value_spatial_shapes, Tensor value_level_start_index, Tensor sampling_locations, Tensor attention_weights) -> Tensor -``` -### 功能描述 -多尺度可变形注意力机制, 将多个视角的特征图进行融合。 -### 参数说明 -- `value(Tensor)`:特征张量,数据类型为`float32, float16`。shape为`[bs, num_keys, num_heads, embed_dims]`。其中`bs`为batch size,`num_keys`为特征图的大小,`num_heads`为头的数量,`embed_dims`为特征图的维度,其中`embed_dims`需要为8的倍数。 -- `value_spatial_shapes(Tensor)`:特征图的形状,数据类型为`int32, int64`。shape为`[num_levels, 2]`。其中`num_levels`为特征图的数量,`2`分别代表`H, W`。 -- `value_level_start_index(Tensor)`:偏移量张量,数据类型为`int32, int64`。shape为`[num_levels]`。 -- `sampling_locations(Tensor)`:位置张量,数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads, num_levels, num_points, 2]`。其中`bs`为batch size,`num_queries`为查询的数量,`num_heads`为头的数量,`num_levels`为特征图的数量,`num_points`为采样点的数量,`2`分别代表`y, x`。 -- `attention_weights(Tensor)`:权重张量,数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads, num_levels, num_points]`。其中`bs`为batch size,`num_queries`为查询的数量,`num_heads`为头的数量,`num_levels`为特征图的数量,`num_points`为采样点的数量。 -### 返回值 -- `output(Tensor)`:融合后的特征张量,数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads*embed_dims]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -- `locations`的值在`[0, 1]`之间。 -- 当前版本只支持`num_keys` ≤ 8,`num_heads` ≤ 8,`embed_dims` == 16或32,`num_points` = 1或偶数。 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.fused import multi_scale_deformable_attn -bs, num_levels, num_heads, num_points, num_queries, embed_dims = 1, 1, 4, 8, 16, 32 - -shapes = torch.as_tensor([(100, 100)], dtype=torch.long) -num_keys = sum((H * W).item() for H, W in shapes) - -value = torch.rand(bs, num_keys, num_heads, embed_dims) * 0.01 -sampling_locations = torch.ones(bs, num_queries, num_heads, num_levels, num_points, 2) * 0.005 -attention_weights = torch.rand(bs, num_queries, num_heads, num_levels, num_points) + 1e-5 -level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) - -out = multi_scale_deformable_attn(value.npu(), shapes.npu(), level_start_index.npu(), sampling_locations.npu(), attention_weights.npu()) -``` - -## npu_max_pool2d -### 接口原型 -```python -mx_driving.fused.npu_max_pool2d(Tensor x, int kernel_size, int stride, int padding) -> Tensor -``` -### 功能描述 -对输入进行最大池化,并输出最大池化值。 -### 参数说明 -- `x (Tensor)`:一组待池化对象,数据类型为`float32`,format为NCHW,输入数据量不超过10亿。 -### 返回值 -- `y (Tensor)`:池化后的最大值,数据类型为`float32`,format为NCHW。 -### 约束说明 -kernel_size仅支持3,stride仅支持2,padding仅支持1,且输入C轴数据量要求为8的倍数,H和W需要大于100。 -性能在C值较大的场景下较优,建议使用规格为C>=64。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.fused import npu_max_pool2d -kernel_size = 3 -stride = 2 -padding = 1 -x = torch.randn(18, 64, 464, 800).npu() -res = npu_max_pool2d(x, kernel_size, stride, padding) -``` - -## npu_deformable_aggregation -### 接口原型 -```python -mx_driving.fused.npu_deformable_aggregation(Tensor feature_maps, Tensor spatial_shape, Tensor scale_start_index, Tensor sample_locations, Tensor weight) -> Tensor -``` -### 功能描述 -可变形聚合,对于每个锚点实例,对多个关键点的多时间戳、视图、缩放特征进行稀疏采样后分层融合为实例特征,实现精确的锚点细化。 -### 参数说明 -- `feature_maps(Tensor)`:特征张量,数据类型为`float32`。shape为`[bs, num_feat, c]`。其中`bs`为batch size,`num_feat`为特征图的大小,`c`为特征图的维度。 -- `spatial_shape(Tensor)`:特征图的形状,数据类型为`int32`。shape为`[cam, scale, 2]`。其中`cam`为相机数量,其中`scale`为每个相机的特征图数量,`2`分别代表H, W。 -- `scale_start_index(Tensor)`:每个特征图的偏移位置张量,数据类型为`int32`。shape为`[cam, scale]`,其中`cam`为相机数量,其中`scale`每个相机的特征图数量。 -- `sample_locations(Tensor)`:位置张量,数据类型为`float32`。shape为`[bs, anchor, pts, cam, 2]`。其中`bs`为batch size,`anchor`为锚点数量,`pts`为采样点的数量,`cam`为相机的数量,`2`分别代表y, x。 -- `weight(Tensor)`:权重张量,数据类型为`float32`。shape为`[bs, anchor, pts, cam, scale, group]`。其中`bs`为batch size,`anchor`为锚点数量,`pts`为采样点的数量,`cam`为相机的数量,`scale`每个相机的特征图数量,`group`为分组数。 -### 返回值 -- `output(Tensor)`:输出结果张量,数据类型为`float32`。shape为`[bs, anchor, c]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -- bs <= 128 -- num_feat的值为spatial_shape中每幅图的特征数量之和 -- c <= 256,且为group的整数倍 -- cam <= 6 -- scale <= 4 -- anchor <= 2048 -- pts <= 2048 -- group <= 32,且为2的指数倍 -- sample_locations的值在[0, 1]之间。 -- 每个输入tensor的数据量不超过1.5亿。 -- 反向具有相同约束。 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.fused import npu_deformable_aggregation - -bs, num_feat, c, cam, anchor, pts, scale, group = 1, 2816, 256, 1, 10, 2000, 1, 8 - -feature_maps = torch.ones_like(torch.randn(bs,num_feat ,c)).to(torch.float16) -spatial_shape = torch.tensor([[[32, 88]]]) -scale_start_index = torch.tensor([[0]]) -sampling_location = torch.rand(bs, anchor, pts, cam, 2) -weights = torch.randn(bs, anchor, pts, cam, scale, group) - -out = npu_deformable_aggregation(feature_maps.npu(), spatial_shape.npu(), scale_start_index.npu(), sampling_location.npu(), weights.npu()) -``` - -## deform_conv2d(DeformConv2dFunction.apply) -### 接口原型 -```python -mx_driving.fused.deform_conv2d(Tensor x, Tensor offset, Tensor weight, Union[int, Tuple[int, ...]] stride, Union[int, Tuple[int, ...]] padding, Union[int, Tuple[int, ...]] dilation, int groups, int deformable_groups) -> Tensor -``` -### 功能描述 -可变形卷积。 -### 参数说明 -- `x(Tensor)`:输入特征,数据类型为`float32`,shape为`(n, c_in, h_in, w_in)`,其中`n`为 batch size,`c_in`为输入特征的通道数量,`h_in`为输入特征图的高,`w_in`为输入特征图的宽。 -- `offset(Tensor)`:偏移量,数据类型为`float32`,shape 为`(n, 2 * k * k, h_out, w_out)`,其中`n`为 batch size,`k` 为卷积核大小,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 -- `weight(Tensor)`:卷积核权重,数据类型为`float32`,shape 为 `(c_out, c_in, k, k)`,其中 `c_out` 为输出的通道数,`c_in` 为输入的通道数,`k` 为卷积核大小。 -- `stride(Union)`:卷积步长。 -- `padding(Union)`:卷积的填充大小。 -- `dilation(Union)`:空洞卷积大小。 -- `groups(int)`:分组卷积大小,当前只支持1。 -- `deformable_groups(int)`:将通道分成几组计算offsets,当前只支持1。 -### 返回值 -- `output(Tensor)`:输出张量,数据类型为`float32`,shape 为 `(n, c_out, h_out, w_out)`,其中`n`为 batch size,`c_out`为输出通道,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -1. `deformable_groups`和`groups`当前只支持1。 -2. `h_in`,`w_in`,`h_out`,`w_out`需满足 -$$ -w_{out}=(w_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 \\ -h_{out}=(h_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 -$$ -3. `c_in`需要为64的倍数。 -### 调用示例 -```python -import torch -import torch_npu -from mx_driving.fused import deform_conv2d, DeformConv2dFunction - -n, c_in, h_in, w_in = 16, 64, 100, 200 -c_out, k, h_out, w_out = 64, 3, 50, 100 - -x = torch.randn((n, c_in, h_in, w_in)).npu() -offset = torch.randn((n, 2 * k * k, h_out, w_out)).npu() -weight = torch.randn((c_out, c_in, k, k)).npu() -stride = 1 -padding = 1 -dilation = 1 -groups = 1 -deformable_groups = 1 - -output = deform_conv2d(x, offset, weight, stride, padding, dilation, groups, deformable_groups) -output = DeformConv2dFunction.apply(x, offset, weight, stride, padding, dilation, groups, deformable_groups) -``` -## modulated_deform_conv2d(ModulatedDeformConv2dFunction.apply) -### 接口原型 -```python -mx_driving.fused.modulated_deform_conv2d(Tensor x, Tensor offset, Tensor mask, Tensor weight, Tensor bias, Union[int, Tuple[int, ...]] stride, Union[int, Tuple[int, ...]] padding, Union[int, Tuple[int, ...]] dilation, int groups, int deformable_groups) -> Tensor -``` -### 功能描述 -在可变形卷积的基础之上加上了 modulation 机制,通过调控输出特征的幅度,提升可变形卷积的聚焦相关区域的能力。 -### 参数说明 -- `x(Tensor)`:输入特征,数据类型为`float32`,shape为`(n, c_in, h_in, w_in)`,其中`n`为 batch size,`c_in`为输入特征的通道数量,`h_in`为输入特征图的高,`w_in`为输入特征图的宽。 -- `offset(Tensor)`:偏移量,数据类型为`float32`,shape 为`(n, 2 * k * k, h_out, w_out)`,其中`n`为 batch size,`k` 为卷积核大小,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 -- `mask(Tensor)`:掩码,用于调控输出特征的幅度,数据类型为`float32`,shape 为`(n, k * k, h_out, w_out)`,其中`n`为 batch size,k 为卷积核大小,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 -- `weight(Tensor)`:卷积核权重,数据类型为`float32`,shape 为 `(c_out, c_in, k, k)`,其中 `c_out` 为输出的通道数,`c_in` 为输入的通道数,`k` 为卷积核大小。 -- `bias(Tensor)`:偏置,暂不支持bias,传入 `None` 即可。 -- `stride(Union)`:卷积步长。 -- `padding(Union)`:卷积的填充大小。 -- `dilation(Union)`:空洞卷积大小。 -- `groups(int)`:分组卷积大小,当前只支持1。 -- `deformable_groups(int)`:将通道分成几组计算offsets,当前只支持1。 -### 返回值 -- `output(Tensor)`:输出张量,数据类型为`float32`,shape 为 `(n, c_out, h_out, w_out)`,其中`n`为 batch size,`c_out`为输出通道,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -1. `deformable_groups`和`groups`当前只支持1。 -2. `h_in`,`w_in`,`h_out`,`w_out`需满足 -$$ -w_{out}=(w_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 \\ -h_{out}=(h_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 -$$ -3. `c_in`需要为64的倍数。 -### 调用示例 -```python -import torch -import torch_npu -from mx_driving.fused import modulated_deform_conv2d, ModulatedDeformConv2dFunction - -n, c_in, h_in, w_in = 16, 64, 100, 200 -c_out, k, h_out, w_out = 64, 3, 50, 100 - -x = torch.randn((n, c_in, h_in, w_in)).npu() -offset = torch.randn((n, 2 * k * k, h_out, w_out)).npu() -mask = torch.randn((n, k * k, h_out, w_out)).npu() -weight = torch.randn((c_out, c_in, k, k)).npu() -bias = None -stride = 1 -padding = 1 -dilation = 1 -groups = 1 -deformable_groups = 1 - -output = modulated_deform_conv2d(x, offset, mask, weight, bias, - stride, padding, dilation, groups, deformable_groups) -output = ModulatedDeformConv2dFunction.apply(x, offset, mask, weight, bias, - stride, padding, dilation, groups, deformable_groups) -``` - -# 点云算子 -## bev_pool -### 接口原型 -```python -mx_driving.point.bev_pool(Tensor feat, Tensor geom_feat, int B, int D, int H, int W) -> Tensor -``` -### 功能描述 -BEV池化。可参考论文`BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation` -### 参数说明 -- `feat(Tensor)`:特征张量,数据类型为`float32`。shape为`[N, C]`。其中`N`为原特征张量拉伸后的数量,`C`为特征的维度。 -- `geom_feat(Tensor)`:输出坐标张量,数据类型为`int32`。shape为`[N, 4]`。其中`4`分别代表`h, w, b, d`。 -- `B(int)`:batch size。 -- `D(int)`:输出池化深度。 -- `H(int)`:输出池化高度。 -- `W(int)`:输出池化宽度。 -### 返回值 -- `bev_pooled_feat(Tensor)`:采样后的点云数据,数据类型为`float32`。shape为`[B, D, H, W, C]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -- `geom_feat`的4个对应的值必须在`[0, H-1]`, `[0, W-1]`, `[0, B-1]`, `[0, D-1]`之间。 -- `geom_feat`和`feat`的第0维长度必须相同。 -- C <= 1024 -- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256 -- 对于反向也是同样的约束。 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.point import bev_pool -feat = torch.rand(4, 256).npu() -feat.requires_grad_() -geom_feat = torch.tensor([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2], [0, 0, 0, 3]], dtype=torch.int32).npu() -bev_pooled_feat = bev_pool(feat, geom_feat, 4, 1, 256, 256) -loss = bev_pooled_feat.sum() -loss.backward() -``` -## bev_pool_v2 -### 接口原型 -```python -mx_driving.point.bev_pool_v2(Tensor depth, Tensor feat, Tensor ranks_depth, Tensor ranks_feat, Tensor ranks_bev, - List[int] bev_feat_shape, Tensor interval_starts, Tensor interval_lengths) -> Tensor -``` -### 功能描述 -BEV池化优化。可参考论文`BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View`。 -### 参数说明 -- `depth(Tensor)`:深度张量,数据类型为`float32`。shape为`[B, N, D, H, W]`。其中`B`为batch size,`N`为特征的数量,`D, H, W`分别代表深度、高度、宽度。 -- `feat(Tensor)`:特征张量,数据类型为`float32`。shape为`[B, N, H, W, C]`。其中`B`为batch size,`N`为特征的数量,`H, W, C`分别代表高度、宽度、通道数。 -- `ranks_depth(Tensor)`:深度排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 -- `ranks_feat(Tensor)`:特征排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 -- `ranks_bev(Tensor)`:BEV排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 -- `bev_feat_shape(List[int])`:BEV特征形状,数据类型为`int32`。长度为`5`, 分别代表`B, D, H, W, C`。 -- `interval_starts(Tensor)`:间隔开始张量,数据类型为`int32`。shape为`[N_INTERVALS]`。 -- `interval_lengths(Tensor)`:间隔长度张量,数据类型为`int32`。shape为`[N_INTERVALS]`。 -### 返回值 -- `bev_pooled_feat(Tensor)`:BEV池化后的特征张量,数据类型为`float32`。shape为`[B, D, H, W, C]`。 -### 约束说明 -- `ranks_depth`的值必须在`[0, B*B*D*H*W]`之间。 -- `ranks_feat`的值必须在`[0, B*N*H*W]`之间。 -- `ranks_bev`的值必须在`[0, B*D*H*W]`之间。 -- C <= 1024 -- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256 -- N_RANKS <= 2^21 -- 对于反向也是同样的约束。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.point import bev_pool_v2 -depth = torch.rand(2, 1, 8, 256, 256).npu() -feat = torch.rand(2, 1, 256, 256, 64).npu() -feat.requires_grad_() -ranks_depth = torch.tensor([0, 1], dtype=torch.int32).npu() -ranks_feat = torch.tensor([0, 1], dtype=torch.int32).npu() -ranks_bev = torch.tensor([0, 1], dtype=torch.int32).npu() -bev_feat_shape = [2, 8, 256, 256, 64] -interval_starts = torch.tensor([0], dtype=torch.int32).npu() -interval_lengths = torch.tensor([2], dtype=torch.int32).npu() -bev_pooled_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths) -loss = bev_pooled_feat.sum() -loss.backward() -``` -## bev_pool_v3 -### 接口原型 -```python -mx_driving.point.bev_pool_v3(Tensor depth, Tensor feat, Tensor ranks_depth, Tensor ranks_feat, Tensor ranks_bev, - List[int] bev_feat_shape) -> Tensor -``` -### 功能描述 -BEV池化优化。`bev_pool_v2`的NPU亲和版本,优先推荐使用。 -### 参数说明 -- `depth(Tensor)`:深度张量,数据类型为`float32`。shape为`[B, N, D, H, W]`。其中`B`为batch size,`N`为特征的数量,`D, H, W`分别代表深度、高度、宽度。 -- `feat(Tensor)`:特征张量,数据类型为`float32`。shape为`[B, N, H, W, C]`。其中`B`为batch size,`N`为特征的数量,`H, W, C`分别代表高度、宽度、通道数。 -- `ranks_depth(Tensor)`:深度排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 -- `ranks_feat(Tensor)`:特征排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 -- `ranks_bev(Tensor)`:BEV排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 -- `bev_feat_shape(List[int])`:BEV特征形状,数据类型为`int32`。长度为`5`, 分别代表`B, D, H, W, C`。 -### 返回值 -- `bev_pooled_feat(Tensor)`:BEV池化后的特征张量,数据类型为`float32`。shape为`[B, D, H, W, C]`。 -### 约束说明 -- `ranks_depth`的值必须在`[0, B*B*D*H*W]`之间。 -- `ranks_feat`的值必须在`[0, B*N*H*W]`之间。 -- `ranks_bev`的值必须在`[0, B*D*H*W]`之间。 -- C 必须为8的倍数。 -- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256 -- 对于反向也是同样的约束。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.point import bev_pool_v3 -depth = torch.rand(2, 1, 8, 256, 256).npu() -feat = torch.rand(2, 1, 256, 256, 64).npu() -feat.requires_grad_() -ranks_depth = torch.tensor([0, 1], dtype=torch.int32).npu() -ranks_feat = torch.tensor([0, 1], dtype=torch.int32).npu() -ranks_bev = torch.tensor([0, 1], dtype=torch.int32).npu() -bev_feat_shape = [2, 8, 256, 256, 64] -bev_pooled_feat = bev_pool_v3(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape) -loss = bev_pooled_feat.sum() -loss.backward() -``` -## furthest_point_sample_with_dist -### 接口原型 -```python -mx_driving.point.furthest_point_sample_with_dist(Tensor points, int num_points) -> Tensor -``` -### 功能描述 -与`npu_furthest_point_sampling`功能相同,但输入略有不同。 -### 参数说明 -- `points(Tensor)`:点云数据,表示各点间的距离,数据类型为`float32`。shape为`[B, N, N]`。其中`B`为batch size,`N`为点的数量。 -- `num_points(int)`:采样点的数量。 -### 返回值 -- `Tensor`:采样后的点云数据,数据类型为`float32`。shape为`[B, num_points]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.point import furthest_point_sample_with_dist -points = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu() -out = furthest_point_sample_with_dist(points, 2) -``` -## npu_furthest_point_sampling -### 接口原型 -```python -mx_driving.point.npu_furthest_point_sampling(Tensor points, int num_points) -> Tensor -``` -### 功能描述 -点云数据的最远点采样。 -### 参数说明 -- `points(Tensor)`:点云数据,数据类型为`float32`。shape为`[B, N, 3]`。其中`B`为batch size,`N`为点的数量,`3`分别代表`x, y, z`。 -- `num_points(int)`:采样点的数量。 -### 返回值 -- `Tensor`:采样后的点云数据,数据类型为`float32`。shape为`[B, num_points]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.point import npu_furthest_point_sampling -points = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu() -out = npu_furthest_point_sampling(points, 2) -``` - -### 算子约束 -1. points输入shape[B, N, 3]的总大小(B x N x 3)不应该超过383166 -## npu_group_points -Note:该接口命名将于2025年改为'group_points'。 -### 接口原型 -```python -mx_driving.point.npu_group_points(Tensor features, Tensor indices) -> Tensor -``` -### 功能描述 -点云数据按照索引重新分组。 -### 参数说明 -- `features`:需要被插值的特征,数据类型为`float32`,维度为(B, C, N)。 -- `indices`:获取目标特征计算的索引,数据类型为`int32`,维度为(B, npoints, nsample)。 -### 返回值 -- `output(Tensor)`:分组后的点云数据,数据类型为`float32`。shape为`[B, C, npoints, nsample]`。 -### 约束说明 -- `indices`的元素值需小于`features`的第三维度,即值在[0, N)。 -- C <= 1024 -- 反向具有相同约束。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch -import torch_npu -from mx_driving.point import npu_group_points - - -indices = torch.tensor([[[0, 2, 5, 5], [1, 0, 5, 0], [2, 1, 4, 4]]]).int().npu() -features = torch.tensor([[[0.9178, -0.7250, -1.6587, 0.0715, -0.2252, 0.4994], - [0.6190, 0.1755, -1.7902, -0.5852, -0.3311, 1.9764], - [1.7567, 0.0740, -1.1414, 0.4705, -0.3197, 1.1944], - [-0.2343, 0.1194, 0.4306, 1.3780, -1.4282, -0.6377], - [0.7239, 0.2321, -0.6578, -1.1395, -2.3874, 1.1281]]], - dtype=torch.float32).npu() -output = npu_group_points(features, indices) -``` - -## npu_add_relu -### 接口原型 -```python -mx_driving.fused.npu_add_relu(Tensor x, Tensor y) -> Tensor -``` -### 功能描述 -与`relu(x + y)`功能相同。 -### 参数说明 -- `x(Tensor)`:输入数据,数据类型为`float32`,shape无限制。 -- `y(Tensor)`:输入数据,数据类型为`float32`,shape需要和x一致。 -### 返回值 -- `Tensor`:输出数据,数据类型为`float32`,shape和x一致。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.fused import npu_add_relu -x = torch.tensor([[[1, 2, 3], [-1, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu() -y = torch.tensor([[[1, 2, 3], [-1, -2, 6], [7, 8, 9]]], dtype=torch.float32).npu() -out = npu_add_relu(x, y) -``` -### 算子约束 -- 输入`x`与输入`y`的shape和dtype需要保持一致,不支持广播。 -- 仅在x的元素个数超过2000000时,相较于`relu(x + y)`有性能提升。 - -## voxelization -### 接口原型 -```python -mx_driving.point.voxelization(Tensor points, List[float] voxel_size, List[float] coors_range, int max_points=-1, int max_voxels=-1, bool deterministic=True) -> Tensor -``` -### 功能描述 -将点云数据进行体素化。 -### 参数说明 -- `points(Tensor)`:点云数据,数据类型为`float32`。shape为`[N, F]`。其中`N`为点的数量,`F`分别代表每个点的特征维度,其中`N > 0, F >= 3`。 -- `voxel_size(List[float])`:体素大小,数据类型为`float32`。shape为`[3]`。其中`3`分别代表`x, y, z`。 -- `coors_range(List[float])`:体素范围,数据类型为`float32`。shape为`[6]`。其中`6`分别代表`x_min, y_min, z_min, x_max, y_max, z_max`。 -- `max_points(int)`:每个体素的最大点数。默认值为`-1`。 -- `max_voxels(int)`:最大体素数。默认值为`-1`。 -- `deterministic(bool)`:是否确定性。默认值为`True`。 -### 返回值 -- `coors(Tensor)`:每个点所属的体素坐标,数据类型为`int32`。shape为`[N, 3]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.point import Voxelization -points = torch.randint(-20, 100, [16, 3], dtype=torch.float32).npu() -coors_range = [0, -40, -3, 70.4, 40, 1] -max_points = -1 -voxel_size = [0.5, 0.5, 0.5] -dynamic_voxelization = Voxelization(voxel_size, coors_range, max_points) -coors = dynamic_voxelization.forward(points) -``` -## npu_dynamic_scatter -### 接口原型 -```python -mx_driving.point.npu_dynamic_scatter(Tensor feats, Tensor coors, str reduce_type = 'max') -> Tuple[torch.Tensor, torch.Tensor] -``` -### 功能描述 -将点云特征点在对应体素中进行特征压缩。 -### 参数说明 -- `feats(Tensor)`:点云特征张量[N, C],仅支持两维,数据类型为`float32`,特征向量`C`长度上限为2048。 -- `coors(Tensor)`:体素坐标映射张量[N, 3],仅支持两维,数据类型为`int32`,此处以x, y, z指代体素三维坐标,其取值范围为`0 <= x, y < 2048`, `0 <= z < 256`。 -- `reduce_type(str)`:压缩类型。可选值为`'max'`, `'mean'`, `'sum'`。默认值为`'max'` -### 返回值 -- `voxel_feats(Tensor)`:压缩后的体素特征张量,仅支持两维,数据类型为`float32`。 -- `voxel_coors(Tensor)`:去重后的体素坐标,仅支持两维,数据类型为`int32`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from mx_driving.point import npu_dynamic_scatter - -feats = torch.tensor([[1, 2, 3], [3, 2, 1], [7, 8, 9], [9, 8, 7]], dtype=torch.float32).npu() -coors = torch.tensor([[1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]], dtype=torch.int32).npu() -voxel_feats, voxel_coors = npu_dynamic_scatter(feats, coors, 'max') - -``` -## unique_voxel -### 接口原型 -```python -mx_driving._C.unique_voxel(Tensor voxels) -> int, Tensor, Tensor, Tensor, Tensor -``` -### 功能描述 -对输入的点云数据进行去重处理。 -### 参数说明 -- `voxels (Tensor)`:数据语义为索引,数据类型为`int32`,shape为`[N]`。 -### 返回值 -- `num_voxels(int)`, 体素数量。 -- `uni_voxels(Tensor)`,去重后的体素数据,数据类型为`int32`,shape为`[num_voxels]`。 -- `uni_indices(Tensor)`, 去重后的索引数据,数据类型为`int32`,shape为`[num_voxels]`。 -- `argsort_indices(Tensor)`, 排序后的索引数据,数据类型为`int32`,shape为`[N]`。 -- `uni_argsort_indices(Tensor)`, 去重后的排序后的索引数据,数据类型为`int32`,shape为`[num_voxels]`。 -### 约束说明 -N的大小受限于内存大小,建议N小于等于2^32。 - -受限于芯片指令,输入的数据类型只能是int32,且>=0,<2^30。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch -import torch_npu -import numpy as np -from mx_driving._C import unique_voxel -voxels = np.random.randint(0, 1024, (100000,)).astype(np.int32) -voxels_npu = torch.from_numpy(voxels).npu() -num_voxels, uni_voxels, uni_indices, argsort_indices, uni_argsort_indices = unique_voxel(voxels_npu) - -``` - - -## voxel_pooling_train -### 接口原型 -```python -mx_driving.point.npu_voxel_pooling_train(Tensor geom_xyz, Tensor input_features, List[int] voxel_num) -> Tensor -``` -### 功能描述 -点云数据体素化。 -### 参数说明 -- `geom_xyz`:体素坐标,数据类型为`int32`,维度为(B, N, 3), 3表示x, y, z。 -- `input_features`:点云数据,数据类型为`float32|float16`,维度为(B, N, C)。 -- `voxel_num`:体素格子长宽高,数据类型为`int32`,维度为(3),3表示体素格子的长宽高。 -### 返回值 -- `output(Tensor)`:输出结果,数据类型为`float32|float16`。shape为`[B, num_voxel_y, num_voxel_x, C]`。 -### 约束说明 -- B <= 128 -- N <= 100000 -- C <= 256 -- num_voxel_x <= 1000 -- num_voxel_y <= 1000 -- num_voxel_z <= 10 -- B * num_voxel_y * num_voxel_x * C <= 100000000 -- B * N * C <= 100000000 -- 反向具有相同约束。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch -import torch_npu -import mx_driving.point - -def gen_data(geom_shape, feature_shape, coeff, batch_size, num_channels, dtype): - geom_xyz = torch.rand(geom_shape) * coeff - geom_xyz = geom_xyz.reshape(batch_size, -1, 3) - geom_xyz[:, :, 2] /= 100 - geom_xyz_cpu = geom_xyz.int() - features = torch.rand(feature_shape, dtype=dtype) - 0.5 - features_cpu = features.reshape(batch_size, -1, num_channels) - - return geom_xyz_cpu, features_cpu - -dtype = torch.float32 -coeff = 90 -voxel_num = [128, 128, 1] -batch_size = 2 -num_points = 40 -num_channel = 80 -xyz = 3 - -geom_shape = [batch_size, num_points, xyz] -feature_shape = [batch_size, num_points, num_channel] - -geom_cpu, feature_cpu = gen_data(geom_shape, feature_shape, coeff, batch_size, num_channel, dtype) - -geom_npu = geom_cpu.npu() -feature_npu = feature_cpu.npu() - -result_npu = mx_driving.point.npu_voxel_pooling_train(geom_npu, feature_npu, voxel_num) -``` -# 稀疏卷积算子(beta) -## SparseConv3d(beta) -### 接口原型 -```python -mx_driving.spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor -``` -### 功能描述 -稀疏卷积 -### 参数说明 -- `in_channels(int)`:输入数据的通道数 -- `out_channels(int)`:输出通道数 -- `kernel_size(List(int)/Tuple(int)/int)`:卷积神经网络中卷积核的大小 -- `stride(List(int)/Tuple(int)/int)`:卷积核在输入数据上滑动时的步长 -- `dilation(List(int)/Tuple(int)/int)`:空洞卷积大小 -- `groups(int)`:分组卷积 -- `bias(bool)`:偏置项 -- `indice_key(String)`:该输入用于复用之前计算的索引信息 -- `mode(String)`:区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积 -### 返回值 -- `SparseConvTensor`:存储了输出的特征值`out_feature`,对应索引位置`out_indices`和对应的spatital_shape。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -- `kernel_size`当前支持数据类型为三维List/Tuple或Int,值域为`[1, 3]` -- `stride`当前支持数据类型为三维List/Tuple或Int -- `dilation`,`groups`当前仅支持值为1 -- 对于反向也是同样的约束。 -### 调用示例 -```python -import torch,torch_npu -import numpy as np -from mx_driving.spconv import SparseConv3d, SparseConvTensor - -def generate_indice(batch, height, width, depth, actual_num): - base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num] - base_indices = np.sort(base_indices) - b_indice = base_indices // (height * width * depth) - base_indices = base_indices % (height * width * depth) - h_indice = base_indices // (width * depth) - base_indices = base_indices // (width * depth) - w_indice = base_indices // depth - d_indice = base_indices % depth - indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num) - return indices - -actual_num = 20 -batch = 4 -spatial_shape = [9, 9, 9] -indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu() -feature = tensor_uniform = torch.rand(actual_num, 16).npu() -feature.requires_grad = True -x = SparseConvTensor(feature, indices, spatial_shape, batch) -net = SparseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu() -out = net(x) -dout = torch.ones_like(out.features).float().npu() -out.features.backward(dout) -``` - - -## SparseInverseConv3d(beta) -### 接口原型 -```python -mx_driving.spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, output_padding=0,bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor -``` -### 功能描述 -稀疏逆卷积 -### 参数说明 -- `in_channels(int)`:输入数据的通道数 -- `out_channels(int)`:输出通道数 -- `kernel_size(List(int)/Tuple(int)/int)`:卷积神经网络中卷积核的大小 -- `stride(List(int)/Tuple(int)/int)`:卷积核在输入数据上滑动时的步长 -- `dilation(List(int)/Tuple(int)/int)`:空洞卷积大小 -- `groups(int)`:分组卷积 -- `bias(bool)`:偏置项 -- `indice_key(String)`:该输入用于复用之前计算的索引信息 -- `mode(String)`:区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积 -### 返回值 -- `SparseConvTensor`:存储了输出的特征值`out_feature`,对应索引位置`out_indices`和对应的spatital_shape。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -- `kernel_size`当前支持数据类型为三维List/Tuple或Int,值域为`[1, 3]` -- `stride`当前支持数据类型为三维List/Tuple或Int -- `dilation`,`groups`当前仅支持值为1 -- 对于反向也是同样的约束。 -### 调用示例 -```python -import torch,torch_npu -import numpy as np -from mx_driving.spconv import SparseInverseConv3d, SparseConvTensor - -def generate_indice(batch, height, width, depth, actual_num): - base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num] - base_indices = np.sort(base_indices) - b_indice = base_indices // (height * width * depth) - base_indices = base_indices % (height * width * depth) - h_indice = base_indices // (width * depth) - base_indices = base_indices // (width * depth) - w_indice = base_indices // depth - d_indice = base_indices % depth - indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num) - return indices - -actual_num = 20 -batch = 4 -spatial_shape = [9, 9, 9] -indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu() -feature = tensor_uniform = torch.rand(actual_num, 16).npu() -feature.requires_grad = True -x = SparseConvTensor(feature, indices, spatial_shape, batch) -net = SparseInverseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu() -out = net(x) -dout = torch.ones_like(out.features).float().npu() -out.features.backward(dout) -``` - - -## SubMConv3d(beta) -### 接口原型 -```python -mx_driving.spconv.SubMConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor -``` -### 功能描述 -稀疏卷积,只有当卷积核中心参与计算时,才会影响输出 -### 参数说明 -- `in_channels(int)`:输入数据的通道数 -- `out_channels(int)`:输出通道数 -- `kernel_size(List(int)/Tuple(int)/int)`:卷积神经网络中卷积核的大小 -- `stride(List(int)/Tuple(int)/int)`:卷积核在输入数据上滑动时的步长 -- `dilation(List(int)/Tuple(int)/int)`:空洞卷积大小 -- `groups(int)`:分组卷积 -- `bias(bool)`:偏置项 -- `indice_key(String)`:该输入用于复用之前计算的索引信息 -- `mode(String)`:区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积 -### 返回值 -- `SparseConvTensor`:存储了输出的特征值`out_feature`,对应索引位置`out_indices`和对应的spatital_shape。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -- `kernel_size`当前支持数据类型为三维List/Tuple或Int,当前值仅支持1、3 -- `stride`当前支持数据类型为三维List/Tuple或Int,当前仅支持值为1 -- `dilation`,`groups`当前仅支持值为1 -- 对于反向也是同样的约束。 -### 调用示例 -```python -import torch,torch_npu -import numpy as np -from mx_driving.spconv import SubMConv3d, SparseConvTensor - -def generate_indice(batch, height, width, depth, actual_num): - base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num] - base_indices = np.sort(base_indices) - b_indice = base_indices // (height * width * depth) - base_indices = base_indices % (height * width * depth) - h_indice = base_indices // (width * depth) - base_indices = base_indices // (width * depth) - w_indice = base_indices // depth - d_indice = base_indices % depth - indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num) - return indices - -actual_num = 20 -batch = 4 -spatial_shape = [9, 9, 9] -indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu() -feature = tensor_uniform = torch.rand(actual_num, 16).npu() -feature.requires_grad = True -x = SparseConvTensor(feature, indices, spatial_shape, batch) -net = SubMConv3d(in_channels=16, out_channels=32, kernel_size=3).npu() -out = net(x) -dout = torch.ones_like(out.features).float().npu() -out.features.backward(dout) -``` \ No newline at end of file +# mxDriving API 汇总 + + +- 通用 +- 感知 +- 点云 +- 预处理 +- 稀疏 +- 融合 + - [multi_scale_deformable_attn](./context/multi_scale_deformable_attn.md) \ No newline at end of file diff --git a/docs/api/context/multi_scale_deformable_attn.md b/docs/api/context/multi_scale_deformable_attn.md new file mode 100644 index 0000000000000000000000000000000000000000..c1fc37a220f12a40d841175f1f6c9956d78121e5 --- /dev/null +++ b/docs/api/context/multi_scale_deformable_attn.md @@ -0,0 +1,36 @@ +## multi_scale_deformable_attn(MultiScaleDeformableAttnFunction.Apply) +### 接口原型 +```python +mx_driving.fused.multi_scale_deformable_attn(Tensor value, Tensor value_spatial_shapes, Tensor value_level_start_index, Tensor sampling_locations, Tensor attention_weights) -> Tensor +``` +### 功能描述 +多尺度可变形注意力机制, 将多个视角的特征图进行融合。 +### 参数说明 +- `value(Tensor)`:特征张量,数据类型为`float32, float16`。shape为`[bs, num_keys, num_heads, embed_dims]`。其中`bs`为batch size,`num_keys`为特征图的大小,`num_heads`为头的数量,`embed_dims`为特征图的维度,其中`embed_dims`需要为8的倍数。 +- `value_spatial_shapes(Tensor)`:特征图的形状,数据类型为`int32, int64`。shape为`[num_levels, 2]`。其中`num_levels`为特征图的数量,`2`分别代表`H, W`。 +- `value_level_start_index(Tensor)`:偏移量张量,数据类型为`int32, int64`。shape为`[num_levels]`。 +- `sampling_locations(Tensor)`:位置张量,数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads, num_levels, num_points, 2]`。其中`bs`为batch size,`num_queries`为查询的数量,`num_heads`为头的数量,`num_levels`为特征图的数量,`num_points`为采样点的数量,`2`分别代表`y, x`。 +- `attention_weights(Tensor)`:权重张量,数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads, num_levels, num_points]`。其中`bs`为batch size,`num_queries`为查询的数量,`num_heads`为头的数量,`num_levels`为特征图的数量,`num_points`为采样点的数量。 +### 返回值 +- `output(Tensor)`:融合后的特征张量,数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads*embed_dims]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 约束说明 +- `locations`的值在`[0, 1]`之间。 +- 当前版本只支持`num_keys` ≤ 8,`num_heads` ≤ 8,`embed_dims` == 16或32,`num_points` = 1或偶数。 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.fused import multi_scale_deformable_attn +bs, num_levels, num_heads, num_points, num_queries, embed_dims = 1, 1, 4, 8, 16, 32 + +shapes = torch.as_tensor([(100, 100)], dtype=torch.long) +num_keys = sum((H * W).item() for H, W in shapes) + +value = torch.rand(bs, num_keys, num_heads, embed_dims) * 0.01 +sampling_locations = torch.ones(bs, num_queries, num_heads, num_levels, num_points, 2) * 0.005 +attention_weights = torch.rand(bs, num_queries, num_heads, num_levels, num_points) + 1e-5 +level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) + +out = multi_scale_deformable_attn(value.npu(), shapes.npu(), level_start_index.npu(), sampling_locations.npu(), attention_weights.npu()) +``` \ No newline at end of file diff --git a/docs/api/mxDriving.md b/docs/api/mxDriving.md new file mode 100644 index 0000000000000000000000000000000000000000..a0042ce32e4f0881d4edc67d40e6fe5713b6755a --- /dev/null +++ b/docs/api/mxDriving.md @@ -0,0 +1,1420 @@ +> Note: 以prototype标注的接口,表示该接口为预发布接口,可能会有变动,不建议在生产环境中使用。 +# 通用算子 +## scatter_max +### 接口原型 +```python +mx_driving.common.scatter_max(Tensor updates, Tensor indices, Tensor out=None) -> (Tensor out, Tensor argmax) +``` +### 功能描述 +在第0维上,将输入张量`updates`中的元素按照`indices`中的索引进行分散,然后在第0维上取最大值,返回最大值和对应的索引。对于1维张量,公式如下: +$$out_i = max(out_i, max_j(updates_j))$$ +$$argmax_i = argmax_j(updates_j)$$ +这里,$i = indices_j$。 +### 参数说明 +- `updates`:更新源张量,数据类型为`float32`,且 + - `updates`的第0维外其余轴合轴后必须32字节对齐。 +- `indices`:索引张量,数据类型为`int32`,且 + - `indices`的维度必须为`1`, + - `indices`第0维的长度必须与`updates`第0维的长度相同。 + - `indices`的最大值必须小于`491520`。 + - `indices`的取值必须为非负的有效索引值。 +- `out`:被更新张量,数据类型为`float32`,默认为`None`,且 + - `out`的维度必须与`updates`的维度相同。 + - `out`除第0维外其余维的长度必须与`updates`相同。 +### 返回值 +- `out`:更新后的张量,数据类型为`float32`。 +- `argmax`:最大值对应的索引张量,数据类型为`int32`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.common import scatter_max +updates = torch.tensor([[2, 0, 1, 3, 1, 0, 0, 4], [0, 2, 1, 3, 0, 3, 4, 2], [1, 2, 3, 4, 4, 3, 2, 1]], dtype=torch.float32).npu() +indices = torch.tensor([0, 2, 0], dtype=torch.int32).npu() +out = updates.new_zeros((3, 8)) +out, argmax = scatter_max(updates, indices, out) +``` +## knn +### 接口原型 +```python +mx_driving.common.knn(int k, Tensor xyz, Tensor center_xyz, bool Transposed) -> Tensor +``` +### 功能描述 +对center_xyz中的每个点找到xyz中对应batch中的距离最近的k个点,并且返回此k个点的索引值。 +### 参数说明 +- `xyz(Tensor)`:点数据,表示(x, y, z)三维坐标,数据类型为`float32`。shape为`[B, N, 3]`(当Transposed=False)或`[B, 3, N]`(当Transposed=True)。其中`B`为batch size,`N`为点的数量。 +- `center_xyz(Tensor)`:点数据,表示(x, y, z)三维坐标,数据类型为`float32`。shape为`[B, npoint, 3]`(当Transposed=False)或`[B, 3, npoint]`(当Transposed=True)。其中`B`为batch size,`npoint`为点的数量。 +- `k(int)`:采样点的数量。 +- `Transposed(bool)`: 输入是否需要进行转置 +### 返回值 +- `idx(Tensor)`:采样后的索引数据,数据类型为`int32`。shape为`[B, k, npoint]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.common import knn +xyz = torch.tensor([[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=torch.float32).npu() +center_xyz = torch.tensor([[[1, 2, 3]], [[1, 2, 3]]], dtype=torch.float32).npu() +idx = knn(2, xyz, center_xyz, False) +``` +### 算子约束 +1. k必须>0且<100。 +2. xyz中的每个batch中的任意一个点到center_xyz对应batch中的任意一个点的距离必须在1e10f以内。 +3. xyz和center_xyz的shape必须是3维,当Transposed=True时,xyz和center_xyz的shape的dim的第1维必须是3;当Transposed=False时,xyz和center_xyz的shape的dim的第2维必须是3。 +4. 由于距离相同时排序为不稳定排序,存在距离精度通过但索引精度错误问题,与竞品无法完全对齐。 + +## scatter_mean +### 接口原型 +```python +mx_driving.common.scatter_mean(Tensor src, Tensor indices, int dim=0, Tensor out=None, int dim_size=None) -> Tensor +``` +### 功能描述 +将输入张量`src`中的元素按照`indices`中的索引在指定的`dim`维进行分组,并计算每组的平均值,返回平均值。 +### 参数说明 +- `src`:源张量,数据类型为`float32`。 +- `indices`:索引张量,数据类型为`int32`。 +- `out`:被更新张量,数据类型为`float32`,可选入参,默认为`None`,输入`out`不为`None`时,`out`中的元素参与平均值的计算。 +- `dim`:指定的维度,表示按照哪个维度进行分组平均计算,数据类型为`int32`,可选入参,默认取值为`0`。 +- `dim_size`:输出张量在`dim`维的长度,数据类型为`int32`,可选入参,默认为`None`,该参数仅在输入`out`为`None`时生效。 +### 返回值 +- `out`:求平均后的张量,数据类型为`float32`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 + +```python +import torch, torch_npu +from mx_driving.common import scatter_mean +src = torch.randn(4, 5, 6).to(torch.float) +indices = torch.randint(5, (4, 5)).to(torch.int32) +dim = 0 +src.requires_grad = True +out = scatter_mean(src.npu(), indices.npu(), None, dim) +grad_out_tensor = torch.ones_like(out) +out.backward(grad_out_tensor) +``` +### 算子约束 +- `indices`的维度必须小于等于`src`的维度,且每一维的长度均必须与`src`长度相同。 +- `indices`的取值必须为非负的有效索引值,参数`out`或`data_size`不为`None`时,`indices`的取值应该为输出张量在`dim`维的有效索引值。 +- `out`的维度必须与`src`的维度相同,且除第`dim`维外其余维的长度必须与`src`相同。 +- `dim`取值不能超过`indices`的维度。 +- `dim_size`的取值必须为非负的有效长度值。 +- `src`和`out`不支持`inf`、`-inf`和`nan`。 +### 其他说明 +- 该算子对尾块较大的场景较为亲和,对尾块很小的场景不亲和,其中,尾块表示`src`后`N`维的大小,`N = src.dim() - indices.dim()`。 + +## three_interpolate +### 接口原型 +```python +mx_driving.common.three_interpolate(features: torch.Tensor, indices: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: +``` +### 功能描述 +对三维数据进行加权最近邻线性插值处理 +### 参数说明 +- `features`:需要被插值的特征,数据类型为`float32|float16`,维度为(B, C, M)。 +- `indices`:获取目标特征计算的索引,数据类型为`int32`,维度为(B, N, 3), + - `indices`的元素值需小于`features`的第三维度,即值在[0, M)。 +- `weight`:获取目标特征计算的权重,数据类型为`float32|float16`,维度为(B, N, 3)。 + - `weight`数据类型与`features`须一致。 +- `features`,`indices`,`weights`三个参数的每个维度须小于10000。 +- `features`,`indices`,`weights`三个参数的大小请勿超过2^24。 +### 返回值 +- `output`:目标特征张量,数据类型为`float32|float16`,维度为(B, C, N)。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch +from mx_driving.common import three_interpolate + + +features = torch.tensor( + [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350], + [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236], + [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732], + [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124], + [0.3207, 0.0000, 0.3411, 0.3207, 0.3207, 0.3207]], + [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000], + [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346], + [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000], + [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414], + [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]], + ).npu() +idx = torch.tensor( + [[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2], [0, 1, 3]], + [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4], [0, 1, 2]]], + ).int().npu() +weight = torch.tensor( + [[[3.3333e-01, 3.3333e-01, 3.3333e-01], + [1.0000e+00, 5.8155e-08, 2.2373e-08], + [1.0000e+00, 1.7737e-08, 1.7356e-08], + [3.3333e-01, 3.3333e-01, 3.3333e-01], + [3.3333e-01, 3.3333e-01, 3.3333e-01], + [3.3333e-01, 3.3333e-01, 3.3333e-01]], + [[3.3333e-01, 3.3333e-01, 3.3333e-01], + [1.0000e+00, 1.3651e-08, 7.7312e-09], + [1.0000e+00, 1.7148e-08, 1.4070e-08], + [3.3333e-01, 3.3333e-01, 3.3333e-01], + [3.3333e-01, 3.3333e-01, 3.3333e-01], + [3.3333e-01, 3.3333e-01, 3.3333e-01]]], + ).npu() +output = three_interpolate(features, idx, weight) +``` + + +## three_nn +### 接口原型 +```python +mx_driving.common.three_nn(Tensor target, Tensor source) -> (Tensor dist, Tensor idx) +``` +### 功能描述 +对target中的每个点找到source中对应batch中的距离最近的3个点,并且返回此3个点的距离和索引值。 +### 参数说明 +- `target(Tensor)`:点数据,表示(x, y, z)三维坐标,数据类型为`float32/float16`。shape为`[B, npoint, 3]`。其中`B`为batch size,`npoint`为点的数量。 +- `source(Tensor)`:点数据,表示(x, y, z)三维坐标,数据类型为`float32/float16`。shape为`[B, N, 3]`。其中`B`为batch size,`N`为点的数量。 +### 返回值 +- `dist(Tensor)`:采样后的索引数据,数据类型为`float32/float16`。shape为`[B, npoint, 3]`。 +- `idx(Tensor)`:采样后的索引数据,数据类型为`int32/int32`。shape为`[B, npoint, 3]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.common import three_nn +source = torch.tensor([[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=torch.float32).npu() +target = torch.tensor([[[1, 2, 3]], [[1, 2, 3]]], dtype=torch.float32).npu() +dist, idx = three_nn(target, source) +``` +### 算子约束 +1. source和target的shape必须是3维,且source和target的shape的dim的第2维必须是3。 +2. 距离相同时排序为不稳定排序,存在距离精度通过但索引精度错误问题,与竞品无法完全对齐。 + + +## hypot +### 接口原型 +```python +mx_driving.common.hypot(Tensor input, Tensor other) -> Tensor +``` +### 功能描述 +给出直角三角形的两边,返回它的斜边。 +### 参数说明 +- `input(Tensor)`:代表直角三角形第一条直角边的输入张量,数据类型为`float32`。 +- `other(Tensor)`:代表直角三角形第二条直角边的输入张量,数据类型为`float32`。 +### 返回值 +- `Tensor`:经过计算后的直角三角形斜边,数据类型为`float32`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.common import hypot +input = torch.tensor([3,3,3], dtype=torch.float32).npu() +other = torch.tensor([4,4,4], dtype=torch.float32).npu() +out = hypot(input, other) # tensor([5.,5.,5.]) +``` +### 算子约束 +1. input和other的shape必须是可广播的。 + + +## assign_score_withk +### 接口原型 +```python +mx_driving.common.assign_score_withk(Tensor scores, Tensor point_features, Tensor center_features, Tensor knn_idx, str aggregate='sum') -> Tensor +``` +### 功能描述 +根据`knn_idx`得到采样点及其邻居点的索引,计算`point_features`和`center_features`的差,并与`scores`相乘后在特征维度进行聚合,返回采样点的特征。 +### 参数说明 +- `scores(Tensor)`:权重矩阵的重要系数,数据类型为`float32`。Shape为`[B, npoint, K, M]`,其中`B`为batch size,`npoint`为采样点的数量,`K`为一个样本点及其邻居点的数量之和,`M`为权重矩阵集合的规模。 +- `point_features(Tensor)`:所有点的特征,数据类型为`float32`。Shape为`[B, N, M, O]`,其中`N`为所有点的数量,`O`为特征数量。 +- `center_features(Tensor)`:所有点的中心特征,数据类型为`float32`。Shape为`[B, N, M, O]`。 +- `knn_idx[Tensor]`:采样点及其邻居点的索引,数据类型为`int64`。Shape为`[B, npoint, K]`。 +- `aggregate`:聚合方式,默认为`sum`,数据类型为`str`。 +### 返回值 +- `output`:聚合后采样点的特征,数据类型为`float32`。Shape为`[B, O, npoint, K]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 + +```python +import torch, torch_npu +from mx_driving.common import assign_score_withk +points = np.random.rand(4, 100, 8, 16).astype(np.float32) +centers = np.random.rand(4, 100, 8, 16).astype(np.float32) +scores = np.random.rand(4, 64, 10, 8).astype(np.float32) +knn_idx = np.random.randint(0, N, size=(4, 64, 10)).astype(np.int64) +output = assign_score_withk(torch.from_numpy(scores).npu(), + torch.from_numpy(points).npu(), + torch.from_numpy(centers).npu(), + torch.from_numpy(knn_idx).npu(), + "sum") +``` +### 算子约束 +- `npoint`和`K`都不大于`N`。 + + +# 数据预处理算子 +## npu_points_in_box +### 接口原型 +```python +mx_driving.preprocess.npu_points_in_box(Tensor boxes, Tensor points) -> Tensor +``` +### 功能描述 +判断点是否在框内。 +### 参数说明 +- `boxes(Tensor)`:框张量,数据类型为`float32`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 +- `points(Tensor)`:点张量,数据类型为`float32`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。 +### 返回值 +- `boxes_idx_of_points(Tensor)`:点在框内的索引张量,数据类型为`int32`。shape 为`[B, N]`。 +### 约束说明 +- `boxes`和`points`的`B`必须相同,且只能为`1`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.preprocess import npu_points_in_box +boxes = torch.tensor([[[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]]], dtype=torch.float32).npu() +points = torch.tensor([[[1, 2, 3], [3, 4, 5]]], dtype=torch.float32).npu() +out = npu_points_in_box(boxes, points) +``` + +## npu_points_in_box_all +Note: 该接口命名将于2025年改为`points_in_boxes_all`。 +### 接口原型 +```python +mx_driving.preprocess.npu_points_in_box_all(Tensor boxes, Tensor points) -> Tensor +``` +### 功能描述 +判断点是否在框内。 +### 参数说明 +- `boxes(Tensor)`:框张量,数据类型为`float32`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 +- `points(Tensor)`:点张量,数据类型为`float32`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。 +### 返回值 +- `boxes_idx_of_points(Tensor)`:同一`batch`下,各点是否在各框内的张量,数据类型为`int32`。shape 为`[B, N, M]`。 +### 约束说明 +- `boxes`和`points`的`B`必须相同。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.preprocess import npu_points_in_box_all +boxes = torch.tensor([[[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]]], dtype=torch.float32).npu() +points = torch.tensor([[[1, 2, 5], [3, 4, 8]]], dtype=torch.float32).npu() +out = npu_points_in_box_all(boxes, points) +``` + +## RoipointPool3d +### 接口原型 +```python +mx_driving.preprocess.RoipointPool3d(int num_sampled_points, Tensor points, Tensor point_features, Tensor boxes3d) -> (Tensor pooled_features, Tensor pooled_empty_flag) +``` +### 功能描述 +对每个3D方案的几何特定特征进行编码。 +### 参数说明 +- `num_sampled_points(int)`:特征点的数量,正整数。 +- `points(Tensor)`:点张量,数据类型为`float32, float16`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。 +- `point_features(Tensor)`:点特征张量,数据类型为`float32, float16`。shape 为`[B, N, C]`。`C`分别代表`x, y, z`。 +- `boxes3d(Tensor)`:框张量,数据类型为`float32, float16`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 +### 返回值 +- `pooled_features(Tensor)`:点在框内的特征张量,数据类型为`float32, float16`。shape 为`[B, M, num, 3+C]`。 +- `pooled_empty_flag(Tensor)`:所有点不在框内的空标记张量,数据类型为`int32`。shape 为`[B, M]`。 +### 约束说明 +- `points`、`point_features`和`boxes3d`的数据类型必须相同,以及`B`也必须相同。 +- `num_sampled_points`必须小于等于`N`。 +- 数据类型为`float32`时,建议`B`小于100、`N`小于等于2640、`M`小于等于48、`num_sampled_points`小于等于48,个别shape值略微超过建议值无影响,但所有shape值均大于建议值时,算子执行会发生错误。 +- 数据类型为`float16`时,建议`B`小于100、`N`小于等于3360、`M`小于等于60、`num_sampled_points`小于等于60,个别shape值略微超过建议值无影响,但所有shape值均大于建议值时,算子执行会发生错误。 +- `N`/`M`的值越大,性能劣化越严重,建议`N`小于`M`的六百倍,否则性能可能会低于0.1x A100。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.preprocess import RoIPointPool3d +num_sampled_points = 1 +points = torch.tensor([[[1, 2, 3]]], dtype=torch.float).npu() +point_features = points.clone() +boxes3d = torch.tensor([[[1, 2, 3, 4, 5, 6, 1]]], dtype=torch.float).npu() +roipoint_pool3d = RoIPointPool3d(num_sampled_points) +pooled_features, pooled_empty_flag = roipoint_pool3d(points, point_features, boxes3d) +``` + + +# 目标检测算子 +## npu_boxes_overlap_bev +Note: 该接口命名将于2025年改为`boxes_overlap_bev`。 +### 接口原型 +```python +mx_driving.detection.npu_boxes_overlap_bev(Tensor boxes_a, Tensor boxes_b) -> Tensor +``` +### 功能描述 +计算bev视角下中两个边界框的重叠面积。 +### 参数说明 +- `boxes_a (Tensor)`:第一组bounding boxes,数据类型为`float32`。shape为`[M, 5]`。其中`5`分别代表`x1, y1, x2, y2, angle`, `x1, y1, x2, y2`代表box四个顶点的横纵坐标,`angle`代表box的弧度制旋转角。 +- `boxes_b (Tensor)`:第二组bounding boxes,数据类型为`float32`。shape为`[N, 5]`。其中`5`分别代表`x1, y1, x2, y2, angle`, `x1, y1, x2, y2`代表box四个顶点的横纵坐标,`angle`代表box的弧度制旋转角。 +### 返回值 +- `area_overlap(Tensor)`:包含两组bounding boxes交叠面积的张量,数据类型为`float32`。shape为`[M, N]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.detection import npu_boxes_overlap_bev +boxes_a = torch.tensor([[0, 0, 2, 2, 0]], dtype=torch.float32).npu() +boxes_b = torch.tensor([[1, 1, 3, 3, 0]], dtype=torch.float32).npu() +area_overlap = npu_boxes_overlap_bev(boxes_a, boxes_b) +``` +## box_iou_quadri +### 接口原型 +```python +mx_driving.detection.box_iou_quadri(Tensor boxes_a, Tensor boxes_b, str mode='iou', bool aligned=False) -> Tensor +``` +### 功能描述 +计算两个边界框的IoU。 +### 参数说明 +- `boxes_a (Tensor)`:第一组bounding boxes,数据类型为`float32`。shape为`[M, 8]`。其中`8`分别代表`x1, y1, x2, y2, x3, y3, x4, y4`, 表示box四个顶点的横纵坐标。 +- `boxes_b (Tensor)`:第二组bounding boxes,数据类型为`float32`。shape为`[N, 8]`。其中`8`分别代表`x1, y1, x2, y2, x3, y3, x4, y4`, 表示box四个顶点的横纵坐标。 +- `mode (str)`:取值为`"iou"`时,计算IoU(intersection over union);取值为`"iof"`时,计算IoF(intersection over foregroud)。 +- `aligned (bool)`:取值为`True`时,只计算配对的box之间的结果;取值为`False`时,计算每对box之间的结果。 +### 返回值 +- `ious(Tensor)`:包含两组bounding boxes的IoU(`mode="iou"`)或IoF(`mode="iof"`)的张量,数据类型为`float32`。shape为`[M]`(`aligned=True`)或`[M, N]`(`aligned=False`)。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.detection import box_iou_quadri +boxes_a = torch.tensor([[7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]], dtype=torch.float32).npu() +boxes_b = torch.tensor([[7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]], dtype=torch.float32).npu() +ious = box_iou_quadri(boxes_a, boxes_b, mode="iou", aligned=False) +``` +## npu_nms3d +### 接口原型 +```python +mx_driving.detection.npu_nms3d(Tensor boxes, Tensor scores, float: iou_threshold) -> Tensor +``` +### 功能描述 +3D非极大值抑制,在bev视角下剔除多个3d box交并比大于阈值的box。 +### 参数说明 +- `boxes(Tensor)`:框张量,数据类型为`float32, float16`。shape 为`[N, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 +- `scores(Tensor)`:评分张量,数据类型为`float32, float16`。shape 为`[N]`。 +- `iou_threshold(float)`:IoU阈值。 +### 返回值 +- `Tensor`:NMS后的框张量,数据类型为`int32`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.detection import npu_nms3d +boxes = torch.tensor([[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]], dtype=torch.float32).npu() +scores = torch.tensor([1, 2], dtype=torch.float32).npu() +out = npu_nms3d(boxes, scores, 0.5) +``` +## npu_nms3d_normal +### 接口原型 +```python +mx_driving.detection.npu_nms3d_normal(Tensor boxes, Tensor scores, float: iou_threshold) -> Tensor +``` +### 功能描述 +3D非极大值抑制。 +### 参数说明 +- `boxes(Tensor)`:框张量,数据类型为`float32, float16`。shape 为`[N, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。 +- `scores(Tensor)`:评分张量,数据类型为`float32, float16`。shape 为`[N]`。 +- `iou_threshold(float)`:IoU阈值。 +### 返回值 +- `Tensor`:NMS后的框张量,数据类型为`int32`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.detection import npu_nms3d_normal +boxes = torch.tensor([[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]], dtype=torch.float32).npu() +scores = torch.tensor([1, 2], dtype=torch.float32).npu() +out = npu_nms3d_normal(boxes, scores, 0.5) +``` +## npu_rotated_iou +### 接口原型 +```python +mx_driving.detection.npu_rotated_iou(Tensor self, Tensor query_boxes, bool trans=False, int mode=0, bool is_cross=True, float v_threshold=0.0, float e_threshold=0.0) -> Tensor +``` +### 功能描述 +计算旋转框的IoU。 +### 参数说明 +- `self(Tensor)`:边界框张量,数据类型为`float32, float16`,形状为`[B, N, 5]`。 +- `query_boxes(Tensor)`:查询框张量,数据类型为`float32, float16`,形状为`[B, M, 5]`。 +- `trans(bool)`:是否进行坐标变换。默认值为`False`。值为`True`时,表示`xyxyt`, 值为`False`时,表示`xywht`,其中`t`为角度制。 +- `is_cross(bool)`:值为`True`时,则对两组边界框中每个边界框之间进行计算。值为`False`时,只对对齐的边界框之间进行计算。 +- `mode(int)`:计算IoU的模式。默认值为`0`。值为`0`时,表示计算`IoU`,值为`1`时,表示计算`IoF`。 +- `v_threshold(float)`:顶点判断的容忍阈值。 +- `e_threshold(float)`:边相交判断的容忍阈值。 +### 返回值 +- `Tensor`:IoU张量,数据类型为`float32, float16`,`is_cross`为`True`时形状为`[B, N, M],反之则为`[B, N]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +import numpy as np +from mx_driving.detection import npu_rotated_iou +a = np.random.uniform(0, 1, (2, 2, 5)).astype(np.float16) +b = np.random.uniform(0, 1, (2, 3, 5)).astype(np.float16) +box1 = torch.from_numpy(a).npu() +box2 = torch.from_numpy(b).npu() +iou = npu_rotated_iou(box1, box2, False, 0, True, 1e-5, 1e-5) +``` +## npu_rotated_overlaps +### 接口原型 +```python +mx_driving.detection.npu_rotated_overlaps(Tensor self, Tensor query_boxes, bool trans=False) -> Tensor +``` +### 功能描述 +计算旋转框的重叠面积。 +### 参数说明 +- `self(Tensor)`:边界框张量,数据类型为`float32, float16`,形状为`[B, N, 5]`。 +- `query_boxes(Tensor)`:查询框张量,数据类型为`float32, float16`,形状为`[B, M, 5]`。 +- `trans(bool)`:是否进行坐标变换。默认值为`False`。值为`True`时,表示`xyxyt`, 值为`False`时,表示`xywht`。 +### 返回值 +- `Tensor`:重叠面积张量,数据类型为`float32, float16`,形状为`[B, N, M]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +import numpy as np +from mx_driving.detection import npu_rotated_overlaps +a = np.random.uniform(0, 1, (1, 3, 5)).astype(np.float16) +b = np.random.uniform(0, 1, (1, 2, 5)).astype(np.float16) +box1 = torch.from_numpy(a).npu() +box2 = torch.from_numpy(b).npu() +output = npu_rotated_overlaps(box1, box2, True) +``` +## roi_align_rotated[beta] +### 接口原型 +```python +mx_driving.detection.roi_align_rotated(Tensor feature_map, Tensor rois, float: spatial_scale, + int: sampling_ratio, int: pooled_height, int: pooled_width, bool: aligned, bool: clockwise) -> Tensor +``` +### 功能描述 +计算旋转候选框的RoI Align池化特征图。 +### 参数说明 +- `feature map(Tensor)`:特征图张量,数据类型为`float32`,形状为`[B, C, H, W]`。 +- `rois(Tensor)`:感兴趣区域张量,数据类型为`float32`,形状为`[n, 6]`。 +- `spatial_scale(float)`:感兴趣区域边界框的缩放率,数据类型为`float32`。 +- `sampling_ratio(int)`:采样率,数据类型为`int`。取值范围为非负整数。 +- `pooled_height(int)`:池化特征图高度,数据类型为`int`。 +- `pooled_width(int)`:池化特征图宽度,数据类型为`int`。 +- `aligned(bool)`:是否对齐,数据类型为`bool`。值为`True`时,表示对齐, 值为`False`时,表示不对齐。 +- `clockwise(bool)`:旋转候选框的旋转方向,数据类型为`bool`。值为`True`时,表示逆时针旋转,值为`False`时,表示顺时针旋转。 +### 返回值 +- `Tensor`:池化特征图张量,数据类型为`float32`,形状为`[n, C, pooled_height, pooled_width]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import math +import torch, torch_npu +import numpy as np +from mx_driving.detection import roi_align_rotated + +feature_map = torch.rand([1, 3, 16, 16]) +feature_map.requires_grad = True +rois = torch.Tensor(6, 8) +rois[0] = torch.randint(0, 1, (8,)) +rois[1].uniform_(0, 16) +rois[2].uniform_(0, 16) +rois[3].uniform_(0, 16) +rois[4].uniform_(0, 16) +rois[5].uniform_(0, math.pi) + +output = roi_align_rotated(feature_map.npu(), rois.npu(), 1, 1, 7, 7, True, True) +output.backward(torch.ones_like(output)) +``` +### 其他说明 +在双线性插值采样过程中,当采样点`x`接近`-1`或`W`位置,`y`接近`-1`或`H`位置时,由于平台差异和计算误差,可能导致该采样点的精度无法与竞品精度完全对齐。 + +## roiaware_pool3d +### 接口原型 +```python +mx_driving.detection.roiaware_pool3d(Tensor rois, Tensor pts, Tensor pts_feature, + Union[int, tuple] out_size, int max_pts_per_voxel, int mode) -> Tensor +``` +### 功能描述 +将输入的点云特征在ROI框内进行池化 +### 参数说明 +- `rois (Tensor)`:输入的RoI框坐标与尺寸,数据类型为`float32/float16`,shape为`[Roi_num, 7]`。 +- `pts (Tensor)`:输入的点云坐标,数据类型为`float32/float16`,shape为`[Pts_num, 3]`。 +- `pts_feature (Tensor)`:输入的点的特征向量,数据类型为`float32/float16`,shape为`[Pts_num, Channels]`。 +- `out_size (Union)`:输出的RoI框内voxel的尺寸,数据类型为`int`或者`tuple`,shape为`[out_x, out_y, out_z]`。 +- `max_pts_per_voxel (int)`:每个voxel内最大的点的个数,数据类型为`int`。 +- `mode (int)`:池化的方式,0为maxpool, 1为avgpool,数据类型为`int`。 +### 返回值 +- `pooled_features (Tensor)`:池化得到的RoI框特征,数据类型为`float32/float16`,shape为`[Roi_num, out_x, out_y, out_z, Channels]`。 +### 约束说明 +- Roi_num <= 100 +- Pts_num <= 1000 +- Channels <= 1024 +- 1 <= max_pts_per_voxel <=256,max_pts_per_voxel <= Pts_num +- 反向具有相同约束。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch +import math +import torch_npu +import mx_driving.detection + +out_size = (5, 5, 5) +max_pts_per_voxel = 128 +mode = 1 + +N = 40 +npoints = 1000 +channels = 1024 + +xyz_coor = np.random.uniform(-1, 1, size = (N, 3)).astype(np.float32) +xyz_size_num = np.random.uniform(5, 50, size = (1, 3)) +xyz_size = (xyz_size_num * np.ones((N, 3))).astype(np.float32) +angle = np.radians(np.random.randint(0, 360, size = (N , 1))).astype(np.float32) + +rois = np.concatenate((xyz_coor, xyz_size), axis=1) +rois = np.concatenate((rois, angle), axis=1) + +pts = np.random.uniform(-5, 5, size = (npoints, 3)).astype(np.float32) +pts_feature = np.random.uniform(-1, 1, size=(npoints, channels)).astype(np.float32) + +pooled_features_npu = mx_driving.detection.roiaware_pool3d(torch.tensor(rois).npu(), torch.tensor(pts).npu(), + torch.tensor(pts_feature).npu(), out_size, max_pts_per_voxel, mode) +``` + +## border_align +### 接口原型 +```python +mx_driving.detection.border_align(Tensor feature_map, Tensor rois, int pooled_size) -> Tensor +``` +### 功能描述 +对输入的RoI框进行边缘特征提取。 +### 参数说明 +- `feature_map (Tensor)`:输入的特征图,数据类型为`float32`,shape为`[Batch_size, Channels, Height, Width]`。 +- `rois (Tensor)`:输入的RoI框坐标,数据类型为`int32`,shape为`[Batch_size, Height * Width, 4]`。 +- `pooled_size (int)`:在每条边上的采样点数,数据类型为`int`。 +### 返回值 +- `out_features (Tensor)`:提取到的RoI框特征,数据类型为`float32`,shape为`[Batch_size, Channels / 4, Height * Width, 4]`。 +### 约束说明 +- Batch_size <= 128 +- Channels <= 8192, Channels % 4 == 0 +- Height <= 256, Width <= 256 +- 2 <= pooled_size <= 20 +- 反向具有相同约束。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch +import torch_npu +import numpy as np +from mx_driving.detection import border_align + +def generate_features(feature_shape): + features = torch.rand(feature_shape) + return features + +def generate_rois(inputs): + num_boxes = inputs.shape[0] * inputs.shape[2] * inputs.shape[3] + xyxy = torch.rand(num_boxes, 4) + xyxy[:, 0::2] = xyxy[:, 0::2] * inputs.size(3) + xyxy[:, 1::2] = xyxy[:, 1::2] * inputs.size(2) + xyxy[:, 2:] = xyxy[:, 0:2] + xyxy[:, 2:] + rois = xyxy.view(inputs.shape[0], -1, 4).contiguous() + return rois + +batch_size = 2 +input_channels = 16 +input_height = 8 +input_width = 8 +pooled_size = 3 +features = generate_features([batch_size, input_channels, input_height, input_width]) +rois = generate_rois(features) +output = border_align(features.npu(), rois.npu(), pooled_size) +``` + +## pixel_group +### 接口原型 +```python +mx_driving.detection.pixel_group(Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label, Tensor kernel_contour, int kernel_region_num, float distance_threshold) -> List[List] +``` +### 功能描述 +根据像素之间的嵌入向量和距离,将未被分组的像素分组。 +### 参数说明 +- `score (Tensor)`:前景得分矩阵,数据类型为`float32`,shape为`[Height, Width]`。 +- `mask (Tensor)`:前景掩码矩阵,数据类型为`bool`,shape为`[Height, Width]`。 +- `embedding (Tensor)`:特征向量,数据类型为`float32`,shape为`[Height, Width, Embedding_dim]`。 +- `kernel_label (Tensor)`:像素的实例标签,数据类型为`int32`,shape为`[Height, Width]`。 +- `kernel_contour (Tensor)`:内核的边界像素,数据类型为`uint8`,shape为`[Height, Width]`。 +- `kernel_region_num`:不同内核(分组)的数量,数据类型为`int`。 +- `distance_threshold`:嵌入向量的距离阈值,数据类型为`float`。 +### 返回值 +- `pixel_assignment (List)`:像素的分组信息,数据类型为`float32`,length为入参`kernel_region_num`。 +### 约束说明 +- mask = score > 0.5 +- `score`的取值范围在`[0, 1]`之间 +- `kernel_label`的最大值为`kernel_region_num`-1 +- `kernel_contour`的取值非0即1 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +import numpy as np +from mx_driving.detection import pixel_group +H, W, dim, num = 10, 10, 8, 3 +score = np.random.uniform(0, 1, [H, W]).astype(np.float32) +score = torch.from_numpy(score).npu() +mask = (score) > 0.5 +embedding = np.random.uniform(0, 10, [H, W, dim]).astype(np.float32) +embedding = torch.from_numpy(embedding).npu() +kernel_label = np.random.uniform(0, num, [H, W]).astype(np.int32) +kernel_label = torch.from_numpy(kernel_label).npu() +kernel_contour = np.random.uniform(0, 1, [H, W]).astype(np.uint8) +kernel_contour = torch.from_numpy(kernel_contour).npu() +kernel_region_num = num +distance_threshold = float(0.8) + +output = pixel_group(score, mask, embedding, kernel_label, kernel_contour, kernel_region_num, distance_threshold) +``` + +# 融合算子 + + +## npu_max_pool2d +### 接口原型 +```python +mx_driving.fused.npu_max_pool2d(Tensor x, int kernel_size, int stride, int padding) -> Tensor +``` +### 功能描述 +对输入进行最大池化,并输出最大池化值。 +### 参数说明 +- `x (Tensor)`:一组待池化对象,数据类型为`float32`,format为NCHW,输入数据量不超过10亿。 +### 返回值 +- `y (Tensor)`:池化后的最大值,数据类型为`float32`,format为NCHW。 +### 约束说明 +kernel_size仅支持3,stride仅支持2,padding仅支持1,且输入C轴数据量要求为8的倍数,H和W需要大于100。 +性能在C值较大的场景下较优,建议使用规格为C>=64。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.fused import npu_max_pool2d +kernel_size = 3 +stride = 2 +padding = 1 +x = torch.randn(18, 64, 464, 800).npu() +res = npu_max_pool2d(x, kernel_size, stride, padding) +``` + +## npu_deformable_aggregation +### 接口原型 +```python +mx_driving.fused.npu_deformable_aggregation(Tensor feature_maps, Tensor spatial_shape, Tensor scale_start_index, Tensor sample_locations, Tensor weight) -> Tensor +``` +### 功能描述 +可变形聚合,对于每个锚点实例,对多个关键点的多时间戳、视图、缩放特征进行稀疏采样后分层融合为实例特征,实现精确的锚点细化。 +### 参数说明 +- `feature_maps(Tensor)`:特征张量,数据类型为`float32`。shape为`[bs, num_feat, c]`。其中`bs`为batch size,`num_feat`为特征图的大小,`c`为特征图的维度。 +- `spatial_shape(Tensor)`:特征图的形状,数据类型为`int32`。shape为`[cam, scale, 2]`。其中`cam`为相机数量,其中`scale`为每个相机的特征图数量,`2`分别代表H, W。 +- `scale_start_index(Tensor)`:每个特征图的偏移位置张量,数据类型为`int32`。shape为`[cam, scale]`,其中`cam`为相机数量,其中`scale`每个相机的特征图数量。 +- `sample_locations(Tensor)`:位置张量,数据类型为`float32`。shape为`[bs, anchor, pts, cam, 2]`。其中`bs`为batch size,`anchor`为锚点数量,`pts`为采样点的数量,`cam`为相机的数量,`2`分别代表y, x。 +- `weight(Tensor)`:权重张量,数据类型为`float32`。shape为`[bs, anchor, pts, cam, scale, group]`。其中`bs`为batch size,`anchor`为锚点数量,`pts`为采样点的数量,`cam`为相机的数量,`scale`每个相机的特征图数量,`group`为分组数。 +### 返回值 +- `output(Tensor)`:输出结果张量,数据类型为`float32`。shape为`[bs, anchor, c]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 约束说明 +- bs <= 128 +- num_feat的值为spatial_shape中每幅图的特征数量之和 +- c <= 256,且为group的整数倍 +- cam <= 6 +- scale <= 4 +- anchor <= 2048 +- pts <= 2048 +- group <= 32,且为2的指数倍 +- sample_locations的值在[0, 1]之间。 +- 每个输入tensor的数据量不超过1.5亿。 +- 反向具有相同约束。 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.fused import npu_deformable_aggregation + +bs, num_feat, c, cam, anchor, pts, scale, group = 1, 2816, 256, 1, 10, 2000, 1, 8 + +feature_maps = torch.ones_like(torch.randn(bs,num_feat ,c)).to(torch.float16) +spatial_shape = torch.tensor([[[32, 88]]]) +scale_start_index = torch.tensor([[0]]) +sampling_location = torch.rand(bs, anchor, pts, cam, 2) +weights = torch.randn(bs, anchor, pts, cam, scale, group) + +out = npu_deformable_aggregation(feature_maps.npu(), spatial_shape.npu(), scale_start_index.npu(), sampling_location.npu(), weights.npu()) +``` + +## deform_conv2d(DeformConv2dFunction.apply) +### 接口原型 +```python +mx_driving.fused.deform_conv2d(Tensor x, Tensor offset, Tensor weight, Union[int, Tuple[int, ...]] stride, Union[int, Tuple[int, ...]] padding, Union[int, Tuple[int, ...]] dilation, int groups, int deformable_groups) -> Tensor +``` +### 功能描述 +可变形卷积。 +### 参数说明 +- `x(Tensor)`:输入特征,数据类型为`float32`,shape为`(n, c_in, h_in, w_in)`,其中`n`为 batch size,`c_in`为输入特征的通道数量,`h_in`为输入特征图的高,`w_in`为输入特征图的宽。 +- `offset(Tensor)`:偏移量,数据类型为`float32`,shape 为`(n, 2 * k * k, h_out, w_out)`,其中`n`为 batch size,`k` 为卷积核大小,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 +- `weight(Tensor)`:卷积核权重,数据类型为`float32`,shape 为 `(c_out, c_in, k, k)`,其中 `c_out` 为输出的通道数,`c_in` 为输入的通道数,`k` 为卷积核大小。 +- `stride(Union)`:卷积步长。 +- `padding(Union)`:卷积的填充大小。 +- `dilation(Union)`:空洞卷积大小。 +- `groups(int)`:分组卷积大小,当前只支持1。 +- `deformable_groups(int)`:将通道分成几组计算offsets,当前只支持1。 +### 返回值 +- `output(Tensor)`:输出张量,数据类型为`float32`,shape 为 `(n, c_out, h_out, w_out)`,其中`n`为 batch size,`c_out`为输出通道,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 约束说明 +1. `deformable_groups`和`groups`当前只支持1。 +2. `h_in`,`w_in`,`h_out`,`w_out`需满足 +$$ +w_{out}=(w_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 \\ +h_{out}=(h_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 +$$ +3. `c_in`需要为64的倍数。 +### 调用示例 +```python +import torch +import torch_npu +from mx_driving.fused import deform_conv2d, DeformConv2dFunction + +n, c_in, h_in, w_in = 16, 64, 100, 200 +c_out, k, h_out, w_out = 64, 3, 50, 100 + +x = torch.randn((n, c_in, h_in, w_in)).npu() +offset = torch.randn((n, 2 * k * k, h_out, w_out)).npu() +weight = torch.randn((c_out, c_in, k, k)).npu() +stride = 1 +padding = 1 +dilation = 1 +groups = 1 +deformable_groups = 1 + +output = deform_conv2d(x, offset, weight, stride, padding, dilation, groups, deformable_groups) +output = DeformConv2dFunction.apply(x, offset, weight, stride, padding, dilation, groups, deformable_groups) +``` +## modulated_deform_conv2d(ModulatedDeformConv2dFunction.apply) +### 接口原型 +```python +mx_driving.fused.modulated_deform_conv2d(Tensor x, Tensor offset, Tensor mask, Tensor weight, Tensor bias, Union[int, Tuple[int, ...]] stride, Union[int, Tuple[int, ...]] padding, Union[int, Tuple[int, ...]] dilation, int groups, int deformable_groups) -> Tensor +``` +### 功能描述 +在可变形卷积的基础之上加上了 modulation 机制,通过调控输出特征的幅度,提升可变形卷积的聚焦相关区域的能力。 +### 参数说明 +- `x(Tensor)`:输入特征,数据类型为`float32`,shape为`(n, c_in, h_in, w_in)`,其中`n`为 batch size,`c_in`为输入特征的通道数量,`h_in`为输入特征图的高,`w_in`为输入特征图的宽。 +- `offset(Tensor)`:偏移量,数据类型为`float32`,shape 为`(n, 2 * k * k, h_out, w_out)`,其中`n`为 batch size,`k` 为卷积核大小,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 +- `mask(Tensor)`:掩码,用于调控输出特征的幅度,数据类型为`float32`,shape 为`(n, k * k, h_out, w_out)`,其中`n`为 batch size,k 为卷积核大小,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 +- `weight(Tensor)`:卷积核权重,数据类型为`float32`,shape 为 `(c_out, c_in, k, k)`,其中 `c_out` 为输出的通道数,`c_in` 为输入的通道数,`k` 为卷积核大小。 +- `bias(Tensor)`:偏置,暂不支持bias,传入 `None` 即可。 +- `stride(Union)`:卷积步长。 +- `padding(Union)`:卷积的填充大小。 +- `dilation(Union)`:空洞卷积大小。 +- `groups(int)`:分组卷积大小,当前只支持1。 +- `deformable_groups(int)`:将通道分成几组计算offsets,当前只支持1。 +### 返回值 +- `output(Tensor)`:输出张量,数据类型为`float32`,shape 为 `(n, c_out, h_out, w_out)`,其中`n`为 batch size,`c_out`为输出通道,`h_out` 为输出特征图高,`w_out` 为输出特征图的宽。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 约束说明 +1. `deformable_groups`和`groups`当前只支持1。 +2. `h_in`,`w_in`,`h_out`,`w_out`需满足 +$$ +w_{out}=(w_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 \\ +h_{out}=(h_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 +$$ +3. `c_in`需要为64的倍数。 +### 调用示例 +```python +import torch +import torch_npu +from mx_driving.fused import modulated_deform_conv2d, ModulatedDeformConv2dFunction + +n, c_in, h_in, w_in = 16, 64, 100, 200 +c_out, k, h_out, w_out = 64, 3, 50, 100 + +x = torch.randn((n, c_in, h_in, w_in)).npu() +offset = torch.randn((n, 2 * k * k, h_out, w_out)).npu() +mask = torch.randn((n, k * k, h_out, w_out)).npu() +weight = torch.randn((c_out, c_in, k, k)).npu() +bias = None +stride = 1 +padding = 1 +dilation = 1 +groups = 1 +deformable_groups = 1 + +output = modulated_deform_conv2d(x, offset, mask, weight, bias, + stride, padding, dilation, groups, deformable_groups) +output = ModulatedDeformConv2dFunction.apply(x, offset, mask, weight, bias, + stride, padding, dilation, groups, deformable_groups) +``` + +# 点云算子 +## bev_pool +### 接口原型 +```python +mx_driving.point.bev_pool(Tensor feat, Tensor geom_feat, int B, int D, int H, int W) -> Tensor +``` +### 功能描述 +BEV池化。可参考论文`BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation` +### 参数说明 +- `feat(Tensor)`:特征张量,数据类型为`float32`。shape为`[N, C]`。其中`N`为原特征张量拉伸后的数量,`C`为特征的维度。 +- `geom_feat(Tensor)`:输出坐标张量,数据类型为`int32`。shape为`[N, 4]`。其中`4`分别代表`h, w, b, d`。 +- `B(int)`:batch size。 +- `D(int)`:输出池化深度。 +- `H(int)`:输出池化高度。 +- `W(int)`:输出池化宽度。 +### 返回值 +- `bev_pooled_feat(Tensor)`:采样后的点云数据,数据类型为`float32`。shape为`[B, D, H, W, C]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 约束说明 +- `geom_feat`的4个对应的值必须在`[0, H-1]`, `[0, W-1]`, `[0, B-1]`, `[0, D-1]`之间。 +- `geom_feat`和`feat`的第0维长度必须相同。 +- C <= 1024 +- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256 +- 对于反向也是同样的约束。 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.point import bev_pool +feat = torch.rand(4, 256).npu() +feat.requires_grad_() +geom_feat = torch.tensor([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2], [0, 0, 0, 3]], dtype=torch.int32).npu() +bev_pooled_feat = bev_pool(feat, geom_feat, 4, 1, 256, 256) +loss = bev_pooled_feat.sum() +loss.backward() +``` +## bev_pool_v2 +### 接口原型 +```python +mx_driving.point.bev_pool_v2(Tensor depth, Tensor feat, Tensor ranks_depth, Tensor ranks_feat, Tensor ranks_bev, + List[int] bev_feat_shape, Tensor interval_starts, Tensor interval_lengths) -> Tensor +``` +### 功能描述 +BEV池化优化。可参考论文`BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View`。 +### 参数说明 +- `depth(Tensor)`:深度张量,数据类型为`float32`。shape为`[B, N, D, H, W]`。其中`B`为batch size,`N`为特征的数量,`D, H, W`分别代表深度、高度、宽度。 +- `feat(Tensor)`:特征张量,数据类型为`float32`。shape为`[B, N, H, W, C]`。其中`B`为batch size,`N`为特征的数量,`H, W, C`分别代表高度、宽度、通道数。 +- `ranks_depth(Tensor)`:深度排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 +- `ranks_feat(Tensor)`:特征排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 +- `ranks_bev(Tensor)`:BEV排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 +- `bev_feat_shape(List[int])`:BEV特征形状,数据类型为`int32`。长度为`5`, 分别代表`B, D, H, W, C`。 +- `interval_starts(Tensor)`:间隔开始张量,数据类型为`int32`。shape为`[N_INTERVALS]`。 +- `interval_lengths(Tensor)`:间隔长度张量,数据类型为`int32`。shape为`[N_INTERVALS]`。 +### 返回值 +- `bev_pooled_feat(Tensor)`:BEV池化后的特征张量,数据类型为`float32`。shape为`[B, D, H, W, C]`。 +### 约束说明 +- `ranks_depth`的值必须在`[0, B*B*D*H*W]`之间。 +- `ranks_feat`的值必须在`[0, B*N*H*W]`之间。 +- `ranks_bev`的值必须在`[0, B*D*H*W]`之间。 +- C <= 1024 +- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256 +- N_RANKS <= 2^21 +- 对于反向也是同样的约束。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.point import bev_pool_v2 +depth = torch.rand(2, 1, 8, 256, 256).npu() +feat = torch.rand(2, 1, 256, 256, 64).npu() +feat.requires_grad_() +ranks_depth = torch.tensor([0, 1], dtype=torch.int32).npu() +ranks_feat = torch.tensor([0, 1], dtype=torch.int32).npu() +ranks_bev = torch.tensor([0, 1], dtype=torch.int32).npu() +bev_feat_shape = [2, 8, 256, 256, 64] +interval_starts = torch.tensor([0], dtype=torch.int32).npu() +interval_lengths = torch.tensor([2], dtype=torch.int32).npu() +bev_pooled_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths) +loss = bev_pooled_feat.sum() +loss.backward() +``` +## bev_pool_v3 +### 接口原型 +```python +mx_driving.point.bev_pool_v3(Tensor depth, Tensor feat, Tensor ranks_depth, Tensor ranks_feat, Tensor ranks_bev, + List[int] bev_feat_shape) -> Tensor +``` +### 功能描述 +BEV池化优化。`bev_pool_v2`的NPU亲和版本,优先推荐使用。 +### 参数说明 +- `depth(Tensor)`:深度张量,数据类型为`float32`。shape为`[B, N, D, H, W]`。其中`B`为batch size,`N`为特征的数量,`D, H, W`分别代表深度、高度、宽度。 +- `feat(Tensor)`:特征张量,数据类型为`float32`。shape为`[B, N, H, W, C]`。其中`B`为batch size,`N`为特征的数量,`H, W, C`分别代表高度、宽度、通道数。 +- `ranks_depth(Tensor)`:深度排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 +- `ranks_feat(Tensor)`:特征排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 +- `ranks_bev(Tensor)`:BEV排序张量,数据类型为`int32`。shape为`[N_RANKS]`。 +- `bev_feat_shape(List[int])`:BEV特征形状,数据类型为`int32`。长度为`5`, 分别代表`B, D, H, W, C`。 +### 返回值 +- `bev_pooled_feat(Tensor)`:BEV池化后的特征张量,数据类型为`float32`。shape为`[B, D, H, W, C]`。 +### 约束说明 +- `ranks_depth`的值必须在`[0, B*B*D*H*W]`之间。 +- `ranks_feat`的值必须在`[0, B*N*H*W]`之间。 +- `ranks_bev`的值必须在`[0, B*D*H*W]`之间。 +- C 必须为8的倍数。 +- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256 +- 对于反向也是同样的约束。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.point import bev_pool_v3 +depth = torch.rand(2, 1, 8, 256, 256).npu() +feat = torch.rand(2, 1, 256, 256, 64).npu() +feat.requires_grad_() +ranks_depth = torch.tensor([0, 1], dtype=torch.int32).npu() +ranks_feat = torch.tensor([0, 1], dtype=torch.int32).npu() +ranks_bev = torch.tensor([0, 1], dtype=torch.int32).npu() +bev_feat_shape = [2, 8, 256, 256, 64] +bev_pooled_feat = bev_pool_v3(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape) +loss = bev_pooled_feat.sum() +loss.backward() +``` +## furthest_point_sample_with_dist +### 接口原型 +```python +mx_driving.point.furthest_point_sample_with_dist(Tensor points, int num_points) -> Tensor +``` +### 功能描述 +与`npu_furthest_point_sampling`功能相同,但输入略有不同。 +### 参数说明 +- `points(Tensor)`:点云数据,表示各点间的距离,数据类型为`float32`。shape为`[B, N, N]`。其中`B`为batch size,`N`为点的数量。 +- `num_points(int)`:采样点的数量。 +### 返回值 +- `Tensor`:采样后的点云数据,数据类型为`float32`。shape为`[B, num_points]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.point import furthest_point_sample_with_dist +points = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu() +out = furthest_point_sample_with_dist(points, 2) +``` +## npu_furthest_point_sampling +### 接口原型 +```python +mx_driving.point.npu_furthest_point_sampling(Tensor points, int num_points) -> Tensor +``` +### 功能描述 +点云数据的最远点采样。 +### 参数说明 +- `points(Tensor)`:点云数据,数据类型为`float32`。shape为`[B, N, 3]`。其中`B`为batch size,`N`为点的数量,`3`分别代表`x, y, z`。 +- `num_points(int)`:采样点的数量。 +### 返回值 +- `Tensor`:采样后的点云数据,数据类型为`float32`。shape为`[B, num_points]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.point import npu_furthest_point_sampling +points = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu() +out = npu_furthest_point_sampling(points, 2) +``` + +### 算子约束 +1. points输入shape[B, N, 3]的总大小(B x N x 3)不应该超过383166 +## npu_group_points +Note:该接口命名将于2025年改为'group_points'。 +### 接口原型 +```python +mx_driving.point.npu_group_points(Tensor features, Tensor indices) -> Tensor +``` +### 功能描述 +点云数据按照索引重新分组。 +### 参数说明 +- `features`:需要被插值的特征,数据类型为`float32`,维度为(B, C, N)。 +- `indices`:获取目标特征计算的索引,数据类型为`int32`,维度为(B, npoints, nsample)。 +### 返回值 +- `output(Tensor)`:分组后的点云数据,数据类型为`float32`。shape为`[B, C, npoints, nsample]`。 +### 约束说明 +- `indices`的元素值需小于`features`的第三维度,即值在[0, N)。 +- C <= 1024 +- 反向具有相同约束。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch +import torch_npu +from mx_driving.point import npu_group_points + + +indices = torch.tensor([[[0, 2, 5, 5], [1, 0, 5, 0], [2, 1, 4, 4]]]).int().npu() +features = torch.tensor([[[0.9178, -0.7250, -1.6587, 0.0715, -0.2252, 0.4994], + [0.6190, 0.1755, -1.7902, -0.5852, -0.3311, 1.9764], + [1.7567, 0.0740, -1.1414, 0.4705, -0.3197, 1.1944], + [-0.2343, 0.1194, 0.4306, 1.3780, -1.4282, -0.6377], + [0.7239, 0.2321, -0.6578, -1.1395, -2.3874, 1.1281]]], + dtype=torch.float32).npu() +output = npu_group_points(features, indices) +``` + +## npu_add_relu +### 接口原型 +```python +mx_driving.fused.npu_add_relu(Tensor x, Tensor y) -> Tensor +``` +### 功能描述 +与`relu(x + y)`功能相同。 +### 参数说明 +- `x(Tensor)`:输入数据,数据类型为`float32`,shape无限制。 +- `y(Tensor)`:输入数据,数据类型为`float32`,shape需要和x一致。 +### 返回值 +- `Tensor`:输出数据,数据类型为`float32`,shape和x一致。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.fused import npu_add_relu +x = torch.tensor([[[1, 2, 3], [-1, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu() +y = torch.tensor([[[1, 2, 3], [-1, -2, 6], [7, 8, 9]]], dtype=torch.float32).npu() +out = npu_add_relu(x, y) +``` +### 算子约束 +- 输入`x`与输入`y`的shape和dtype需要保持一致,不支持广播。 +- 仅在x的元素个数超过2000000时,相较于`relu(x + y)`有性能提升。 + +## voxelization +### 接口原型 +```python +mx_driving.point.voxelization(Tensor points, List[float] voxel_size, List[float] coors_range, int max_points=-1, int max_voxels=-1, bool deterministic=True) -> Tensor +``` +### 功能描述 +将点云数据进行体素化。 +### 参数说明 +- `points(Tensor)`:点云数据,数据类型为`float32`。shape为`[N, F]`。其中`N`为点的数量,`F`分别代表每个点的特征维度,其中`N > 0, F >= 3`。 +- `voxel_size(List[float])`:体素大小,数据类型为`float32`。shape为`[3]`。其中`3`分别代表`x, y, z`。 +- `coors_range(List[float])`:体素范围,数据类型为`float32`。shape为`[6]`。其中`6`分别代表`x_min, y_min, z_min, x_max, y_max, z_max`。 +- `max_points(int)`:每个体素的最大点数。默认值为`-1`。 +- `max_voxels(int)`:最大体素数。默认值为`-1`。 +- `deterministic(bool)`:是否确定性。默认值为`True`。 +### 返回值 +- `coors(Tensor)`:每个点所属的体素坐标,数据类型为`int32`。shape为`[N, 3]`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.point import Voxelization +points = torch.randint(-20, 100, [16, 3], dtype=torch.float32).npu() +coors_range = [0, -40, -3, 70.4, 40, 1] +max_points = -1 +voxel_size = [0.5, 0.5, 0.5] +dynamic_voxelization = Voxelization(voxel_size, coors_range, max_points) +coors = dynamic_voxelization.forward(points) +``` +## npu_dynamic_scatter +### 接口原型 +```python +mx_driving.point.npu_dynamic_scatter(Tensor feats, Tensor coors, str reduce_type = 'max') -> Tuple[torch.Tensor, torch.Tensor] +``` +### 功能描述 +将点云特征点在对应体素中进行特征压缩。 +### 参数说明 +- `feats(Tensor)`:点云特征张量[N, C],仅支持两维,数据类型为`float32`,特征向量`C`长度上限为2048。 +- `coors(Tensor)`:体素坐标映射张量[N, 3],仅支持两维,数据类型为`int32`,此处以x, y, z指代体素三维坐标,其取值范围为`0 <= x, y < 2048`, `0 <= z < 256`。 +- `reduce_type(str)`:压缩类型。可选值为`'max'`, `'mean'`, `'sum'`。默认值为`'max'` +### 返回值 +- `voxel_feats(Tensor)`:压缩后的体素特征张量,仅支持两维,数据类型为`float32`。 +- `voxel_coors(Tensor)`:去重后的体素坐标,仅支持两维,数据类型为`int32`。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch, torch_npu +from mx_driving.point import npu_dynamic_scatter + +feats = torch.tensor([[1, 2, 3], [3, 2, 1], [7, 8, 9], [9, 8, 7]], dtype=torch.float32).npu() +coors = torch.tensor([[1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]], dtype=torch.int32).npu() +voxel_feats, voxel_coors = npu_dynamic_scatter(feats, coors, 'max') + +``` +## unique_voxel +### 接口原型 +```python +mx_driving._C.unique_voxel(Tensor voxels) -> int, Tensor, Tensor, Tensor, Tensor +``` +### 功能描述 +对输入的点云数据进行去重处理。 +### 参数说明 +- `voxels (Tensor)`:数据语义为索引,数据类型为`int32`,shape为`[N]`。 +### 返回值 +- `num_voxels(int)`, 体素数量。 +- `uni_voxels(Tensor)`,去重后的体素数据,数据类型为`int32`,shape为`[num_voxels]`。 +- `uni_indices(Tensor)`, 去重后的索引数据,数据类型为`int32`,shape为`[num_voxels]`。 +- `argsort_indices(Tensor)`, 排序后的索引数据,数据类型为`int32`,shape为`[N]`。 +- `uni_argsort_indices(Tensor)`, 去重后的排序后的索引数据,数据类型为`int32`,shape为`[num_voxels]`。 +### 约束说明 +N的大小受限于内存大小,建议N小于等于2^32。 + +受限于芯片指令,输入的数据类型只能是int32,且>=0,<2^30。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch +import torch_npu +import numpy as np +from mx_driving._C import unique_voxel +voxels = np.random.randint(0, 1024, (100000,)).astype(np.int32) +voxels_npu = torch.from_numpy(voxels).npu() +num_voxels, uni_voxels, uni_indices, argsort_indices, uni_argsort_indices = unique_voxel(voxels_npu) + +``` + + +## voxel_pooling_train +### 接口原型 +```python +mx_driving.point.npu_voxel_pooling_train(Tensor geom_xyz, Tensor input_features, List[int] voxel_num) -> Tensor +``` +### 功能描述 +点云数据体素化。 +### 参数说明 +- `geom_xyz`:体素坐标,数据类型为`int32`,维度为(B, N, 3), 3表示x, y, z。 +- `input_features`:点云数据,数据类型为`float32|float16`,维度为(B, N, C)。 +- `voxel_num`:体素格子长宽高,数据类型为`int32`,维度为(3),3表示体素格子的长宽高。 +### 返回值 +- `output(Tensor)`:输出结果,数据类型为`float32|float16`。shape为`[B, num_voxel_y, num_voxel_x, C]`。 +### 约束说明 +- B <= 128 +- N <= 100000 +- C <= 256 +- num_voxel_x <= 1000 +- num_voxel_y <= 1000 +- num_voxel_z <= 10 +- B * num_voxel_y * num_voxel_x * C <= 100000000 +- B * N * C <= 100000000 +- 反向具有相同约束。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 调用示例 +```python +import torch +import torch_npu +import mx_driving.point + +def gen_data(geom_shape, feature_shape, coeff, batch_size, num_channels, dtype): + geom_xyz = torch.rand(geom_shape) * coeff + geom_xyz = geom_xyz.reshape(batch_size, -1, 3) + geom_xyz[:, :, 2] /= 100 + geom_xyz_cpu = geom_xyz.int() + features = torch.rand(feature_shape, dtype=dtype) - 0.5 + features_cpu = features.reshape(batch_size, -1, num_channels) + + return geom_xyz_cpu, features_cpu + +dtype = torch.float32 +coeff = 90 +voxel_num = [128, 128, 1] +batch_size = 2 +num_points = 40 +num_channel = 80 +xyz = 3 + +geom_shape = [batch_size, num_points, xyz] +feature_shape = [batch_size, num_points, num_channel] + +geom_cpu, feature_cpu = gen_data(geom_shape, feature_shape, coeff, batch_size, num_channel, dtype) + +geom_npu = geom_cpu.npu() +feature_npu = feature_cpu.npu() + +result_npu = mx_driving.point.npu_voxel_pooling_train(geom_npu, feature_npu, voxel_num) +``` +# 稀疏卷积算子(beta) +## SparseConv3d(beta) +### 接口原型 +```python +mx_driving.spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor +``` +### 功能描述 +稀疏卷积 +### 参数说明 +- `in_channels(int)`:输入数据的通道数 +- `out_channels(int)`:输出通道数 +- `kernel_size(List(int)/Tuple(int)/int)`:卷积神经网络中卷积核的大小 +- `stride(List(int)/Tuple(int)/int)`:卷积核在输入数据上滑动时的步长 +- `dilation(List(int)/Tuple(int)/int)`:空洞卷积大小 +- `groups(int)`:分组卷积 +- `bias(bool)`:偏置项 +- `indice_key(String)`:该输入用于复用之前计算的索引信息 +- `mode(String)`:区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积 +### 返回值 +- `SparseConvTensor`:存储了输出的特征值`out_feature`,对应索引位置`out_indices`和对应的spatital_shape。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 约束说明 +- `kernel_size`当前支持数据类型为三维List/Tuple或Int,值域为`[1, 3]` +- `stride`当前支持数据类型为三维List/Tuple或Int +- `dilation`,`groups`当前仅支持值为1 +- 对于反向也是同样的约束。 +### 调用示例 +```python +import torch,torch_npu +import numpy as np +from mx_driving.spconv import SparseConv3d, SparseConvTensor + +def generate_indice(batch, height, width, depth, actual_num): + base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num] + base_indices = np.sort(base_indices) + b_indice = base_indices // (height * width * depth) + base_indices = base_indices % (height * width * depth) + h_indice = base_indices // (width * depth) + base_indices = base_indices // (width * depth) + w_indice = base_indices // depth + d_indice = base_indices % depth + indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num) + return indices + +actual_num = 20 +batch = 4 +spatial_shape = [9, 9, 9] +indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu() +feature = tensor_uniform = torch.rand(actual_num, 16).npu() +feature.requires_grad = True +x = SparseConvTensor(feature, indices, spatial_shape, batch) +net = SparseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu() +out = net(x) +dout = torch.ones_like(out.features).float().npu() +out.features.backward(dout) +``` + + +## SparseInverseConv3d(beta) +### 接口原型 +```python +mx_driving.spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, output_padding=0,bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor +``` +### 功能描述 +稀疏逆卷积 +### 参数说明 +- `in_channels(int)`:输入数据的通道数 +- `out_channels(int)`:输出通道数 +- `kernel_size(List(int)/Tuple(int)/int)`:卷积神经网络中卷积核的大小 +- `stride(List(int)/Tuple(int)/int)`:卷积核在输入数据上滑动时的步长 +- `dilation(List(int)/Tuple(int)/int)`:空洞卷积大小 +- `groups(int)`:分组卷积 +- `bias(bool)`:偏置项 +- `indice_key(String)`:该输入用于复用之前计算的索引信息 +- `mode(String)`:区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积 +### 返回值 +- `SparseConvTensor`:存储了输出的特征值`out_feature`,对应索引位置`out_indices`和对应的spatital_shape。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 约束说明 +- `kernel_size`当前支持数据类型为三维List/Tuple或Int,值域为`[1, 3]` +- `stride`当前支持数据类型为三维List/Tuple或Int +- `dilation`,`groups`当前仅支持值为1 +- 对于反向也是同样的约束。 +### 调用示例 +```python +import torch,torch_npu +import numpy as np +from mx_driving.spconv import SparseInverseConv3d, SparseConvTensor + +def generate_indice(batch, height, width, depth, actual_num): + base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num] + base_indices = np.sort(base_indices) + b_indice = base_indices // (height * width * depth) + base_indices = base_indices % (height * width * depth) + h_indice = base_indices // (width * depth) + base_indices = base_indices // (width * depth) + w_indice = base_indices // depth + d_indice = base_indices % depth + indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num) + return indices + +actual_num = 20 +batch = 4 +spatial_shape = [9, 9, 9] +indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu() +feature = tensor_uniform = torch.rand(actual_num, 16).npu() +feature.requires_grad = True +x = SparseConvTensor(feature, indices, spatial_shape, batch) +net = SparseInverseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu() +out = net(x) +dout = torch.ones_like(out.features).float().npu() +out.features.backward(dout) +``` + + +## SubMConv3d(beta) +### 接口原型 +```python +mx_driving.spconv.SubMConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor +``` +### 功能描述 +稀疏卷积,只有当卷积核中心参与计算时,才会影响输出 +### 参数说明 +- `in_channels(int)`:输入数据的通道数 +- `out_channels(int)`:输出通道数 +- `kernel_size(List(int)/Tuple(int)/int)`:卷积神经网络中卷积核的大小 +- `stride(List(int)/Tuple(int)/int)`:卷积核在输入数据上滑动时的步长 +- `dilation(List(int)/Tuple(int)/int)`:空洞卷积大小 +- `groups(int)`:分组卷积 +- `bias(bool)`:偏置项 +- `indice_key(String)`:该输入用于复用之前计算的索引信息 +- `mode(String)`:区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积 +### 返回值 +- `SparseConvTensor`:存储了输出的特征值`out_feature`,对应索引位置`out_indices`和对应的spatital_shape。 +### 支持的型号 +- Atlas A2 训练系列产品 +### 约束说明 +- `kernel_size`当前支持数据类型为三维List/Tuple或Int,当前值仅支持1、3 +- `stride`当前支持数据类型为三维List/Tuple或Int,当前仅支持值为1 +- `dilation`,`groups`当前仅支持值为1 +- 对于反向也是同样的约束。 +### 调用示例 +```python +import torch,torch_npu +import numpy as np +from mx_driving.spconv import SubMConv3d, SparseConvTensor + +def generate_indice(batch, height, width, depth, actual_num): + base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num] + base_indices = np.sort(base_indices) + b_indice = base_indices // (height * width * depth) + base_indices = base_indices % (height * width * depth) + h_indice = base_indices // (width * depth) + base_indices = base_indices // (width * depth) + w_indice = base_indices // depth + d_indice = base_indices % depth + indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num) + return indices + +actual_num = 20 +batch = 4 +spatial_shape = [9, 9, 9] +indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu() +feature = tensor_uniform = torch.rand(actual_num, 16).npu() +feature.requires_grad = True +x = SparseConvTensor(feature, indices, spatial_shape, batch) +net = SubMConv3d(in_channels=16, out_channels=32, kernel_size=3).npu() +out = net(x) +dout = torch.ones_like(out.features).float().npu() +out.features.backward(dout) +``` \ No newline at end of file diff --git a/mx_driving/point/ops/csrc/functions.h b/include/csrc/functions.h similarity index 31% rename from mx_driving/point/ops/csrc/functions.h rename to include/csrc/functions.h index 806ddd9504e6ec4a9839ad780cd4f8fd1359b5ba..98a171b0fa016366add9180eecf05197f0ddc3aa 100644 --- a/mx_driving/point/ops/csrc/functions.h +++ b/include/csrc/functions.h @@ -11,13 +11,90 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#ifndef CSRC_FUNCTIONS_H_ +#define CSRC_FUNCTIONS_H_ -#ifndef PERCEPTION_POINT_OPS_CSRC_FUNCTIONS_H_ -#define PERCEPTION_POINT_OPS_CSRC_FUNCTIONS_H_ -#include -#include +#include -#include +std::tuple knn( + const at::Tensor& xyz, const at::Tensor& center_xyz, int32_t k, bool is_from_knn); + +at::Tensor npu_three_interpolate( + int b, int c, int m, int n, const at::Tensor& points, const at::Tensor& idx, const at::Tensor& weight); + +at::Tensor npu_three_interpolate_backward( + int b, int c, int n, int m, const at::Tensor& grad_out, const at::Tensor& idx, const at::Tensor& weight); + +std::tuple scatter_max_with_argmax_v2( + const at::Tensor& updates, const at::Tensor& indices, c10::optional out); + +at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments); + +at::Tensor npu_scatter(const at::Tensor& self, const at::Tensor& indices, const at::Tensor& updates, int64_t dim); + +at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Tensor& count, int32_t dim); + +std::tuple npu_scatter_mean(at::Tensor& src, at::Tensor& index, c10::optional out, + c10::optional dim, c10::optional dim_size); +std::tuple npu_sort_pairs( + const at::Tensor& keys_in, const at::Tensor& values_in, int64_t dim, bool descending); + +at::Tensor npu_hypot(const at::Tensor& x, const at::Tensor& y); + +std::tuple npu_hypot_grad( + const at::Tensor& x, const at::Tensor& y, const at::Tensor& out, const at::Tensor& out_grad); + +void assign_score_withk(const at::Tensor& points, const at::Tensor& centers, const at::Tensor& scores, + const at::Tensor& knn_idx, at::Tensor& output, int32_t B, int32_t N, int32_t npoint, int32_t M, int32_t K, + int32_t out_dim, int32_t aggregate); +at::Tensor npu_max_pool2d(const at::Tensor& x, int kernel_size, int stride, int padding); + +at::Tensor multi_scale_deformable_attn(const at::Tensor& value, const at::Tensor& value_spatial_shapes, + const at::Tensor& value_level_start_index, const at::Tensor& sampling_locations, + const at::Tensor& attention_weights); + +std::tuple multi_scale_deformable_attn_backward(const at::Tensor& value, + const at::Tensor& value_spatial_shapes, const at::Tensor& value_level_start_index, + const at::Tensor& sampling_locations, const at::Tensor& attention_weights, const at::Tensor& grad_output); + +std::tuple multi_scale_deformable_attn_grad_v2(const at::Tensor& value, + const at::Tensor& shape, const at::Tensor& level_start_index, const at::Tensor& location_trans, + const at::Tensor& attn_weight_trans, const at::Tensor& grad_output); + +at::Tensor npu_add_relu(at::Tensor& x, const at::Tensor& y); + +at::Tensor npu_add_relu_grad(at::Tensor& self, at::Tensor& grad_output); +std::tuple npu_scatter_mean(at::Tensor& src, at::Tensor& index, c10::optional out, + c10::optional dim, c10::optional dim_size); + +at::Tensor fused_bias_leaky_relu(const at::Tensor& x, const at::Tensor& bias, double negative_slop, double scale); + +at::Tensor deformable_aggregation(const at::Tensor& mc_ms_feat, const at::Tensor& spatial_shape, + const at::Tensor& scale_start_index, const at::Tensor& sampling_location, const at::Tensor& weights); +std::tuple deformable_aggregation_backward(const at::Tensor& mc_ms_feat, + const at::Tensor& spatial_shape, const at::Tensor& scale_start_index, const at::Tensor& sampling_location, + const at::Tensor& weights, const at::Tensor& grad_output, const at::Tensor& grad_mc_ms_feat, + const at::Tensor& grad_sampling_location, const at::Tensor& grad_weights); + +std::tuple deformable_conv2d(const at::Tensor& input, const at::Tensor& offset, + const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, + at::IntArrayRef dilation, int64_t groups, int64_t deformable_groups); + +std::tuple modulated_deformable_conv2d(const at::Tensor& input, const at::Tensor& offset, + const at::Tensor& mask, const at::Tensor& weight, const c10::optional& bias_opt, + at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, + int64_t groups, int64_t deformable_groups, int64_t with_bias); + +std::tuple deformable_conv2d_backward(const at::Tensor& input, + const at::Tensor& weight, const at::Tensor& offset, const at::Tensor& offset_output, const at::Tensor& grad_y, + at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, + int64_t groups, int64_t deformable_groups); + +std::tuple modulated_deformable_conv2d_backward( + const at::Tensor& input, const at::Tensor& offset, const at::Tensor& mask, const at::Tensor& weight, + const c10::optional& bias_opt, const at::Tensor& offset_output, const at::Tensor& grad_y, + at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, + int64_t groups, int64_t deformable_groups, int64_t with_bias); at::Tensor group_points( const at::Tensor& points, const at::Tensor& idx, int64_t b, int64_t c, int64_t n, int64_t npoints, int64_t nsample); @@ -78,4 +155,82 @@ at::Tensor npu_bev_pool_v3(const at::Tensor& depth, const at::Tensor& feat, cons const at::Tensor& ranks_feat, const at::Tensor& ranks_bev, int64_t b, int64_t d, int64_t h, int64_t w); std::tuple npu_bev_pool_v3_backward(const at::Tensor& grad_out, const at::Tensor& depth, const at::Tensor& feat, const at::Tensor& ranks_depth, const at::Tensor& ranks_feat, const at::Tensor& ranks_bev); -#endif // PERCEPTION_POINT_OPS_CSRC_FUNCTIONS_H_ +std::tuple npu_subm_sparse_conv3d(const at::Tensor& feature, + const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, int out_channel, + at::IntArrayRef outSpatialShape, int batch_size, const at::Tensor& temp); + +std::tuple multi_to_sparse(const at::Tensor& out_features, + const at::Tensor& unique_indices_offset, const at::Tensor& sorted_idx_to_former_indices, + const at::Tensor& outidx_pair); + +std::tuple multi_to_sparse_v2(const at::Tensor& features, const at::Tensor& weight, + const at::Tensor& unique_indices_offset, const at::Tensor& sorted_idx_to_former_indices, + const at::Tensor& outidx_pair); + +std::tuple npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size, + at::IntArrayRef stride, at::IntArrayRef padding, int out_channel, at::IntArrayRef outSpatialShape, int batch_size); + +std::tuple npu_sparse_inverse_conv3d(const at::Tensor& feature, + const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, + at::IntArrayRef padding, at::IntArrayRef dilation, at::IntArrayRef output_padding, int out_channel, + at::IntArrayRef outSpatialShape, int batch_size); + +std::tuple npu_sparse_conv3d_grad(const at::Tensor& indices_offset, + const at::Tensor& former_sorted_indices, const at::Tensor& feature, const at::Tensor& weight, + const at::Tensor& grad); + +std::tuple npu_prepare_subm_conv3d( + const at::Tensor& flattenIndices, at::IntArrayRef outSpatialShape, int batch_size); + +std::tuple nms3d_normal(const at::Tensor& boxes, double nms_overlap_thresh); + +std::tuple nms3d(const at::Tensor& boxes, double threshold); + +at::Tensor npu_rotated_overlaps(const at::Tensor& self, const at::Tensor& query_boxes, bool trans); + +at::Tensor npu_rotated_iou(const at::Tensor& boxes, const at::Tensor& query_boxes, bool trans, int64_t mode, + bool is_cross, double v_threshold, double e_threshold); + +at::Tensor npu_boxes_overlap_bev(const at::Tensor& boxes_a, const at::Tensor& boxes_b); + +void roi_align_rotated_v2_forward_npu(const at::Tensor& input, const at::Tensor& rois_map, at::Tensor& output, + double spatial_scale, int32_t sampling_ratio, int32_t pooled_height, int32_t pooled_width, bool aligned, + bool clockwise); +at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input, const at::Tensor& rois, const at::Tensor& grad_output, + int32_t pooled_height, int32_t pooled_width, double spatial_scale, int32_t sampling_ratio, bool aligned, + bool clockwise); + +at::Tensor npu_box_iou_quadri( + const at::Tensor& boxes_a, const at::Tensor& boxes_b, const int64_t mode_flag, const bool aligned); + +at::Tensor npu_box_iou_rotated( + const at::Tensor& boxes_a, const at::Tensor& boxes_b, const int64_t mode_flag, const bool aligned); + +void border_align(const at::Tensor& input, const at::Tensor& rois, at::Tensor& output, int32_t pooled_size); + +at::Tensor border_align_backward(const at::Tensor& grad_out, const at::Tensor& boxes, const at::Tensor& argmax_idx, + int32_t pool_size, int32_t height, int32_t width); + +void npu_roiaware_pool3d_forward(const at::Tensor& rois, const at::Tensor& pts, const at::Tensor& pts_feature, + at::Tensor& argmax, at::Tensor& pts_idx_of_voxels, at::Tensor& pooled_features, int32_t mode); +at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::Tensor& argmax, + const at::Tensor& grad_out, int32_t npoints, int64_t pool_method); + +std::vector> pixel_group(const at::Tensor& score, const at::Tensor& mask, + const at::Tensor& embedding, const at::Tensor& kernel_label, const at::Tensor& kernel_contour, + int kernel_region_num, double distance_threshold); + +at::Tensor npu_points_in_box(const at::Tensor& boxes, const at::Tensor& pts); + +at::Tensor npu_points_in_box_all(const at::Tensor& boxes, const at::Tensor& pts); + +std::tuple npu_roipoint_pool3d_forward(const int32_t num_sampled_points, + const at::Tensor& points, const at::Tensor& point_features, const at::Tensor& boxes3d); + +at::Tensor npu_geometric_kernel_attention(const at::Tensor& value, const at::Tensor& spatial_shapes, + const at::Tensor& level_start_index, const at::Tensor& sampling_locations, const at::Tensor& attn_weights); + +std::tuple npu_geometric_kernel_attention_backward(const at::Tensor& value, + const at::Tensor& spatial_shapes, const at::Tensor& level_start_index, const at::Tensor& sampling_locations, + const at::Tensor& attn_weights, const at::Tensor& grad_output); +#endif // CSRC_FUNCTIONS_H_ diff --git a/include/csrc/utils.h b/include/csrc/utils.h index 9ffbf71a525941cc3927678bd1af02a6615289e3..8c89c4b9174c3fd33faba9c58bafe850d4e57fd9 100644 --- a/include/csrc/utils.h +++ b/include/csrc/utils.h @@ -17,10 +17,9 @@ #ifndef CSRC_UTILS_H_ #define CSRC_UTILS_H_ -#include #include -template +template inline T1 Ceil(const T1& x, const T2& y) { if (y == 0) { @@ -29,7 +28,7 @@ inline T1 Ceil(const T1& x, const T2& y) return (x + y - 1) / y; } -template +template inline T1 AlignUp(const T1& x, const T2& y) { if (y == 0) { @@ -38,12 +37,12 @@ inline T1 AlignUp(const T1& x, const T2& y) return ((x + y - 1) / y) * y; } -template +template inline T1 Tail(const T1& x, const T2& y) { if (x == 0 || y == 0) { return 0; } - return (x - 1) % y + 1; + return (x - 1) % y + 1; } #endif // CSRC_UTILS_H_ \ No newline at end of file diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a3e5516a2089b1f447b2f32cefa4e3e96ddc96f --- /dev/null +++ b/kernels/CMakeLists.txt @@ -0,0 +1,173 @@ +file(GLOB ASCEND_HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/op_host/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/*.h) +file(GLOB ASCEND_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/*.h) +if(BUILD_STAGE EQUAL 0) + add_library(ascend_all_ops SHARED ${ASCEND_HOST_SRC}) + target_compile_options(ascend_all_ops PRIVATE -g -fPIC -std=c++11 + -D_GLIBCXX_USE_CXX11_ABI=0) + target_include_directories(ascend_all_ops PRIVATE ${CANN_INCLUDE_PATH}) + target_link_libraries(ascend_all_ops PRIVATE intf_pub exe_graph register + tiling_api ascendcl) + add_custom_command( + TARGET ascend_all_ops + POST_BUILD + COMMAND ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build + $ ${ASCEND_AUTOGEN_PATH}) +elseif(BUILD_STAGE EQUAL 1) + # ===================Build proto =================== + add_library(cust_op_proto SHARED ${ASCEND_AUTOGEN_PATH}/op_proto.cc) + target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) + target_compile_options(cust_op_proto PRIVATE -fvisibility=hidden) + target_link_libraries( + cust_op_proto + PRIVATE intf_pub + exe_graph + register + tiling_api + ascendcl + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive) + set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME + cust_opsproto_rt2.0) + install_target( + TRG cust_op_proto DST + packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR} + ) + install_file(TRG cust_op_proto SRC ${ASCEND_AUTOGEN_PATH}/op_proto.h DST + packages/vendors/${vendor_name}/op_proto/inc) + + add_library(cust_optiling SHARED ${ASCEND_HOST_SRC}) + target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) + target_compile_options(cust_optiling PRIVATE -fvisibility=hidden) + target_link_libraries( + cust_optiling + PRIVATE intf_pub + exe_graph + register + tiling_api + ascendcl + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive) + set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME + cust_opmaster_rt2.0) + install_target( + TRG + cust_optiling + DST + packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR} + ) + # create liboptiling.so link + add_custom_command( + TARGET cust_optiling + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E chdir + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling + ${CMAKE_COMMAND} -E create_symlink + lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ + liboptiling.so) + install( + FILES + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/liboptiling.so + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) + + # ===================Build ACLNN=================== + file(GLOB ACLNN_SRC ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) + file(GLOB ACLNN_INC ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) + add_library(cust_opapi SHARED ${ACLNN_SRC}) + target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase opapi) + install_target(TRG cust_opapi DST packages/vendors/${vendor_name}/op_api/lib) + install_file(TRG cust_opapi SRC ${ACLNN_INC} DST + packages/vendors/${vendor_name}/op_api/include) + if("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") + add_ops_compile_options(ALL OPTIONS -g -O0) + endif() + + foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + if(EXISTS ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini) + # generate aic-${compute_unit}-ops-info.json + add_ops_info_target( + TARGET + ops_info_gen_${compute_unit} + OUTPUT + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}/aic-${compute_unit}-ops-info.json + OPS_INFO + ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + INSTALL_DIR + packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} + ) + + # generate ascendc impl py once + if(NOT TARGET ascendc_impl_gen) + add_ops_impl_target( + TARGET + ascendc_impl_gen + OPS_INFO + ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel + OUT_DIR + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl + ) + install_file( + TRG + ascendc_impl_gen + SRC + ${ASCEND_KERNEL_SRC} + DST + packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic + ) + endif() + + # dynamic shape binary compile + if(${ENABLE_BINARY_PACKAGE}) + add_bin_compile_target( + TARGET + ascendc_bin_${compute_unit} + OPS_INFO + ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel + ADP_DIR + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic + OUT_DIR + ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + KERNEL_DIR + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel + INSTALL_DIR + packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel + COMPUTE_UNIT + ${compute_unit}) + add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen + cust_optiling) + endif() + endif() + endforeach() + + # generate npu_supported_ops.json + add_npu_support_target( + TARGET + npu_supported_ops + OPS_INFO_DIR + ${ASCEND_AUTOGEN_PATH} + OUT_DIR + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_info_cfg/ai_core + INSTALL_DIR + packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}) + + get_system_info(SYSTEM_INFO) + + # gen version.info + add_custom_target( + gen_version_info ALL + COMMAND + bash ${PROJECT_SOURCE_DIR}/cmake/util/gen_version_info.sh + ${ASCEND_CANN_PACKAGE_PATH} + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}) + + install(FILES ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/version.info + DESTINATION packages/vendors/${vendor_name}) +endif() diff --git a/mx_driving/common/ops/csrc/OWNERS b/kernels/op_host/OWNERS similarity index 100% rename from mx_driving/common/ops/csrc/OWNERS rename to kernels/op_host/OWNERS diff --git a/mx_driving/fused/ops/kernels/op_host/add_relu.cpp b/kernels/op_host/add_relu.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/add_relu.cpp rename to kernels/op_host/add_relu.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/add_relu_tiling.h b/kernels/op_host/add_relu_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/add_relu_tiling.h rename to kernels/op_host/add_relu_tiling.h diff --git a/mx_driving/common/ops/kernels/op_host/assign_score_withk.cpp b/kernels/op_host/assign_score_withk.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_host/assign_score_withk.cpp rename to kernels/op_host/assign_score_withk.cpp diff --git a/mx_driving/common/ops/kernels/op_host/assign_score_withk_tiling.h b/kernels/op_host/assign_score_withk_tiling.h similarity index 100% rename from mx_driving/common/ops/kernels/op_host/assign_score_withk_tiling.h rename to kernels/op_host/assign_score_withk_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/bev_pool.cpp b/kernels/op_host/bev_pool.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/bev_pool.cpp rename to kernels/op_host/bev_pool.cpp diff --git a/mx_driving/point/ops/kernels/op_host/bev_pool_tiling.h b/kernels/op_host/bev_pool_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/bev_pool_tiling.h rename to kernels/op_host/bev_pool_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/bev_pool_v3.cpp b/kernels/op_host/bev_pool_v3.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/bev_pool_v3.cpp rename to kernels/op_host/bev_pool_v3.cpp diff --git a/mx_driving/point/ops/kernels/op_host/bev_pool_v3_tiling.h b/kernels/op_host/bev_pool_v3_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/bev_pool_v3_tiling.h rename to kernels/op_host/bev_pool_v3_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/border_align.cpp b/kernels/op_host/border_align.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/border_align.cpp rename to kernels/op_host/border_align.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/border_align_grad.cpp b/kernels/op_host/border_align_grad.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/border_align_grad.cpp rename to kernels/op_host/border_align_grad.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/border_align_grad_tiling.h b/kernels/op_host/border_align_grad_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/border_align_grad_tiling.h rename to kernels/op_host/border_align_grad_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/border_align_tiling.h b/kernels/op_host/border_align_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/border_align_tiling.h rename to kernels/op_host/border_align_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/box_iou.cpp b/kernels/op_host/box_iou.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/box_iou.cpp rename to kernels/op_host/box_iou.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/box_iou_tiling.h b/kernels/op_host/box_iou_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/box_iou_tiling.h rename to kernels/op_host/box_iou_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/boxes_overlap_bev.cpp b/kernels/op_host/boxes_overlap_bev.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/boxes_overlap_bev.cpp rename to kernels/op_host/boxes_overlap_bev.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/boxes_overlap_bev_tiling.h b/kernels/op_host/boxes_overlap_bev_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/boxes_overlap_bev_tiling.h rename to kernels/op_host/boxes_overlap_bev_tiling.h diff --git a/mx_driving/common/ops/kernels/op_host/common.h b/kernels/op_host/common.h similarity index 100% rename from mx_driving/common/ops/kernels/op_host/common.h rename to kernels/op_host/common.h diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_aggregation.cpp b/kernels/op_host/deformable_aggregation.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/deformable_aggregation.cpp rename to kernels/op_host/deformable_aggregation.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_aggregation_grad.cpp b/kernels/op_host/deformable_aggregation_grad.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/deformable_aggregation_grad.cpp rename to kernels/op_host/deformable_aggregation_grad.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_aggregation_grad_tiling.h b/kernels/op_host/deformable_aggregation_grad_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/deformable_aggregation_grad_tiling.h rename to kernels/op_host/deformable_aggregation_grad_tiling.h diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_aggregation_tiling.h b/kernels/op_host/deformable_aggregation_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/deformable_aggregation_tiling.h rename to kernels/op_host/deformable_aggregation_tiling.h diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_conv2d.cpp b/kernels/op_host/deformable_conv2d.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/deformable_conv2d.cpp rename to kernels/op_host/deformable_conv2d.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_conv2d_grad.cpp b/kernels/op_host/deformable_conv2d_grad.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/deformable_conv2d_grad.cpp rename to kernels/op_host/deformable_conv2d_grad.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_conv2d_grad_tiling.h b/kernels/op_host/deformable_conv2d_grad_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/deformable_conv2d_grad_tiling.h rename to kernels/op_host/deformable_conv2d_grad_tiling.h diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_conv2d_tiling.h b/kernels/op_host/deformable_conv2d_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/deformable_conv2d_tiling.h rename to kernels/op_host/deformable_conv2d_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_scatter.cpp b/kernels/op_host/dynamic_scatter.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/dynamic_scatter.cpp rename to kernels/op_host/dynamic_scatter.cpp diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_scatter_grad.cpp b/kernels/op_host/dynamic_scatter_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/dynamic_scatter_grad.cpp rename to kernels/op_host/dynamic_scatter_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_scatter_grad_tiling.h b/kernels/op_host/dynamic_scatter_grad_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/dynamic_scatter_grad_tiling.h rename to kernels/op_host/dynamic_scatter_grad_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_scatter_tiling.h b/kernels/op_host/dynamic_scatter_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/dynamic_scatter_tiling.h rename to kernels/op_host/dynamic_scatter_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_voxelization.cpp b/kernels/op_host/dynamic_voxelization.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/dynamic_voxelization.cpp rename to kernels/op_host/dynamic_voxelization.cpp diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_voxelization_tiling.h b/kernels/op_host/dynamic_voxelization_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/dynamic_voxelization_tiling.h rename to kernels/op_host/dynamic_voxelization_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/furthest_point_sampling.cpp b/kernels/op_host/furthest_point_sampling.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/furthest_point_sampling.cpp rename to kernels/op_host/furthest_point_sampling.cpp diff --git a/mx_driving/point/ops/kernels/op_host/furthest_point_sampling_tiling.h b/kernels/op_host/furthest_point_sampling_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/furthest_point_sampling_tiling.h rename to kernels/op_host/furthest_point_sampling_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/furthest_point_sampling_with_dist.cpp b/kernels/op_host/furthest_point_sampling_with_dist.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/furthest_point_sampling_with_dist.cpp rename to kernels/op_host/furthest_point_sampling_with_dist.cpp diff --git a/mx_driving/point/ops/kernels/op_host/furthest_point_sampling_with_dist_tiling.h b/kernels/op_host/furthest_point_sampling_with_dist_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/furthest_point_sampling_with_dist_tiling.h rename to kernels/op_host/furthest_point_sampling_with_dist_tiling.h diff --git a/mx_driving/fused/ops/kernels/op_host/fused_bias_leaky_relu_v2.cpp b/kernels/op_host/fused_bias_leaky_relu_v2.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/fused_bias_leaky_relu_v2.cpp rename to kernels/op_host/fused_bias_leaky_relu_v2.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/fused_bias_leaky_relu_v2_tiling.h b/kernels/op_host/fused_bias_leaky_relu_v2_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/fused_bias_leaky_relu_v2_tiling.h rename to kernels/op_host/fused_bias_leaky_relu_v2_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/gather_nms3d_mask_tiling.cpp b/kernels/op_host/gather_nms3d_mask_tiling.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/gather_nms3d_mask_tiling.cpp rename to kernels/op_host/gather_nms3d_mask_tiling.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/gather_nms3d_mask_tiling.h b/kernels/op_host/gather_nms3d_mask_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/gather_nms3d_mask_tiling.h rename to kernels/op_host/gather_nms3d_mask_tiling.h diff --git a/mx_driving/fused/ops/kernels/op_host/geometric_kernel_attn_grad.cpp b/kernels/op_host/geometric_kernel_attn_grad.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/geometric_kernel_attn_grad.cpp rename to kernels/op_host/geometric_kernel_attn_grad.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/geometric_kernel_attn_grad_tiling.h b/kernels/op_host/geometric_kernel_attn_grad_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/geometric_kernel_attn_grad_tiling.h rename to kernels/op_host/geometric_kernel_attn_grad_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/group_points.cpp b/kernels/op_host/group_points.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/group_points.cpp rename to kernels/op_host/group_points.cpp diff --git a/mx_driving/point/ops/kernels/op_host/group_points_grad.cpp b/kernels/op_host/group_points_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/group_points_grad.cpp rename to kernels/op_host/group_points_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_host/group_points_grad_tiling.h b/kernels/op_host/group_points_grad_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/group_points_grad_tiling.h rename to kernels/op_host/group_points_grad_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/group_points_tiling.h b/kernels/op_host/group_points_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/group_points_tiling.h rename to kernels/op_host/group_points_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/hard_voxelize.cpp b/kernels/op_host/hard_voxelize.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/hard_voxelize.cpp rename to kernels/op_host/hard_voxelize.cpp diff --git a/mx_driving/point/ops/kernels/op_host/hard_voxelize_tiling.h b/kernels/op_host/hard_voxelize_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/hard_voxelize_tiling.h rename to kernels/op_host/hard_voxelize_tiling.h diff --git a/mx_driving/common/ops/kernels/op_host/hypot.cpp b/kernels/op_host/hypot.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_host/hypot.cpp rename to kernels/op_host/hypot.cpp diff --git a/mx_driving/common/ops/kernels/op_host/hypot_grad.cpp b/kernels/op_host/hypot_grad.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_host/hypot_grad.cpp rename to kernels/op_host/hypot_grad.cpp diff --git a/mx_driving/common/ops/kernels/op_host/hypot_tiling.h b/kernels/op_host/hypot_tiling.h similarity index 100% rename from mx_driving/common/ops/kernels/op_host/hypot_tiling.h rename to kernels/op_host/hypot_tiling.h diff --git a/mx_driving/common/ops/kernels/op_host/knn.cpp b/kernels/op_host/knn.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_host/knn.cpp rename to kernels/op_host/knn.cpp diff --git a/mx_driving/common/ops/kernels/op_host/knn_tiling.h b/kernels/op_host/knn_tiling.h similarity index 100% rename from mx_driving/common/ops/kernels/op_host/knn_tiling.h rename to kernels/op_host/knn_tiling.h diff --git a/mx_driving/fused/ops/kernels/op_host/max_pool2d.cpp b/kernels/op_host/max_pool2d.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/max_pool2d.cpp rename to kernels/op_host/max_pool2d.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/max_pool2d.h b/kernels/op_host/max_pool2d.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/max_pool2d.h rename to kernels/op_host/max_pool2d.h diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn.cpp b/kernels/op_host/multi_scale_deformable_attn.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn.cpp rename to kernels/op_host/multi_scale_deformable_attn.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad.cpp b/kernels/op_host/multi_scale_deformable_attn_grad.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad.cpp rename to kernels/op_host/multi_scale_deformable_attn_grad.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_tiling.h b/kernels/op_host/multi_scale_deformable_attn_grad_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_tiling.h rename to kernels/op_host/multi_scale_deformable_attn_grad_tiling.h diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_tiling_v2.h b/kernels/op_host/multi_scale_deformable_attn_grad_tiling_v2.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_tiling_v2.h rename to kernels/op_host/multi_scale_deformable_attn_grad_tiling_v2.h diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_v2.cpp b/kernels/op_host/multi_scale_deformable_attn_grad_v2.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_v2.cpp rename to kernels/op_host/multi_scale_deformable_attn_grad_v2.cpp diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_tiling.h b/kernels/op_host/multi_scale_deformable_attn_tiling.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_tiling.h rename to kernels/op_host/multi_scale_deformable_attn_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/nms3d.cpp b/kernels/op_host/nms3d.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/nms3d.cpp rename to kernels/op_host/nms3d.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.cpp b/kernels/op_host/nms3d_normal_tiling.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.cpp rename to kernels/op_host/nms3d_normal_tiling.cpp diff --git a/mx_driving/common/ops/kernels/op_host/nms3d_normal_tiling.h b/kernels/op_host/nms3d_normal_tiling.h similarity index 100% rename from mx_driving/common/ops/kernels/op_host/nms3d_normal_tiling.h rename to kernels/op_host/nms3d_normal_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/nms3d_tiling.h b/kernels/op_host/nms3d_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/nms3d_tiling.h rename to kernels/op_host/nms3d_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/pixel_group.cpp b/kernels/op_host/pixel_group.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/pixel_group.cpp rename to kernels/op_host/pixel_group.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/pixel_group_tiling.h b/kernels/op_host/pixel_group_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/pixel_group_tiling.h rename to kernels/op_host/pixel_group_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/point_to_voxel.cpp b/kernels/op_host/point_to_voxel.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/point_to_voxel.cpp rename to kernels/op_host/point_to_voxel.cpp diff --git a/mx_driving/point/ops/kernels/op_host/point_to_voxel_tiling.h b/kernels/op_host/point_to_voxel_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/point_to_voxel_tiling.h rename to kernels/op_host/point_to_voxel_tiling.h diff --git a/mx_driving/preprocess/ops/kernels/op_host/points_in_box.cpp b/kernels/op_host/points_in_box.cpp similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_host/points_in_box.cpp rename to kernels/op_host/points_in_box.cpp diff --git a/mx_driving/preprocess/ops/kernels/op_host/points_in_box_all.cpp b/kernels/op_host/points_in_box_all.cpp similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_host/points_in_box_all.cpp rename to kernels/op_host/points_in_box_all.cpp diff --git a/mx_driving/preprocess/ops/kernels/op_host/points_in_box_all_tiling.h b/kernels/op_host/points_in_box_all_tiling.h similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_host/points_in_box_all_tiling.h rename to kernels/op_host/points_in_box_all_tiling.h diff --git a/mx_driving/preprocess/ops/kernels/op_host/points_in_box_tiling.h b/kernels/op_host/points_in_box_tiling.h similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_host/points_in_box_tiling.h rename to kernels/op_host/points_in_box_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/roi_align_rotated_grad_v2.cpp b/kernels/op_host/roi_align_rotated_grad_v2.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roi_align_rotated_grad_v2.cpp rename to kernels/op_host/roi_align_rotated_grad_v2.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/roi_align_rotated_grad_v2_tiling.h b/kernels/op_host/roi_align_rotated_grad_v2_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roi_align_rotated_grad_v2_tiling.h rename to kernels/op_host/roi_align_rotated_grad_v2_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/roi_align_rotated_v2.cpp b/kernels/op_host/roi_align_rotated_v2.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roi_align_rotated_v2.cpp rename to kernels/op_host/roi_align_rotated_v2.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/roi_align_rotated_v2_tiling.h b/kernels/op_host/roi_align_rotated_v2_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roi_align_rotated_v2_tiling.h rename to kernels/op_host/roi_align_rotated_v2_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_avgpool3d_grad.cpp b/kernels/op_host/roiaware_avgpool3d_grad.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roiaware_avgpool3d_grad.cpp rename to kernels/op_host/roiaware_avgpool3d_grad.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_avgpool3d_grad_tiling.h b/kernels/op_host/roiaware_avgpool3d_grad_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roiaware_avgpool3d_grad_tiling.h rename to kernels/op_host/roiaware_avgpool3d_grad_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_maxpool3d_grad.cpp b/kernels/op_host/roiaware_maxpool3d_grad.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roiaware_maxpool3d_grad.cpp rename to kernels/op_host/roiaware_maxpool3d_grad.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_maxpool3d_grad_tiling.h b/kernels/op_host/roiaware_maxpool3d_grad_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roiaware_maxpool3d_grad_tiling.h rename to kernels/op_host/roiaware_maxpool3d_grad_tiling.h diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_pool3d.cpp b/kernels/op_host/roiaware_pool3d.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roiaware_pool3d.cpp rename to kernels/op_host/roiaware_pool3d.cpp diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_pool3d_tiling.h b/kernels/op_host/roiaware_pool3d_tiling.h similarity index 100% rename from mx_driving/detection/ops/kernels/op_host/roiaware_pool3d_tiling.h rename to kernels/op_host/roiaware_pool3d_tiling.h diff --git a/mx_driving/preprocess/ops/kernels/op_host/roipoint_pool3d_forward.cpp b/kernels/op_host/roipoint_pool3d_forward.cpp similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_host/roipoint_pool3d_forward.cpp rename to kernels/op_host/roipoint_pool3d_forward.cpp diff --git a/mx_driving/preprocess/ops/kernels/op_host/roipoint_pool3d_forward_tiling.h b/kernels/op_host/roipoint_pool3d_forward_tiling.h similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_host/roipoint_pool3d_forward_tiling.h rename to kernels/op_host/roipoint_pool3d_forward_tiling.h diff --git a/mx_driving/common/ops/kernels/op_host/scatter_max_with_argmax_v2.cpp b/kernels/op_host/scatter_max_with_argmax_v2.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_host/scatter_max_with_argmax_v2.cpp rename to kernels/op_host/scatter_max_with_argmax_v2.cpp diff --git a/mx_driving/common/ops/kernels/op_host/scatter_max_with_argmax_v2.h b/kernels/op_host/scatter_max_with_argmax_v2.h similarity index 100% rename from mx_driving/common/ops/kernels/op_host/scatter_max_with_argmax_v2.h rename to kernels/op_host/scatter_max_with_argmax_v2.h diff --git a/mx_driving/common/ops/kernels/op_host/scatter_mean.cpp b/kernels/op_host/scatter_mean.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_host/scatter_mean.cpp rename to kernels/op_host/scatter_mean.cpp diff --git a/mx_driving/common/ops/kernels/op_host/scatter_mean.h b/kernels/op_host/scatter_mean.h similarity index 100% rename from mx_driving/common/ops/kernels/op_host/scatter_mean.h rename to kernels/op_host/scatter_mean.h diff --git a/mx_driving/common/ops/kernels/op_host/scatter_mean_grad.cpp b/kernels/op_host/scatter_mean_grad.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_host/scatter_mean_grad.cpp rename to kernels/op_host/scatter_mean_grad.cpp diff --git a/mx_driving/common/ops/kernels/op_host/scatter_mean_grad_tiling.h b/kernels/op_host/scatter_mean_grad_tiling.h similarity index 100% rename from mx_driving/common/ops/kernels/op_host/scatter_mean_grad_tiling.h rename to kernels/op_host/scatter_mean_grad_tiling.h diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_conv3d.cpp b/kernels/op_host/sparse_conv3d.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/sparse_conv3d.cpp rename to kernels/op_host/sparse_conv3d.cpp diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_grad_v2.cpp b/kernels/op_host/sparse_conv3d_grad_v2.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_grad_v2.cpp rename to kernels/op_host/sparse_conv3d_grad_v2.cpp diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_grad_v2_tiling.h b/kernels/op_host/sparse_conv3d_grad_v2_tiling.h similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_grad_v2_tiling.h rename to kernels/op_host/sparse_conv3d_grad_v2_tiling.h diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_tiling.h b/kernels/op_host/sparse_conv3d_tiling.h similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_tiling.h rename to kernels/op_host/sparse_conv3d_tiling.h diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_inverse_conv3d.cpp b/kernels/op_host/sparse_inverse_conv3d.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/sparse_inverse_conv3d.cpp rename to kernels/op_host/sparse_inverse_conv3d.cpp diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_inverse_conv3d_tiling.h b/kernels/op_host/sparse_inverse_conv3d_tiling.h similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/sparse_inverse_conv3d_tiling.h rename to kernels/op_host/sparse_inverse_conv3d_tiling.h diff --git a/mx_driving/spconv/ops/kernels/op_host/subm_sparse_conv3d_tiling.cpp b/kernels/op_host/subm_sparse_conv3d_tiling.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/subm_sparse_conv3d_tiling.cpp rename to kernels/op_host/subm_sparse_conv3d_tiling.cpp diff --git a/mx_driving/spconv/ops/kernels/op_host/subm_sparse_conv3d_tiling.h b/kernels/op_host/subm_sparse_conv3d_tiling.h similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/subm_sparse_conv3d_tiling.h rename to kernels/op_host/subm_sparse_conv3d_tiling.h diff --git a/mx_driving/spconv/ops/kernels/op_host/to_sparse.cpp b/kernels/op_host/to_sparse.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/to_sparse.cpp rename to kernels/op_host/to_sparse.cpp diff --git a/mx_driving/spconv/ops/kernels/op_host/to_sparse_tiling.h b/kernels/op_host/to_sparse_tiling.h similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/to_sparse_tiling.h rename to kernels/op_host/to_sparse_tiling.h diff --git a/mx_driving/spconv/ops/kernels/op_host/to_sparse_v3.cpp b/kernels/op_host/to_sparse_v3.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/to_sparse_v3.cpp rename to kernels/op_host/to_sparse_v3.cpp diff --git a/mx_driving/spconv/ops/kernels/op_host/to_sparse_v3_tiling.h b/kernels/op_host/to_sparse_v3_tiling.h similarity index 100% rename from mx_driving/spconv/ops/kernels/op_host/to_sparse_v3_tiling.h rename to kernels/op_host/to_sparse_v3_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/unique_voxel.cpp b/kernels/op_host/unique_voxel.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/unique_voxel.cpp rename to kernels/op_host/unique_voxel.cpp diff --git a/mx_driving/point/ops/kernels/op_host/unique_voxel_tiling.h b/kernels/op_host/unique_voxel_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/unique_voxel_tiling.h rename to kernels/op_host/unique_voxel_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/vec_pool_grad.cpp b/kernels/op_host/vec_pool_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/vec_pool_grad.cpp rename to kernels/op_host/vec_pool_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_host/vec_pool_grad_tiling.h b/kernels/op_host/vec_pool_grad_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/vec_pool_grad_tiling.h rename to kernels/op_host/vec_pool_grad_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/voxel_pooling_train.cpp b/kernels/op_host/voxel_pooling_train.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/voxel_pooling_train.cpp rename to kernels/op_host/voxel_pooling_train.cpp diff --git a/mx_driving/point/ops/kernels/op_host/voxel_pooling_train_grad.cpp b/kernels/op_host/voxel_pooling_train_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_host/voxel_pooling_train_grad.cpp rename to kernels/op_host/voxel_pooling_train_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_host/voxel_pooling_train_grad_tiling.h b/kernels/op_host/voxel_pooling_train_grad_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/voxel_pooling_train_grad_tiling.h rename to kernels/op_host/voxel_pooling_train_grad_tiling.h diff --git a/mx_driving/point/ops/kernels/op_host/voxel_pooling_train_tiling.h b/kernels/op_host/voxel_pooling_train_tiling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_host/voxel_pooling_train_tiling.h rename to kernels/op_host/voxel_pooling_train_tiling.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/add_relu.cpp b/kernels/op_kernel/add_relu.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/add_relu.cpp rename to kernels/op_kernel/add_relu.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/assign_score_withk.cpp b/kernels/op_kernel/assign_score_withk.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/assign_score_withk.cpp rename to kernels/op_kernel/assign_score_withk.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool.cpp b/kernels/op_kernel/bev_pool.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/bev_pool.cpp rename to kernels/op_kernel/bev_pool.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool.h b/kernels/op_kernel/bev_pool.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/bev_pool.h rename to kernels/op_kernel/bev_pool.h diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_grad.cpp b/kernels/op_kernel/bev_pool_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_grad.cpp rename to kernels/op_kernel/bev_pool_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v2.cpp b/kernels/op_kernel/bev_pool_v2.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v2.cpp rename to kernels/op_kernel/bev_pool_v2.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v2.h b/kernels/op_kernel/bev_pool_v2.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v2.h rename to kernels/op_kernel/bev_pool_v2.h diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v2_grad.cpp b/kernels/op_kernel/bev_pool_v2_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v2_grad.cpp rename to kernels/op_kernel/bev_pool_v2_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v3.cpp b/kernels/op_kernel/bev_pool_v3.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v3.cpp rename to kernels/op_kernel/bev_pool_v3.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v3_grad.cpp b/kernels/op_kernel/bev_pool_v3_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v3_grad.cpp rename to kernels/op_kernel/bev_pool_v3_grad.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/border_align.cpp b/kernels/op_kernel/border_align.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/border_align.cpp rename to kernels/op_kernel/border_align.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/border_align_grad.cpp b/kernels/op_kernel/border_align_grad.cpp similarity index 96% rename from mx_driving/detection/ops/kernels/op_kernel/border_align_grad.cpp rename to kernels/op_kernel/border_align_grad.cpp index 5c05866288d97d4332c32a52c8225b3ce5f21c76..efb3a71adf8417bf869578767504ca1aa3952652 100644 --- a/mx_driving/detection/ops/kernels/op_kernel/border_align_grad.cpp +++ b/kernels/op_kernel/border_align_grad.cpp @@ -117,6 +117,8 @@ public: xStride = 0; yStride = -stride; break; + default: + break; } x = boxesLocal.GetValue((i / 2 * 2)); @@ -131,22 +133,26 @@ public: continue; } - if (y <= 0.0f) y = 0; - if (x <= 0.0f) x = 0; + if (y <= 0.0f) { + y = 0; + } + if (x <= 0.0f) { + x = 0; + } yLow = AscendC::ScalarCast(y); xLow = AscendC::ScalarCast(x); if (yLow >= height - 1) { yHigh = yLow = height - 1; - y = (float)yLow; + y = static_cast(yLow); } else { yHigh = yLow + 1; } if (xLow >= width - 1) { xHigh = xLow = width - 1; - x = (float)xLow; + x = static_cast(xLow); } else { xHigh = xLow + 1; } diff --git a/mx_driving/detection/ops/kernels/op_kernel/box_iou.cpp b/kernels/op_kernel/box_iou.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/box_iou.cpp rename to kernels/op_kernel/box_iou.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/boxes_overlap_bev.cpp b/kernels/op_kernel/boxes_overlap_bev.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/boxes_overlap_bev.cpp rename to kernels/op_kernel/boxes_overlap_bev.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/common.h b/kernels/op_kernel/common.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/common.h rename to kernels/op_kernel/common.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/deformable_aggregation.cpp b/kernels/op_kernel/deformable_aggregation.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/deformable_aggregation.cpp rename to kernels/op_kernel/deformable_aggregation.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/deformable_aggregation_grad.cpp b/kernels/op_kernel/deformable_aggregation_grad.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/deformable_aggregation_grad.cpp rename to kernels/op_kernel/deformable_aggregation_grad.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/deformable_conv2d.cpp b/kernels/op_kernel/deformable_conv2d.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/deformable_conv2d.cpp rename to kernels/op_kernel/deformable_conv2d.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/deformable_conv2d_grad.cpp b/kernels/op_kernel/deformable_conv2d_grad.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/deformable_conv2d_grad.cpp rename to kernels/op_kernel/deformable_conv2d_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter.cpp b/kernels/op_kernel/dynamic_scatter.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter.cpp rename to kernels/op_kernel/dynamic_scatter.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_base.h b/kernels/op_kernel/dynamic_scatter_base.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_base.h rename to kernels/op_kernel/dynamic_scatter_base.h diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad.cpp b/kernels/op_kernel/dynamic_scatter_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad.cpp rename to kernels/op_kernel/dynamic_scatter_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_base.h b/kernels/op_kernel/dynamic_scatter_grad_base.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_base.h rename to kernels/op_kernel/dynamic_scatter_grad_base.h diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_max.h b/kernels/op_kernel/dynamic_scatter_grad_max.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_max.h rename to kernels/op_kernel/dynamic_scatter_grad_max.h diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_mean.h b/kernels/op_kernel/dynamic_scatter_grad_mean.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_mean.h rename to kernels/op_kernel/dynamic_scatter_grad_mean.h diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_sum.h b/kernels/op_kernel/dynamic_scatter_grad_sum.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_sum.h rename to kernels/op_kernel/dynamic_scatter_grad_sum.h diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_max.h b/kernels/op_kernel/dynamic_scatter_max.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_max.h rename to kernels/op_kernel/dynamic_scatter_max.h diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_mean.h b/kernels/op_kernel/dynamic_scatter_mean.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_mean.h rename to kernels/op_kernel/dynamic_scatter_mean.h diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_sum.h b/kernels/op_kernel/dynamic_scatter_sum.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_sum.h rename to kernels/op_kernel/dynamic_scatter_sum.h diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_voxelization.cpp b/kernels/op_kernel/dynamic_voxelization.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/dynamic_voxelization.cpp rename to kernels/op_kernel/dynamic_voxelization.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling.cpp b/kernels/op_kernel/furthest_point_sampling.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling.cpp rename to kernels/op_kernel/furthest_point_sampling.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling.h b/kernels/op_kernel/furthest_point_sampling.h similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling.h rename to kernels/op_kernel/furthest_point_sampling.h diff --git a/mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling_with_dist.cpp b/kernels/op_kernel/furthest_point_sampling_with_dist.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling_with_dist.cpp rename to kernels/op_kernel/furthest_point_sampling_with_dist.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/fused_bias_leaky_relu_v2.cpp b/kernels/op_kernel/fused_bias_leaky_relu_v2.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/fused_bias_leaky_relu_v2.cpp rename to kernels/op_kernel/fused_bias_leaky_relu_v2.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/fused_bias_leaky_relu_v2.h b/kernels/op_kernel/fused_bias_leaky_relu_v2.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/fused_bias_leaky_relu_v2.h rename to kernels/op_kernel/fused_bias_leaky_relu_v2.h diff --git a/mx_driving/detection/ops/kernels/op_kernel/gather_nms3d_mask.cpp b/kernels/op_kernel/gather_nms3d_mask.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/gather_nms3d_mask.cpp rename to kernels/op_kernel/gather_nms3d_mask.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/geometric_kernel_attn_grad.cpp b/kernels/op_kernel/geometric_kernel_attn_grad.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/geometric_kernel_attn_grad.cpp rename to kernels/op_kernel/geometric_kernel_attn_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/group_points.cpp b/kernels/op_kernel/group_points.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/group_points.cpp rename to kernels/op_kernel/group_points.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/group_points_grad.cpp b/kernels/op_kernel/group_points_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/group_points_grad.cpp rename to kernels/op_kernel/group_points_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/hard_voxelize.cpp b/kernels/op_kernel/hard_voxelize.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/hard_voxelize.cpp rename to kernels/op_kernel/hard_voxelize.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/hypot.cpp b/kernels/op_kernel/hypot.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/hypot.cpp rename to kernels/op_kernel/hypot.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/hypot_grad.cpp b/kernels/op_kernel/hypot_grad.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/hypot_grad.cpp rename to kernels/op_kernel/hypot_grad.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/knn.cpp b/kernels/op_kernel/knn.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/knn.cpp rename to kernels/op_kernel/knn.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/knn.h b/kernels/op_kernel/knn.h similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/knn.h rename to kernels/op_kernel/knn.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/max_pool2d.cpp b/kernels/op_kernel/max_pool2d.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/max_pool2d.cpp rename to kernels/op_kernel/max_pool2d.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_generic.h b/kernels/op_kernel/ms_deform_attn_generic.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_generic.h rename to kernels/op_kernel/ms_deform_attn_generic.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_generic.h b/kernels/op_kernel/ms_deform_attn_grad_generic.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_generic.h rename to kernels/op_kernel/ms_deform_attn_grad_generic.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_generic_v2.h b/kernels/op_kernel/ms_deform_attn_grad_generic_v2.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_generic_v2.h rename to kernels/op_kernel/ms_deform_attn_grad_generic_v2.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf.h b/kernels/op_kernel/ms_deform_attn_grad_high_perf.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf.h rename to kernels/op_kernel/ms_deform_attn_grad_high_perf.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf_v2.h b/kernels/op_kernel/ms_deform_attn_grad_high_perf_v2.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf_v2.h rename to kernels/op_kernel/ms_deform_attn_grad_high_perf_v2.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_high_perf.h b/kernels/op_kernel/ms_deform_attn_high_perf.h similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_high_perf.h rename to kernels/op_kernel/ms_deform_attn_high_perf.h diff --git a/mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn.cpp b/kernels/op_kernel/multi_scale_deformable_attn.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn.cpp rename to kernels/op_kernel/multi_scale_deformable_attn.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn_grad.cpp b/kernels/op_kernel/multi_scale_deformable_attn_grad.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn_grad.cpp rename to kernels/op_kernel/multi_scale_deformable_attn_grad.cpp diff --git a/mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn_grad_v2.cpp b/kernels/op_kernel/multi_scale_deformable_attn_grad_v2.cpp similarity index 100% rename from mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn_grad_v2.cpp rename to kernels/op_kernel/multi_scale_deformable_attn_grad_v2.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/nms3d.cpp b/kernels/op_kernel/nms3d.cpp similarity index 72% rename from mx_driving/detection/ops/kernels/op_kernel/nms3d.cpp rename to kernels/op_kernel/nms3d.cpp index 224a56e1c48e1a576865266ca819c9a16e827ffb..ac47bd22701f8cdf5c98802e57dfa27684357409 100644 --- a/mx_driving/detection/ops/kernels/op_kernel/nms3d.cpp +++ b/kernels/op_kernel/nms3d.cpp @@ -9,8 +9,7 @@ #define M_PI 3.14159265358979323846 /* pi */ using namespace AscendC; -constexpr int32_t -BUFFER_NUM = 2; +constexpr int32_t BUFFER_NUM = 2; constexpr float EPS = 1e-8; constexpr float ATAN2_DEFAULT_VALUE = 1000.0; @@ -32,16 +31,12 @@ struct Point { y = _y; } - __aicore__ Point - - operator+(const Point &b) const + __aicore__ Point operator+(const Point& b) const { return Point(x + b.x, y + b.y); } - __aicore__ Point - - operator-(const Point &b) const + __aicore__ Point operator-(const Point& b) const { return Point(x - b.x, y - b.y); } @@ -52,8 +47,7 @@ class KernelNms3d { public: __aicore__ inline KernelNms3d() {} - __aicore__ inline void Init(GM_ADDR boxes, GM_ADDR mask, - const Nms3dTilingData *__restrict tiling_data) + __aicore__ inline void Init(GM_ADDR boxes, GM_ADDR mask, const Nms3dTilingData* __restrict tiling_data) { ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); usedCoreNum = tiling_data->usedCoreNum; @@ -68,9 +62,8 @@ public: uint32_t core_id = GetBlockIdx(); isLastCore = (core_id == (tiling_data->usedCoreNum - 1)); - boxGm.SetGlobalBuffer(reinterpret_cast<__gm__ T * > (boxes), boxNum * 7); - maskGm.SetGlobalBuffer(reinterpret_cast<__gm__ int16_t * > (mask), - maskNum * boxNum); + boxGm.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(boxes), boxNum * 7); + maskGm.SetGlobalBuffer(reinterpret_cast<__gm__ int16_t*>(mask), maskNum * boxNum); pipe.InitBuffer(inQueueCur, BUFFER_NUM, dataAlign * sizeof(T)); pipe.InitBuffer(inQueueBox, BUFFER_NUM, dataAlign * 7 * sizeof(T)); @@ -90,7 +83,7 @@ public: pipe.InitBuffer(min2Buf, dataAlign * sizeof(T)); pipe.InitBuffer(max1Buf, dataAlign * sizeof(T)); pipe.InitBuffer(max2Buf, dataAlign * sizeof(T)); - if constexpr(sizeof(T) == sizeof(half)) { + if constexpr (sizeof(T) == sizeof(half)) { pipe.InitBuffer(calcBuf, dataAlign * 2 * 7 * sizeof(float)); curTemp = calcBuf.Get(dataAlign * 2 * 7); boxTemp = curTemp[8]; @@ -100,7 +93,7 @@ public: __aicore__ inline void Process() { uint32_t core_id = GetBlockIdx(); - LocalTensor oneLocal = oneMask.AllocTensor(); + LocalTensor oneLocal = oneMask.AllocTensor(); Duplicate(oneLocal, static_cast(1), dataAlign); for (size_t i = 0; i < boxNum; ++i) { for (size_t j = 0; j < loopTime; ++j) { @@ -119,24 +112,22 @@ public: } private: - __aicore__ inline void CopyIn(int32_t cur_box, int32_t com_box, - bool is_last) + __aicore__ inline void CopyIn(int32_t cur_box, int32_t com_box, bool is_last) { - LocalTensor curLocal = inQueueCur.AllocTensor(); - LocalTensor boxLocal = inQueueBox.AllocTensor(); + LocalTensor curLocal = inQueueCur.AllocTensor(); + LocalTensor boxLocal = inQueueBox.AllocTensor(); DataCopy(curLocal, boxGm[cur_box * 7], dataAlign); DataCopy(boxLocal, boxGm[com_box * 7], dataAlign * 7); inQueueCur.EnQue(curLocal); inQueueBox.EnQue(boxLocal); } - __aicore__ inline void Compute(int32_t cur_box, int32_t com_box, - bool is_last) + __aicore__ inline void Compute(int32_t cur_box, int32_t com_box, bool is_last) { uint32_t cmpNum = is_last ? tailNum : dataAlign; - if constexpr(sizeof(T) == sizeof(half)) { - LocalTensor curLocal = inQueueCur.DeQue(); - LocalTensor boxLocal = inQueueBox.DeQue(); + if constexpr (sizeof(T) == sizeof(half)) { + LocalTensor curLocal = inQueueCur.DeQue(); + LocalTensor boxLocal = inQueueBox.DeQue(); Cast(curTemp, curLocal, RoundMode::CAST_NONE, dataAlign); Cast(boxTemp, boxLocal, RoundMode::CAST_NONE, 7 * dataAlign); inQueueCur.FreeTensor(curLocal); @@ -147,7 +138,7 @@ private: } PipeBarrier(); - LocalTensor outLocal = outQueueMask.AllocTensor(); + LocalTensor outLocal = outQueueMask.AllocTensor(); for (size_t i = 0; i < cmpNum; i++) { if (cur_box >= com_box + i) { outLocal.SetValue(i, 1); @@ -166,7 +157,7 @@ private: } PipeBarrier(); outQueueMask.EnQue(outLocal); - if constexpr(sizeof(T) != sizeof(half)) { + if constexpr (sizeof(T) != sizeof(half)) { inQueueCur.FreeTensor(curTemp); inQueueBox.FreeTensor(boxTemp); } @@ -174,35 +165,30 @@ private: __aicore__ inline void CopyOut(int32_t cur_box, int32_t com_box) { - LocalTensor outLocal = outQueueMask.DeQue(); + LocalTensor outLocal = outQueueMask.DeQue(); DataCopy(maskGm[cur_box * maskNum + com_box], outLocal, dataAlign); outQueueMask.FreeTensor(outLocal); } private: - __aicore__ inline float cross(const Point &a, const Point &b) + __aicore__ inline float cross(const Point& a, const Point& b) { return a.x * b.y - a.y * b.x; } - __aicore__ inline float cross(const Point &p1, const Point &p2, - const Point &p0) + __aicore__ inline float cross(const Point& p1, const Point& p2, const Point& p0) { return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); } - __aicore__ int check_rect_cross(const Point &p1, const Point &p2, - const Point &q1, const Point &q2) + __aicore__ int check_rect_cross(const Point& p1, const Point& p2, const Point& q1, const Point& q2) { - int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && - min(q1.x, q2.x) <= max(p1.x, p2.x) && - min(p1.y, p2.y) <= max(q1.y, q2.y) && - min(q1.y, q2.y) <= max(p1.y, p2.y); + int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && min(q1.x, q2.x) <= max(p1.x, p2.x) && + min(p1.y, p2.y) <= max(q1.y, q2.y) && min(q1.y, q2.y) <= max(p1.y, p2.y); return ret; } - __aicore__ inline int check_in_box2d(const LocalTensor &box, - const Point &p) + __aicore__ inline int check_in_box2d(const LocalTensor& box, const Point& p) { const float MARGIN = 1e-2; float center_x = box.GetValue(0); @@ -215,17 +201,14 @@ private: Cos(cosLocal, angleLocal); float angle_cos = cosLocal.GetValue(0); float angle_sin = sinLocal.GetValue(0); - float rot_x = - (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); + float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; - return (abs(rot_x) < box.GetValue(3) / 2 + MARGIN && - abs(rot_y) < box.GetValue(4) / 2 + MARGIN); + return (abs(rot_x) < box.GetValue(3) / 2 + MARGIN && abs(rot_y) < box.GetValue(4) / 2 + MARGIN); } - __aicore__ inline int intersection(const Point &p1, const Point &p0, - const Point &q1, const Point &q0, - Point &ans_point) + __aicore__ inline int intersection( + const Point& p1, const Point& p0, const Point& q1, const Point& q0, Point& ans_point) { if (check_rect_cross(p0, p1, q0, q1) == 0) { return 0; @@ -234,8 +217,7 @@ private: float s2 = cross(p1, q1, p0); float s3 = cross(p0, q1, q0); float s4 = cross(q1, p1, q0); - if (!(s1 * s2 > static_cast(0.0) && - s3 * s4 > static_cast(0.0))) { + if (!(s1 * s2 > static_cast(0.0) && s3 * s4 > static_cast(0.0))) { return 0; } float s5 = cross(q1, p1, p0); @@ -259,22 +241,17 @@ private: return 1; } - __aicore__ inline void rotate_around_center(const Point ¢er, - const float angle_cos, - const float angle_sin, Point &p) + __aicore__ inline void rotate_around_center( + const Point& center, const float angle_cos, const float angle_sin, Point& p) { - float new_x = - (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x; - float new_y = - (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; + float new_x = (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x; + float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; p.set(new_x, new_y); } - __aicore__ inline int point_cmp(const Point &a, const Point &b, - const Point ¢er) + __aicore__ inline int point_cmp(const Point& a, const Point& b, const Point& center) { - return math_atan2(a.y - center.y, a.x - center.x) > - math_atan2(b.y - center.y, b.x - center.x); + return math_atan2(a.y - center.y, a.x - center.x) > math_atan2(b.y - center.y, b.x - center.x); } __aicore__ inline float math_atan2(float a, float b) @@ -305,8 +282,7 @@ private: return atanLocal.GetValue(0); } - __aicore__ inline float box_overlap(const LocalTensor &boxATensor, - const LocalTensor &boxBTensor) + __aicore__ inline float box_overlap(const LocalTensor& boxATensor, const LocalTensor& boxBTensor) { // params box_a: [x, y, z, dx, dy, dz, heading] // params box_b: [x, y, z, dx, dy, dz, heading] @@ -329,16 +305,8 @@ private: Point center_a(boxATensor.GetValue(0), boxATensor.GetValue(1)); Point center_b(boxBTensor.GetValue(0), boxBTensor.GetValue(1)); - Point box_a_corners[5] = {{a_x1, a_y1}, - {a_x2, a_y1}, - {a_x2, a_y2}, - {a_x1, a_y2}, - {a_x1, a_y1}}; - Point box_b_corners[5] = {{b_x1, b_y1}, - {b_x2, b_y1}, - {b_x2, b_y2}, - {b_x1, b_y2}, - {b_x1, b_y1}}; + Point box_a_corners[5] = {{a_x1, a_y1}, {a_x2, a_y1}, {a_x2, a_y2}, {a_x1, a_y2}, {a_x1, a_y1}}; + Point box_b_corners[5] = {{b_x1, b_y1}, {b_x2, b_y1}, {b_x2, b_y2}, {b_x1, b_y2}, {b_x1, b_y1}}; // get oriented corners LocalTensor angleLocal = angleBuf.Get(); @@ -354,10 +322,8 @@ private: float b_angle_sin = sinLocal.GetValue(1); for (int k = 0; k < 4; k++) { - rotate_around_center(center_a, a_angle_cos, a_angle_sin, - box_a_corners[k]); - rotate_around_center(center_b, b_angle_cos, b_angle_sin, - box_b_corners[k]); + rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); + rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); } box_a_corners[4] = box_a_corners[0]; @@ -372,9 +338,8 @@ private: poly_center.set(0, 0); for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { - flag = intersection(box_a_corners[i + 1], box_a_corners[i], - box_b_corners[j + 1], box_b_corners[j], - cross_points[count]); + flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], + cross_points[count]); if (flag) { poly_center = poly_center + cross_points[count]; count++; @@ -412,15 +377,13 @@ private: float cross_area = 0; for (int k = 0; k < count - 1; k++) { - cross_area += cross(cross_points[k] - cross_points[0], - cross_points[k + 1] - cross_points[0]); + cross_area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); } return abs(cross_area) / static_cast(2.0); } - __aicore__ inline float iou_bev(const LocalTensor &boxATensor, - const LocalTensor &boxBTensor) + __aicore__ inline float iou_bev(const LocalTensor& boxATensor, const LocalTensor& boxBTensor) { // params box_a: [x, y, z, dx, dy, dz, heading] // params box_b: [x, y, z, dx, dy, dz, heading] @@ -432,17 +395,17 @@ private: private: TPipe pipe; - TQue inQueueCur, inQueueBox; - TQue outQueueMask, oneMask; - TBuf calcBuf; - TBuf comBuf; + TQue inQueueCur, inQueueBox; + TQue outQueueMask, oneMask; + TBuf calcBuf; + TBuf comBuf; - TBuf p1Buf, p2Buf, q1Buf, q2Buf; - TBuf angleBuf, sinBuf, cosBuf, pointBuf; - TBuf min1Buf, min2Buf, max1Buf, max2Buf; + TBuf p1Buf, p2Buf, q1Buf, q2Buf; + TBuf angleBuf, sinBuf, cosBuf, pointBuf; + TBuf min1Buf, min2Buf, max1Buf, max2Buf; - GlobalTensor boxGm; - GlobalTensor maskGm; + GlobalTensor boxGm; + GlobalTensor maskGm; LocalTensor curTemp, boxTemp; uint32_t usedCoreNum; uint32_t loopTime; @@ -456,13 +419,10 @@ private: bool isLastCore; }; -extern "C" __global__ __aicore__ - -void nms3d(GM_ADDR boxes, GM_ADDR mask, - GM_ADDR workspace, GM_ADDR tiling) +extern "C" __global__ __aicore__ void nms3d(GM_ADDR boxes, GM_ADDR mask, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); - const Nms3dTilingData *__restrict tilingDevice = &tilingData; + const Nms3dTilingData* __restrict tilingDevice = &tilingData; if (TILING_KEY_IS(1)) { KernelNms3d op; op.Init(boxes, mask, tilingDevice); diff --git a/mx_driving/detection/ops/kernels/op_kernel/nms3d_normal.cpp b/kernels/op_kernel/nms3d_normal.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/nms3d_normal.cpp rename to kernels/op_kernel/nms3d_normal.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/pixel_group.cpp b/kernels/op_kernel/pixel_group.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/pixel_group.cpp rename to kernels/op_kernel/pixel_group.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/point_to_voxel.cpp b/kernels/op_kernel/point_to_voxel.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/point_to_voxel.cpp rename to kernels/op_kernel/point_to_voxel.cpp diff --git a/mx_driving/preprocess/ops/kernels/op_kernel/points_in_box.cpp b/kernels/op_kernel/points_in_box.cpp similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_kernel/points_in_box.cpp rename to kernels/op_kernel/points_in_box.cpp diff --git a/mx_driving/preprocess/ops/kernels/op_kernel/points_in_box_all.cpp b/kernels/op_kernel/points_in_box_all.cpp similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_kernel/points_in_box_all.cpp rename to kernels/op_kernel/points_in_box_all.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/roi_align_rotated_grad_v2.cpp b/kernels/op_kernel/roi_align_rotated_grad_v2.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/roi_align_rotated_grad_v2.cpp rename to kernels/op_kernel/roi_align_rotated_grad_v2.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/roi_align_rotated_v2.cpp b/kernels/op_kernel/roi_align_rotated_v2.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/roi_align_rotated_v2.cpp rename to kernels/op_kernel/roi_align_rotated_v2.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/roiaware_avgpool3d_grad.cpp b/kernels/op_kernel/roiaware_avgpool3d_grad.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/roiaware_avgpool3d_grad.cpp rename to kernels/op_kernel/roiaware_avgpool3d_grad.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/roiaware_maxpool3d_grad.cpp b/kernels/op_kernel/roiaware_maxpool3d_grad.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/roiaware_maxpool3d_grad.cpp rename to kernels/op_kernel/roiaware_maxpool3d_grad.cpp diff --git a/mx_driving/detection/ops/kernels/op_kernel/roiaware_pool3d.cpp b/kernels/op_kernel/roiaware_pool3d.cpp similarity index 100% rename from mx_driving/detection/ops/kernels/op_kernel/roiaware_pool3d.cpp rename to kernels/op_kernel/roiaware_pool3d.cpp diff --git a/mx_driving/preprocess/ops/kernels/op_kernel/roipoint_pool3d_forward.cpp b/kernels/op_kernel/roipoint_pool3d_forward.cpp similarity index 100% rename from mx_driving/preprocess/ops/kernels/op_kernel/roipoint_pool3d_forward.cpp rename to kernels/op_kernel/roipoint_pool3d_forward.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_max_with_argmax_v2.cpp b/kernels/op_kernel/scatter_max_with_argmax_v2.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_max_with_argmax_v2.cpp rename to kernels/op_kernel/scatter_max_with_argmax_v2.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean.cpp b/kernels/op_kernel/scatter_mean.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean.cpp rename to kernels/op_kernel/scatter_mean.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_div.cpp b/kernels/op_kernel/scatter_mean_div.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_div.cpp rename to kernels/op_kernel/scatter_mean_div.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad.cpp b/kernels/op_kernel/scatter_mean_grad.cpp similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad.cpp rename to kernels/op_kernel/scatter_mean_grad.cpp diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad.h b/kernels/op_kernel/scatter_mean_grad.h similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad.h rename to kernels/op_kernel/scatter_mean_grad.h diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_base.h b/kernels/op_kernel/scatter_mean_grad_base.h similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_base.h rename to kernels/op_kernel/scatter_mean_grad_base.h diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_large.h b/kernels/op_kernel/scatter_mean_grad_large.h similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_large.h rename to kernels/op_kernel/scatter_mean_grad_large.h diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_line.h b/kernels/op_kernel/scatter_mean_grad_line.h similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_line.h rename to kernels/op_kernel/scatter_mean_grad_line.h diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_normal.h b/kernels/op_kernel/scatter_mean_normal.h similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_normal.h rename to kernels/op_kernel/scatter_mean_normal.h diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_notail.h b/kernels/op_kernel/scatter_mean_notail.h similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_notail.h rename to kernels/op_kernel/scatter_mean_notail.h diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_notail_bighead.h b/kernels/op_kernel/scatter_mean_notail_bighead.h similarity index 100% rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_notail_bighead.h rename to kernels/op_kernel/scatter_mean_notail_bighead.h diff --git a/mx_driving/spconv/ops/kernels/op_kernel/sparse_conv3d.cpp b/kernels/op_kernel/sparse_conv3d.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_kernel/sparse_conv3d.cpp rename to kernels/op_kernel/sparse_conv3d.cpp diff --git a/mx_driving/spconv/ops/kernels/op_kernel/sparse_conv3d_grad_v2.cpp b/kernels/op_kernel/sparse_conv3d_grad_v2.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_kernel/sparse_conv3d_grad_v2.cpp rename to kernels/op_kernel/sparse_conv3d_grad_v2.cpp diff --git a/mx_driving/spconv/ops/kernels/op_kernel/sparse_inverse_conv3d.cpp b/kernels/op_kernel/sparse_inverse_conv3d.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_kernel/sparse_inverse_conv3d.cpp rename to kernels/op_kernel/sparse_inverse_conv3d.cpp diff --git a/mx_driving/spconv/ops/kernels/op_kernel/subm_sparse_conv3d.cpp b/kernels/op_kernel/subm_sparse_conv3d.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_kernel/subm_sparse_conv3d.cpp rename to kernels/op_kernel/subm_sparse_conv3d.cpp diff --git a/mx_driving/spconv/ops/kernels/op_kernel/to_sparse.cpp b/kernels/op_kernel/to_sparse.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_kernel/to_sparse.cpp rename to kernels/op_kernel/to_sparse.cpp diff --git a/mx_driving/spconv/ops/kernels/op_kernel/to_sparse_v3.cpp b/kernels/op_kernel/to_sparse_v3.cpp similarity index 100% rename from mx_driving/spconv/ops/kernels/op_kernel/to_sparse_v3.cpp rename to kernels/op_kernel/to_sparse_v3.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/unique_voxel.cpp b/kernels/op_kernel/unique_voxel.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/unique_voxel.cpp rename to kernels/op_kernel/unique_voxel.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/vec_pool_grad.cpp b/kernels/op_kernel/vec_pool_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/vec_pool_grad.cpp rename to kernels/op_kernel/vec_pool_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/voxel_pooling_train.cpp b/kernels/op_kernel/voxel_pooling_train.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/voxel_pooling_train.cpp rename to kernels/op_kernel/voxel_pooling_train.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/voxel_pooling_train_grad.cpp b/kernels/op_kernel/voxel_pooling_train_grad.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/voxel_pooling_train_grad.cpp rename to kernels/op_kernel/voxel_pooling_train_grad.cpp diff --git a/mx_driving/point/ops/kernels/op_kernel/voxel_to_point.cpp b/kernels/op_kernel/voxel_to_point.cpp similarity index 100% rename from mx_driving/point/ops/kernels/op_kernel/voxel_to_point.cpp rename to kernels/op_kernel/voxel_to_point.cpp diff --git a/mx_driving/_C/__init__.pyi b/mx_driving/_C/__init__.pyi index 54fb2639b54657363024bb25f5bbf78cabdd033d..9a811c3a1ea61a2c21d3c43b5f5a8534ab8a61d1 100644 --- a/mx_driving/_C/__init__.pyi +++ b/mx_driving/_C/__init__.pyi @@ -76,7 +76,7 @@ def deformable_aggregation( sampling_location: torch.Tensor, weights: torch.Tensor, ) -> torch.Tensor: ... -def deformable_aggregation_grad( +def deformable_aggregation_backward( mc_ms_feat: torch.Tensor, spatial_shape: torch.Tensor, scale_start_index: torch.Tensor, @@ -234,7 +234,7 @@ def npu_box_iou_quadri(boxes_a: torch.Tensor, boxes_b: torch.Tensor, mode_flag: def npu_box_iou_rotated( boxes_a: torch.Tensor, boxes_b: torch.Tensor, mode_flag: int, aligned: bool ) -> torch.Tensor: ... -def border_align_forward_npu( +def border_align( input: torch.Tensor, rois: torch.Tensor, output: torch.Tensor, pooled_size: int ) -> None: ... def border_align_backward( @@ -417,7 +417,7 @@ __all__ = [ "npu_add_relu_grad", "fused_bias_leaky_relu", "deformable_aggregation", - "deformable_aggregation_grad", + "deformable_aggregation_backward", "deformable_conv2d", "modulated_deformable_conv2d", "deformable_conv2d_backward", diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py index 3ab0077d7278f0bcd0909676de66fa07328ac347..566532626d85b1a624a831aeb16fe9a7d5782596 100644 --- a/mx_driving/__init__.py +++ b/mx_driving/__init__.py @@ -1,9 +1,102 @@ import os -import torch -import torch_npu import mx_driving._C +from .modules.roi_point_pool_3d import RoIPointPool3d +from .modules.sparse_conv import SparseConv3d, SparseInverseConv3d, SubMConv3d +from .modules.sparse_modules import SparseConvTensor, SparseModule, SparseSequential +from .modules.voxelization import Voxelization +from .ops.assign_score_withk import assign_score_withk +from .ops.bev_pool import bev_pool +from .ops.bev_pool_v2 import bev_pool_v2 +from .ops.bev_pool_v3 import bev_pool_v3 +from .ops.border_align import border_align +from .ops.box_iou import box_iou_quadri +from .ops.boxes_overlap_bev import boxes_overlap_bev, npu_boxes_overlap_bev +from .ops.deform_conv2d import DeformConv2dFunction, deform_conv2d +from .ops.furthest_point_sampling import npu_furthest_point_sampling +from .ops.furthest_point_sampling_with_dist import furthest_point_sample_with_dist +from .ops.fused_bias_leaky_relu import npu_fused_bias_leaky_relu +from .ops.group_points import group_points, npu_group_points +from .ops.hypot import hypot +from .ops.knn import knn +from .ops.modulated_deform_conv2d import ModulatedDeformConv2dFunction, modulated_deform_conv2d +from .ops.multi_scale_deformable_attn import ( + MultiScaleDeformableAttnFunction, + multi_scale_deformable_attn, + npu_multi_scale_deformable_attn_function, +) +from .ops.nms3d_normal import npu_nms3d_normal +from .ops.npu_add_relu import npu_add_relu +from .ops.npu_deformable_aggregation import npu_deformable_aggregation +from .ops.npu_dynamic_scatter import npu_dynamic_scatter +from .ops.npu_max_pool2d import npu_max_pool2d +from .ops.npu_nms3d import npu_nms3d +from .ops.npu_points_in_box import npu_points_in_box +from .ops.npu_points_in_box_all import npu_points_in_box_all, points_in_boxes_all +from .ops.pixel_group import pixel_group +from .ops.roi_align_rotated import roi_align_rotated +from .ops.roiaware_pool3d import roiaware_pool3d +from .ops.rotated_iou import npu_rotated_iou +from .ops.rotated_overlaps import npu_rotated_overlaps +from .ops.scatter_max import scatter_max +from .ops.scatter_mean import scatter_mean +from .ops.three_interpolate import three_interpolate +from .ops.three_nn import three_nn +from .ops.voxel_pooling_train import npu_voxel_pooling_train +from .ops.voxelization import voxelization +from .ops.npu_geometric_kernel_attention import npu_geometric_kernel_attention + +__all__ = [ + "RoIPointPool3d", + "SparseConv3d", + "SparseInverseConv3d", + "SubMConv3d", + "SparseConvTensor", + "SparseModule", + "SparseSequential", + "Voxelization", + "assign_score_withk", + "bev_pool", + "bev_pool_v2", + "bev_pool_v3", + "border_align", + "box_iou_quadri", + "boxes_overlap_bev", + "npu_boxes_overlap_bev", + "deform_conv2d", + "furthest_point_sample_with_dist", + "furthest_point_sample_with_dist", + "npu_fused_bias_leaky_relu", + "group_points", + "npu_group_points", + "hypot", + "knn", + "modulated_deform_conv2d", + "multi_scale_deformable_attn", + "npu_multi_scale_deformable_attn_function", + "npu_nms3d_normal", + "npu_add_relu", + "npu_deformable_aggregation", + "npu_dynamic_scatter", + "npu_max_pool2d", + "npu_nms3d", + "npu_points_in_box", + "npu_points_in_box_all", + "points_in_boxes_all", + "pixel_group", + "roi_align_rotated", + "roiaware_pool3d", + "npu_rotated_iou", + "npu_rotated_overlaps", + "scatter_max", + "scatter_mean", + "three_interpolate", + "three_nn", + "npu_voxel_pooling_train", + "voxelization", +] + def _set_env(): mx_driving_root = os.path.dirname(os.path.abspath(__file__)) diff --git a/mx_driving/common/__init__.py b/mx_driving/common.py similarity index 43% rename from mx_driving/common/__init__.py rename to mx_driving/common.py index beecdaa5c544d780249725270ac4b52055e65504..1eb379bfeff78122b146576145954ebbea755461 100644 --- a/mx_driving/common/__init__.py +++ b/mx_driving/common.py @@ -1,7 +1,12 @@ +import warnings + +warnings.warn( + "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning +) from .ops.three_interpolate import three_interpolate from .ops.scatter_max import scatter_max from .ops.knn import knn -from .ops.threeNN import three_nn +from .ops.three_nn import three_nn from .ops.scatter_mean import scatter_mean from .ops.hypot import hypot -from .ops.assign_score_withk import assign_score_withk +from .ops.assign_score_withk import assign_score_withk \ No newline at end of file diff --git a/mx_driving/common/CMakeLists.txt b/mx_driving/common/CMakeLists.txt deleted file mode 100644 index 807aa0c667560bcf0d75c6c6a26369daa624e9de..0000000000000000000000000000000000000000 --- a/mx_driving/common/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) - add_subdirectory(ops/kernels) -endif() - -if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) - add_subdirectory(ops/onnx/plugin) -endif() - -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) - add_subdirectory(ops/csrc) -endif() diff --git a/mx_driving/common/components/README.md b/mx_driving/common/components/README.md deleted file mode 100644 index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000 --- a/mx_driving/common/components/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some pytorch algorithm modules. \ No newline at end of file diff --git a/mx_driving/common/ops/csrc/CMakeLists.txt b/mx_driving/common/ops/csrc/CMakeLists.txt deleted file mode 100644 index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/csrc/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_CSRC_SRC - ${ASCEND_CSRC_SRC} ${CSRC_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/common/ops/csrc/Hypot.cpp b/mx_driving/common/ops/csrc/Hypot.cpp deleted file mode 100644 index b9a008e8ef16b100c276e7f32030ebf7d6c3a73a..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/csrc/Hypot.cpp +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) 2024 Huawei Technologies Co., Ltd -// All rights reserved. - -#include "csrc/OpApiCommon.h" -#include "functions.h" - -at::Tensor npu_hypot(const at::Tensor& x, const at::Tensor& y) -{ - auto out = at::empty_like(x, x.options()); - EXEC_NPU_CMD(aclnnHypot, x, y, out); - return out; -} - -std::tuple npu_hypot_grad(const at::Tensor& x, const at::Tensor& y, const at::Tensor& out, const at::Tensor& out_grad) -{ - auto x_grad = at::empty_like(x, x.options()); - auto y_grad = at::empty_like(y, y.options()); - EXEC_NPU_CMD(aclnnHypotGrad, x, y, out, out_grad, x_grad, y_grad); - return std::make_tuple(x_grad, y_grad); -} diff --git a/mx_driving/common/ops/csrc/README.md b/mx_driving/common/ops/csrc/README.md deleted file mode 100644 index 8073915fabe1c484db0488c9abc5e09b858c52c8..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/csrc/README.md +++ /dev/null @@ -1,6 +0,0 @@ -## Description -The `csrc` lib implements python interface, which use `pybind11` to wrap the C++ code. -There are 3 files you need to focus: -1. `pybind.cpp`: Define the python interface. -2. `functions.cpp`: Define the C++ interface. -3. The file naming in `Pascal` style: The implementation of the C++ interface. \ No newline at end of file diff --git a/mx_driving/common/ops/csrc/functions.h b/mx_driving/common/ops/csrc/functions.h deleted file mode 100644 index f226d907436037ca0e5211f9e1a771670ae61e2b..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/csrc/functions.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2024, Huawei Technologies.All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#ifndef COMMON_OPS_CSRC_FUNCTIONS_H_ -#define COMMON_OPS_CSRC_FUNCTIONS_H_ - -#include - -std::tuple knn(const at::Tensor& xyz, const at::Tensor& center_xyz, int32_t k, bool is_from_knn); - -at::Tensor npu_three_interpolate( - int b, int c, int m, int n, const at::Tensor& points, const at::Tensor& idx, const at::Tensor& weight); - -at::Tensor npu_three_interpolate_backward( - int b, int c, int n, int m, const at::Tensor& grad_out, const at::Tensor& idx, const at::Tensor& weight); - -std::tuple scatter_max_with_argmax_v2( - const at::Tensor& updates, const at::Tensor& indices, c10::optional out); - -at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments); - -at::Tensor npu_scatter(const at::Tensor& self, const at::Tensor& indices, const at::Tensor& updates, int64_t dim); - -at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Tensor& count, int32_t dim); - -std::tuple npu_scatter_mean(at::Tensor& src, at::Tensor& index, - c10::optional out, c10::optional dim, - c10::optional dim_size); -std::tuple npu_sort_pairs(const at::Tensor &keys_in, const at::Tensor &values_in, int64_t dim, bool descending); - -at::Tensor npu_hypot(const at::Tensor& x, const at::Tensor& y); - -std::tuple npu_hypot_grad(const at::Tensor& x, const at::Tensor& y, const at::Tensor& out, const at::Tensor& out_grad); - -void assign_score_withk(const at::Tensor& points, const at::Tensor& centers, const at::Tensor& scores, const at::Tensor& knn_idx, - at::Tensor& output, int32_t B, int32_t N, int32_t npoint, int32_t M, int32_t K, int32_t out_dim, int32_t aggregate); - -#endif // COMMON_OPS_CSRC_FUNCTIONS_H_ diff --git a/mx_driving/common/ops/csrc/pybind.cpp b/mx_driving/common/ops/csrc/pybind.cpp deleted file mode 100644 index 5e483ee6c7c0197d885892c2e0a21179e3fa03a2..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/csrc/pybind.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2024 Huawei Technologies Co., Ltd -// Copyright (c) 2019, Facebook CORPORATION. -// All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "csrc/pybind.h" - -#include - -#include "functions.h" - -void init_common(pybind11::module& m) -{ - // knn - m.def("knn", &knn); - - // npu_scatter_mean_grad - m.def("npu_scatter_mean_grad", &npu_scatter_mean_grad); - - // three_interpolate - m.def("npu_three_interpolate", &npu_three_interpolate); - m.def("npu_three_interpolate_backward", &npu_three_interpolate_backward); - - // scatter_mean - m.def("npu_scatter_mean", &npu_scatter_mean, "npu_scatter_mean NPU version"); - - // scatter_max - m.def("scatter_max_with_argmax_v2", &scatter_max_with_argmax_v2); - m.def("npu_scatter_max_backward", &npu_scatter_max_backward); - - // npu_sort_pairs - m.def("npu_sort_pairs", &npu_sort_pairs, "sort_pairs NPU version"); - - // npu_hypot - m.def("npu_hypot", &npu_hypot); - m.def("npu_hypot_grad", &npu_hypot_grad); - - // assign_score_withk - m.def("assign_score_withk", &assign_score_withk); -} diff --git a/mx_driving/common/ops/kernels/CMakeLists.txt b/mx_driving/common/ops/kernels/CMakeLists.txt deleted file mode 100644 index 3b1f8543b1fd15189db362166f9afad80f056ffd..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/kernels/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) - add_subdirectory(op_host) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) - add_subdirectory(op_kernel) -endif() -if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) - add_subdirectory(testcases) -endif() - diff --git a/mx_driving/common/ops/kernels/README.md b/mx_driving/common/ops/kernels/README.md deleted file mode 100644 index 1e6645553e8d86a84a9833a13610741b59930494..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/kernels/README.md +++ /dev/null @@ -1,13 +0,0 @@ -## 算子原型 - - - - - - - - - - - -
算子类型(OpType)Add
算子输入nameshapedata typeformat
x-floatND
y-floatND
算子输出z-floatND
核函数名add_custom
\ No newline at end of file diff --git a/mx_driving/common/ops/kernels/inc/base.h b/mx_driving/common/ops/kernels/inc/base.h deleted file mode 100644 index a0b8bfae346605551e85d2fb8db80595e644de08..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/kernels/inc/base.h +++ /dev/null @@ -1,34 +0,0 @@ -// REG_OP(Add) -// .INPUT(x1, TensorType({DT_FLOAT})) -// .INPUT(x2, TensorType({DT_FLOAT})) -// .OUTPUT(y, TensorType({DT_FLOAT})) -// .OP_END_FACTORY_REG(Add) - -// REG_OP(FurthestPointSamplingWithDist) -// .INPUT(points_dist, TensorType({DT_FLOAT})) -// .INPUT(nearest_temp, TensorType({DT_FLOAT})) -// .OUTPUT(index, TensorType({DT_INT32})) -// .REQUIRED_ATTR(num_points, Int) -// .OP_END_FACTORY_REG(FurthestPointSamplingWithDist) - -// REG_OP(Nms3dNormal) -// .INPUT(boxes, TensorType({DT_FLOAT, DT_FLOAT16})) -// .OUTPUT(keep, TensorType({DT_INT16})) -// .REQUIRED_ATTR(nms_overlap_thresh, Float) -// .OP_END_FACTORY_REG(Nms3dNormal) - -// REG_OP(FurthestPointSampling) -// .INPUT(point_xyz, TensorType({DT_FLOAT})) -// .INPUT(nearest_temp, TensorType({DT_FLOAT})) -// .OUTPUT(index, TensorType({DT_INT32})) -// .REQUIRED_ATTR(num_points, Int) -// .OP_END_FACTORY_REG(FurthestPointSampling) - -// REG_OP(DynamicScatterGrad) -// .INPUT(grad_voxel_feats, TensorType({DT_FLOAT})) -// .INPUT(prefix_sum_point_per_voxel, TensorType({DT_INT32})) -// .INPUT(argsort_coor, TensorType({DT_INT32})) -// .INPUT(compare_mask, TensorType({DT_UINT16})) -// .OUTPUT(grad_point_feats, TensorType({DT_FLOAT})) -// .ATTR(reduce_type, String, "max") -// .OP_END_FACTORY_REG(DynamicScatterGrad) diff --git a/mx_driving/common/ops/kernels/op_host/CMakeLists.txt b/mx_driving/common/ops/kernels/op_host/CMakeLists.txt deleted file mode 100644 index c44b2b0174f28f0144a7c03fc6c40cc5b389c14e..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/kernels/op_host/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_HOST_SRC - ${ASCEND_HOST_SRC} ${HOST_SRC} - CACHE INTERNAL "") -# add the exclude files for aclnn -set(aclop_exclude - ${aclop_exclude} "" - CACHE INTERNAL "") -file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp) -file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h) -set(ACLNN_SRC_CUSTOM - ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC} - CACHE INTERNAL "") -set(ACLNN_INC_CUSTOM - ${ACLNN_INC_CUSTOM} ${ACLNN_INC} - CACHE INTERNAL "") diff --git a/mx_driving/common/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/common/ops/kernels/op_kernel/CMakeLists.txt deleted file mode 100644 index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000 --- a/mx_driving/common/ops/kernels/op_kernel/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_KERNEL_SRC - ${ASCEND_KERNEL_SRC} ${KERNEL_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/fused/ops/csrc/AddRelu.cpp b/mx_driving/csrc/AddRelu.cpp similarity index 88% rename from mx_driving/fused/ops/csrc/AddRelu.cpp rename to mx_driving/csrc/AddRelu.cpp index 6fe35c20b9d04772de7d83b170de9f219f16ddff..896a20e80d39ae71da5d5a0d623c98556c436a0b 100644 --- a/mx_driving/fused/ops/csrc/AddRelu.cpp +++ b/mx_driving/csrc/AddRelu.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor npu_add_relu(at::Tensor& x, const at::Tensor& y) { @@ -27,10 +27,6 @@ at::Tensor npu_add_relu_grad(at::Tensor& self, at::Tensor& grad_output) { auto result = at::empty_like(self, self.options()); at_npu::native::OpCommand cmd; - cmd.Name("ReluGrad") - .Input(grad_output) - .Input(self) - .Output(result) - .Run(); + cmd.Name("ReluGrad").Input(grad_output).Input(self).Output(result).Run(); return result; } diff --git a/mx_driving/common/ops/csrc/AssignScoreWithk.cpp b/mx_driving/csrc/AssignScoreWithk.cpp similarity index 81% rename from mx_driving/common/ops/csrc/AssignScoreWithk.cpp rename to mx_driving/csrc/AssignScoreWithk.cpp index a7cecce21eed3db78f111763a3b2cc11faae054a..4a028d92f5bead67c59624e2bc4634bd740f9a6d 100644 --- a/mx_driving/common/ops/csrc/AssignScoreWithk.cpp +++ b/mx_driving/csrc/AssignScoreWithk.cpp @@ -15,22 +15,11 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -void assign_score_withk( - const at::Tensor& points, - const at::Tensor& centers, - const at::Tensor& scores, - const at::Tensor& knn_idx, - at::Tensor & output, - int32_t B, - int32_t N, - int32_t npoint, - int32_t M, - int32_t K, - int32_t out_dim, - int32_t aggregate - ) +void assign_score_withk(const at::Tensor& points, const at::Tensor& centers, const at::Tensor& scores, + const at::Tensor& knn_idx, at::Tensor& output, int32_t B, int32_t N, int32_t npoint, int32_t M, int32_t K, + int32_t out_dim, int32_t aggregate) { TORCH_CHECK_NPU(points); TORCH_CHECK_NPU(centers); @@ -48,5 +37,6 @@ void assign_score_withk( at::Tensor points_trans = points.permute({0, 3, 1, 2}); at::Tensor centers_trans = centers.permute({0, 3, 1, 2}); - EXEC_NPU_CMD_SYNC(aclnnAssignScoreWithk, points_trans, centers_trans, scores, knn_idx, B, N, npoint, M, K, out_dim, aggregate, output); -} \ No newline at end of file + EXEC_NPU_CMD_SYNC(aclnnAssignScoreWithk, points_trans, centers_trans, scores, knn_idx, B, N, npoint, M, K, out_dim, + aggregate, output); +} diff --git a/mx_driving/point/ops/csrc/BEVPool.cpp b/mx_driving/csrc/BEVPool.cpp similarity index 98% rename from mx_driving/point/ops/csrc/BEVPool.cpp rename to mx_driving/csrc/BEVPool.cpp index 47332f0711aed7388ed81faf2c82c5a600f07ee6..56742eabb146498a15e41578271ed90696325b47 100644 --- a/mx_driving/point/ops/csrc/BEVPool.cpp +++ b/mx_driving/csrc/BEVPool.cpp @@ -14,9 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr int64_t N_IDX = 0; diff --git a/mx_driving/point/ops/csrc/BEVPoolBackward.cpp b/mx_driving/csrc/BEVPoolBackward.cpp similarity index 98% rename from mx_driving/point/ops/csrc/BEVPoolBackward.cpp rename to mx_driving/csrc/BEVPoolBackward.cpp index 262c4584a736d6aa63874faeba886d24d790756f..129b059a9f6654318bc988af880df5d817eba2fe 100644 --- a/mx_driving/point/ops/csrc/BEVPoolBackward.cpp +++ b/mx_driving/csrc/BEVPoolBackward.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr int64_t N_IDX = 0; diff --git a/mx_driving/point/ops/csrc/BEVPoolV2.cpp b/mx_driving/csrc/BEVPoolV2.cpp similarity index 98% rename from mx_driving/point/ops/csrc/BEVPoolV2.cpp rename to mx_driving/csrc/BEVPoolV2.cpp index b2268d7a371d6437a1417c3be3492ad70d7b01a0..3c4b013f98d93160d507058debb791745bcdecdf 100644 --- a/mx_driving/point/ops/csrc/BEVPoolV2.cpp +++ b/mx_driving/csrc/BEVPoolV2.cpp @@ -16,7 +16,7 @@ #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr int64_t C_IDX = 4; diff --git a/mx_driving/point/ops/csrc/BEVPoolV2Backward.cpp b/mx_driving/csrc/BEVPoolV2Backward.cpp similarity index 99% rename from mx_driving/point/ops/csrc/BEVPoolV2Backward.cpp rename to mx_driving/csrc/BEVPoolV2Backward.cpp index ebeca36c0c52e16824b1007dc9f99ca6aeb5aa56..2330da05c2658f446829feeb76fa753781966264 100644 --- a/mx_driving/point/ops/csrc/BEVPoolV2Backward.cpp +++ b/mx_driving/csrc/BEVPoolV2Backward.cpp @@ -14,9 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr int64_t C_IDX = 4; diff --git a/mx_driving/point/ops/csrc/BEVPoolV3.cpp b/mx_driving/csrc/BEVPoolV3.cpp similarity index 98% rename from mx_driving/point/ops/csrc/BEVPoolV3.cpp rename to mx_driving/csrc/BEVPoolV3.cpp index 0680801182bbdbfc448f3db43af559c4854c8272..d6461246e788c67eb42d7ce37abf16ca554b59bd 100644 --- a/mx_driving/point/ops/csrc/BEVPoolV3.cpp +++ b/mx_driving/csrc/BEVPoolV3.cpp @@ -14,9 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr int64_t C_IDX = 4; diff --git a/mx_driving/point/ops/csrc/BEVPoolV3Backward.cpp b/mx_driving/csrc/BEVPoolV3Backward.cpp similarity index 98% rename from mx_driving/point/ops/csrc/BEVPoolV3Backward.cpp rename to mx_driving/csrc/BEVPoolV3Backward.cpp index 8566f7c3d0d302277e7a948d03f2a20683528f04..6916416c9da7b28e53f25a4602127cec6fc4d60d 100644 --- a/mx_driving/point/ops/csrc/BEVPoolV3Backward.cpp +++ b/mx_driving/csrc/BEVPoolV3Backward.cpp @@ -14,9 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr int64_t C_IDX = 4; @@ -44,4 +43,4 @@ std::tuple npu_bev_pool_v3_backward(const at::Tensor& gr EXEC_NPU_CMD(aclnnBEVPoolV3Grad, grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, grad_depth, grad_feat); return std::make_tuple(grad_depth, grad_feat); -} \ No newline at end of file +} diff --git a/mx_driving/detection/ops/csrc/BorderAlign.cpp b/mx_driving/csrc/BorderAlign.cpp similarity index 86% rename from mx_driving/detection/ops/csrc/BorderAlign.cpp rename to mx_driving/csrc/BorderAlign.cpp index 68e246ed5c8645de6e29584b2af877580e911c0e..72363550145582837992dd32d5a9614e54b4f489 100644 --- a/mx_driving/detection/ops/csrc/BorderAlign.cpp +++ b/mx_driving/csrc/BorderAlign.cpp @@ -13,13 +13,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -void border_align_forward_npu(const at::Tensor& input, const at::Tensor& rois, at::Tensor& output, const int32_t pooled_size) +void border_align(const at::Tensor& input, const at::Tensor& rois, at::Tensor& output, int32_t pooled_size) { TORCH_CHECK(input.size(1) % 4 == 0, "The number of channels must be divisible by 4."); at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous(); at::Tensor rois_map = rois.contiguous(); EXEC_NPU_CMD(aclnnBorderAlign, feature_map, rois_map, pooled_size, output); -} \ No newline at end of file +} diff --git a/mx_driving/detection/ops/csrc/BorderAlignGrad.cpp b/mx_driving/csrc/BorderAlignBackward.cpp similarity index 89% rename from mx_driving/detection/ops/csrc/BorderAlignGrad.cpp rename to mx_driving/csrc/BorderAlignBackward.cpp index 9d5ce70ad219fef6997afbc45afd2236edb97f25..599fadb884fc184b19eb72ddf5445cc3cec4bc5e 100644 --- a/mx_driving/detection/ops/csrc/BorderAlignGrad.cpp +++ b/mx_driving/csrc/BorderAlignBackward.cpp @@ -15,10 +15,10 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor border_align_backward(const at::Tensor& grad_out, const at::Tensor& boxes, const at::Tensor& argmax_idx, - int32_t pool_size, int32_t height, int32_t width) + int32_t pool_size, int32_t height, int32_t width) { TORCH_CHECK_NPU(grad_out); TORCH_CHECK_NPU(boxes); @@ -33,7 +33,8 @@ at::Tensor border_align_backward(const at::Tensor& grad_out, const at::Tensor& b int32_t box_size = boxes.size(1); at::Tensor grad_input = at::zeros({batch_size, feat_channels, height, width}, grad_out.options()); - - EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_out, boxes, argmax_idx, channels, box_size, height, width, pool_size, batch_size, grad_input); + + EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_out, boxes, argmax_idx, channels, box_size, height, width, pool_size, + batch_size, grad_input); return grad_input; -} \ No newline at end of file +} diff --git a/mx_driving/detection/ops/csrc/BoxIou.cpp b/mx_driving/csrc/BoxIou.cpp similarity index 89% rename from mx_driving/detection/ops/csrc/BoxIou.cpp rename to mx_driving/csrc/BoxIou.cpp index 84b22dfa5f198d724e86aab8b0cb5852945d77a8..e0abc69d4ce5541dd0d137c502f3d13f9d8fdaaa 100644 --- a/mx_driving/detection/ops/csrc/BoxIou.cpp +++ b/mx_driving/csrc/BoxIou.cpp @@ -15,12 +15,12 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr int64_t N_IDX = 0; -void check_npu(const at::Tensor &boxes_a, const at::Tensor &boxes_b) +void check_npu(const at::Tensor& boxes_a, const at::Tensor& boxes_b) { TORCH_CHECK_NPU(boxes_a); TORCH_CHECK_NPU(boxes_b); @@ -35,8 +35,8 @@ void check_npu(const at::Tensor &boxes_a, const at::Tensor &boxes_b) * @param aligned: False-calculate between each box of boxes_a and boxes_b, True-calculate between each aligned pair of boxes_a and boxes_b * @return ious: iou of boxes */ -at::Tensor npu_box_iou_quadri(const at::Tensor &boxes_a, const at::Tensor &boxes_b, - const int64_t mode_flag, const bool aligned) +at::Tensor npu_box_iou_quadri( + const at::Tensor& boxes_a, const at::Tensor& boxes_b, const int64_t mode_flag, const bool aligned) { TORCH_CHECK(boxes_a.size(1) == 8, "boxes_a must be 2D tensor (N, 8)"); TORCH_CHECK(boxes_b.size(1) == 8, "boxes_b must be 2D tensor (N, 8)"); @@ -61,8 +61,8 @@ at::Tensor npu_box_iou_quadri(const at::Tensor &boxes_a, const at::Tensor &boxes * @param aligned: False-calculate between each box of boxes_a and boxes_b, True-calculate between each aligned pair of boxes_a and boxes_b * @return ious: iou of boxes */ -at::Tensor npu_box_iou_rotated(const at::Tensor &boxes_a, const at::Tensor &boxes_b, - const int64_t mode_flag, const bool aligned) +at::Tensor npu_box_iou_rotated( + const at::Tensor& boxes_a, const at::Tensor& boxes_b, const int64_t mode_flag, const bool aligned) { TORCH_CHECK(boxes_a.size(1) == 5, "boxes_a must be 2D tensor (N, 5)"); TORCH_CHECK(boxes_b.size(1) == 5, "boxes_b must be 2D tensor (N, 5)"); diff --git a/mx_driving/detection/ops/csrc/BoxesOverlapBev.cpp b/mx_driving/csrc/BoxesOverlapBev.cpp similarity index 90% rename from mx_driving/detection/ops/csrc/BoxesOverlapBev.cpp rename to mx_driving/csrc/BoxesOverlapBev.cpp index cc06412b02595a0103065a3249867ce95687d2c7..db85fec26990876be75a18b8b885fe3edd0ab418 100644 --- a/mx_driving/detection/ops/csrc/BoxesOverlapBev.cpp +++ b/mx_driving/csrc/BoxesOverlapBev.cpp @@ -15,12 +15,12 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr int64_t N_IDX = 0; -void check_npu(const at::Tensor &boxes_a, const at::Tensor &boxes_b) +void check_npu(const at::Tensor& boxes_a, const at::Tensor& boxes_b) { TORCH_CHECK_NPU(boxes_a); TORCH_CHECK_NPU(boxes_b); @@ -33,7 +33,7 @@ void check_npu(const at::Tensor &boxes_a, const at::Tensor &boxes_b) * @param boxes_b: input boxes, 2D tensor(N, 5) * @return area_overlap: overlap area of boxes */ -at::Tensor npu_boxes_overlap_bev(const at::Tensor &boxes_a, const at::Tensor &boxes_b) +at::Tensor npu_boxes_overlap_bev(const at::Tensor& boxes_a, const at::Tensor& boxes_b) { TORCH_CHECK(boxes_a.size(1) == 5, "boxes_a must be 2D tensor (N, 5)"); TORCH_CHECK(boxes_b.size(1) == 5, "boxes_b must be 2D tensor (N, 5)"); diff --git a/mx_driving/csrc/CMakeLists.txt b/mx_driving/csrc/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..10fcddf9d354f07983689f83f6b5e7d7959405db --- /dev/null +++ b/mx_driving/csrc/CMakeLists.txt @@ -0,0 +1,62 @@ +file(GLOB ASCEND_CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +if(BUILD_STAGE EQUAL 2) + set(Python3_USE_STATIC_LIBS FALSE) + find_package(Python3 COMPONENTS Interpreter Development) + + execute_process( + COMMAND ${Python3_EXECUTABLE} -c + "import os; import torch; print(os.path.dirname(torch.__file__))" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE TORCH_PATH) + execute_process( + COMMAND + ${Python3_EXECUTABLE} -c + "import os; import site; print(site.getsitepackages()[0] + '/torch_npu')" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE TORCH_NPU_PATH) + message("TORCH_PATH is ${TORCH_PATH}") + message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") + + set(EXT_CXX_FLAGS "${EXT_CXX_FLAGS}") + separate_arguments(EXT_CXX_FLAGS) + add_library(_C SHARED ${ASCEND_CSRC_SRC}) + set_target_properties( + _C + PROPERTIES OUTPUT_NAME "${MX_DRIVING_PATH}/_C.${Python3_SOABI}" + PREFIX "" + SUFFIX ".so") + + if(${COMPILE_WITH_XLA}) + target_compile_definitions(_C PRIVATE COMPILE_WITH_XLA) + endif() + target_include_directories( + _C + PRIVATE ${Python3_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/include + ${TORCH_NPU_PATH}/include ${TORCH_PATH}/include + ${TORCH_PATH}/include/torch/csrc/api/include) + target_compile_options( + _C + PRIVATE -fprofile-arcs + -ftest-coverage + -fPIC + $<$:-O3> + $<$:-O0 + -g> + -fstack-protector-all + -DTORCH_API_INCLUDE_EXTENSION_H + -DTORCH_EXTENSION_NAME=_C + -D_GLIBCXX_USE_CXX11_ABI=0 + -D__FILENAME__=__FILE__ + ${EXT_CXX_FLAGS}) + + target_link_directories(_C PRIVATE ${TORCH_PATH}/lib ${TORCH_NPU_PATH}/lib) + target_link_libraries(_C PRIVATE gcov c10 torch torch_python torch_npu) + target_link_options( + _C + PRIVATE + $<$,EXECUTABLE>:-pie> + $<$:-s> + -Wl,-z,relro + -Wl,-z,now + -Wl,-z,noexecstack) +endif() diff --git a/mx_driving/fused/ops/csrc/DeformableAggregation.cpp b/mx_driving/csrc/DeformableAggregation.cpp similarity index 65% rename from mx_driving/fused/ops/csrc/DeformableAggregation.cpp rename to mx_driving/csrc/DeformableAggregation.cpp index da4027c22f0cc246db88af6c1d98eefbd73adc88..c2a769d49ab5d781d274466348a9f9ae75b8df64 100644 --- a/mx_driving/fused/ops/csrc/DeformableAggregation.cpp +++ b/mx_driving/csrc/DeformableAggregation.cpp @@ -1,6 +1,21 @@ -#include "csrc/OpApiCommon.h" -#include "functions.h" +// Copyright (c) 2024 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "csrc/OpApiCommon.h" +#include "csrc/functions.h" at::Tensor deformable_aggregation(const at::Tensor& mc_ms_feat, const at::Tensor& spatial_shape, const at::Tensor& scale_start_index, const at::Tensor& sampling_location, const at::Tensor& weights) @@ -32,21 +47,13 @@ at::Tensor deformable_aggregation(const at::Tensor& mc_ms_feat, const at::Tensor EXEC_NPU_CMD(aclnnDeformableAggregation, mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, batch_size, num_feat, num_embeds, num_anchors, num_pts, num_cams, num_scale, num_groups, out); - return out; } -std::tuple deformable_aggregation_grad( - const at::Tensor& mc_ms_feat, - const at::Tensor& spatial_shape, - const at::Tensor& scale_start_index, - const at::Tensor& sampling_location, - const at::Tensor& weights, - const at::Tensor& grad_output, - const at::Tensor& grad_mc_ms_feat, - const at::Tensor& grad_sampling_location, - const at::Tensor& grad_weights - ) +std::tuple deformable_aggregation_backward(const at::Tensor& mc_ms_feat, + const at::Tensor& spatial_shape, const at::Tensor& scale_start_index, const at::Tensor& sampling_location, + const at::Tensor& weights, const at::Tensor& grad_output, const at::Tensor& grad_mc_ms_feat, + const at::Tensor& grad_sampling_location, const at::Tensor& grad_weights) { TORCH_CHECK_NPU(mc_ms_feat); TORCH_CHECK_NPU(spatial_shape); @@ -61,20 +68,7 @@ std::tuple deformable_aggregation_grad( TORCH_CHECK(sampling_location.dim() == 5, "sampling_location.dim() must be 5, but got: ", sampling_location.dim()); TORCH_CHECK(weights.dim() == 6, "weights.dim() must be 6, but got: ", weights.dim()); - EXEC_NPU_CMD( - aclnnDeformableAggregationGrad, - mc_ms_feat, - spatial_shape, - scale_start_index, - sampling_location, - weights, - grad_output, - grad_mc_ms_feat, - grad_sampling_location, - grad_weights); - return std::make_tuple( - grad_mc_ms_feat, - grad_sampling_location, - grad_weights); + EXEC_NPU_CMD(aclnnDeformableAggregationGrad, mc_ms_feat, spatial_shape, scale_start_index, sampling_location, + weights, grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights); + return std::make_tuple(grad_mc_ms_feat, grad_sampling_location, grad_weights); } - diff --git a/mx_driving/fused/ops/csrc/DeformableConv2d.cpp b/mx_driving/csrc/DeformableConv2d.cpp similarity index 98% rename from mx_driving/fused/ops/csrc/DeformableConv2d.cpp rename to mx_driving/csrc/DeformableConv2d.cpp index f847861aa8f45029f945d9e2e4182cfa28a70a82..3f797463b032b3af42b62be1620ff82b650028c5 100644 --- a/mx_driving/fused/ops/csrc/DeformableConv2d.cpp +++ b/mx_driving/csrc/DeformableConv2d.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" std::tuple deformable_conv2d(const at::Tensor& input, const at::Tensor& offset, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, diff --git a/mx_driving/fused/ops/csrc/DeformableConv2dBackward.cpp b/mx_driving/csrc/DeformableConv2dBackward.cpp similarity index 73% rename from mx_driving/fused/ops/csrc/DeformableConv2dBackward.cpp rename to mx_driving/csrc/DeformableConv2dBackward.cpp index 038bb3760014f1d05b0c9ddbc0c2f45bd5ba3585..dd6444a7d81f33646d28c77d69139a049442ae27 100644 --- a/mx_driving/fused/ops/csrc/DeformableConv2dBackward.cpp +++ b/mx_driving/csrc/DeformableConv2dBackward.cpp @@ -1,5 +1,21 @@ +// Copyright (c) 2024 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" std::tuple deformable_conv2d_backward(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& offset, const at::Tensor& offset_output, const at::Tensor& grad_y, diff --git a/mx_driving/point/ops/csrc/DynamicScatter.cpp b/mx_driving/csrc/DynamicScatter.cpp similarity index 98% rename from mx_driving/point/ops/csrc/DynamicScatter.cpp rename to mx_driving/csrc/DynamicScatter.cpp index fb50a1431e3650c63ec2ce7dd22ad45bc0deaffa..48984495885728ac1dc725d63fdf1e696d5115e1 100644 --- a/mx_driving/point/ops/csrc/DynamicScatter.cpp +++ b/mx_driving/csrc/DynamicScatter.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" namespace { constexpr uint32_t BLOCK_NUM = 8; @@ -63,4 +63,4 @@ void npu_dynamic_scatter_grad(at::Tensor& grad_point_feats, const at::Tensor& gr EXEC_NPU_CMD(aclnnDynamicScatterGrad, grad_voxel_feats, prefix_sum_point_per_voxel, argsort_coor, compare_mask, reduce_type, grad_point_feats); } -} \ No newline at end of file +} diff --git a/mx_driving/point/ops/csrc/DynamicVoxelization.cpp b/mx_driving/csrc/DynamicVoxelization.cpp similarity index 96% rename from mx_driving/point/ops/csrc/DynamicVoxelization.cpp rename to mx_driving/csrc/DynamicVoxelization.cpp index 7b3ae41b7b39c4b7420b99266a1b7ca7e807f4d5..7c049c6411f889458c1520302bf8bbcf964c2efd 100644 --- a/mx_driving/point/ops/csrc/DynamicVoxelization.cpp +++ b/mx_driving/csrc/DynamicVoxelization.cpp @@ -14,10 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor dynamic_voxelization(const at::Tensor& points, at::Tensor& coors, int grid_x, int grid_y, int grid_z, double voxel_x, double voxel_y, double voxel_z, double coors_min_x, double coors_min_y, double coorsMinZ) diff --git a/mx_driving/point/ops/csrc/FurthestPointSampling.cpp b/mx_driving/csrc/FurthestPointSampling.cpp similarity index 97% rename from mx_driving/point/ops/csrc/FurthestPointSampling.cpp rename to mx_driving/csrc/FurthestPointSampling.cpp index ab097e94e4871d14f5e7629a489393ea6fa8b378..0db88d3414a51dfc1af35b3196f69f846a51584e 100644 --- a/mx_driving/point/ops/csrc/FurthestPointSampling.cpp +++ b/mx_driving/csrc/FurthestPointSampling.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor npu_furthest_point_sampling(const at::Tensor& point_xyz, const at::Tensor& nearset_temp, int32_t num_points) { @@ -23,4 +23,4 @@ at::Tensor npu_furthest_point_sampling(const at::Tensor& point_xyz, const at::Te nearset_temp.options().dtype(at::kInt)); EXEC_NPU_CMD(aclnnFurthestPointSampling, point_xyz, nearset_temp, num_points, output); return output; -} \ No newline at end of file +} diff --git a/mx_driving/point/ops/csrc/FurthestPointSamplingWithDist.cpp b/mx_driving/csrc/FurthestPointSamplingWithDist.cpp similarity index 97% rename from mx_driving/point/ops/csrc/FurthestPointSamplingWithDist.cpp rename to mx_driving/csrc/FurthestPointSamplingWithDist.cpp index cfef3b81c6034f8338eebe7fc1ad271cc955bb0a..bb36399d89bfa994d0c508bfae1867f6fa980dd8 100644 --- a/mx_driving/point/ops/csrc/FurthestPointSamplingWithDist.cpp +++ b/mx_driving/csrc/FurthestPointSamplingWithDist.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor furthest_point_sampling_with_dist( const at::Tensor& points_dist, const at::Tensor& nearest_temp, int32_t num_points) diff --git a/mx_driving/fused/ops/csrc/FusedBiasLeakyRelu.cpp b/mx_driving/csrc/FusedBiasLeakyRelu.cpp similarity index 89% rename from mx_driving/fused/ops/csrc/FusedBiasLeakyRelu.cpp rename to mx_driving/csrc/FusedBiasLeakyRelu.cpp index 720d9a49703cdcccf4b0097460509d22c93f7908..006740badcf68e97e8c57b92c86ec603e5834c9a 100644 --- a/mx_driving/fused/ops/csrc/FusedBiasLeakyRelu.cpp +++ b/mx_driving/csrc/FusedBiasLeakyRelu.cpp @@ -14,12 +14,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -at::Tensor fused_bias_leaky_relu(const at::Tensor& x, const at::Tensor& bias, const double negative_slope, const double scale) +at::Tensor fused_bias_leaky_relu(const at::Tensor& x, const at::Tensor& bias, double negative_slope, double scale) { TORCH_CHECK_NPU(x); TORCH_CHECK_NPU(bias); @@ -28,4 +26,4 @@ at::Tensor fused_bias_leaky_relu(const at::Tensor& x, const at::Tensor& bias, co EXEC_NPU_CMD(aclnnFusedBiasLeakyReluV2, x, bias, negative_slope, scale, output); return output; -} \ No newline at end of file +} diff --git a/mx_driving/fused/ops/csrc/GeometricKernelAttentionFunc.cpp b/mx_driving/csrc/GeometricKernelAttentionFunc.cpp similarity index 98% rename from mx_driving/fused/ops/csrc/GeometricKernelAttentionFunc.cpp rename to mx_driving/csrc/GeometricKernelAttentionFunc.cpp index cfff39e99d4d0c45c2238ffc81a4dac2cb4c24a8..ebfc0947c6e1f668dcc4bcd381852f4c653a6e71 100644 --- a/mx_driving/fused/ops/csrc/GeometricKernelAttentionFunc.cpp +++ b/mx_driving/csrc/GeometricKernelAttentionFunc.cpp @@ -16,7 +16,7 @@ #include "csrc/OpApiCommon.h" #include "csrc/utils.h" -#include "functions.h" +#include "csrc/functions.h" constexpr size_t VALUE_BATCH_SIZE_DIM = 0; constexpr size_t VALUE_NUM_KEYS_DIM = 1; @@ -30,7 +30,7 @@ constexpr size_t ATTN_WEIGHTS_NUM_POINTS_DIM = 4; constexpr size_t FLOAT32_BYTES = 4; constexpr size_t BLOCK_BYTES = 32; -at::Tensor npu_geometric_kernel_attention_func(const at::Tensor& value, const at::Tensor& spatial_shapes, +at::Tensor npu_geometric_kernel_attention(const at::Tensor& value, const at::Tensor& spatial_shapes, const at::Tensor& level_start_index, const at::Tensor& sampling_locations, const at::Tensor& attn_weights) { TORCH_CHECK(value.scalar_type() == at::kHalf || value.scalar_type() == at::kFloat, diff --git a/mx_driving/point/ops/csrc/GroupPoints.cpp b/mx_driving/csrc/GroupPoints.cpp similarity index 91% rename from mx_driving/point/ops/csrc/GroupPoints.cpp rename to mx_driving/csrc/GroupPoints.cpp index 03e8b4797a97fea0fe82737a648a5e91a8f58be7..94f052eae48c48434cc350e7fee1347ebbd3cf26 100644 --- a/mx_driving/point/ops/csrc/GroupPoints.cpp +++ b/mx_driving/csrc/GroupPoints.cpp @@ -15,15 +15,15 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" - +#include "csrc/functions.h" at::Tensor group_points( const at::Tensor& points, const at::Tensor& idx, int64_t b, int64_t c, int64_t n, int64_t npoints, int64_t nsample) { TORCH_CHECK_NPU(points); TORCH_CHECK_NPU(idx); - TORCH_CHECK(points.scalar_type() == at::kHalf || points.scalar_type() == at::kFloat, "group_points only support float16 or float32 tensor.") + TORCH_CHECK(points.scalar_type() == at::kHalf || points.scalar_type() == at::kFloat, + "group_points only support float16 or float32 tensor.") TORCH_CHECK(points.dim() == 3, "points.dim() must be 3, but got: ", points.dim()); TORCH_CHECK(idx.dim() == 3, "idx.dim() must be 3, but got: ", idx.dim()); TORCH_CHECK(points.size(0) == idx.size(0), "the input first dimension must be the same.") @@ -43,8 +43,8 @@ at::Tensor group_points( } -at::Tensor group_points_backward(const at::Tensor& grad_out, const at::Tensor& idx, int64_t b, - int64_t c, int64_t n, int64_t npoints, int64_t nsample) +at::Tensor group_points_backward(const at::Tensor& grad_out, const at::Tensor& idx, int64_t b, int64_t c, int64_t n, + int64_t npoints, int64_t nsample) { TORCH_CHECK_NPU(grad_out); TORCH_CHECK_NPU(idx); @@ -61,4 +61,4 @@ at::Tensor group_points_backward(const at::Tensor& grad_out, const at::Tensor& i at::Tensor grad_points = out.transpose(1, 2); return grad_points; -} \ No newline at end of file +} diff --git a/mx_driving/point/ops/csrc/HardVoxelize.cpp b/mx_driving/csrc/HardVoxelize.cpp similarity index 99% rename from mx_driving/point/ops/csrc/HardVoxelize.cpp rename to mx_driving/csrc/HardVoxelize.cpp index 25964218ac2f8a64d177c57bedf6c85c342fa721..27c779541f4fea9685a2adf070d8a9e5711feacb 100644 --- a/mx_driving/point/ops/csrc/HardVoxelize.cpp +++ b/mx_driving/csrc/HardVoxelize.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" constexpr size_t NUM_VOXELS_IDX = 0; constexpr size_t UNI_VOXELS_IDX = 1; diff --git a/include/csrc/pybind.h b/mx_driving/csrc/Hypot.cpp similarity index 52% rename from include/csrc/pybind.h rename to mx_driving/csrc/Hypot.cpp index 49ac2037b041ffbe1d634508b46d35ad8ebb72f5..24a0d4a77133de34edcd55220f0c3dd60398619c 100644 --- a/include/csrc/pybind.h +++ b/mx_driving/csrc/Hypot.cpp @@ -13,14 +13,23 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#ifndef CSRC_PYBIND_H_ -#define CSRC_PYBIND_H_ -#include -void init_common(pybind11::module& m); -void init_fused(pybind11::module& m); -void init_point(pybind11::module& m); -void init_preprocess(pybind11::module& m); -void init_detection(pybind11::module& m); -void init_spconv(pybind11::module& m); -#endif // CSRC_PYBIND_H_ + +#include "csrc/OpApiCommon.h" +#include "csrc/functions.h" + +at::Tensor npu_hypot(const at::Tensor& x, const at::Tensor& y) +{ + auto out = at::empty_like(x, x.options()); + EXEC_NPU_CMD(aclnnHypot, x, y, out); + return out; +} + +std::tuple npu_hypot_grad( + const at::Tensor& x, const at::Tensor& y, const at::Tensor& out, const at::Tensor& out_grad) +{ + auto x_grad = at::empty_like(x, x.options()); + auto y_grad = at::empty_like(y, y.options()); + EXEC_NPU_CMD(aclnnHypotGrad, x, y, out, out_grad, x_grad, y_grad); + return std::make_tuple(x_grad, y_grad); +} diff --git a/mx_driving/common/ops/csrc/Knn.cpp b/mx_driving/csrc/Knn.cpp similarity index 97% rename from mx_driving/common/ops/csrc/Knn.cpp rename to mx_driving/csrc/Knn.cpp index e9c47973572ad7021acd3019fbd4567f004e2d14..12f03e1b3ac70504f1b51d70f81c778587c3c9bc 100644 --- a/mx_driving/common/ops/csrc/Knn.cpp +++ b/mx_driving/csrc/Knn.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" std::tuple knn(const at::Tensor& xyz, const at::Tensor& center_xyz, int32_t k, bool is_from_knn) { @@ -28,4 +28,4 @@ std::tuple knn(const at::Tensor& xyz, const at::Tensor& EXEC_NPU_CMD_SYNC(aclnnKnn, xyz, center_xyz, is_from_knn, k, dist, idx); return std::tie(dist, idx); -} \ No newline at end of file +} diff --git a/mx_driving/fused/ops/csrc/MaxPool2d.cpp b/mx_driving/csrc/MaxPool2d.cpp similarity index 99% rename from mx_driving/fused/ops/csrc/MaxPool2d.cpp rename to mx_driving/csrc/MaxPool2d.cpp index 499c8e70462dd8a2daf3e6ea3ece6442b7476485..94e2e366bae227d8ebf2f8eacf4750d9eae9215d 100644 --- a/mx_driving/fused/ops/csrc/MaxPool2d.cpp +++ b/mx_driving/csrc/MaxPool2d.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor npu_max_pool2d(const at::Tensor& x, int kernel_size, int stride, int padding) { diff --git a/mx_driving/fused/ops/csrc/ModulatedDeformableConv2d.cpp b/mx_driving/csrc/ModulatedDeformableConv2d.cpp similarity index 98% rename from mx_driving/fused/ops/csrc/ModulatedDeformableConv2d.cpp rename to mx_driving/csrc/ModulatedDeformableConv2d.cpp index 5fe0afa6d235868f5bacb006a46d2970cf68c2bc..c41d6c72a69ba596604b408bffb00265fed558a4 100644 --- a/mx_driving/fused/ops/csrc/ModulatedDeformableConv2d.cpp +++ b/mx_driving/csrc/ModulatedDeformableConv2d.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" std::tuple modulated_deformable_conv2d(const at::Tensor& input, const at::Tensor& offset, const at::Tensor& mask, const at::Tensor& weight, const c10::optional& bias_opt, diff --git a/mx_driving/fused/ops/csrc/ModulatedDeformableConv2dBackward.cpp b/mx_driving/csrc/ModulatedDeformableConv2dBackward.cpp similarity index 75% rename from mx_driving/fused/ops/csrc/ModulatedDeformableConv2dBackward.cpp rename to mx_driving/csrc/ModulatedDeformableConv2dBackward.cpp index 7b72604d206d9466f15c2b02addbacae32df10d5..94614e5531cadc7e02fd2dcf16fd6e439309f7e3 100644 --- a/mx_driving/fused/ops/csrc/ModulatedDeformableConv2dBackward.cpp +++ b/mx_driving/csrc/ModulatedDeformableConv2dBackward.cpp @@ -1,5 +1,21 @@ +// Copyright (c) 2024 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" std::tuple modulated_deformable_conv2d_backward( const at::Tensor& input, const at::Tensor& offset, const at::Tensor& mask, const at::Tensor& weight, diff --git a/mx_driving/fused/ops/csrc/MultiScaleDeformableAttn.cpp b/mx_driving/csrc/MultiScaleDeformableAttn.cpp similarity index 99% rename from mx_driving/fused/ops/csrc/MultiScaleDeformableAttn.cpp rename to mx_driving/csrc/MultiScaleDeformableAttn.cpp index 2e81bdf243f05a2d1b4d154438f94231e41716c1..f0a0e7c7cc4f0bff337f7b14c5eb2cf99a91ec33 100644 --- a/mx_driving/fused/ops/csrc/MultiScaleDeformableAttn.cpp +++ b/mx_driving/csrc/MultiScaleDeformableAttn.cpp @@ -15,8 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" - +#include "csrc/functions.h" namespace { constexpr size_t BATCH_IDX = 0; constexpr size_t QUERY_IDX = 1; diff --git a/mx_driving/spconv/ops/csrc/MultiToSparse.cpp b/mx_driving/csrc/MultiToSparse.cpp similarity index 77% rename from mx_driving/spconv/ops/csrc/MultiToSparse.cpp rename to mx_driving/csrc/MultiToSparse.cpp index f6ba0d09f0a03ae681ada99e9a842ff11d82abaf..0424634e83bac86732701b443f04eb3f81bb04dc 100644 --- a/mx_driving/spconv/ops/csrc/MultiToSparse.cpp +++ b/mx_driving/csrc/MultiToSparse.cpp @@ -15,10 +15,11 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -std::tuple multi_to_sparse(const at::Tensor& out_features, const at::Tensor& unique_indices_offset, - const at::Tensor& sorted_idx_to_former_indices, const at::Tensor& outidx_pair) +std::tuple multi_to_sparse(const at::Tensor& out_features, + const at::Tensor& unique_indices_offset, const at::Tensor& sorted_idx_to_former_indices, + const at::Tensor& outidx_pair) { TORCH_CHECK_NPU(out_features); TORCH_CHECK_NPU(unique_indices_offset); @@ -28,21 +29,21 @@ std::tuple multi_to_sparse(const at::Tensor& out_feature auto indices_size = unique_indices_offset.sizes(); auto features_size = out_features.sizes(); TORCH_CHECK(indices_size[0] > 1, - "indices_size zeros dim must be greater than 1 expected but got indices_size[0] value: ", - indices_size[0]); + "indices_size zeros dim must be greater than 1 expected but got indices_size[0] value: ", indices_size[0]); c10::SmallVector out_size = {indices_size[0] - 1, features_size[1]}; c10::SmallVector out_idx_size = {indices_size[0] - 1, 8}; at::Tensor sparse_value = at::empty(out_size, out_features.options()); at::Tensor sparse_indices = at::empty(out_idx_size, unique_indices_offset.options()); - EXEC_NPU_CMD(aclnnToSparse, unique_indices_offset, out_features, - sorted_idx_to_former_indices, outidx_pair, sparse_value, sparse_indices); + EXEC_NPU_CMD(aclnnToSparse, unique_indices_offset, out_features, sorted_idx_to_former_indices, outidx_pair, + sparse_value, sparse_indices); return std::tie(sparse_value, sparse_indices); } -std::tuple multi_to_sparse_v2(const at::Tensor& features, const at::Tensor& weight, const at::Tensor& unique_indices_offset, - const at::Tensor& sorted_idx_to_former_indices, const at::Tensor& outidx_pair) +std::tuple multi_to_sparse_v2(const at::Tensor& features, const at::Tensor& weight, + const at::Tensor& unique_indices_offset, const at::Tensor& sorted_idx_to_former_indices, + const at::Tensor& outidx_pair) { TORCH_CHECK_NPU(features); TORCH_CHECK_NPU(weight); @@ -55,8 +56,7 @@ std::tuple multi_to_sparse_v2(const at::Tensor& features auto indices_size = unique_indices_offset.sizes(); TORCH_CHECK(indices_size[0] > 1, - "indices_size zeros dim must be greater than 1 expected but got indices_size[0] value: ", - indices_size[0]); + "indices_size zeros dim must be greater than 1 expected but got indices_size[0] value: ", indices_size[0]); c10::SmallVector out_size = {indices_size[0] - 1, weight_size[4]}; c10::SmallVector out_idx_size = {indices_size[0] - 1, 8}; @@ -64,7 +64,7 @@ std::tuple multi_to_sparse_v2(const at::Tensor& features at::Tensor sparse_value = at::empty(out_size, features.options()); at::Tensor sparse_indices = at::empty(out_idx_size, unique_indices_offset.options()); - EXEC_NPU_CMD(aclnnToSparseV3, features, weight, unique_indices_offset, - sorted_idx_to_former_indices, outidx_pair, sparse_value, sparse_indices); + EXEC_NPU_CMD(aclnnToSparseV3, features, weight, unique_indices_offset, sorted_idx_to_former_indices, outidx_pair, + sparse_value, sparse_indices); return std::tie(sparse_value, sparse_indices); -} \ No newline at end of file +} diff --git a/mx_driving/detection/ops/csrc/Nms3d.cpp b/mx_driving/csrc/Nms3d.cpp similarity index 97% rename from mx_driving/detection/ops/csrc/Nms3d.cpp rename to mx_driving/csrc/Nms3d.cpp index c92cda16fa9ca924c10c193400a6176b774bfd27..587d8ea6b752fe60f6a82dc8a0e9018638485bfb 100644 --- a/mx_driving/detection/ops/csrc/Nms3d.cpp +++ b/mx_driving/csrc/Nms3d.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" std::tuple nms3d(const at::Tensor& boxes, double threshold) { diff --git a/mx_driving/detection/ops/csrc/Nms3dNormal.cpp b/mx_driving/csrc/Nms3dNormal.cpp similarity index 97% rename from mx_driving/detection/ops/csrc/Nms3dNormal.cpp rename to mx_driving/csrc/Nms3dNormal.cpp index 933dfd0792116fb5b6ea9bf3643c21de39cca680..387ee2bb9945625001374f6b11b0159758eb1236 100644 --- a/mx_driving/detection/ops/csrc/Nms3dNormal.cpp +++ b/mx_driving/csrc/Nms3dNormal.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" std::tuple nms3d_normal(const at::Tensor& boxes, double nms_overlap_thresh) { diff --git a/mx_driving/common/ops/kernels/op_host/OWNERS b/mx_driving/csrc/OWNERS similarity index 100% rename from mx_driving/common/ops/kernels/op_host/OWNERS rename to mx_driving/csrc/OWNERS diff --git a/bind/OpApiCommon.cpp b/mx_driving/csrc/OpApiCommon.cpp similarity index 100% rename from bind/OpApiCommon.cpp rename to mx_driving/csrc/OpApiCommon.cpp diff --git a/mx_driving/detection/ops/csrc/PixelGroup.cpp b/mx_driving/csrc/PixelGroup.cpp similarity index 86% rename from mx_driving/detection/ops/csrc/PixelGroup.cpp rename to mx_driving/csrc/PixelGroup.cpp index ce16121c3a33fcfd5f4a222830699e700d1aea86..fc3e5175df43bd509194af786025197d462465de 100644 --- a/mx_driving/detection/ops/csrc/PixelGroup.cpp +++ b/mx_driving/csrc/PixelGroup.cpp @@ -15,11 +15,11 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -std::vector> pixel_group(const at::Tensor &score, const at::Tensor &mask, const at::Tensor &embedding, - const at::Tensor &kernel_label, const at::Tensor &kernel_contour, - int32_t kernel_region_num, double distance_threshold) +std::vector> pixel_group(const at::Tensor& score, const at::Tensor& mask, + const at::Tensor& embedding, const at::Tensor& kernel_label, const at::Tensor& kernel_contour, + int32_t kernel_region_num, double distance_threshold) { TORCH_CHECK_NPU(score); TORCH_CHECK_NPU(mask); @@ -42,8 +42,8 @@ std::vector> pixel_group(const at::Tensor &score, const at::T at::Tensor label_updated = at::empty(label_updated_size, kernel_label.options()); at::Tensor valid_mask = at::empty(label_updated_size, mask.options()); - EXEC_NPU_CMD(aclnnPixelGroup, score, mask, embedding, kernel_label, kernel_contour, - kernel_region_num, distance_threshold, point_vector, label_updated); + EXEC_NPU_CMD(aclnnPixelGroup, score, mask, embedding, kernel_label, kernel_contour, kernel_region_num, + distance_threshold, point_vector, label_updated); std::vector> pixel_assignment(kernel_region_num); at::Tensor point_vector_cpu = point_vector.to(at::kCPU); @@ -68,4 +68,4 @@ std::vector> pixel_group(const at::Tensor &score, const at::T } return pixel_assignment; -} \ No newline at end of file +} diff --git a/mx_driving/point/ops/csrc/PointToVoxel.cpp b/mx_driving/csrc/PointToVoxel.cpp similarity index 98% rename from mx_driving/point/ops/csrc/PointToVoxel.cpp rename to mx_driving/csrc/PointToVoxel.cpp index 63214e75ceab1e328cae67f423feed0bc9f038a2..e1c0c270636ee6c1daee8e86197df0479a8ef49a 100644 --- a/mx_driving/point/ops/csrc/PointToVoxel.cpp +++ b/mx_driving/csrc/PointToVoxel.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" constexpr float DEFAULT_VALUE = -1.0f; constexpr size_t VOXEL_SIZES_SIZE = 3; diff --git a/mx_driving/preprocess/ops/csrc/PointsInBox.cpp b/mx_driving/csrc/PointsInBox.cpp similarity index 97% rename from mx_driving/preprocess/ops/csrc/PointsInBox.cpp rename to mx_driving/csrc/PointsInBox.cpp index 92ad849ef5f8622eacdcf33310c492e711193a1d..c13a08f0329d6c2a8db8bb202a018fbfc37b3d31 100644 --- a/mx_driving/preprocess/ops/csrc/PointsInBox.cpp +++ b/mx_driving/csrc/PointsInBox.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor npu_points_in_box(const at::Tensor& boxes, const at::Tensor& pts) { diff --git a/mx_driving/preprocess/ops/csrc/PointsInBoxAll.cpp b/mx_driving/csrc/PointsInBoxAll.cpp similarity index 98% rename from mx_driving/preprocess/ops/csrc/PointsInBoxAll.cpp rename to mx_driving/csrc/PointsInBoxAll.cpp index b6fb5e78ff323b66a7dbcd37837d9565f0db0f39..d2a8bd342723265aeb5b8f7d475b271386ec754b 100644 --- a/mx_driving/preprocess/ops/csrc/PointsInBoxAll.cpp +++ b/mx_driving/csrc/PointsInBoxAll.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor npu_points_in_box_all(const at::Tensor& boxes, const at::Tensor& pts) { diff --git a/mx_driving/spconv/ops/csrc/PrepareSubmConv3d.cpp b/mx_driving/csrc/PrepareSubmConv3d.cpp similarity index 80% rename from mx_driving/spconv/ops/csrc/PrepareSubmConv3d.cpp rename to mx_driving/csrc/PrepareSubmConv3d.cpp index dd73a12dab58ff45cb1dda678741f33f67363fb9..c1017abc64903fd7b5c77eefd5993f49dd5fa16e 100644 --- a/mx_driving/spconv/ops/csrc/PrepareSubmConv3d.cpp +++ b/mx_driving/csrc/PrepareSubmConv3d.cpp @@ -15,14 +15,14 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -std::tuple npu_prepare_subm_conv3d(const at::Tensor& flattenIndices, - at::IntArrayRef outSpatialShape, int batch_size) +std::tuple npu_prepare_subm_conv3d( + const at::Tensor& flattenIndices, at::IntArrayRef outSpatialShape, int batch_size) { int64_t outputnum = 1; for (int32_t i = 0; i < outSpatialShape.size(); i++) { - outputnum *= outSpatialShape[i]; + outputnum *= outSpatialShape[i]; } c10::SmallVector output_size = {batch_size * outputnum}; auto temp = at::empty(output_size, flattenIndices.options().dtype(at::kFloat)).fill_(-1); diff --git a/mx_driving/detection/ops/csrc/RoiAlignRotatedGradV2.cpp b/mx_driving/csrc/RoiAlignRotatedGradV2.cpp similarity index 70% rename from mx_driving/detection/ops/csrc/RoiAlignRotatedGradV2.cpp rename to mx_driving/csrc/RoiAlignRotatedGradV2.cpp index d8736b9aa558cbacfff86e274dfa013cf98db9e4..d4b0439d19cd38e13bfbd3b59eaf4eac06f8bc1b 100644 --- a/mx_driving/detection/ops/csrc/RoiAlignRotatedGradV2.cpp +++ b/mx_driving/csrc/RoiAlignRotatedGradV2.cpp @@ -13,13 +13,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input, - const at::Tensor& rois, const at::Tensor& grad_output, - int32_t pooled_height, int32_t pooled_width, double spatial_scale, - int32_t sampling_ratio, bool aligned, bool clockwise) +at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input, const at::Tensor& rois, const at::Tensor& grad_output, + int32_t pooled_height, int32_t pooled_width, double spatial_scale, int32_t sampling_ratio, bool aligned, + bool clockwise) { auto ori_dtype = input.scalar_type(); @@ -27,9 +27,8 @@ at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input, at::Tensor grad_input = at::zeros(grad_input_size, input.options()); - EXEC_NPU_CMD(aclnnRoiAlignRotatedGradV2, input, rois, grad_output, - pooled_height, pooled_width, spatial_scale, sampling_ratio, aligned, clockwise, - grad_input); + EXEC_NPU_CMD(aclnnRoiAlignRotatedGradV2, input, rois, grad_output, pooled_height, pooled_width, spatial_scale, + sampling_ratio, aligned, clockwise, grad_input); return grad_input.to(ori_dtype); -} \ No newline at end of file +} diff --git a/mx_driving/detection/ops/csrc/RoiAlignRotatedV2.cpp b/mx_driving/csrc/RoiAlignRotatedV2.cpp similarity index 76% rename from mx_driving/detection/ops/csrc/RoiAlignRotatedV2.cpp rename to mx_driving/csrc/RoiAlignRotatedV2.cpp index dd7304fa7927e735b70673672043a069b49de0cc..bbf5017793e02b2bbb3bac16dafe0bcf7e30f1be 100644 --- a/mx_driving/detection/ops/csrc/RoiAlignRotatedV2.cpp +++ b/mx_driving/csrc/RoiAlignRotatedV2.cpp @@ -13,14 +13,16 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" void roi_align_rotated_v2_forward_npu(const at::Tensor& input, const at::Tensor& rois_map, at::Tensor& output, - double spatial_scale, int32_t sampling_ratio, int32_t pooled_height, int32_t pooled_width, - bool aligned, bool clockwise) + double spatial_scale, int32_t sampling_ratio, int32_t pooled_height, int32_t pooled_width, bool aligned, + bool clockwise) { at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous(); at::Tensor rois = rois_map.permute({1, 0}).contiguous(); - EXEC_NPU_CMD(aclnnRoiAlignRotatedV2, feature_map, rois, spatial_scale, sampling_ratio, pooled_height, pooled_width, aligned, clockwise, output); -} \ No newline at end of file + EXEC_NPU_CMD(aclnnRoiAlignRotatedV2, feature_map, rois, spatial_scale, sampling_ratio, pooled_height, pooled_width, + aligned, clockwise, output); +} diff --git a/mx_driving/detection/ops/csrc/RoiawarePool3d.cpp b/mx_driving/csrc/RoiawarePool3d.cpp similarity index 93% rename from mx_driving/detection/ops/csrc/RoiawarePool3d.cpp rename to mx_driving/csrc/RoiawarePool3d.cpp index e69adbd655b5b8c20830b9c97692d1c14255237a..faab5fb99eb6be1df75c980ba9aac856294ad71b 100644 --- a/mx_driving/detection/ops/csrc/RoiawarePool3d.cpp +++ b/mx_driving/csrc/RoiawarePool3d.cpp @@ -21,8 +21,9 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" void npu_roiaware_pool3d_forward(const at::Tensor& rois, const at::Tensor& pts, const at::Tensor& pts_feature, at::Tensor& argmax, at::Tensor& pts_idx_of_voxels, at::Tensor& pooled_features, int32_t mode) @@ -45,11 +46,12 @@ void npu_roiaware_pool3d_forward(const at::Tensor& rois, const at::Tensor& pts, uint32_t outy = pts_idx_of_voxels.size(2); uint32_t outz = pts_idx_of_voxels.size(3); - EXEC_NPU_CMD(aclnnRoiawarePool3d, rois_cast, pts_cast, pts_feature_cast, mode, max_pts_each_voxel, outx, outy, outz, argmax, pts_idx_of_voxels, pooled_features_cast); + EXEC_NPU_CMD(aclnnRoiawarePool3d, rois_cast, pts_cast, pts_feature_cast, mode, max_pts_each_voxel, outx, outy, outz, + argmax, pts_idx_of_voxels, pooled_features_cast); if (dtype == at::kHalf) { pooled_features_cast = pooled_features_cast.to(at::kHalf); } pooled_features.copy_(pooled_features_cast); -} \ No newline at end of file +} diff --git a/mx_driving/detection/ops/csrc/RoiawarePool3dGrad.cpp b/mx_driving/csrc/RoiawarePool3dGrad.cpp similarity index 52% rename from mx_driving/detection/ops/csrc/RoiawarePool3dGrad.cpp rename to mx_driving/csrc/RoiawarePool3dGrad.cpp index dd4e316327170beec7a53c55414cca3e9d9d4770..a289519c0e9f3afcb572d6fe610feeef7da54f52 100644 --- a/mx_driving/detection/ops/csrc/RoiawarePool3dGrad.cpp +++ b/mx_driving/csrc/RoiawarePool3dGrad.cpp @@ -1,5 +1,29 @@ +// Copyright (c) 2023-2024 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Copyright (c) 2023-2024 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::Tensor& argmax, @@ -9,10 +33,11 @@ at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::T TORCH_CHECK_NPU(argmax); TORCH_CHECK_NPU(grad_out); - TORCH_CHECK(pts_idx_of_voxels.dim() == 5, "pts_idx_of_voxels must to be a 5D Tensor, but got: ", pts_idx_of_voxels.dim()); + TORCH_CHECK( + pts_idx_of_voxels.dim() == 5, "pts_idx_of_voxels must to be a 5D Tensor, but got: ", pts_idx_of_voxels.dim()); TORCH_CHECK(argmax.dim() == 5, "argmax as to be a 5D Tensor, but got: ", argmax.dim()); TORCH_CHECK(grad_out.dim() == 5, "grad_out has to be a 5D Tensor, but got: ", grad_out.dim()); - + int32_t boxes_num = grad_out.size(0); int32_t out_x = grad_out.size(1); int32_t out_y = grad_out.size(2); @@ -20,12 +45,13 @@ at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::T int32_t channels = grad_out.size(4); int32_t max_pts_per_voxel = pts_idx_of_voxels.size(4); - TORCH_CHECK((boxes_num != 0 && out_x != 0 && out_y != 0 && out_z != 0 && channels != 0 && npoints != 0), "Error, some dim equals zero!\n"); + TORCH_CHECK((boxes_num != 0 && out_x != 0 && out_y != 0 && out_z != 0 && channels != 0 && npoints != 0), + "Error, some dim equals zero!\n"); TORCH_CHECK((channels <= 2048), "channels must less equal than 2048, but got: ", channels); auto dtype = grad_out.dtype(); at::Tensor grad_out_cast = grad_out; - + at::Tensor grad_in = at::zeros({npoints, channels}, grad_out.options()); if (dtype == at::kHalf) { grad_out_cast = grad_out.to(at::kFloat); @@ -34,20 +60,21 @@ at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::T if (pool_method == 0) { // maxpool3d - EXEC_NPU_CMD(aclnnRoiawareMaxpool3dGrad, argmax, grad_out_cast, boxes_num, - out_x, out_y, out_z, channels, npoints, grad_in); + EXEC_NPU_CMD(aclnnRoiawareMaxpool3dGrad, argmax, grad_out_cast, boxes_num, out_x, out_y, out_z, channels, + npoints, grad_in); } else if (pool_method == 1) { // avgpool3d TORCH_CHECK(npoints >= max_pts_per_voxel, "npoints must greator than max_pts_per_voxel!"); TORCH_CHECK(max_pts_per_voxel != 0, "Error, some dim equals zero!"); - TORCH_CHECK((max_pts_per_voxel <= 2048), "max_pts_per_voxel must less equal than 2048, but got: ", max_pts_per_voxel); - - EXEC_NPU_CMD(aclnnRoiawareAvgpool3dGrad, pts_idx_of_voxels, grad_out_cast, boxes_num, - out_x, out_y, out_z, channels, npoints, max_pts_per_voxel, grad_in); + TORCH_CHECK( + (max_pts_per_voxel <= 2048), "max_pts_per_voxel must less equal than 2048, but got: ", max_pts_per_voxel); + + EXEC_NPU_CMD(aclnnRoiawareAvgpool3dGrad, pts_idx_of_voxels, grad_out_cast, boxes_num, out_x, out_y, out_z, + channels, npoints, max_pts_per_voxel, grad_in); } if (dtype == at::kHalf) { grad_in = grad_in.to(at::kHalf); } return grad_in; -} \ No newline at end of file +} diff --git a/mx_driving/preprocess/ops/csrc/RoipointPool3dForward.cpp b/mx_driving/csrc/RoipointPool3dForward.cpp similarity index 69% rename from mx_driving/preprocess/ops/csrc/RoipointPool3dForward.cpp rename to mx_driving/csrc/RoipointPool3dForward.cpp index aed16dd4290d40e48490494571a564dd06e75afe..b9ca5a59a904b5b5ece8181bbfb0f58e70da0418 100644 --- a/mx_driving/preprocess/ops/csrc/RoipointPool3dForward.cpp +++ b/mx_driving/csrc/RoipointPool3dForward.cpp @@ -13,9 +13,9 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" /* * points: (B, N, 3) @@ -25,19 +25,17 @@ * pooled_empty_flag: (B, M) */ std::tuple npu_roipoint_pool3d_forward(const int32_t num_sampled_points, - const at::Tensor &points, - const at::Tensor &point_features, - const at::Tensor &boxes3d) + const at::Tensor& points, const at::Tensor& point_features, const at::Tensor& boxes3d) { auto points_trans = points.transpose(1, 2).contiguous(); auto point_features_trans = point_features.transpose(1, 2).contiguous(); - c10::SmallVector features_trans_size = {points.size(0), boxes3d.size(1), - points.size(2) + point_features.size(2), num_sampled_points}; + c10::SmallVector features_trans_size = { + points.size(0), boxes3d.size(1), points.size(2) + point_features.size(2), num_sampled_points}; at::Tensor pooled_features_trans = at::empty(features_trans_size, points.options()); c10::SmallVector empty_flag_size = {boxes3d.size(0), boxes3d.size(1)}; at::Tensor pooled_empty_flag = at::empty(empty_flag_size, boxes3d.options().dtype(at::kInt)); - EXEC_NPU_CMD(aclnnRoipointPool3dForward, - points_trans, point_features_trans, boxes3d, num_sampled_points, pooled_features_trans, pooled_empty_flag); + EXEC_NPU_CMD(aclnnRoipointPool3dForward, points_trans, point_features_trans, boxes3d, num_sampled_points, + pooled_features_trans, pooled_empty_flag); auto pooled_features = pooled_features_trans.transpose(2, 3).contiguous(); return std::tie(pooled_features, pooled_empty_flag); -} \ No newline at end of file +} diff --git a/mx_driving/detection/ops/csrc/RotatedIou.cpp b/mx_driving/csrc/RotatedIou.cpp similarity index 97% rename from mx_driving/detection/ops/csrc/RotatedIou.cpp rename to mx_driving/csrc/RotatedIou.cpp index fd39e98507a4494a4215669ae6a625e442a1c499..f25da9ed0bcb5a5aa8954c759b19b6466061792d 100644 --- a/mx_driving/detection/ops/csrc/RotatedIou.cpp +++ b/mx_driving/csrc/RotatedIou.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "functions.h" -#include "torch_npu/csrc/framework/OpCommand.h" +#include "csrc/OpApiCommon.h" +#include "csrc/functions.h" namespace { at::Tensor& rotated_iou_npu_nocheck(at::Tensor& iou, const at::Tensor& boxes, const at::Tensor& query_boxes, bool trans, diff --git a/mx_driving/detection/ops/csrc/RotatedOverlaps.cpp b/mx_driving/csrc/RotatedOverlaps.cpp similarity index 70% rename from mx_driving/detection/ops/csrc/RotatedOverlaps.cpp rename to mx_driving/csrc/RotatedOverlaps.cpp index 2f49abd989a06f0372028c39d2e75fba9c717a9c..ee1f952e138176f63637da3c77a9e71e8b23b437 100644 --- a/mx_driving/detection/ops/csrc/RotatedOverlaps.cpp +++ b/mx_driving/csrc/RotatedOverlaps.cpp @@ -14,35 +14,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "torch_npu/csrc/framework/OpCommand.h" -#include "functions.h" +#include "csrc/OpApiCommon.h" +#include "csrc/functions.h" namespace { -at::Tensor &rotated_overlaps_npu_nocheck( - at::Tensor &overlaps, - const at::Tensor &self, - const at::Tensor &query_boxes, - bool trans) +at::Tensor& rotated_overlaps_npu_nocheck( + at::Tensor& overlaps, const at::Tensor& self, const at::Tensor& query_boxes, bool trans) { at_npu::native::OpCommand cmd; - cmd.Name("RotatedOverlaps") - .Input(self) - .Input(query_boxes) - .Output(overlaps) - .Attr("trans", trans) - .Run(); + cmd.Name("RotatedOverlaps").Input(self).Input(query_boxes).Output(overlaps).Attr("trans", trans).Run(); return overlaps; } } // namespace -at::Tensor npu_rotated_overlaps( - const at::Tensor &self, - const at::Tensor &query_boxes, - bool trans) +at::Tensor npu_rotated_overlaps(const at::Tensor& self, const at::Tensor& query_boxes, bool trans) { TORCH_CHECK(self.ndimension() == 3 && query_boxes.ndimension() == 3, - "boxes' dim should be equal to query_boxes' ndimension() ", - "and equal to 3!"); + "boxes' dim should be equal to query_boxes' ndimension() ", "and equal to 3!"); auto origin_dtype = self.scalar_type(); // the Op only support fp32 currently! at::Tensor self_cp = self.to(at::kFloat).permute({0, 2, 1}); diff --git a/mx_driving/common/ops/csrc/ScatterMax.cpp b/mx_driving/csrc/ScatterMax.cpp similarity index 80% rename from mx_driving/common/ops/csrc/ScatterMax.cpp rename to mx_driving/csrc/ScatterMax.cpp index 37c06e1523fa098d214e40edaa6407b2c4c785cf..a1c4ee4102716878fa9f590b1f5d75e33ea600bd 100644 --- a/mx_driving/common/ops/csrc/ScatterMax.cpp +++ b/mx_driving/csrc/ScatterMax.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" using namespace std; @@ -23,7 +23,7 @@ namespace { constexpr uint32_t MAX_INDICES_VALUE = 120000; constexpr uint32_t SUPPORT_UPDATES = 32; constexpr uint32_t MAX_SUPPORT_UPDATES = 512; -} +} // namespace void npu_scatter_max_check(const at::Tensor& updates, const at::Tensor& indices, const at::Tensor& result) { @@ -42,12 +42,14 @@ void npu_scatter_max_check(const at::Tensor& updates, const at::Tensor& indices, for (size_t i = 1; i < result.dim(); i++) { TORCH_CHECK(updatesSizes[i] == resultSizes[i], "updates and out should have the same size except for dim 0."); } - TORCH_CHECK(indicesLength == 1, "all the dims's range except the first dim of input tensor [indices] should be equal to 1."); - TORCH_CHECK(indices.sizes()[0] == updates.sizes()[0], "input's updates size of dim 0 should be equal to indices's size."); + TORCH_CHECK(indicesLength == 1, + "all the dims's range except the first dim of input tensor [indices] should be equal to 1."); + TORCH_CHECK( + indices.sizes()[0] == updates.sizes()[0], "input's updates size of dim 0 should be equal to indices's size."); } -std::tuple scatter_max_with_argmax_v2(const at::Tensor& updates, const at::Tensor& indices, - c10::optional out) +std::tuple scatter_max_with_argmax_v2( + const at::Tensor& updates, const at::Tensor& indices, c10::optional out) { auto sizes = updates.sizes().vec(); auto indicesMax = indices.max().item().toLong(); @@ -63,8 +65,7 @@ std::tuple scatter_max_with_argmax_v2(const at::Tensor& return std::tie(result, argmax); } -at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, - const at::Tensor& num_segments) +at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments) { c10::SmallVector output_size; @@ -78,6 +79,12 @@ at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segme at::Tensor out = at::empty(output_size, x.options()); at_npu::native::OpCommand cmd; - cmd.Name("UnsortedSegmentSum").Input(x).Input(segment_ids).Input(num_segments).Output(out).Attr("check_ids", true).Run(); + cmd.Name("UnsortedSegmentSum") + .Input(x) + .Input(segment_ids) + .Input(num_segments) + .Output(out) + .Attr("check_ids", true) + .Run(); return out; -} \ No newline at end of file +} diff --git a/mx_driving/common/ops/csrc/ScatterMeanGrad.cpp b/mx_driving/csrc/ScatterMeanGrad.cpp similarity index 91% rename from mx_driving/common/ops/csrc/ScatterMeanGrad.cpp rename to mx_driving/csrc/ScatterMeanGrad.cpp index d7ada84800518c07fa373a4a13a58207e3f81f85..1f714f89fb353957630cde9bbf235b712e978444 100644 --- a/mx_driving/common/ops/csrc/ScatterMeanGrad.cpp +++ b/mx_driving/csrc/ScatterMeanGrad.cpp @@ -13,9 +13,9 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include + #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Tensor& count, int32_t dim) { @@ -30,10 +30,8 @@ at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Te TORCH_CHECK(grad_out.scalar_type() == at::kFloat, "grad_out: float32 tensor expected but got a tensor with dtype: ", grad_out.scalar_type()); TORCH_CHECK(index.scalar_type() == at::kInt, - "index: int32 tensor expected but got a tensor with dtype: ", - index.scalar_type()); - TORCH_CHECK(grad_out_dims != 0 && index_dims != 0, - "grad_out and index should not be empty"); + "index: int32 tensor expected but got a tensor with dtype: ", index.scalar_type()); + TORCH_CHECK(grad_out_dims != 0 && index_dims != 0, "grad_out and index should not be empty"); c10::SmallVector grad_in_size; for (uint32_t i = 0; i < grad_out_dims; i++) { @@ -42,8 +40,7 @@ at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Te dim = (dim + index_dims) % index_dims; grad_in_size[dim] = index_size[dim]; for (uint32_t i = 0; i < grad_out_dims; i++) { - TORCH_CHECK(i >= index_dims || grad_in_size[i] == index_size[i], - "the shape except dim should be the same"); + TORCH_CHECK(i >= index_dims || grad_in_size[i] == index_size[i], "the shape except dim should be the same"); } uint64_t tail = 1; for (uint32_t i = index_dims; i < grad_out_dims; i++) { @@ -67,4 +64,4 @@ at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Te EXEC_NPU_CMD(aclnnScatterMeanGrad, grad_out, index, count, dim, result); } return result; -} \ No newline at end of file +} diff --git a/mx_driving/common/ops/csrc/SortPairs.cpp b/mx_driving/csrc/SortPairs.cpp similarity index 89% rename from mx_driving/common/ops/csrc/SortPairs.cpp rename to mx_driving/csrc/SortPairs.cpp index 9f60bcfce2efb8f5926879cbf43dafb16a711075..9ea09a4dfcd6127fed4890b8c6ad5dee5c857350 100644 --- a/mx_driving/common/ops/csrc/SortPairs.cpp +++ b/mx_driving/csrc/SortPairs.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" /** * @brief sort key-value pairs @@ -25,8 +25,8 @@ * @param descending: true-descending, false-ascending * @return (keys_out, values_out): (sorted keys, corresponding values of sorted keys) */ -std::tuple npu_sort_pairs(const at::Tensor &keys_in, const at::Tensor &values_in, - int64_t dim, bool descending) +std::tuple npu_sort_pairs( + const at::Tensor& keys_in, const at::Tensor& values_in, int64_t dim, bool descending) { TORCH_CHECK_NPU(keys_in); TORCH_CHECK_NPU(values_in); diff --git a/mx_driving/spconv/ops/csrc/SparseConv3d.cpp b/mx_driving/csrc/SparseConv3d.cpp similarity index 68% rename from mx_driving/spconv/ops/csrc/SparseConv3d.cpp rename to mx_driving/csrc/SparseConv3d.cpp index beee90cb9b5efcf0f95faac009b4efcc36642206..e92532cf98875657398b4914d78242ffa85c78b8 100644 --- a/mx_driving/spconv/ops/csrc/SparseConv3d.cpp +++ b/mx_driving/csrc/SparseConv3d.cpp @@ -15,33 +15,31 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" -#include +#include "csrc/functions.h" -std::tuple npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, - int out_channel, at::IntArrayRef outSpatialShape, int batch_size) +std::tuple npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size, + at::IntArrayRef stride, at::IntArrayRef padding, int out_channel, at::IntArrayRef outSpatialShape, int batch_size) { TORCH_CHECK_NPU(indices); - TORCH_CHECK(out_channel <= 128, "out_channel must less or equal than 128 expected but got out_channel: ", - out_channel); - TORCH_CHECK(out_channel % 8 == 0, "out_channel must be divisible by 8 but got out_channel: ", - out_channel); + TORCH_CHECK( + out_channel <= 128, "out_channel must less or equal than 128 expected but got out_channel: ", out_channel); + TORCH_CHECK(out_channel % 8 == 0, "out_channel must be divisible by 8 but got out_channel: ", out_channel); auto indices_size = indices.sizes(); int64_t kernelsum = 1; for (int32_t i = 0; i < kernel_size.size(); i++) { - kernelsum *= kernel_size[i]; + kernelsum *= kernel_size[i]; } int64_t outputsum = indices_size[0] * kernelsum; c10::SmallVector indices_out_size = {outputsum}; c10::SmallVector indices_pairs_size = {outputsum, indices_size[1]}; - c10::SmallVector spatial_size = {batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel}; - at::IntArrayRef outputShape = at::IntArrayRef(spatial_size); + c10::SmallVector spatial_size = { + batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel}; + at::IntArrayRef outputShape = at::IntArrayRef(spatial_size); at::Tensor indices_out = at::empty(indices_out_size, indices.options()).fill_(-1); at::Tensor indices_pairs = at::empty(indices_pairs_size, indices.options()).fill_(-1); - EXEC_NPU_CMD(aclnnSparseConv3d, indices, kernel_size, outputShape, - stride, padding, indices_out, indices_pairs); + EXEC_NPU_CMD(aclnnSparseConv3d, indices, kernel_size, outputShape, stride, padding, indices_out, indices_pairs); return std::tie(indices_pairs, indices_out); -} \ No newline at end of file +} diff --git a/mx_driving/spconv/ops/csrc/SparseConv3dGrad.cpp b/mx_driving/csrc/SparseConv3dGrad.cpp similarity index 81% rename from mx_driving/spconv/ops/csrc/SparseConv3dGrad.cpp rename to mx_driving/csrc/SparseConv3dGrad.cpp index 5192def8082e40f3ef5c80fe3035c51cde0d99e1..c5549b328bedd843eac30fbc9da36e65866cdfc6 100644 --- a/mx_driving/spconv/ops/csrc/SparseConv3dGrad.cpp +++ b/mx_driving/csrc/SparseConv3dGrad.cpp @@ -15,11 +15,11 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" -#include +#include "csrc/functions.h" -std::tuple npu_sparse_conv3d_grad(const at::Tensor& indices_offset, const at::Tensor& former_sorted_indices, - const at::Tensor& feature, const at::Tensor& weight, const at::Tensor& grad) +std::tuple npu_sparse_conv3d_grad(const at::Tensor& indices_offset, + const at::Tensor& former_sorted_indices, const at::Tensor& feature, const at::Tensor& weight, + const at::Tensor& grad) { TORCH_CHECK_NPU(indices_offset); TORCH_CHECK_NPU(former_sorted_indices); @@ -33,7 +33,7 @@ std::tuple npu_sparse_conv3d_grad(const at::Tensor& indi int64_t kernelsum = 1; for (int32_t i = 0; i < weight_size.size() - 2; i++) { - kernelsum *= weight_size[i]; + kernelsum *= weight_size[i]; } int64_t kernelIC = weight_size[3]; int64_t kernelOC = weight_size[4]; @@ -44,6 +44,7 @@ std::tuple npu_sparse_conv3d_grad(const at::Tensor& indi at::Tensor feature_grad = at::zeros(feature_grad_size, feature.options()); at::Tensor weight_grad = at::zeros(weight_size, feature.options()); - EXEC_NPU_CMD(aclnnSparseConv3dGradV2, indices_offset, former_sorted_indices, feature, weight_trans, grad, feature_grad, weight_grad); + EXEC_NPU_CMD(aclnnSparseConv3dGradV2, indices_offset, former_sorted_indices, feature, weight_trans, grad, + feature_grad, weight_grad); return std::tie(feature_grad, weight_grad); -} \ No newline at end of file +} diff --git a/mx_driving/spconv/ops/csrc/SparseInverseConv3d.cpp b/mx_driving/csrc/SparseInverseConv3d.cpp similarity index 67% rename from mx_driving/spconv/ops/csrc/SparseInverseConv3d.cpp rename to mx_driving/csrc/SparseInverseConv3d.cpp index b9f9fd6a1bc242731884308db443f1f86cf631f8..e4ad5023907a764a8f6f4748da807016303c2f7a 100644 --- a/mx_driving/spconv/ops/csrc/SparseInverseConv3d.cpp +++ b/mx_driving/csrc/SparseInverseConv3d.cpp @@ -15,13 +15,12 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" -#include +#include "csrc/functions.h" -std::tuple npu_sparse_inverse_conv3d(const at::Tensor& feature, const at::Tensor& indices, const at::Tensor& weight, - at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, - at::IntArrayRef dilation, at::IntArrayRef output_padding, - int out_channel, at::IntArrayRef outSpatialShape, int batch_size) +std::tuple npu_sparse_inverse_conv3d(const at::Tensor& feature, + const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, + at::IntArrayRef padding, at::IntArrayRef dilation, at::IntArrayRef output_padding, int out_channel, + at::IntArrayRef outSpatialShape, int batch_size) { // check Tensor Device is NPU TORCH_CHECK_NPU(feature); @@ -31,7 +30,7 @@ std::tuple npu_sparse_inverse_conv3d(const a // Calculate kernelSize int64_t kernelsum = 1; for (int32_t i = 0; i < kernel_size.size(); i++) { - kernelsum *= kernel_size[i]; + kernelsum *= kernel_size[i]; } // to create memory of teh output auto indices_size = indices.sizes(); @@ -44,11 +43,12 @@ std::tuple npu_sparse_inverse_conv3d(const a at::Tensor indices_out = at::empty(indices_out_size, indices.options()).fill_(-1); at::Tensor indices_pairs = at::empty(indices_pairs_size, indices.options()).fill_(-1); - c10::SmallVector spatial_size = {batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel}; - at::IntArrayRef outputShape = at::IntArrayRef(spatial_size); + c10::SmallVector spatial_size = { + batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel}; + at::IntArrayRef outputShape = at::IntArrayRef(spatial_size); // weight [,,,in_channels, out_channels] -> [,,,out_channels, in_channels] at::Tensor weight_trans = weight.transpose(-1, -2).contiguous(); - EXEC_NPU_CMD(aclnnSparseInverseConv3d, feature, indices, weight_trans, outputShape, - stride, padding, dilation, output_padding, out, indices_out, indices_pairs); + EXEC_NPU_CMD(aclnnSparseInverseConv3d, feature, indices, weight_trans, outputShape, stride, padding, dilation, + output_padding, out, indices_out, indices_pairs); return std::tie(out, indices_pairs, indices_out); -} \ No newline at end of file +} diff --git a/mx_driving/spconv/ops/csrc/SubmSparseCov3d.cpp b/mx_driving/csrc/SubmSparseCov3d.cpp similarity index 72% rename from mx_driving/spconv/ops/csrc/SubmSparseCov3d.cpp rename to mx_driving/csrc/SubmSparseCov3d.cpp index e0502bde8c2257e7969d73d1c162f916da31b039..140179189a3df108618240f761c7174b70f8b788 100644 --- a/mx_driving/spconv/ops/csrc/SubmSparseCov3d.cpp +++ b/mx_driving/csrc/SubmSparseCov3d.cpp @@ -15,20 +15,18 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -std::tuple npu_subm_sparse_conv3d(const at::Tensor& feature, const at::Tensor& indices, - const at::Tensor& weight, - at::IntArrayRef kernel_size, int out_channel, - at::IntArrayRef outSpatialShape, int batch_size, - const at::Tensor& temp) +std::tuple npu_subm_sparse_conv3d(const at::Tensor& feature, + const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, int out_channel, + at::IntArrayRef outSpatialShape, int batch_size, const at::Tensor& temp) { auto indices_size = indices.sizes(); auto feature_size = feature.sizes(); auto weight_dim = weight.dim(); int64_t kernelsum = 1; for (int32_t i = 0; i < kernel_size.size(); i++) { - kernelsum *= kernel_size[0]; + kernelsum *= kernel_size[0]; } int64_t outputsum = indices_size[0] * kernelsum; c10::SmallVector output_size = {indices_size[0], kernelsum, feature_size[1]}; @@ -38,7 +36,7 @@ std::tuple npu_subm_sparse_conv3d(const at:: at::Tensor out = at::empty(output_size, feature.options()).fill_(0); at::Tensor indices_out = at::empty(indices_out_size, feature.options().dtype(at::kInt)).fill_(-1); at::Tensor indices_pairs = at::empty(indices_pairs_size, feature.options().dtype(at::kInt)); - EXEC_NPU_CMD(aclnnSubmSparseConv3d, feature, indices_trans, weight, temp, kernel_size, - out_channel, outSpatialShape, batch_size, out, indices_out, indices_pairs); + EXEC_NPU_CMD(aclnnSubmSparseConv3d, feature, indices_trans, weight, temp, kernel_size, out_channel, outSpatialShape, + batch_size, out, indices_out, indices_pairs); return std::tie(out, indices_pairs, indices_out); } diff --git a/mx_driving/common/ops/csrc/ThreeInterpolate.cpp b/mx_driving/csrc/ThreeInterpolate.cpp similarity index 72% rename from mx_driving/common/ops/csrc/ThreeInterpolate.cpp rename to mx_driving/csrc/ThreeInterpolate.cpp index f781887dec014c554528a88f54bb0b9eb28c15fa..a56646351f75ca86ee651e18d521522f42482863 100644 --- a/mx_driving/common/ops/csrc/ThreeInterpolate.cpp +++ b/mx_driving/csrc/ThreeInterpolate.cpp @@ -15,9 +15,10 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& points, const at::Tensor& idx, const at::Tensor& weight) +at::Tensor npu_three_interpolate( + int b, int c, int m, int n, const at::Tensor& points, const at::Tensor& idx, const at::Tensor& weight) { TORCH_CHECK_NPU(points); TORCH_CHECK_NPU(idx); @@ -27,8 +28,10 @@ at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& p auto idx_dtype = idx.scalar_type(); auto weight_dtype = weight.scalar_type(); - TORCH_CHECK((point_dtype == at::kFloat || point_dtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16."); - TORCH_CHECK((weight_dtype == at::kFloat || weight_dtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16."); + TORCH_CHECK((point_dtype == at::kFloat || point_dtype == at::kHalf), + "three_interpolate_forward ascend only support fp32 and fp16."); + TORCH_CHECK((weight_dtype == at::kFloat || weight_dtype == at::kHalf), + "three_interpolate_forward ascend only support fp32 and fp16."); TORCH_CHECK((point_dtype == weight_dtype), "input dtype is inconsistent."); TORCH_CHECK((idx_dtype == at::kInt), "indices: int32 tensor expected but got a tensor with dtype: ", idx_dtype); @@ -36,13 +39,15 @@ at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& p auto idx_size = idx.sizes(); auto weight_size = weight.sizes(); - TORCH_CHECK((point_size.size() == 3 && idx_size.size() == 3 && weight_size.size() == 3), "input dimension should be 3."); - TORCH_CHECK((point_size[0] == idx_size[0] && point_size[0] == weight_size[0] && idx_size[0] == weight_size[0]), "the first dimension of input should be the same."); + TORCH_CHECK( + (point_size.size() == 3 && idx_size.size() == 3 && weight_size.size() == 3), "input dimension should be 3."); + TORCH_CHECK((point_size[0] == idx_size[0] && point_size[0] == weight_size[0] && idx_size[0] == weight_size[0]), + "the first dimension of input should be the same."); TORCH_CHECK((idx_size[1] == weight_size[1]), "the second dimension of indices and weight should be the same."); TORCH_CHECK((idx_size[2] == 3 && weight_size[2] == 3), "the third dimension of indices and weight should be 3."); TORCH_CHECK((b < 10001 && c < 10001 && m < 10001 && n < 10001), "input dimension is too heavy."); - + auto point_c_trans = points.transpose(1, 2).to(at::kFloat); auto weight_cast = weight.to(at::kFloat); @@ -50,13 +55,8 @@ at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& p at::Tensor out_cast = at::zeros(output_size, points.options()).to(at::kFloat); at_npu::native::OpCommand cmd; - cmd.Name("ThreeInterpolate") - .Input(point_c_trans) - .Input(idx) - .Input(weight_cast) - .Output(out_cast) - .Run(); - + cmd.Name("ThreeInterpolate").Input(point_c_trans).Input(idx).Input(weight_cast).Output(out_cast).Run(); + auto out = out_cast; if (point_dtype == at::kHalf) { out = out_cast.to(at::kHalf); @@ -64,11 +64,12 @@ at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& p auto output = out_cast.view({b, n, c}).transpose(1, 2); auto res = output.contiguous(); out.copy_(res); - + return out; } -at::Tensor npu_three_interpolate_backward(int b, int c, int n, int m, const at::Tensor& grad_out, const at::Tensor& idx, const at::Tensor& weight) +at::Tensor npu_three_interpolate_backward( + int b, int c, int n, int m, const at::Tensor& grad_out, const at::Tensor& idx, const at::Tensor& weight) { TORCH_CHECK_NPU(grad_out); TORCH_CHECK_NPU(idx); @@ -78,8 +79,10 @@ at::Tensor npu_three_interpolate_backward(int b, int c, int n, int m, const at:: auto idx_dtype = idx.scalar_type(); auto weight_dtype = weight.scalar_type(); - TORCH_CHECK((grad_dtype == at::kFloat || grad_dtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16."); - TORCH_CHECK((weight_dtype == at::kFloat || weight_dtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16."); + TORCH_CHECK((grad_dtype == at::kFloat || grad_dtype == at::kHalf), + "three_interpolate_forward ascend only support fp32 and fp16."); + TORCH_CHECK((weight_dtype == at::kFloat || weight_dtype == at::kHalf), + "three_interpolate_forward ascend only support fp32 and fp16."); TORCH_CHECK((grad_dtype == weight_dtype), "input dtype is inconsistent."); TORCH_CHECK((idx_dtype == at::kInt), "indices: int32 tensor expected but got a tensor with dtype: ", idx_dtype); @@ -87,11 +90,14 @@ at::Tensor npu_three_interpolate_backward(int b, int c, int n, int m, const at:: auto idx_size = idx.sizes(); auto weight_size = weight.sizes(); - TORCH_CHECK((grad_size.size() == 3 && idx_size.size() == 3 && weight_size.size() == 3), "the input dimension should be 3."); - TORCH_CHECK((grad_size[0] == idx_size[0] && grad_size[0] == weight_size[0] && idx_size[0] == weight_size[0]), "the first dimension of input should be the same."); - TORCH_CHECK((grad_size[2] == idx_size[1] && grad_size[2] == weight_size[1] && idx_size[1] == weight_size[1]), "the second dimension of indices and weight should be the same."); + TORCH_CHECK( + (grad_size.size() == 3 && idx_size.size() == 3 && weight_size.size() == 3), "the input dimension should be 3."); + TORCH_CHECK((grad_size[0] == idx_size[0] && grad_size[0] == weight_size[0] && idx_size[0] == weight_size[0]), + "the first dimension of input should be the same."); + TORCH_CHECK((grad_size[2] == idx_size[1] && grad_size[2] == weight_size[1] && idx_size[1] == weight_size[1]), + "the second dimension of indices and weight should be the same."); TORCH_CHECK((idx_size[2] == 3 && weight_size[2] == 3), "the third dimension of indices and weight should be 3."); - + TORCH_CHECK((b < 10001 && c < 10001 && m < 10001 && n < 10001), "input dimension is too heavy."); at::Tensor grad_points = at::zeros({b, c, m}, grad_out.options()); diff --git a/mx_driving/point/ops/csrc/UniqueVoxel.cpp b/mx_driving/csrc/UniqueVoxel.cpp similarity index 94% rename from mx_driving/point/ops/csrc/UniqueVoxel.cpp rename to mx_driving/csrc/UniqueVoxel.cpp index 31ac61f10e9e51c14d11b31c1eef4014997b0449..eed06c2eccd65c81b5a1a9e790ac82b9af6fdd85 100644 --- a/mx_driving/point/ops/csrc/UniqueVoxel.cpp +++ b/mx_driving/csrc/UniqueVoxel.cpp @@ -14,16 +14,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "csrc/OpApiCommon.h" -#include "functions.h" - +#include "csrc/functions.h" std::tuple unique_voxel(const at::Tensor& voxels) { TORCH_CHECK_NPU(voxels); TORCH_CHECK(voxels.dim() == 1, "voxels.dim() must be 1, but got: ", voxels.dim()); - TORCH_CHECK(voxels.dtype() == at::kFloat || voxels.dtype() == at::kInt, "voxels.dtype() must be float or int32, but got: ", voxels.dtype()); + TORCH_CHECK(voxels.dtype() == at::kFloat || voxels.dtype() == at::kInt, + "voxels.dtype() must be float or int32, but got: ", voxels.dtype()); size_t num_points = voxels.size(0); diff --git a/mx_driving/point/ops/csrc/VecPoolBackward.cpp b/mx_driving/csrc/VecPoolBackward.cpp similarity index 54% rename from mx_driving/point/ops/csrc/VecPoolBackward.cpp rename to mx_driving/csrc/VecPoolBackward.cpp index aaa3e1af547cc203b31babefcac6f0cef98b341f..1ad326b26a6caa4b9eae10dff88ab2c8c0ddab89 100644 --- a/mx_driving/point/ops/csrc/VecPoolBackward.cpp +++ b/mx_driving/csrc/VecPoolBackward.cpp @@ -15,24 +15,16 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" -at::Tensor vec_pool_backward(const at::Tensor& grad_new_features, - const at::Tensor& point_cnt_of_grid, - const at::Tensor& grouped_idxs, - const int64_t n, - const int64_t num_c_in) +at::Tensor vec_pool_backward(const at::Tensor& grad_new_features, const at::Tensor& point_cnt_of_grid, + const at::Tensor& grouped_idxs, const int64_t n, const int64_t num_c_in) { - TORCH_CHECK(grad_new_features.dim() == 2, - "grad_new_features.dim() must be 2, but got: ", grad_new_features.dim()); - TORCH_CHECK(point_cnt_of_grid.dim() == 2, - "point_cnt_of_grid.dim() must be 2, but got: ", point_cnt_of_grid.dim()); - TORCH_CHECK(grouped_idxs.dim() == 2, - "grouped_idxs.dim() must be 2, but got: ", grouped_idxs.dim()); - TORCH_CHECK(point_cnt_of_grid.size(1) != 0, - "numTotalGrids can not be 0."); - TORCH_CHECK(grouped_idxs.size(1) == 3, - "grouped_idxs.shape[1] must be 3, but got: ", grouped_idxs.size(1)); + TORCH_CHECK(grad_new_features.dim() == 2, "grad_new_features.dim() must be 2, but got: ", grad_new_features.dim()); + TORCH_CHECK(point_cnt_of_grid.dim() == 2, "point_cnt_of_grid.dim() must be 2, but got: ", point_cnt_of_grid.dim()); + TORCH_CHECK(grouped_idxs.dim() == 2, "grouped_idxs.dim() must be 2, but got: ", grouped_idxs.dim()); + TORCH_CHECK(point_cnt_of_grid.size(1) != 0, "numTotalGrids can not be 0."); + TORCH_CHECK(grouped_idxs.size(1) == 3, "grouped_idxs.shape[1] must be 3, but got: ", grouped_idxs.size(1)); auto output_size = {n, num_c_in}; at::Tensor out = at::zeros(output_size, grad_new_features.options()); EXEC_NPU_CMD(aclnnVecPoolGrad, grad_new_features, point_cnt_of_grid, grouped_idxs, n, num_c_in, out); diff --git a/mx_driving/point/ops/csrc/VoxelPoolingTrain.cpp b/mx_driving/csrc/VoxelPoolingTrain.cpp similarity index 98% rename from mx_driving/point/ops/csrc/VoxelPoolingTrain.cpp rename to mx_driving/csrc/VoxelPoolingTrain.cpp index 5341a87cd7f9519b419cd9dcd7a740a8440f040d..7657918e5c4183088d1177b5f55843f140c3c76b 100644 --- a/mx_driving/point/ops/csrc/VoxelPoolingTrain.cpp +++ b/mx_driving/csrc/VoxelPoolingTrain.cpp @@ -14,10 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" std::tuple voxel_pooling_train(const at::Tensor& inputFeatures, const at::Tensor& geom, at::Tensor& outputFeatures, at::Tensor& posMemo, int batchSize, int numPoints, int numChannels, int numVoxelX, @@ -70,4 +68,4 @@ at::Tensor voxel_pool_train_backward(const at::Tensor& gradOut, const at::Tensor out = out.to(at::kHalf); } return out; -} \ No newline at end of file +} diff --git a/mx_driving/point/ops/csrc/VoxelToPoint.cpp b/mx_driving/csrc/VoxelToPoint.cpp similarity index 98% rename from mx_driving/point/ops/csrc/VoxelToPoint.cpp rename to mx_driving/csrc/VoxelToPoint.cpp index 3d1bc37fd4a3d34d251351fb05e7bf95d8f2b98f..fd6463333b6e6a040b5e41dd0b135243ba97f0d3 100644 --- a/mx_driving/point/ops/csrc/VoxelToPoint.cpp +++ b/mx_driving/csrc/VoxelToPoint.cpp @@ -15,7 +15,7 @@ // limitations under the License. #include "csrc/OpApiCommon.h" -#include "functions.h" +#include "csrc/functions.h" constexpr float DEFAULT_VALUE = -1.0f; diff --git a/mx_driving/csrc/pybind.cpp b/mx_driving/csrc/pybind.cpp new file mode 100644 index 0000000000000000000000000000000000000000..71cd0c44dcaf924c1e4ea5b16959c55dbfd47143 --- /dev/null +++ b/mx_driving/csrc/pybind.cpp @@ -0,0 +1,196 @@ +// Copyright (c) 2024 Huawei Technologies Co., Ltd +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "csrc/functions.h" +#include + +#include +#include + +std::string g_opApiSoPath; +std::once_flag init_flag; // Flag for one-time initialization + +void init_op_api_so_path(const std::string& path) +{ + std::call_once(init_flag, [&]() { g_opApiSoPath = path; }); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("_init_op_api_so_path", &init_op_api_so_path); + // knn + m.def("knn", &knn); + + // npu_scatter_mean_grad + m.def("npu_scatter_mean_grad", &npu_scatter_mean_grad); + + // three_interpolate + m.def("npu_three_interpolate", &npu_three_interpolate); + m.def("npu_three_interpolate_backward", &npu_three_interpolate_backward); + + // scatter_mean + m.def("npu_scatter_mean", &npu_scatter_mean, "npu_scatter_mean NPU version"); + + // scatter_max + m.def("scatter_max_with_argmax_v2", &scatter_max_with_argmax_v2); + m.def("npu_scatter_max_backward", &npu_scatter_max_backward); + + // npu_sort_pairs + m.def("npu_sort_pairs", &npu_sort_pairs, "sort_pairs NPU version"); + + // npu_hypot + m.def("npu_hypot", &npu_hypot); + m.def("npu_hypot_grad", &npu_hypot_grad); + + // assign_score_withk + m.def("assign_score_withk", &assign_score_withk); + // nms3d_normal + m.def("nms3d_normal", &nms3d_normal); + + // nms3d + m.def("nms3d", &nms3d); + + // roated overlap + m.def("npu_rotated_overlaps", &npu_rotated_overlaps, "npu_rotated_overlap NPU version"); + + // rotated iou + m.def("npu_rotated_iou", &npu_rotated_iou); + + // npu_boxes_overlap_bev + m.def("npu_boxes_overlap_bev", &npu_boxes_overlap_bev, "boxes_overlap_bev NPU version"); + + // roi_align_rotated_v2_forward_npu + m.def("roi_align_rotated_v2_forward_npu", &roi_align_rotated_v2_forward_npu); + + // npu_roi_align_rotated_grad_v2 + m.def("npu_roi_align_rotated_grad_v2", &npu_roi_align_rotated_grad_v2); + + // npu_box_iou_quadri + m.def("npu_box_iou_quadri", &npu_box_iou_quadri, "box_iou_quadri NPU version"); + + // npu_box_iou_rotated + m.def("npu_box_iou_rotated", &npu_box_iou_rotated, "box_iou_rotated NPU version"); + + // border_align_forward_npu + m.def("border_align", &border_align); + + // border_align_backward_npu + m.def("border_align_backward", &border_align_backward); + + // npu_roiaware_pool3d_forward + m.def("npu_roiaware_pool3d_forward", &npu_roiaware_pool3d_forward); + + // roiaware_pool3d_grad + m.def("roiaware_pool3d_grad", &roiaware_pool3d_grad, "roiaware_pool3d_grad NPU version"); + + // pixel_group + m.def("pixel_group", &pixel_group); + + // nnpu_max_pool2d + m.def("npu_max_pool2d", &npu_max_pool2d); + // mullti_scale_deformable_attn + m.def("multi_scale_deformable_attn", &multi_scale_deformable_attn); + m.def("multi_scale_deformable_attn_backward", &multi_scale_deformable_attn_backward); + + // npu_add_relu + m.def("npu_add_relu", &npu_add_relu); + m.def("npu_add_relu_grad", &npu_add_relu_grad); + + // fused_bias_leaky_relu + m.def("fused_bias_leaky_relu", &fused_bias_leaky_relu); + + // npu_deformable_aggregation + m.def("npu_deformable_aggregation", &deformable_aggregation); + m.def("npu_deformable_aggregation_backward", &deformable_aggregation_backward); + + // deformable_conv2d + m.def("deformable_conv2d", &deformable_conv2d); + m.def("modulated_deformable_conv2d", &modulated_deformable_conv2d); + m.def("deformable_conv2d_backward", &deformable_conv2d_backward); + m.def("modulated_deformable_conv2d_backward", &modulated_deformable_conv2d_backward); + + // npu_geometric_kernel_attention_func + m.def("npu_geometric_kernel_attention", &npu_geometric_kernel_attention); + m.def("npu_geometric_kernel_attention_backward", &npu_geometric_kernel_attention_backward); + + // group_points + m.def("group_points", &group_points); + m.def("group_points_backward", &group_points_backward); + + // vec_pool + m.def("vec_pool_backward", &vec_pool_backward); + + m.def("point_to_voxel", &point_to_voxel); + + m.def("voxel_to_point", &voxel_to_point); + + m.def("unique_voxel", &unique_voxel); + + m.def("hard_voxelize", &hard_voxelize); + + // bev_pool + m.def("npu_bev_pool", &npu_bev_pool, "npu_bev_pool NPU version"); + m.def("npu_bev_pool_backward", &npu_bev_pool_backward, "npu_bev_pool_backward NPU version"); + m.def("npu_bev_pool_v2", &npu_bev_pool_v2, "npu_bev_pool_v2 NPU version"); + m.def("npu_bev_pool_v2_backward", &npu_bev_pool_v2_backward, "npu_bev_pool_v2_backward NPU version"); + m.def("npu_bev_pool_v3", &npu_bev_pool_v3, "npu_bev_pool_v3 NPU version"); + m.def("npu_bev_pool_v3_backward", &npu_bev_pool_v3_backward, "npu_bev_pool_v3_backward NPU version"); + + // furthest_points_sampling_with_dist + m.def("furthest_point_sampling_with_dist", &furthest_point_sampling_with_dist); + + // npu_dynamic_scatter + m.def("npu_dynamic_scatter", &npu_dynamic_scatter); + m.def("npu_dynamic_scatter_grad", &npu_dynamic_scatter_grad); + + // dyn_voxelization + m.def("dynamic_voxelization", &dynamic_voxelization); + + // npu_furthest_point_sampling + m.def("npu_furthest_point_sampling", &npu_furthest_point_sampling); + + // voxel_pooling + m.def("voxel_pooling_train", &voxel_pooling_train); + m.def("voxel_pool_train_backward", &voxel_pool_train_backward); + + // npu_points_in_box + m.def("npu_points_in_box", &npu_points_in_box); + + // npu_points_in_box_all + m.def("npu_points_in_box_all", &npu_points_in_box_all); + + // npu_roipoint_pool3d_forward + m.def("npu_roipoint_pool3d_forward", &npu_roipoint_pool3d_forward); + + // npu_subm_sparse_conv3d + m.def("npu_subm_sparse_conv3d", &npu_subm_sparse_conv3d); + + // npu_sparse_conv3d + m.def("npu_sparse_conv3d", &npu_sparse_conv3d); + + // npu_sparse_inverse_conv3d + m.def("npu_sparse_inverse_conv3d", &npu_sparse_inverse_conv3d); + + // multi_to_sparse + m.def("multi_to_sparse", &multi_to_sparse); + + // multi_to_sparse_v2 + m.def("multi_to_sparse_v2", &multi_to_sparse_v2); + + // npu_sparse_conv3d_grad + m.def("npu_sparse_conv3d_grad", &npu_sparse_conv3d_grad); + + m.def("npu_prepare_subm_conv3d", &npu_prepare_subm_conv3d); +} diff --git a/mx_driving/common/ops/csrc/scatterMean.cpp b/mx_driving/csrc/scatterMean.cpp similarity index 88% rename from mx_driving/common/ops/csrc/scatterMean.cpp rename to mx_driving/csrc/scatterMean.cpp index ff188e4a7b14b20012020dee5d5a089abbaf6199..9df865e18e44dc6363b77c398862d2a7b55b0224 100644 --- a/mx_driving/common/ops/csrc/scatterMean.cpp +++ b/mx_driving/csrc/scatterMean.cpp @@ -14,20 +14,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "csrc/OpApiCommon.h" -#include "functions.h" - +#include "csrc/functions.h" using namespace std; -static void npu_scatter_mean_shape_check(const at::Tensor& src, const at::Tensor& indices, const at::Tensor& out, int dim, int max_index) +static void npu_scatter_mean_shape_check( + const at::Tensor& src, const at::Tensor& indices, const at::Tensor& out, int dim, int max_index) { auto src_size = src.sizes(); auto out_size = out.sizes(); auto indices_size = indices.sizes(); auto indices_dim = indices.dim(); - TORCH_CHECK(dim < indices_dim, "Dimension out of range, dim expected to be in range of [", -indices_dim, ", ", indices_dim-1, "], but got ", dim); + TORCH_CHECK(dim < indices_dim, "Dimension out of range, dim expected to be in range of [", -indices_dim, ", ", + indices_dim - 1, "], but got ", dim); TORCH_CHECK(src.dim() == out.dim(), "out's dimension should be equal to src's dimension."); TORCH_CHECK(src.dim() >= indices.dim(), "indices's dimension should not larger than src's dimension."); // shape of out and src @@ -66,9 +66,8 @@ static int32_t get_available_dimnum(const at::Tensor& indices) return indices_dim - last_indices_dim; } -std::tuple npu_scatter_mean(at::Tensor& src, at::Tensor& indices, - c10::optional out, c10::optional dim, - c10::optional dim_size) +std::tuple npu_scatter_mean(at::Tensor& src, at::Tensor& indices, c10::optional out, + c10::optional dim, c10::optional dim_size) { TORCH_CHECK_NPU(src); TORCH_CHECK_NPU(indices); @@ -112,10 +111,7 @@ std::tuple npu_scatter_mean(at::Tensor& src, at::Tensor& at::Tensor count = at::zeros(out_trans.sizes(), src.options().dtype(at::kFloat)); EXEC_NPU_CMD(aclnnScatterMean, src, indices, out_trans, dim_input, out_trans, count); - count = at::where( - count == 0, - at::ones({}, count.options()), - count); + count = at::where(count == 0, at::ones({}, count.options()), count); out_trans = out_trans / count; out_trans = out_trans.transpose(true_dim, dim_input).contiguous(); @@ -130,4 +126,4 @@ std::tuple npu_scatter_mean(at::Tensor& src, at::Tensor& EXEC_NPU_CMD(aclnnScatterMeanDiv, true_out, count, true_out); return std::tie(true_out, count); } -} \ No newline at end of file +} diff --git a/mx_driving/detection/__init__.py b/mx_driving/detection.py similarity index 69% rename from mx_driving/detection/__init__.py rename to mx_driving/detection.py index f5605f4a8127548e01e39d14ce81bf89a349289b..12cac4f57087abe4cb2852ac321b9b76f1912517 100644 --- a/mx_driving/detection/__init__.py +++ b/mx_driving/detection.py @@ -1,3 +1,8 @@ +import warnings + +warnings.warn( + "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning +) from .ops.boxes_overlap_bev import boxes_overlap_bev, npu_boxes_overlap_bev from .ops.nms3d_normal import npu_nms3d_normal from .ops.npu_nms3d import npu_nms3d @@ -7,4 +12,4 @@ from .ops.roi_align_rotated import roi_align_rotated from .ops.box_iou import box_iou_quadri from .ops.border_align import border_align from .ops.roiaware_pool3d import roiaware_pool3d -from .ops.pixel_group import pixel_group \ No newline at end of file +from .ops.pixel_group import pixel_group diff --git a/mx_driving/detection/CMakeLists.txt b/mx_driving/detection/CMakeLists.txt deleted file mode 100644 index 63ebf65165f490b26ec6fbb6cb034f1e8d947c59..0000000000000000000000000000000000000000 --- a/mx_driving/detection/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) - add_subdirectory(ops/kernels) -endif() - -if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) - add_subdirectory(ops/onnx/plugin) -endif() - -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) - add_subdirectory(ops/csrc) -endif() diff --git a/mx_driving/detection/components/README.md b/mx_driving/detection/components/README.md deleted file mode 100644 index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000 --- a/mx_driving/detection/components/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some pytorch algorithm modules. \ No newline at end of file diff --git a/mx_driving/detection/ops/csrc/CMakeLists.txt b/mx_driving/detection/ops/csrc/CMakeLists.txt deleted file mode 100644 index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/csrc/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_CSRC_SRC - ${ASCEND_CSRC_SRC} ${CSRC_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/detection/ops/csrc/OWNERS b/mx_driving/detection/ops/csrc/OWNERS deleted file mode 100644 index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/csrc/OWNERS +++ /dev/null @@ -1,7 +0,0 @@ -approvers: -- wangxiaoxin-sherie -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/detection/ops/csrc/README.md b/mx_driving/detection/ops/csrc/README.md deleted file mode 100644 index 0bbe4f394307b9d81004b5bd923e630eabd9a509..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/csrc/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some cpp source files, which provide code for adaptation of ascend kernels. It provide links for kernels and cpp interfaces. \ No newline at end of file diff --git a/mx_driving/detection/ops/csrc/functions.h b/mx_driving/detection/ops/csrc/functions.h deleted file mode 100644 index ba1d8a236b1e5fb718fcb69d1fb84de2ea7768c4..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/csrc/functions.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2024, Huawei Technologies.All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#ifndef PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ -#define PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ - -#include -#include - -std::tuple nms3d_normal(const at::Tensor& boxes, double nms_overlap_thresh); - -std::tuple nms3d(const at::Tensor& boxes, double threshold); - -at::Tensor npu_rotated_overlaps(const at::Tensor& self, const at::Tensor& query_boxes, bool trans); - -at::Tensor npu_rotated_iou(const at::Tensor& boxes, const at::Tensor& query_boxes, bool trans, int64_t mode, - bool is_cross, double v_threshold, double e_threshold); - -at::Tensor npu_boxes_overlap_bev(const at::Tensor &boxes_a, const at::Tensor &boxes_b); - -void roi_align_rotated_v2_forward_npu(const at::Tensor& input, const at::Tensor& rois_map, at::Tensor& output, - double spatial_scale, int32_t sampling_ratio, int32_t pooled_height, int32_t pooled_width, - bool aligned, bool clockwise); -at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input, - const at::Tensor& rois, const at::Tensor& grad_output, - int32_t pooled_height, int32_t pooled_width, double spatial_scale, - int32_t sampling_ratio, bool aligned, bool clockwise); - -at::Tensor npu_box_iou_quadri(const at::Tensor &boxes_a, const at::Tensor &boxes_b, - const int64_t mode_flag, const bool aligned); - -at::Tensor npu_box_iou_rotated(const at::Tensor &boxes_a, const at::Tensor &boxes_b, - const int64_t mode_flag, const bool aligned); - -void border_align_forward_npu(const at::Tensor& input, const at::Tensor& rois, at::Tensor& output, const int32_t pooled_size); - -at::Tensor border_align_backward(const at::Tensor& grad_out, const at::Tensor& boxes, const at::Tensor& argmax_idx, - int32_t pool_size, int32_t height, int32_t width); - -void npu_roiaware_pool3d_forward(const at::Tensor& rois, const at::Tensor& pts, const at::Tensor& pts_feature, - at::Tensor& argmax, at::Tensor& pts_idx_of_voxels, at::Tensor& pooled_features, int32_t mode); -at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::Tensor& argmax, - const at::Tensor& grad_out, int32_t npoints, int64_t pool_method); - -std::vector> pixel_group(const at::Tensor& score, const at::Tensor& mask, const at::Tensor& embedding, - const at::Tensor& kernel_label, const at::Tensor& kernel_contour, - int kernel_region_num, double distance_threshold); -#endif // PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ diff --git a/mx_driving/detection/ops/csrc/pybind.cpp b/mx_driving/detection/ops/csrc/pybind.cpp deleted file mode 100644 index 18f1e90b1984c19fc9694612f4d9bb01e90b4d9c..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/csrc/pybind.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include -#include "csrc/pybind.h" -#include "functions.h" - -void init_detection(pybind11::module& m) -{ - // nms3d_normal - m.def("nms3d_normal", &nms3d_normal); - - // nms3d - m.def("nms3d", &nms3d); - - // roated overlap - m.def("npu_rotated_overlaps", &npu_rotated_overlaps, "npu_rotated_overlap NPU version"); - - // rotated iou - m.def("npu_rotated_iou", &npu_rotated_iou); - - // npu_boxes_overlap_bev - m.def("npu_boxes_overlap_bev", &npu_boxes_overlap_bev, "boxes_overlap_bev NPU version"); - - // roi_align_rotated_v2_forward_npu - m.def("roi_align_rotated_v2_forward_npu", &roi_align_rotated_v2_forward_npu); - - // npu_roi_align_rotated_grad_v2 - m.def("npu_roi_align_rotated_grad_v2", &npu_roi_align_rotated_grad_v2); - - // npu_box_iou_quadri - m.def("npu_box_iou_quadri", &npu_box_iou_quadri, "box_iou_quadri NPU version"); - - // npu_box_iou_rotated - m.def("npu_box_iou_rotated", &npu_box_iou_rotated, "box_iou_rotated NPU version"); - - // border_align_forward_npu - m.def("border_align_forward_npu", &border_align_forward_npu); - - // border_align_backward_npu - m.def("border_align_backward", &border_align_backward); - - // npu_roiaware_pool3d_forward - m.def("npu_roiaware_pool3d_forward", &npu_roiaware_pool3d_forward); - - // roiaware_pool3d_grad - m.def("roiaware_pool3d_grad", &roiaware_pool3d_grad, "roiaware_pool3d_grad NPU version"); - - // pixel_group - m.def("pixel_group", &pixel_group); -} diff --git a/mx_driving/detection/ops/kernels/CMakeLists.txt b/mx_driving/detection/ops/kernels/CMakeLists.txt deleted file mode 100644 index 179d9da23345abf75fb87954f266055922527742..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/kernels/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework) - add_subdirectory(framework) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) - add_subdirectory(op_host) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) - add_subdirectory(op_kernel) -endif() -if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) - add_subdirectory(testcases) -endif() diff --git a/mx_driving/detection/ops/kernels/README.md b/mx_driving/detection/ops/kernels/README.md deleted file mode 100644 index 214fb0a6d662e806bd7f6bdd1b8962bc1639026e..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/kernels/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some ascend-kernel source files, which are like cuda-kernels and supply some ops that can be run on ascend device. \ No newline at end of file diff --git a/mx_driving/detection/ops/kernels/op_host/CMakeLists.txt b/mx_driving/detection/ops/kernels/op_host/CMakeLists.txt deleted file mode 100644 index 7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/kernels/op_host/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_HOST_SRC - ${ASCEND_HOST_SRC} ${HOST_SRC} - CACHE INTERNAL "") - -# add the exclude files for aclnn -set(aclop_exclude - ${aclop_exclude} "" - CACHE INTERNAL "") -file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp) -file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h) -set(ACLNN_SRC_CUSTOM - ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC} - CACHE INTERNAL "") -set(ACLNN_INC_CUSTOM - ${ACLNN_INC_CUSTOM} ${ACLNN_INC} - CACHE INTERNAL "") diff --git a/mx_driving/detection/ops/kernels/op_host/OWNERS b/mx_driving/detection/ops/kernels/op_host/OWNERS deleted file mode 100644 index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/kernels/op_host/OWNERS +++ /dev/null @@ -1,7 +0,0 @@ -approvers: -- wangxiaoxin-sherie -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.h b/mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.h deleted file mode 100644 index 9976c2486c9e45dc8d867da1f7ad8740f8dbc90b..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. - */ -#ifndef NMS3D_NORMAL_TILING_H -#define NMS3D_NORMAL_TILING_H - -#include "register/tilingdata_base.h" - -namespace optiling { -BEGIN_TILING_DATA_DEF(Nms3dNormalTilingData) - TILING_DATA_FIELD_DEF(uint32_t, usedCoreNum) // used cores - TILING_DATA_FIELD_DEF(uint32_t, boxNum) // count of boxes - TILING_DATA_FIELD_DEF(uint32_t, loopTime) // loop times - TILING_DATA_FIELD_DEF(uint32_t, eachSum) // count of each core, = loop_time * 8 - TILING_DATA_FIELD_DEF(uint32_t, tailSum) // count of tail core - TILING_DATA_FIELD_DEF(uint32_t, tailNum) // last time count of tail core - TILING_DATA_FIELD_DEF(uint32_t, maskNum) // mask align 32bit - TILING_DATA_FIELD_DEF(float, overlapThresh) -END_TILING_DATA_DEF; - -REGISTER_TILING_DATA_CLASS(Nms3dNormal, Nms3dNormalTilingData) -} // namespace optiling - -#endif // NMS3D_NORMAL_TILING_H diff --git a/mx_driving/detection/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/detection/ops/kernels/op_kernel/CMakeLists.txt deleted file mode 100644 index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/kernels/op_kernel/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_KERNEL_SRC - ${ASCEND_KERNEL_SRC} ${KERNEL_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/detection/ops/onnx/__init__.py b/mx_driving/detection/ops/onnx/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/mx_driving/detection/ops/onnx/plugin/CMakeLists.txt b/mx_driving/detection/ops/onnx/plugin/CMakeLists.txt deleted file mode 100644 index cc6034bd1fe09a766aef52f69cf0bb348ceaf2b5..0000000000000000000000000000000000000000 --- a/mx_driving/detection/ops/onnx/plugin/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB ONNX_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_ONNX_SRC - ${ASCEND_ONNX_SRC} ${ONNX_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/fused/__init__.py b/mx_driving/fused.py similarity index 71% rename from mx_driving/fused/__init__.py rename to mx_driving/fused.py index 80a2b2cf2531379a50c340c24fb8f9c2aa79f1cf..71d05810e1c58c23d07c83c7a52818c788cfd327 100644 --- a/mx_driving/fused/__init__.py +++ b/mx_driving/fused.py @@ -1,3 +1,8 @@ +import warnings + +warnings.warn( + "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning +) from .ops.deform_conv2d import DeformConv2dFunction, deform_conv2d from .ops.fused_bias_leaky_relu import npu_fused_bias_leaky_relu from .ops.modulated_deform_conv2d import (ModulatedDeformConv2dFunction, @@ -8,4 +13,4 @@ from .ops.multi_scale_deformable_attn import ( from .ops.npu_add_relu import npu_add_relu from .ops.npu_deformable_aggregation import npu_deformable_aggregation from .ops.npu_max_pool2d import npu_max_pool2d -from .ops.npu_geometric_kernel_attention_func import npu_geometric_kernel_attention_func +from .ops.npu_geometric_kernel_attention import npu_geometric_kernel_attention diff --git a/mx_driving/fused/CMakeLists.txt b/mx_driving/fused/CMakeLists.txt deleted file mode 100644 index 807aa0c667560bcf0d75c6c6a26369daa624e9de..0000000000000000000000000000000000000000 --- a/mx_driving/fused/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) - add_subdirectory(ops/kernels) -endif() - -if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) - add_subdirectory(ops/onnx/plugin) -endif() - -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) - add_subdirectory(ops/csrc) -endif() diff --git a/mx_driving/fused/components/README.md b/mx_driving/fused/components/README.md deleted file mode 100644 index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000 --- a/mx_driving/fused/components/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some pytorch algorithm modules. \ No newline at end of file diff --git a/mx_driving/fused/ops/__init__.py b/mx_driving/fused/ops/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/mx_driving/fused/ops/csrc/CMakeLists.txt b/mx_driving/fused/ops/csrc/CMakeLists.txt deleted file mode 100644 index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/csrc/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_CSRC_SRC - ${ASCEND_CSRC_SRC} ${CSRC_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/fused/ops/csrc/OWNERS b/mx_driving/fused/ops/csrc/OWNERS deleted file mode 100644 index 6d60158d26b6a9b3c818a73e78f09a6aa3700cf7..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/csrc/OWNERS +++ /dev/null @@ -1,8 +0,0 @@ -approvers: -- wangxiaoxin-sherie -- liu_zhi_xu -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/fused/ops/csrc/README.md b/mx_driving/fused/ops/csrc/README.md deleted file mode 100644 index 0bbe4f394307b9d81004b5bd923e630eabd9a509..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/csrc/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some cpp source files, which provide code for adaptation of ascend kernels. It provide links for kernels and cpp interfaces. \ No newline at end of file diff --git a/mx_driving/fused/ops/csrc/functions.h b/mx_driving/fused/ops/csrc/functions.h deleted file mode 100644 index 54a7c11468408f6aa73e1da92b3fc2fc8e1500e6..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/csrc/functions.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2024, Huawei Technologies.All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_ -#define PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_ -#include -#include - -at::Tensor npu_max_pool2d(const at::Tensor& x, int kernel_size, int stride, int padding); - -at::Tensor multi_scale_deformable_attn(const at::Tensor& value, const at::Tensor& value_spatial_shapes, - const at::Tensor& value_level_start_index, const at::Tensor& sampling_locations, - const at::Tensor& attention_weights); - -std::tuple multi_scale_deformable_attn_backward(const at::Tensor& value, - const at::Tensor& value_spatial_shapes, const at::Tensor& value_level_start_index, - const at::Tensor& sampling_locations, const at::Tensor& attention_weights, const at::Tensor& grad_output); - -std::tuple multi_scale_deformable_attn_grad_v2(const at::Tensor& value, - const at::Tensor& shape, const at::Tensor& level_start_index, const at::Tensor& location_trans, - const at::Tensor& attn_weight_trans, const at::Tensor& grad_output); - -at::Tensor npu_add_relu(at::Tensor& x, const at::Tensor& y); - -at::Tensor npu_add_relu_grad(at::Tensor& self, at::Tensor& grad_output); -std::tuple npu_scatter_mean(at::Tensor& src, at::Tensor& index, c10::optional out, - c10::optional dim, c10::optional dim_size); - -at::Tensor fused_bias_leaky_relu( - const at::Tensor& x, const at::Tensor& bias, const double negative_slop, const double scale); - -at::Tensor deformable_aggregation(const at::Tensor& mc_ms_feat, const at::Tensor& spatial_shape, - const at::Tensor& scale_start_index, const at::Tensor& sampling_location, const at::Tensor& weights); -std::tuple deformable_aggregation_grad(const at::Tensor& mc_ms_feat, - const at::Tensor& spatial_shape, const at::Tensor& scale_start_index, const at::Tensor& sampling_location, - const at::Tensor& weights, const at::Tensor& grad_output, const at::Tensor& grad_mc_ms_feat, - const at::Tensor& grad_sampling_location, const at::Tensor& grad_weights); - -std::tuple deformable_conv2d(const at::Tensor& input, const at::Tensor& offset, - const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, - at::IntArrayRef dilation, int64_t groups, int64_t deformable_groups); - -std::tuple modulated_deformable_conv2d(const at::Tensor& input, const at::Tensor& offset, - const at::Tensor& mask, const at::Tensor& weight, const c10::optional& bias_opt, - at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, - int64_t groups, int64_t deformable_groups, int64_t with_bias); - -std::tuple deformable_conv2d_backward(const at::Tensor& input, - const at::Tensor& weight, const at::Tensor& offset, const at::Tensor& offset_output, const at::Tensor& grad_y, - at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, - int64_t groups, int64_t deformable_groups); - -std::tuple modulated_deformable_conv2d_backward( - const at::Tensor& input, const at::Tensor& offset, const at::Tensor& mask, const at::Tensor& weight, - const c10::optional& bias_opt, const at::Tensor& offset_output, const at::Tensor& grad_y, - at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, - int64_t groups, int64_t deformable_groups, int64_t with_bias); - -at::Tensor npu_geometric_kernel_attention_func(const at::Tensor& value, const at::Tensor& spatial_shapes, - const at::Tensor& level_start_index, const at::Tensor& sampling_locations, const at::Tensor& attn_weights); - -std::tuple npu_geometric_kernel_attention_backward(const at::Tensor& value, - const at::Tensor& spatial_shapes, const at::Tensor& level_start_index, const at::Tensor& sampling_locations, - const at::Tensor& attn_weights, const at::Tensor& grad_output); -#endif // PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_ diff --git a/mx_driving/fused/ops/csrc/pybind.cpp b/mx_driving/fused/ops/csrc/pybind.cpp deleted file mode 100644 index acbdc51f3a881923a1aff4edecdd9fdd732f6a70..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/csrc/pybind.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include "csrc/pybind.h" - -#include - -#include "functions.h" -void init_fused(pybind11::module& m) -{ - // nnpu_max_pool2d - m.def("npu_max_pool2d", &npu_max_pool2d); - // mullti_scale_deformable_attn - m.def("multi_scale_deformable_attn", &multi_scale_deformable_attn); - m.def("multi_scale_deformable_attn_backward", &multi_scale_deformable_attn_backward); - - // npu_add_relu - m.def("npu_add_relu", &npu_add_relu); - m.def("npu_add_relu_grad", &npu_add_relu_grad); - - // fused_bias_leaky_relu - m.def("fused_bias_leaky_relu", &fused_bias_leaky_relu); - - // npu_deformable_aggregation - m.def("npu_deformable_aggregation", &deformable_aggregation); - m.def("npu_deformable_aggregation_grad", &deformable_aggregation_grad); - - // deformable_conv2d - m.def("deformable_conv2d", &deformable_conv2d); - m.def("modulated_deformable_conv2d", &modulated_deformable_conv2d); - m.def("deformable_conv2d_backward", &deformable_conv2d_backward); - m.def("modulated_deformable_conv2d_backward", &modulated_deformable_conv2d_backward); - - // npu_geometric_kernel_attention_func - m.def("npu_geometric_kernel_attention_func", &npu_geometric_kernel_attention_func); - m.def("npu_geometric_kernel_attention_backward", &npu_geometric_kernel_attention_backward); -} diff --git a/mx_driving/fused/ops/kernels/CMakeLists.txt b/mx_driving/fused/ops/kernels/CMakeLists.txt deleted file mode 100644 index b77ac594c4df44bf8700a3b2fa1867984111f27a..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/kernels/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) - add_subdirectory(op_host) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) - add_subdirectory(op_kernel) -endif() -if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) - add_subdirectory(testcases) -endif() diff --git a/mx_driving/fused/ops/kernels/README.md b/mx_driving/fused/ops/kernels/README.md deleted file mode 100644 index 214fb0a6d662e806bd7f6bdd1b8962bc1639026e..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/kernels/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some ascend-kernel source files, which are like cuda-kernels and supply some ops that can be run on ascend device. \ No newline at end of file diff --git a/mx_driving/fused/ops/kernels/op_host/CMakeLists.txt b/mx_driving/fused/ops/kernels/op_host/CMakeLists.txt deleted file mode 100644 index 7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/kernels/op_host/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_HOST_SRC - ${ASCEND_HOST_SRC} ${HOST_SRC} - CACHE INTERNAL "") - -# add the exclude files for aclnn -set(aclop_exclude - ${aclop_exclude} "" - CACHE INTERNAL "") -file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp) -file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h) -set(ACLNN_SRC_CUSTOM - ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC} - CACHE INTERNAL "") -set(ACLNN_INC_CUSTOM - ${ACLNN_INC_CUSTOM} ${ACLNN_INC} - CACHE INTERNAL "") diff --git a/mx_driving/fused/ops/kernels/op_host/OWNERS b/mx_driving/fused/ops/kernels/op_host/OWNERS deleted file mode 100644 index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/kernels/op_host/OWNERS +++ /dev/null @@ -1,7 +0,0 @@ -approvers: -- wangxiaoxin-sherie -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/fused/ops/kernels/op_host/common.h b/mx_driving/fused/ops/kernels/op_host/common.h deleted file mode 100644 index 4580dff5fd0b206d1b94383f160932c22d1cb8a9..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/kernels/op_host/common.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. - */ -#ifndef COMMON_H -#define COMMON_H - -#include "register/op_def_registry.h" -#include "tiling/platform/platform_ascendc.h" -#include "tiling/tiling_api.h" -#include "register/tilingdata_base.h" - -inline uint32_t ceil_multiple(uint32_t num, uint32_t block) -{ - if (block == 0) { - return 0; - } - return (num + block - 1) / block; -} - -inline uint32_t ceil_value(uint32_t num, uint32_t block) -{ - if (block == 0) { - return 0; - } - return ((num + block - 1) / block) * block; -} - -#endif // COMMON_H diff --git a/mx_driving/fused/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/fused/ops/kernels/op_kernel/CMakeLists.txt deleted file mode 100644 index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/kernels/op_kernel/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_KERNEL_SRC - ${ASCEND_KERNEL_SRC} ${KERNEL_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/fused/ops/onnx/__init__.py b/mx_driving/fused/ops/onnx/__init__.py deleted file mode 100644 index 3989a46992bc48cb7e9e30ca3cfe092a90d60ff2..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/onnx/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .wrapper_onnx_ops import NPUMultiScaleDeformableAttnOP - -onnx_msda = NPUMultiScaleDeformableAttnOP.apply diff --git a/mx_driving/fused/ops/onnx/plugin/CMakeLists.txt b/mx_driving/fused/ops/onnx/plugin/CMakeLists.txt deleted file mode 100644 index cc6034bd1fe09a766aef52f69cf0bb348ceaf2b5..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/onnx/plugin/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB ONNX_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_ONNX_SRC - ${ASCEND_ONNX_SRC} ${ONNX_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/fused/ops/onnx/wrapper_onnx_ops.py b/mx_driving/fused/ops/onnx/wrapper_onnx_ops.py deleted file mode 100644 index 12b6baa60408ca9b3b3c330e6de7b7d625d1de5e..0000000000000000000000000000000000000000 --- a/mx_driving/fused/ops/onnx/wrapper_onnx_ops.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional, List -import torch -from torch import Tensor -import torch.onnx.symbolic_helper as sym_help -import mx_driving.fused - - -class NPUMultiScaleDeformableAttnOP(torch.autograd.Function): - @staticmethod - def forward(ctx, *args, **kwargs): - return mx_driving.fused.multi_scale_deformable_attn(*args, **kwargs) - - @staticmethod - # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def symbolic(g, value: Tensor, value_spatial_shapes: Tensor, value_level_start_index: Tensor, - sampling_locations: Tensor, attention_weights: Tensor): - return g.op("npu::MultiScaleDeformableAttn", - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights) diff --git a/mx_driving/common/ops/__init__.py b/mx_driving/modules/__init__.py similarity index 100% rename from mx_driving/common/ops/__init__.py rename to mx_driving/modules/__init__.py diff --git a/mx_driving/modules/roi_point_pool_3d.py b/mx_driving/modules/roi_point_pool_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..3efed8ed43fdb92cae7542f14c0b48bdf5092189 --- /dev/null +++ b/mx_driving/modules/roi_point_pool_3d.py @@ -0,0 +1,12 @@ +from torch.nn import Module + +from ..ops.npu_roipoint_pool3d import roipoint_pool3d + + +class RoIPointPool3d(Module): + def __init__(self, num_sampled_points: int = 512): + super().__init__() + self.num_sampled_points = num_sampled_points + + def forward(self, points, point_features, boxes3d): + return roipoint_pool3d(self.num_sampled_points, points, point_features, boxes3d) diff --git a/mx_driving/spconv/ops/sparse_conv.py b/mx_driving/modules/sparse_conv.py similarity index 53% rename from mx_driving/spconv/ops/sparse_conv.py rename to mx_driving/modules/sparse_conv.py index 8f1304d56856d027b20360bf4c50a49140729a0b..06576e113a3d6405ce4488ddaebd8416f5593d98 100644 --- a/mx_driving/spconv/ops/sparse_conv.py +++ b/mx_driving/modules/sparse_conv.py @@ -21,17 +21,58 @@ from torch.nn import init from torch.nn.init import calculate_gain from torch.nn.parameter import Parameter -from . import sparse_functional as Fsp -from . import sparse_ops as ops +from ..ops import sparse_functional as Fsp from .sparse_modules import SparseModule from .sparse_structure import SparseConvTensor +def get_conv_output_size(input_size, kernel_size, stride, padding, dilation): + ndim = len(input_size) + output_size = [] + for i in range(ndim): + size = (input_size[i] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) // stride[i] + 1 + if kernel_size[i] == -1: + output_size.append(1) + else: + output_size.append(size) + return output_size + + +# pylint: disable=too-many-arguments,huawei-too-many-arguments +def get_inverse_conv_output_size(input_size, kernel_size, stride, padding, dilation, output_padding): + ndim = len(input_size) + output_size = [] + for i in range(ndim): + size = ( + (input_size[i] - 1) * stride[i] + - 2 * padding[i] + + dilation[i] * (kernel_size[i] - 1) + + output_padding[i] + + 1 + ) + if kernel_size[i] == -1: + output_size.append(1) + else: + output_size.append(size) + return output_size + + +# pylint: disable=too-many-arguments,huawei-too-many-arguments +def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation, output_padding): + ndim = len(input_size) + output_size = [] + for i in range(ndim): + if kernel_size[i] == -1: + raise ValueError("deconv don't support kernel_size < 0") + size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[i] + output_padding[i] + output_size.append(size) + return output_size + + def _calculate_fan_in_and_fan_out_hwio(tensor): dimensions = tensor.ndimension() if dimensions < 2: - raise ValueError('fan in and fan out can not be computed for tensor' - 'with fewer than 2 dimensions') + raise ValueError("fan in and fan out can not be computed for tensor" "with fewer than 2 dimensions") if dimensions == 2: # Linear fan_in = tensor.size(-2) @@ -49,25 +90,26 @@ def _calculate_fan_in_and_fan_out_hwio(tensor): class SparseConvolution(SparseModule): - - # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def __init__(self, - ndim, - in_channels, - out_channels, - kernel_size=3, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True, - subm=False, - output_padding=0, - transposed=False, - inverse=False, - indice_key=None, - fused_bn=False, - mode='mmcv'): + # pylint: disable=too-many-arguments,huawei-too-many-arguments + def __init__( + self, + ndim, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + subm=False, + output_padding=0, + transposed=False, + inverse=False, + indice_key=None, + fused_bn=False, + mode="mmcv", + ): super().__init__() if groups != 1: raise RuntimeError("do not support group == 1") @@ -103,17 +145,16 @@ class SparseConvolution(SparseModule): self.fused_bn = fused_bn self.mode = mode - self.weight = Parameter( - torch.Tensor(*kernel_size, in_channels, out_channels)) + self.weight = Parameter(torch.Tensor(*kernel_size, in_channels, out_channels)) if bias: self.bias = Parameter(torch.Tensor(out_channels)) else: - self.register_parameter('bias', None) + self.register_parameter("bias", None) self.reset_parameters() def reset_parameters(self): fan_in, fan_out = _calculate_fan_in_and_fan_out_hwio(self.weight) - if self.mode == 'mmcv': + if self.mode == "mmcv": init.kaiming_uniform_(self.weight, a=math.sqrt(5)) else: self._custom_kaiming_uniform_(self.weight, a=math.sqrt(5), fan_in=fan_in, fan_out=fan_out) @@ -124,80 +165,111 @@ class SparseConvolution(SparseModule): bound = 1 / math.sqrt(fan_in) init.uniform_(self.bias, -bound, bound) - def _custom_kaiming_uniform_(self, - tensor, - a=0, - fan_in=0, - fan_out=0, - mode='fan_in', - nonlinearity='leaky_relu'): + def _custom_kaiming_uniform_(self, tensor, a=0, fan_in=0, fan_out=0, mode="fan_in", nonlinearity="leaky_relu"): fan = 0.0 - if mode == 'fan_in': + if mode == "fan_in": fan = float(fan_in) - elif mode == 'fan_out': + elif mode == "fan_out": fan = float(fan_out) gain = calculate_gain(nonlinearity, a) std = gain / math.sqrt(fan) bound = math.sqrt(3.0) * std with torch.no_grad(): tensor.uniform_(-bound, bound) - tensor.data = tensor.data.reshape(self.out_channels, np.prod(self.kernel_size) * self.in_channels).transpose(-1, -2).contiguous() + tensor.data = ( + tensor.data.reshape(self.out_channels, np.prod(self.kernel_size) * self.in_channels) + .transpose(-1, -2) + .contiguous() + ) tensor.data = tensor.data.reshape(*self.kernel_size, self.in_channels, self.out_channels) def forward(self, input): if not isinstance(input, SparseConvTensor): raise RuntimeError("input is not SparseConvTensor") if self.inverse: - out_spatial_shape = ops.get_inverse_conv_output_size( - input.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding) + out_spatial_shape = get_inverse_conv_output_size( + input.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding + ) out_spatial_shape = [int(i) for i in out_spatial_shape] if not isinstance(out_spatial_shape, list): out_spatial_shape = out_spatial_shape.tolist() - out_features, outidx = Fsp.indice_inverse_conv(input.features, input.indices, self.weight, out_spatial_shape, - self.out_channels, input.batch_size, - self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding, - self.groups, self.bias) + out_features, outidx = Fsp.indice_inverse_conv( + input.features, + input.indices, + self.weight, + out_spatial_shape, + self.out_channels, + input.batch_size, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + self.output_padding, + self.groups, + self.bias, + ) elif not self.subm: - out_spatial_shape = ops.get_conv_output_size( - input.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation) + out_spatial_shape = get_conv_output_size( + input.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation + ) out_spatial_shape = [int(i) for i in out_spatial_shape] if not isinstance(out_spatial_shape, list): out_spatial_shape = out_spatial_shape.tolist() - out_features, outidx = Fsp.indice_conv(input.features, input.indices, self.weight, out_spatial_shape, - self.out_channels, input.batch_size, - self.kernel_size, self.stride, self.padding, self.dilation, - self.groups, self.bias) + out_features, outidx = Fsp.indice_conv( + input.features, + input.indices, + self.weight, + out_spatial_shape, + self.out_channels, + input.batch_size, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + self.groups, + self.bias, + ) else: out_spatial_shape = input.spatial_shape out_spatial_shape = [int(i) for i in out_spatial_shape] if not isinstance(out_spatial_shape, list): out_spatial_shape = out_spatial_shape.tolist() - out_features, outidx = Fsp.indice_subm_conv(input.features, input.indices, self.weight, out_spatial_shape, - self.out_channels, input.batch_size, - self.kernel_size, self.stride, self.padding, self.dilation, - self.groups, self.bias) + out_features, outidx = Fsp.indice_subm_conv( + input.features, + input.indices, + self.weight, + out_spatial_shape, + self.out_channels, + input.batch_size, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + self.groups, + self.bias, + ) if self.bias is not None: out_features += self.bias - out_tensor = SparseConvTensor(out_features, outidx, out_spatial_shape, - input.batch_size) + out_tensor = SparseConvTensor(out_features, outidx, out_spatial_shape, input.batch_size) return out_tensor class SparseConv3d(SparseConvolution): - - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True, - indice_key=None, - mode='mmcv'): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None, + mode="mmcv", + ): super().__init__( 3, in_channels, @@ -209,22 +281,24 @@ class SparseConv3d(SparseConvolution): groups, bias, indice_key=indice_key, - mode=mode) + mode=mode, + ) class SubMConv3d(SparseConvolution): - - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True, - indice_key=None, - mode='mmcv'): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None, + mode="mmcv", + ): super().__init__( 3, in_channels, @@ -237,23 +311,25 @@ class SubMConv3d(SparseConvolution): bias, True, indice_key=indice_key, - mode=mode) + mode=mode, + ) class SparseInverseConv3d(SparseConvolution): - - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True, - inverse=True, - indice_key=None, - mode='mmcv'): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + inverse=True, + indice_key=None, + mode="mmcv", + ): super().__init__( 3, in_channels, @@ -267,4 +343,5 @@ class SparseInverseConv3d(SparseConvolution): subm=False, inverse=True, indice_key=indice_key, - mode=mode) \ No newline at end of file + mode=mode, + ) diff --git a/mx_driving/spconv/ops/sparse_modules.py b/mx_driving/modules/sparse_modules.py similarity index 100% rename from mx_driving/spconv/ops/sparse_modules.py rename to mx_driving/modules/sparse_modules.py diff --git a/mx_driving/spconv/ops/sparse_structure.py b/mx_driving/modules/sparse_structure.py similarity index 74% rename from mx_driving/spconv/ops/sparse_structure.py rename to mx_driving/modules/sparse_structure.py index 83907ab5563ff292e8c48715f5b1149a7d31f460..44ebf4627998638b2fbd404994277f129e9d1300 100644 --- a/mx_driving/spconv/ops/sparse_structure.py +++ b/mx_driving/modules/sparse_structure.py @@ -4,8 +4,7 @@ import numpy as np import torch -def scatter_nd(indices: torch.Tensor, updates: torch.Tensor, - shape: torch.Tensor) -> torch.Tensor: +def scatter_nd(indices: torch.Tensor, updates: torch.Tensor, shape: torch.Tensor) -> torch.Tensor: """pytorch edition of tensorflow scatter_nd. this function don't contain except handle code. so use this carefully when @@ -13,7 +12,7 @@ def scatter_nd(indices: torch.Tensor, updates: torch.Tensor, """ ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device) ndim = indices.shape[-1] - output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:] + output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1] :] flatted_indices = indices.view(-1, ndim) slices = [flatted_indices[:, i] for i in range(ndim)] slices += [Ellipsis] @@ -22,13 +21,14 @@ def scatter_nd(indices: torch.Tensor, updates: torch.Tensor, class SparseConvTensor: - - def __init__(self, - features: torch.Tensor, - indices: torch.Tensor, - spatial_shape: Union[List, Tuple], - batch_size: int, - grid: Optional[torch.Tensor] = None): + def __init__( + self, + features: torch.Tensor, + indices: torch.Tensor, + spatial_shape: Union[List, Tuple], + batch_size: int, + grid: Optional[torch.Tensor] = None, + ): self.features = features self.indices = indices if self.indices.dtype != torch.int32: @@ -50,8 +50,7 @@ class SparseConvTensor: return None def dense(self, channels_first: bool = True) -> torch.Tensor: - output_shape = [self.batch_size] + list( - self.spatial_shape) + [self.features.shape[1]] + output_shape = [self.batch_size] + list(self.spatial_shape) + [self.features.shape[1]] res = scatter_nd(self.indices.long(), self.features, output_shape) if not channels_first: return res @@ -62,5 +61,4 @@ class SparseConvTensor: @property def sparity(self): - return (self.indices.shape[0] / np.prod(self.spatial_shape) / - self.batch_size) + return self.indices.shape[0] / np.prod(self.spatial_shape) / self.batch_size diff --git a/mx_driving/modules/voxelization.py b/mx_driving/modules/voxelization.py new file mode 100644 index 0000000000000000000000000000000000000000..eaa128233792e3b43c617012bb8ed06053616adb --- /dev/null +++ b/mx_driving/modules/voxelization.py @@ -0,0 +1,23 @@ +import torch +from torch.nn import Module +from torch.nn.modules.utils import _pair + +from ..ops.voxelization import voxelization + + +class Voxelization(Module): + def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000, deterministic=True): + super().__init__() + + self.voxel_size = voxel_size + self.point_cloud_range = point_cloud_range + self.max_num_points = max_num_points + self.max_voxels = max_voxels + self.max_voxels = max_voxels if isinstance(max_voxels, tuple) else _pair(max_voxels) + self.deterministic = deterministic + + def forward(self, points: torch.Tensor): + max_voxels = self.max_voxels[0] if self.training else self.max_voxels[1] + return voxelization( + points, self.voxel_size, self.point_cloud_range, self.max_num_points, max_voxels, self.deterministic + ) diff --git a/mx_driving/detection/ops/__init__.py b/mx_driving/ops/__init__.py similarity index 100% rename from mx_driving/detection/ops/__init__.py rename to mx_driving/ops/__init__.py diff --git a/mx_driving/common/ops/assign_score_withk.py b/mx_driving/ops/assign_score_withk.py similarity index 97% rename from mx_driving/common/ops/assign_score_withk.py rename to mx_driving/ops/assign_score_withk.py index f17773adb7b4d16da5e9ee8cafd0cc32618c9a82..7f05125fc3bc58cd187a36f0eec9086c8f0d20d4 100644 --- a/mx_driving/common/ops/assign_score_withk.py +++ b/mx_driving/ops/assign_score_withk.py @@ -10,7 +10,6 @@ Modification 1. Add support for Ascend NPU import torch import torch_npu from torch.autograd import Function -from torch.nn import Module import mx_driving._C diff --git a/mx_driving/point/ops/bev_pool.py b/mx_driving/ops/bev_pool.py similarity index 100% rename from mx_driving/point/ops/bev_pool.py rename to mx_driving/ops/bev_pool.py diff --git a/mx_driving/point/ops/bev_pool_v2.py b/mx_driving/ops/bev_pool_v2.py similarity index 100% rename from mx_driving/point/ops/bev_pool_v2.py rename to mx_driving/ops/bev_pool_v2.py diff --git a/mx_driving/point/ops/bev_pool_v3.py b/mx_driving/ops/bev_pool_v3.py similarity index 100% rename from mx_driving/point/ops/bev_pool_v3.py rename to mx_driving/ops/bev_pool_v3.py diff --git a/mx_driving/detection/ops/border_align.py b/mx_driving/ops/border_align.py similarity index 94% rename from mx_driving/detection/ops/border_align.py rename to mx_driving/ops/border_align.py index f91c9ded76e7e8b4a5138c2b84b0d7a15e5139e2..6ecbfc48094ef93ddca62be37b61caa27ebcea94 100644 --- a/mx_driving/detection/ops/border_align.py +++ b/mx_driving/ops/border_align.py @@ -22,7 +22,7 @@ class BorderAlignFunction(Function): feature_map.device ) - mx_driving._C.border_align_forward_npu(feature_map, rois, output, ctx.pooled_size) + mx_driving._C.border_align(feature_map, rois, output, ctx.pooled_size) npu_outputs, index = output.max(dim=-2) npu_outputs = ( diff --git a/mx_driving/detection/ops/box_iou.py b/mx_driving/ops/box_iou.py similarity index 100% rename from mx_driving/detection/ops/box_iou.py rename to mx_driving/ops/box_iou.py diff --git a/mx_driving/detection/ops/boxes_overlap_bev.py b/mx_driving/ops/boxes_overlap_bev.py similarity index 100% rename from mx_driving/detection/ops/boxes_overlap_bev.py rename to mx_driving/ops/boxes_overlap_bev.py diff --git a/mx_driving/fused/ops/deform_conv2d.py b/mx_driving/ops/deform_conv2d.py similarity index 100% rename from mx_driving/fused/ops/deform_conv2d.py rename to mx_driving/ops/deform_conv2d.py diff --git a/mx_driving/point/ops/furthest_point_sampling.py b/mx_driving/ops/furthest_point_sampling.py similarity index 85% rename from mx_driving/point/ops/furthest_point_sampling.py rename to mx_driving/ops/furthest_point_sampling.py index 708251dbf4e9e02ece44d1d8f7d73cfd16ce5ab9..59eb005f5c4a9c297dda837c88f00e5b3af8f571 100644 --- a/mx_driving/point/ops/furthest_point_sampling.py +++ b/mx_driving/ops/furthest_point_sampling.py @@ -6,12 +6,12 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + import numpy as np import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C @@ -21,10 +21,10 @@ class AdsFurthestPointSampling(Function): B, N = point_xyz.size()[:2] point_xyz = point_xyz.permute(0, 2, 1).contiguous() - nearest_dist = torch.tensor(np.ones((B, N)) * 1e10, dtype=torch.float32, device='npu').contiguous() + nearest_dist = torch.tensor(np.ones((B, N)) * 1e10, dtype=torch.float32, device="npu").contiguous() output = mx_driving._C.npu_furthest_point_sampling(point_xyz, nearest_dist, num_points) return output -npu_furthest_point_sampling = AdsFurthestPointSampling.apply \ No newline at end of file +npu_furthest_point_sampling = AdsFurthestPointSampling.apply diff --git a/mx_driving/point/ops/furthest_point_sampling_with_dist.py b/mx_driving/ops/furthest_point_sampling_with_dist.py similarity index 90% rename from mx_driving/point/ops/furthest_point_sampling_with_dist.py rename to mx_driving/ops/furthest_point_sampling_with_dist.py index f56f104cf15c1836d11e3512d3f254a3015c7839..7c24bc970fee2d16bfa1e40ba3722f86afbd01e6 100644 --- a/mx_driving/point/ops/furthest_point_sampling_with_dist.py +++ b/mx_driving/ops/furthest_point_sampling_with_dist.py @@ -6,20 +6,21 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C class AdsFurthestPointSamplingWithDistFunction(Function): @staticmethod - def forward(ctx, points_dist, num_points): + def forward(ctx, points_dist, num_points): B, N = points_dist.size()[:2] nearest_temp = points_dist.new_zeros([B, N]).fill_(1e10) result = mx_driving._C.furthest_point_sampling_with_dist(points_dist, nearest_temp, num_points) return result + furthest_point_sample_with_dist = AdsFurthestPointSamplingWithDistFunction.apply diff --git a/mx_driving/fused/ops/fused_bias_leaky_relu.py b/mx_driving/ops/fused_bias_leaky_relu.py similarity index 65% rename from mx_driving/fused/ops/fused_bias_leaky_relu.py rename to mx_driving/ops/fused_bias_leaky_relu.py index 51a86fdf519d02bb6e42185370df0a701ee7cc82..03b88d3904be40cbd8d4bf228004a85e8abd6265 100644 --- a/mx_driving/fused/ops/fused_bias_leaky_relu.py +++ b/mx_driving/ops/fused_bias_leaky_relu.py @@ -5,22 +5,22 @@ Modification by: Huawei Developers Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU -""" +""" + import torch from torch.autograd import Function -import torch_npu import mx_driving._C -class FusedBiasLeakyRelu(Function): - +class FusedBiasLeakyReLU(Function): @staticmethod def forward(ctx, x, bias, negative_slope=0.2, scale=2**0.5): - bias = torch.broadcast_to(bias.to(x.dtype).reshape([-1 if i == 1 else 1 for i in range(x.ndim)]), - x.shape).contiguous() + bias = torch.broadcast_to( + bias.to(x.dtype).reshape([-1 if i == 1 else 1 for i in range(x.ndim)]), x.shape + ).contiguous() out = mx_driving._C.fused_bias_leaky_relu(x, bias, negative_slope, scale) return out -npu_fused_bias_leaky_relu = FusedBiasLeakyRelu.apply \ No newline at end of file +npu_fused_bias_leaky_relu = FusedBiasLeakyReLU.apply diff --git a/mx_driving/point/ops/group_points.py b/mx_driving/ops/group_points.py similarity index 75% rename from mx_driving/point/ops/group_points.py rename to mx_driving/ops/group_points.py index 523ef73a6b035557d304b0e8b62dec7cc25facee..d7122fec4336277f43c4128da5342cd48952c4e5 100644 --- a/mx_driving/point/ops/group_points.py +++ b/mx_driving/ops/group_points.py @@ -14,12 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import warnings -import torch + import numpy as np +import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C @@ -27,10 +27,7 @@ class AdsGroupPoints(Function): """Group feature with given index.""" @staticmethod - def forward( - ctx, - features: torch.Tensor, - indices: torch.Tensor): + def forward(ctx, features: torch.Tensor, indices: torch.Tensor): """ Args: features (Tensor): Tensor of features to group, input shape is (B, C, N). @@ -45,14 +42,7 @@ class AdsGroupPoints(Function): B, C, N = features.size() _, npoints, nsample = indices.size() - output = mx_driving._C.group_points( - features, - indices, - B, - C, - N, - npoints, - nsample) + output = mx_driving._C.group_points(features, indices, B, C, N, npoints, nsample) ctx.for_backwards = (indices, N) return output @@ -70,14 +60,7 @@ class AdsGroupPoints(Function): idx, N = ctx.for_backwards B, C, npoints, nsample = grad_out.size() - grad_features = mx_driving._C.group_points_backward( - grad_out, - idx, - B, - C, - N, - npoints, - nsample) + grad_features = mx_driving._C.group_points_backward(grad_out, idx, B, C, N, npoints, nsample) return grad_features, None @@ -86,5 +69,7 @@ def group_points(features: torch.Tensor, indices: torch.Tensor): def npu_group_points(features: torch.Tensor, indices: torch.Tensor): - warnings.warn("`npu_group_points` will be deprecated in future. Please use `group_points` instead.", DeprecationWarning) - return AdsGroupPoints.apply(features, indices) \ No newline at end of file + warnings.warn( + "`npu_group_points` will be deprecated in future. Please use `group_points` instead.", DeprecationWarning + ) + return AdsGroupPoints.apply(features, indices) diff --git a/mx_driving/common/ops/hypot.py b/mx_driving/ops/hypot.py similarity index 100% rename from mx_driving/common/ops/hypot.py rename to mx_driving/ops/hypot.py diff --git a/mx_driving/common/ops/knn.py b/mx_driving/ops/knn.py similarity index 64% rename from mx_driving/common/ops/knn.py rename to mx_driving/ops/knn.py index bbd9cd5ac84950d2427067747e8fd9bdf0676e6a..067c7b8f43193e40ebaf4535adde4bb03b969c8f 100644 --- a/mx_driving/common/ops/knn.py +++ b/mx_driving/ops/knn.py @@ -6,24 +6,23 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + from typing import Optional + import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C -class AdsKnn(Function): +class Knn(Function): @staticmethod - def forward(ctx, - k: int, - xyz: torch.Tensor, - center_xyz: Optional[torch.Tensor] = None, - transposed: bool = False) -> torch.Tensor: + def forward( + ctx, k: int, xyz: torch.Tensor, center_xyz: Optional[torch.Tensor] = None, transposed: bool = False + ) -> torch.Tensor: if k <= 0 and k >= 100: - print('k should be in range (0, 100).') + print("k should be in range (0, 100).") return None if center_xyz is None: @@ -34,21 +33,21 @@ class AdsKnn(Function): else: xyz = xyz.transpose(2, 1).contiguous() - if not xyz.is_contiguous(): # [B, 3, N] + if not xyz.is_contiguous(): # [B, 3, N] return None - if not xyz.is_contiguous(): # [B, npoint, 3] + if not xyz.is_contiguous(): # [B, npoint, 3] return None if center_xyz.get_device() != xyz.get_device(): - print('center_xyz and xyz should be on the same device.') + print("center_xyz and xyz should be on the same device.") return None dist2, idx = mx_driving._C.knn(xyz, center_xyz, k, True) zeros_idx = torch.zeros(xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu() idx.where(dist2 >= 1e10, zeros_idx) - idx = idx.transpose(2, 1).contiguous() # [B, k, npoint] + idx = idx.transpose(2, 1).contiguous() # [B, k, npoint] return idx.int() -knn = AdsKnn.apply +knn = Knn.apply diff --git a/mx_driving/fused/ops/modulated_deform_conv2d.py b/mx_driving/ops/modulated_deform_conv2d.py similarity index 100% rename from mx_driving/fused/ops/modulated_deform_conv2d.py rename to mx_driving/ops/modulated_deform_conv2d.py diff --git a/mx_driving/fused/ops/multi_scale_deformable_attn.py b/mx_driving/ops/multi_scale_deformable_attn.py similarity index 81% rename from mx_driving/fused/ops/multi_scale_deformable_attn.py rename to mx_driving/ops/multi_scale_deformable_attn.py index d2cdda814f272f7a5c04edbad2a79091ea9d7440..4c7ffef62799b07ac6ab7fb56f1904a5c6162a5e 100644 --- a/mx_driving/fused/ops/multi_scale_deformable_attn.py +++ b/mx_driving/ops/multi_scale_deformable_attn.py @@ -49,6 +49,25 @@ class MultiScaleDeformableAttnFunction(Function): ) return grad_value, None, None, grad_sampling_loc, grad_attn_weight + @staticmethod + # pylint: disable=too-many-arguments,huawei-too-many-arguments + def symbolic( + g, + value: torch.Tensor, + value_spatial_shapes: torch.Tensor, + value_level_start_index: torch.Tensor, + sampling_locations: torch.Tensor, + attention_weights: torch.Tensor, + ): + return g.op( + "npu::MultiScaleDeformableAttn", + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) + multi_scale_deformable_attn = MultiScaleDeformableAttnFunction.apply diff --git a/mx_driving/detection/ops/nms3d_normal.py b/mx_driving/ops/nms3d_normal.py similarity index 90% rename from mx_driving/detection/ops/nms3d_normal.py rename to mx_driving/ops/nms3d_normal.py index c6b297cc9be74e518d0e057b7807b79e91dc6da5..d1751c07848f7faee693654eb28a29ccfda5cc56 100644 --- a/mx_driving/detection/ops/nms3d_normal.py +++ b/mx_driving/ops/nms3d_normal.py @@ -6,10 +6,11 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + import torch import torch_npu from torch.autograd import Function -from torch.nn import Module + import mx_driving._C @@ -17,11 +18,12 @@ class AdsNms3dNormalFunction(Function): @staticmethod def forward(ctx, boxes, scores, iou_threshold: float): if boxes.shape[1] != 7: - raise 'Input boxes shape should be (N, 7)' + raise "Input boxes shape should be (N, 7)" order = scores.sort(0, descending=True)[1] boxes = boxes[order].contiguous() keep, num_out = mx_driving._C.nms3d_normal(boxes, iou_threshold) return order[keep[:num_out].long()].contiguous() + npu_nms3d_normal = AdsNms3dNormalFunction.apply diff --git a/mx_driving/fused/ops/npu_add_relu.py b/mx_driving/ops/npu_add_relu.py similarity index 88% rename from mx_driving/fused/ops/npu_add_relu.py rename to mx_driving/ops/npu_add_relu.py index 62366a79b1b05917054bfeb2578798fb81729a7f..dbb66145ac93761dc7d7c51c7c5d5eabf68f9e29 100644 --- a/mx_driving/fused/ops/npu_add_relu.py +++ b/mx_driving/ops/npu_add_relu.py @@ -5,13 +5,13 @@ Modification by: Huawei Developers Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU -""" +""" + import torch +import torch.nn.functional as F +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu -import torch.nn.functional as F import mx_driving._C @@ -27,8 +27,9 @@ class AddReluFunction(Function): @staticmethod def backward(ctx, grad_output): - x, = ctx.saved_tensors + (x,) = ctx.saved_tensors result = mx_driving._C.npu_add_relu_grad(x, grad_output) return result, result -npu_add_relu = AddReluFunction.apply \ No newline at end of file + +npu_add_relu = AddReluFunction.apply diff --git a/mx_driving/fused/ops/npu_deformable_aggregation.py b/mx_driving/ops/npu_deformable_aggregation.py similarity index 87% rename from mx_driving/fused/ops/npu_deformable_aggregation.py rename to mx_driving/ops/npu_deformable_aggregation.py index d6076fb4876ff68403f36ec7bb9384b67d0b9bcc..46de6fb9832f782022c299f95cccba97487d510b 100644 --- a/mx_driving/fused/ops/npu_deformable_aggregation.py +++ b/mx_driving/ops/npu_deformable_aggregation.py @@ -1,9 +1,8 @@ -import torch import numpy as np +import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C @@ -12,13 +11,13 @@ class AdsDeformableAggregation(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def forward( - ctx, - mc_ms_feat: torch.Tensor, - spatial_shape: torch.Tensor, - scale_start_index: torch.Tensor, - sampling_location: torch.Tensor, - weights: torch.Tensor): - + ctx, + mc_ms_feat: torch.Tensor, + spatial_shape: torch.Tensor, + scale_start_index: torch.Tensor, + sampling_location: torch.Tensor, + weights: torch.Tensor, + ): mc_ms_feat = mc_ms_feat.contiguous().float() spatial_shape = spatial_shape.contiguous().int() @@ -41,8 +40,8 @@ class AdsDeformableAggregation(Function): weights, ) return output - - @staticmethod + + @staticmethod def backward(ctx, grad_output): ( mc_ms_feat, @@ -60,7 +59,7 @@ class AdsDeformableAggregation(Function): grad_mc_ms_feat = torch.zeros_like(mc_ms_feat) grad_sampling_location = torch.zeros_like(sampling_location) grad_weights = torch.zeros_like(weights) - grad_mc_ms_feat, grad_sampling_location, grad_weights = mx_driving._C.npu_deformable_aggregation_grad( + grad_mc_ms_feat, grad_sampling_location, grad_weights = mx_driving._C.npu_deformable_aggregation_backward( mc_ms_feat, spatial_shape, scale_start_index, @@ -80,4 +79,5 @@ class AdsDeformableAggregation(Function): grad_weights, ) + npu_deformable_aggregation = AdsDeformableAggregation.apply diff --git a/mx_driving/point/ops/npu_dynamic_scatter.py b/mx_driving/ops/npu_dynamic_scatter.py similarity index 67% rename from mx_driving/point/ops/npu_dynamic_scatter.py rename to mx_driving/ops/npu_dynamic_scatter.py index 81ae7ff6f6722becaa4e766752137eb322a54c70..76fddcfcaca1f6d2ca95a4369ea52d3970f1b517 100644 --- a/mx_driving/point/ops/npu_dynamic_scatter.py +++ b/mx_driving/ops/npu_dynamic_scatter.py @@ -6,29 +6,33 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + from typing import Any, Optional, Tuple import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C class DynamicScatterFunction(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def forward(ctx: Any, feats: torch.Tensor, coors: torch.Tensor, - reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]: - if reduce_type not in ('max', 'sum', 'mean'): + def forward( + ctx: Any, feats: torch.Tensor, coors: torch.Tensor, reduce_type: str = "max" + ) -> Tuple[torch.Tensor, torch.Tensor]: + if reduce_type not in ("max", "sum", "mean"): raise ValueError("reduce_type should be 'max', 'sum' or 'mean', but now is %s." % reduce_type) voxel_idx = mx_driving._C.point_to_voxel(coors, [], [], "XYZ") - num_voxels, uniqued_voxel_idx, prefix_sum_point_per_voxel, argsort_coor, _ = mx_driving._C.unique_voxel(voxel_idx) + num_voxels, uniqued_voxel_idx, prefix_sum_point_per_voxel, argsort_coor, _ = mx_driving._C.unique_voxel( + voxel_idx + ) voxel_coors = mx_driving._C.voxel_to_point(uniqued_voxel_idx, [], [], "XYZ") - voxel_feats, compare_mask = mx_driving._C.npu_dynamic_scatter(feats, coors, prefix_sum_point_per_voxel, - argsort_coor, num_voxels, reduce_type) + voxel_feats, compare_mask = mx_driving._C.npu_dynamic_scatter( + feats, coors, prefix_sum_point_per_voxel, argsort_coor, num_voxels, reduce_type + ) ctx.reduce_type = reduce_type ctx.feats_shape = feats.shape @@ -39,13 +43,17 @@ class DynamicScatterFunction(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments # 'pylint: disable=too-many-return-arguments,huawei-too-many-return-arguments - def backward(ctx: Any, - grad_voxel_feats: torch.Tensor, - grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple: + def backward(ctx: Any, grad_voxel_feats: torch.Tensor, grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple: (prefix_sum_point_per_voxel, argsort_coor, compare_mask) = ctx.saved_tensors grad_point_feats = torch.zeros(ctx.feats_shape, dtype=grad_voxel_feats.dtype, device=grad_voxel_feats.device) - mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel, - argsort_coor, compare_mask, ctx.reduce_type) + mx_driving._C.npu_dynamic_scatter_grad( + grad_point_feats, + grad_voxel_feats.contiguous(), + prefix_sum_point_per_voxel, + argsort_coor, + compare_mask, + ctx.reduce_type, + ) return grad_point_feats, None, None diff --git a/mx_driving/fused/ops/npu_geometric_kernel_attention_func.py b/mx_driving/ops/npu_geometric_kernel_attention.py similarity index 87% rename from mx_driving/fused/ops/npu_geometric_kernel_attention_func.py rename to mx_driving/ops/npu_geometric_kernel_attention.py index 1819053207777bd65ce56c4bc4f3480de3aacb45..82663f11aaba9ac4e94d5e51a4d3170848c59b0f 100644 --- a/mx_driving/fused/ops/npu_geometric_kernel_attention_func.py +++ b/mx_driving/ops/npu_geometric_kernel_attention.py @@ -9,7 +9,7 @@ class GeometricKernelAttentionFunc(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def forward(ctx, value, spatial_shapes, level_start_index, sampling_locations, attn_weights): - result = mx_driving._C.npu_geometric_kernel_attention_func( + result = mx_driving._C.npu_geometric_kernel_attention( value, spatial_shapes, level_start_index, sampling_locations, attn_weights ) ctx.save_for_backward(value, spatial_shapes, level_start_index, sampling_locations, attn_weights) @@ -24,4 +24,4 @@ class GeometricKernelAttentionFunc(Function): return grad_value, None, None, None, grad_attn_weights -npu_geometric_kernel_attention_func = GeometricKernelAttentionFunc.apply +npu_geometric_kernel_attention = GeometricKernelAttentionFunc.apply diff --git a/mx_driving/fused/ops/npu_max_pool2d.py b/mx_driving/ops/npu_max_pool2d.py similarity index 100% rename from mx_driving/fused/ops/npu_max_pool2d.py rename to mx_driving/ops/npu_max_pool2d.py diff --git a/mx_driving/detection/ops/npu_nms3d.py b/mx_driving/ops/npu_nms3d.py similarity index 100% rename from mx_driving/detection/ops/npu_nms3d.py rename to mx_driving/ops/npu_nms3d.py diff --git a/mx_driving/preprocess/ops/npu_points_in_box.py b/mx_driving/ops/npu_points_in_box.py similarity index 89% rename from mx_driving/preprocess/ops/npu_points_in_box.py rename to mx_driving/ops/npu_points_in_box.py index 056df051629a1a69daa0e3bbe31eefad04831f7c..540a5b77224ab79cd66cf911eac001e3335028bd 100644 --- a/mx_driving/preprocess/ops/npu_points_in_box.py +++ b/mx_driving/ops/npu_points_in_box.py @@ -6,11 +6,11 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C @@ -25,4 +25,5 @@ class PointsInBoxFunction(Function): def backward(ctx, grad_output): return None -npu_points_in_box = PointsInBoxFunction.apply \ No newline at end of file + +npu_points_in_box = PointsInBoxFunction.apply diff --git a/mx_driving/preprocess/ops/npu_points_in_box_all.py b/mx_driving/ops/npu_points_in_box_all.py similarity index 78% rename from mx_driving/preprocess/ops/npu_points_in_box_all.py rename to mx_driving/ops/npu_points_in_box_all.py index 8f31e175765c512dfd912f7f78efb8216cf152f9..d27ff9095883fa94a6457baeff2a3ba34c8c00a2 100644 --- a/mx_driving/preprocess/ops/npu_points_in_box_all.py +++ b/mx_driving/ops/npu_points_in_box_all.py @@ -6,12 +6,13 @@ Modification date: 2024-07-24 Modification Description: Modification 1. Add support for Ascend NPU """ + import warnings + import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C @@ -32,5 +33,8 @@ def points_in_boxes_all(boxes, pts): def npu_points_in_box_all(boxes, pts): - warnings.warn("`npu_points_in_box_all` will be deprecated in future. Please use `points_in_boxes_all` instead.", DeprecationWarning) - return PointsInBoxAllFunction.apply(boxes, pts) \ No newline at end of file + warnings.warn( + "`npu_points_in_box_all` will be deprecated in future. Please use `points_in_boxes_all` instead.", + DeprecationWarning, + ) + return PointsInBoxAllFunction.apply(boxes, pts) diff --git a/mx_driving/preprocess/ops/npu_roipoint_pool3d.py b/mx_driving/ops/npu_roipoint_pool3d.py similarity index 59% rename from mx_driving/preprocess/ops/npu_roipoint_pool3d.py rename to mx_driving/ops/npu_roipoint_pool3d.py index eae3c1104c5348d06cb92d37c3af39b87aec7530..116a81979d29b1ae8d631f0324d3bc5059e84492 100644 --- a/mx_driving/preprocess/ops/npu_roipoint_pool3d.py +++ b/mx_driving/ops/npu_roipoint_pool3d.py @@ -6,10 +6,11 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + import torch -from torch.autograd import Function -from torch.nn import Module import torch_npu +from torch.autograd import Function + import mx_driving._C @@ -17,19 +18,19 @@ class RoipointPool3dFunction(Function): @staticmethod def forward(ctx, num_sampled_points, points, point_features, boxes3d): if num_sampled_points <= 0: - raise Exception('Input num_sampled_points be more than 0') + raise Exception("Input num_sampled_points be more than 0") if (points.size(0) != point_features.size(0)) or (points.size(0) != boxes3d.size(0)): - raise Exception('Input points/point_features/boxes3d shape should be (B, x, x)') + raise Exception("Input points/point_features/boxes3d shape should be (B, x, x)") if (len(points.shape) != 3) or (points.size(2) != 3): - raise Exception('Input points shape should be (B, N, 3)') + raise Exception("Input points shape should be (B, N, 3)") if (len(point_features.shape) != 3) or (points.size(1) != point_features.size(1)): - raise Exception('Input point_features shape should be (B, N, C)') + raise Exception("Input point_features shape should be (B, N, C)") if (len(boxes3d.shape) != 3) or (boxes3d.size(2) != 7): - raise Exception('Input boxes3d shape should be (B, M, 7)') + raise Exception("Input boxes3d shape should be (B, M, 7)") if (points.dtype != point_features.dtype) or (points.dtype != boxes3d.dtype): - raise Exception('Input points/point_features/boxes3d dtype should be the same.') - if (points.device.type != 'npu') or (point_features.device.type != 'npu') or (boxes3d.device.type != 'npu'): - raise ValueError('The device is not npu!') + raise Exception("Input points/point_features/boxes3d dtype should be the same.") + if (points.device.type != "npu") or (point_features.device.type != "npu") or (boxes3d.device.type != "npu"): + raise ValueError("The device is not npu!") # points: (B, N, 3) 输入点 # point_features: (B, N, C) 输入点特征 # boxes3d: (B, M, 7) 边界框 @@ -42,18 +43,10 @@ class RoipointPool3dFunction(Function): feature_len = point_features.size(2) # pooled_features = points.new_zeros((batch_size, boxes_num, num_sampled_points, 3 + feature_len)) # pooled_empty_flag = points.new_zeros((batch_size, boxes_num), dtype=torch.int) - pooled_features, pooled_empty_flag = \ - mx_driving._C.npu_roipoint_pool3d_forward(num_sampled_points, points, point_features, boxes3d) + pooled_features, pooled_empty_flag = mx_driving._C.npu_roipoint_pool3d_forward( + num_sampled_points, points, point_features, boxes3d + ) return pooled_features, pooled_empty_flag roipoint_pool3d = RoipointPool3dFunction.apply - - -class RoipointPool3d(Module): - def __init__(self, num_sampled_points: int = 512): - super().__init__() - self.num_sampled_points = num_sampled_points - - def forward(self, points, point_features, boxes3d): - return RoipointPool3dFunction.apply(self.num_sampled_points, points, point_features, boxes3d) diff --git a/mx_driving/detection/ops/pixel_group.py b/mx_driving/ops/pixel_group.py similarity index 100% rename from mx_driving/detection/ops/pixel_group.py rename to mx_driving/ops/pixel_group.py diff --git a/mx_driving/detection/ops/roi_align_rotated.py b/mx_driving/ops/roi_align_rotated.py similarity index 65% rename from mx_driving/detection/ops/roi_align_rotated.py rename to mx_driving/ops/roi_align_rotated.py index 3ed1c3eaf50419b5c57a0e6050b522e61624f8e4..e37aacc323d3646591d8a1e2d74883218d6b510f 100644 --- a/mx_driving/detection/ops/roi_align_rotated.py +++ b/mx_driving/ops/roi_align_rotated.py @@ -1,11 +1,12 @@ """ Copyright (c) OpenMMLab. All rights reserved. """ + from typing import Any, Optional, Tuple, Union import torch -import torch_npu import torch.nn as nn +import torch_npu from torch.autograd import Function import mx_driving._C @@ -13,8 +14,18 @@ import mx_driving._C class RoIAlignRotatedFunction(Function): @staticmethod - def forward(ctx: Any, feature_map: torch.Tensor, rois: torch.Tensor, spatial_scale: float, - sampling_ratio: int, pooled_height: int, pooled_width: int, aligned: bool = True, clockwise: bool = False) -> torch.Tensor: + # pylint: disable=too-many-arguments,huawei-too-many-arguments + def forward( + ctx: Any, + feature_map: torch.Tensor, + rois: torch.Tensor, + spatial_scale: float, + sampling_ratio: int, + pooled_height: int, + pooled_width: int, + aligned: bool = True, + clockwise: bool = False, + ) -> torch.Tensor: ctx.pooled_height = pooled_height ctx.pooled_width = pooled_width ctx.spatial_scale = spatial_scale @@ -26,7 +37,9 @@ class RoIAlignRotatedFunction(Function): batch_size, num_channels, data_height, data_width = feature_map.size() num_rois = rois.size(0) - output = feature_map.new_zeros(num_rois, ctx.pooled_height, ctx.pooled_width, num_channels).to(feature_map.device) + output = feature_map.new_zeros(num_rois, ctx.pooled_height, ctx.pooled_width, num_channels).to( + feature_map.device + ) mx_driving._C.roi_align_rotated_v2_forward_npu( feature_map, @@ -37,21 +50,31 @@ class RoIAlignRotatedFunction(Function): ctx.pooled_height, ctx.pooled_width, ctx.aligned, - ctx.clockwise) + ctx.clockwise, + ) output = output.transpose(2, 3).transpose(1, 2).contiguous() return output - + @staticmethod + # pylint: disable=too-many-return-values def backward(ctx: Any, grad_output: torch.Tensor): feature_map, rois = ctx.saved_tensors rois_trans = torch.permute(rois, (1, 0)).contiguous() grad_output_trans = torch.permute(grad_output, (0, 2, 3, 1)).contiguous() grad_feature_map = mx_driving._C.npu_roi_align_rotated_grad_v2( - feature_map, rois_trans, grad_output_trans, - ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, - ctx.sampling_ratio, ctx.aligned, ctx.clockwise) + feature_map, + rois_trans, + grad_output_trans, + ctx.pooled_height, + ctx.pooled_width, + ctx.spatial_scale, + ctx.sampling_ratio, + ctx.aligned, + ctx.clockwise, + ) grad_feature_map = grad_feature_map.permute(0, 3, 1, 2).contiguous() - + return grad_feature_map, None, None, None, None, None, None, None -roi_align_rotated = RoIAlignRotatedFunction.apply \ No newline at end of file + +roi_align_rotated = RoIAlignRotatedFunction.apply diff --git a/mx_driving/detection/ops/roiaware_pool3d.py b/mx_driving/ops/roiaware_pool3d.py similarity index 56% rename from mx_driving/detection/ops/roiaware_pool3d.py rename to mx_driving/ops/roiaware_pool3d.py index 7aa0279bc8f21547531166101de416795934b059..d355f4f13a1cc53c759d085a4fe97106325e858c 100644 --- a/mx_driving/detection/ops/roiaware_pool3d.py +++ b/mx_driving/ops/roiaware_pool3d.py @@ -6,11 +6,13 @@ Modification date: 2024-10-16 Modification Description: Modification 1. Add support for Ascend NPU """ + from typing import Any, Tuple, Union + import torch import torch_npu from torch.autograd import Function -from torch.nn import Module + import mx_driving._C @@ -23,48 +25,50 @@ def is_tuple_of(input_tuple, expected_type=int): class RoIAwarePool3dFunction(Function): @staticmethod - def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor, pts_feature: torch.Tensor, - out_size: Union[int, tuple], max_pts_per_voxel: int, mode: int): + # pylint: disable=too-many-arguments,huawei-too-many-arguments + def forward( + ctx: Any, + rois: torch.Tensor, + pts: torch.Tensor, + pts_feature: torch.Tensor, + out_size: Union[int, tuple], + max_pts_per_voxel: int, + mode: int, + ): if isinstance(out_size, int): out_x = out_y = out_z = out_size - elif (len(out_size) == 3 or is_tuple_of(out_size, int)): + elif len(out_size) == 3 or is_tuple_of(out_size, int): out_x, out_y, out_z = out_size else: raise Exception("outsize attr Error!\n") - + num_rois = rois.shape[0] num_channels = pts_feature.shape[-1] num_pts = pts.shape[0] - pooled_features = pts_feature.new_zeros( - (num_rois, out_x, out_y, out_z, num_channels)) - argmax = pts_feature.new_zeros( - (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int32) - pts_idx_of_voxels = pts_feature.new_zeros( - (num_rois, out_x, out_y, out_z, max_pts_per_voxel), dtype=torch.int32) - + pooled_features = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, num_channels)) + argmax = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int32) + pts_idx_of_voxels = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, max_pts_per_voxel), dtype=torch.int32) + mx_driving._C.npu_roiaware_pool3d_forward( - rois, - pts, - pts_feature, - argmax, - pts_idx_of_voxels, - pooled_features, - mode) - + rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features, mode + ) + ctx.save_for_backward(pts_idx_of_voxels, argmax, mode, num_pts, num_channels) - + return pooled_features - + @staticmethod def backward(ctx: Any, grad_out: torch.Tensor): ret = ctx.roiaware_pool3d_for_backward pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret # backward - grad_in = mx_driving._C.roiaware_pool3d_grad(pts_idx_of_voxels, argmax, - grad_out.contiguous(), num_pts, pool_method=mode) + grad_in = mx_driving._C.roiaware_pool3d_grad( + pts_idx_of_voxels, argmax, grad_out.contiguous(), num_pts, pool_method=mode + ) return None, None, grad_in, None, None, None - -roiaware_pool3d = RoIAwarePool3dFunction.apply \ No newline at end of file + + +roiaware_pool3d = RoIAwarePool3dFunction.apply diff --git a/mx_driving/detection/ops/rotated_iou.py b/mx_driving/ops/rotated_iou.py similarity index 100% rename from mx_driving/detection/ops/rotated_iou.py rename to mx_driving/ops/rotated_iou.py diff --git a/mx_driving/detection/ops/rotated_overlaps.py b/mx_driving/ops/rotated_overlaps.py similarity index 100% rename from mx_driving/detection/ops/rotated_overlaps.py rename to mx_driving/ops/rotated_overlaps.py diff --git a/mx_driving/common/ops/scatter_max.py b/mx_driving/ops/scatter_max.py similarity index 77% rename from mx_driving/common/ops/scatter_max.py rename to mx_driving/ops/scatter_max.py index b30c6139f61ef9edb952fde48fbacfd7c2088050..5a0e49eb0f7982011389a85d86af348c260c463c 100644 --- a/mx_driving/common/ops/scatter_max.py +++ b/mx_driving/ops/scatter_max.py @@ -6,11 +6,11 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C @@ -28,13 +28,21 @@ class ScatterMaxFunction(Function): device = argmax.device grad_updates_index0 = argmax.unsqueeze(-1) - grad_updates_index1 = torch.tile(torch.arange(0, argmax.shape[1]), argmax.shape[0:1:1]).reshape(argmax.shape).unsqueeze(-1).to(device) + grad_updates_index1 = ( + torch.tile(torch.arange(0, argmax.shape[1]), argmax.shape[0:1:1]) + .reshape(argmax.shape) + .unsqueeze(-1) + .to(device) + ) grad_updates_indices = torch.concat((grad_updates_index0, grad_updates_index1), -1).to(device) - grad_updates_indices_uss = grad_updates_indices[..., 0] * grad_updates_indices.shape[1] + grad_updates_indices[..., 1] + grad_updates_indices_uss = ( + grad_updates_indices[..., 0] * grad_updates_indices.shape[1] + grad_updates_indices[..., 1] + ) num_segments = torch.tensor(updates.shape[0] * updates.shape[1]).to(device) grad = mx_driving._C.npu_scatter_max_backward(grad_output, grad_updates_indices_uss, num_segments) return grad.reshape(updates.shape), None, None + scatter_max = ScatterMaxFunction.apply diff --git a/mx_driving/common/ops/scatter_mean.py b/mx_driving/ops/scatter_mean.py similarity index 90% rename from mx_driving/common/ops/scatter_mean.py rename to mx_driving/ops/scatter_mean.py index 7e7e619b3283171c6169aa4ca01349fb55d59980..06e0c8a4fbda2394b0e53e1693447e8c89c98db1 100644 --- a/mx_driving/common/ops/scatter_mean.py +++ b/mx_driving/ops/scatter_mean.py @@ -1,8 +1,7 @@ import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C @@ -22,4 +21,5 @@ class ScatterMeanFunction(Function): result = mx_driving._C.npu_scatter_mean_grad(grad_out, index, count, dim) return result, None, None, None, None -scatter_mean = ScatterMeanFunction.apply \ No newline at end of file + +scatter_mean = ScatterMeanFunction.apply diff --git a/mx_driving/common/ops/sort_pairs.py b/mx_driving/ops/sort_pairs.py similarity index 100% rename from mx_driving/common/ops/sort_pairs.py rename to mx_driving/ops/sort_pairs.py diff --git a/mx_driving/spconv/ops/sparse_functional.py b/mx_driving/ops/sparse_functional.py similarity index 68% rename from mx_driving/spconv/ops/sparse_functional.py rename to mx_driving/ops/sparse_functional.py index ee4db416da3f1ddb4943ace4a023d21b3384c04b..a994ab4151ddd2173df784bff051953e7128c625 100644 --- a/mx_driving/spconv/ops/sparse_functional.py +++ b/mx_driving/ops/sparse_functional.py @@ -15,28 +15,39 @@ from typing import Any -import torch import numpy as np +import torch from torch.autograd import Function from torch.autograd.function import once_differentiable + import mx_driving._C -from . import sparse_ops as ops class SparseConvFunction(Function): - @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def forward(ctx: Any, features, indices, weight, out_spatial_shape, - out_channels, batch_size, - kernel_size, stride, padding, dilation, - groups, bias) -> torch.Tensor: + def forward( + ctx: Any, + features, + indices, + weight, + out_spatial_shape, + out_channels, + batch_size, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + ) -> torch.Tensor: device = features.device weight = weight.data # calculate the index pair - outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_conv3d(indices, kernel_size, stride, padding, - out_channels, out_spatial_shape, batch_size) + outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_conv3d( + indices, kernel_size, stride, padding, out_channels, out_spatial_shape, batch_size + ) # sort and nonezero to_insert = torch.tensor(-1).to(device) sorted_idx, sorted_idx_to_former_indices = torch.sort(ouidx_offset.view(torch.float32)) @@ -45,8 +56,9 @@ class SparseConvFunction(Function): sub_result = new_sorted_idx - new_sorted_idx_2 unique_indices_offset = torch.nonzero(sub_result != 0) # index_put and matmul - out_features, outidx = mx_driving._C.multi_to_sparse_v2(features, weight, unique_indices_offset.int(), - sorted_idx_to_former_indices.int(), outidx_pair.int()) + out_features, outidx = mx_driving._C.multi_to_sparse_v2( + features, weight, unique_indices_offset.int(), sorted_idx_to_former_indices.int(), outidx_pair.int() + ) outidx, outidx_ = torch.chunk(outidx, 2, dim=1) ctx.save_for_backward(features, weight, sorted_idx_to_former_indices.int(), unique_indices_offset.int()) @@ -55,12 +67,11 @@ class SparseConvFunction(Function): @staticmethod @once_differentiable # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple: + def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx=None) -> tuple: features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors - feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset, - sorted_idx_to_former_indices, - features, weight, grad_out_features) - + feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad( + unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features + ) return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None @@ -69,16 +80,38 @@ class SparseInverseConvFunction(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def forward(ctx: Any, features, indices, weight, out_spatial_shape, - out_channels, batch_size, - kernel_size, stride, padding, dilation, output_padding, - groups, bias) -> torch.Tensor: + def forward( + ctx: Any, + features, + indices, + weight, + out_spatial_shape, + out_channels, + batch_size, + kernel_size, + stride, + padding, + dilation, + output_padding, + groups, + bias, + ) -> torch.Tensor: device = features.device weight = weight.data # calculate the index pair - out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_inverse_conv3d(features, indices, weight, - kernel_size, stride, padding, dilation, output_padding, - out_channels, out_spatial_shape, batch_size) + out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_inverse_conv3d( + features, + indices, + weight, + kernel_size, + stride, + padding, + dilation, + output_padding, + out_channels, + out_spatial_shape, + batch_size, + ) # sort and nonezero to_insert = torch.tensor(-1).to(device) sorted_idx, sorted_idx_to_former_indices = torch.sort(ouidx_offset.view(torch.float32)) @@ -87,8 +120,9 @@ class SparseInverseConvFunction(Function): sub_result = new_sorted_idx - new_sorted_idx_2 unique_indices_offset = torch.nonzero(sub_result != 0) # matmul - out_features, outidx = mx_driving._C.multi_to_sparse(out_features, unique_indices_offset.int(), - sorted_idx_to_former_indices.int(), outidx_pair.int()) + out_features, outidx = mx_driving._C.multi_to_sparse( + out_features, unique_indices_offset.int(), sorted_idx_to_former_indices.int(), outidx_pair.int() + ) outidx, outidx_ = torch.chunk(outidx, 2, dim=1) ctx.save_for_backward(features, weight, sorted_idx_to_former_indices.int(), unique_indices_offset.int()) @@ -97,11 +131,11 @@ class SparseInverseConvFunction(Function): @staticmethod @once_differentiable # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple: + def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx=None) -> tuple: features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors - feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset, - sorted_idx_to_former_indices, - features, weight, grad_out_features) + feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad( + unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features + ) return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None, None @@ -109,21 +143,36 @@ class SubMConvFunction(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def forward(ctx: Any, features, indices, weight, out_spatial_shape, - out_channels, batch_size, - kernel_size, stride, padding, dilation, - groups, bias) -> torch.Tensor: + def forward( + ctx: Any, + features, + indices, + weight, + out_spatial_shape, + out_channels, + batch_size, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + ) -> torch.Tensor: device = features.device weight = weight.data # calculate the index pair indices_long = indices.long() - flatten_indices = indices_long[:, 0] * out_spatial_shape[0] * out_spatial_shape[1] * out_spatial_shape[2] + \ - indices_long[:, 1] * out_spatial_shape[1] * out_spatial_shape[2] + indices_long[:, 2] * out_spatial_shape[2] + indices_long[:, 3] + flatten_indices = ( + indices_long[:, 0] * out_spatial_shape[0] * out_spatial_shape[1] * out_spatial_shape[2] + + indices_long[:, 1] * out_spatial_shape[1] * out_spatial_shape[2] + + indices_long[:, 2] * out_spatial_shape[2] + + indices_long[:, 3] + ) temp, ordered_indices = mx_driving._C.npu_prepare_subm_conv3d(flatten_indices, out_spatial_shape, batch_size) temp[flatten_indices] = ordered_indices - output_iml2col, outidx_pair, ouidx_offset = mx_driving._C.npu_subm_sparse_conv3d(features, indices, weight, - kernel_size, out_channels, - out_spatial_shape, batch_size, temp) + output_iml2col, outidx_pair, ouidx_offset = mx_driving._C.npu_subm_sparse_conv3d( + features, indices, weight, kernel_size, out_channels, out_spatial_shape, batch_size, temp + ) weight_flatten = weight.view(kernel_size[0] * kernel_size[1] * kernel_size[2] * features.shape[1], out_channels) output_iml2col = output_iml2col.view(features.shape[0], -1) out_features = output_iml2col @ weight_flatten @@ -133,12 +182,14 @@ class SubMConvFunction(Function): @staticmethod @once_differentiable # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple: + def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx=None) -> tuple: features, weight, output_iml2col, ouidx_offset = ctx.saved_tensors weight_grad = output_iml2col.T @ grad_out_features weight_shape = weight.shape kernel_num = weight_shape[0] * weight_shape[1] * weight_shape[2] - weight_grad = weight_grad.view(weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3], weight_shape[4]) + weight_grad = weight_grad.view( + weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3], weight_shape[4] + ) weight = weight.view(kernel_num * weight_shape[3], weight_shape[4]) feature_grad_iml2col = grad_out_features @ (weight.T) feature_grad_iml2col = feature_grad_iml2col.view(features.shape[0], kernel_num, features.shape[1]) @@ -151,6 +202,7 @@ class SubMConvFunction(Function): feature_grad.index_put_((ouidx_offset,), feature_grad_iml2col, True) return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None + indice_conv = SparseConvFunction.apply indice_inverse_conv = SparseInverseConvFunction.apply -indice_subm_conv = SubMConvFunction.apply \ No newline at end of file +indice_subm_conv = SubMConvFunction.apply diff --git a/mx_driving/common/ops/three_interpolate.py b/mx_driving/ops/three_interpolate.py similarity index 92% rename from mx_driving/common/ops/three_interpolate.py rename to mx_driving/ops/three_interpolate.py index ed237c10166394d4ecd89d9695a07dd92ba62dd0..a0fdcdcb7536749bed1dfa1274108e7a12477094 100644 --- a/mx_driving/common/ops/three_interpolate.py +++ b/mx_driving/ops/three_interpolate.py @@ -10,19 +10,17 @@ Modification 1. Add support for Ascend NPU from typing import Any, Tuple import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C class ThreeInterpolateFunction(Function): @staticmethod - def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor, - weight: torch.Tensor) -> torch.Tensor: - + def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: + b, c, m = features.size() n = indices.size(1) ctx.three_interpolate_for_backward = (indices, weight, m) @@ -34,7 +32,7 @@ class ThreeInterpolateFunction(Function): @staticmethod def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - + b, c, n = grad_out.size() idx, weight, m = ctx.three_interpolate_for_backward @@ -46,7 +44,8 @@ class ThreeInterpolateFunction(Function): if grad_out_dtype == torch.half: grad_features = grad_features.to(torch.half) - + return grad_features, None, None + three_interpolate = ThreeInterpolateFunction.apply diff --git a/mx_driving/common/ops/threeNN.py b/mx_driving/ops/three_nn.py similarity index 92% rename from mx_driving/common/ops/threeNN.py rename to mx_driving/ops/three_nn.py index d259e5d07e76a310ffbea40db5ee90b08126a350..26cfafe52a31680ca086a7ed46f0e7ce721200de 100644 --- a/mx_driving/common/ops/threeNN.py +++ b/mx_driving/ops/three_nn.py @@ -6,16 +6,17 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + from typing import Any, Tuple + import torch +import torch_npu from torch.autograd import Function -from torch.nn import Module -import torch_npu import mx_driving._C -class AdsThreeNN(Function): +class ThreeNN(Function): @staticmethod def forward(ctx: Any, target: torch.Tensor, source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: # target is center_xyz @@ -34,4 +35,4 @@ class AdsThreeNN(Function): return dist2, idx.int() -three_nn = AdsThreeNN.apply +three_nn = ThreeNN.apply diff --git a/mx_driving/point/ops/voxel_pooling_train.py b/mx_driving/ops/voxel_pooling_train.py similarity index 85% rename from mx_driving/point/ops/voxel_pooling_train.py rename to mx_driving/ops/voxel_pooling_train.py index 12c61c9d6ae876c4af3336a3de0f27b54574d914..41efc71a8b361ce2b14d9c95bd5669e55d4226bc 100644 --- a/mx_driving/point/ops/voxel_pooling_train.py +++ b/mx_driving/ops/voxel_pooling_train.py @@ -6,9 +6,10 @@ Modification date: 2024-06-04 Modification Description: Modification 1. Add support for Ascend NPU """ + import torch from torch.autograd import Function -from torch.nn import Module + import mx_driving._C @@ -18,12 +19,11 @@ class AdsVoxelPoolingFunction(Function): grad_input_features = torch.zeros_like(input_features) geom_xyz = geom_xyz.reshape(geom_xyz.shape[0], -1, geom_xyz.shape[-1]) input_features = input_features.reshape(geom_xyz.shape[0], -1, input_features.shape[-1]) - + batch_size = input_features.shape[0] num_points = input_features.shape[1] num_channels = input_features.shape[2] - output_features = input_features.new_zeros(batch_size, voxel_num[1], - voxel_num[0], num_channels) + output_features = input_features.new_zeros(batch_size, voxel_num[1], voxel_num[0], num_channels) pos_memo = geom_xyz.new_ones(batch_size, num_points, 3) * -1 pos, result = mx_driving._C.voxel_pooling_train( input_features, @@ -52,15 +52,10 @@ class AdsVoxelPoolingFunction(Function): W = grad_output_features.shape[3] result = mx_driving._C.voxel_pool_train_backward( - grad_output_features, - pos_memo, - batch_size, - num_points, - num_channels, - H, - W + grad_output_features, pos_memo, batch_size, num_points, num_channels, H, W ) grad_input_features = result.reshape(grad_input_features_shape) return None, grad_input_features, None -npu_voxel_pooling_train = AdsVoxelPoolingFunction.apply \ No newline at end of file + +npu_voxel_pooling_train = AdsVoxelPoolingFunction.apply diff --git a/mx_driving/point/ops/voxelization.py b/mx_driving/ops/voxelization.py similarity index 64% rename from mx_driving/point/ops/voxelization.py rename to mx_driving/ops/voxelization.py index 777133cb3ee8ad81b9a2a4604f879f5c42589c00..cd62fc782f472b614ce841420c8d5b0c67605783 100644 --- a/mx_driving/point/ops/voxelization.py +++ b/mx_driving/ops/voxelization.py @@ -4,8 +4,6 @@ Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. import torch from torch.autograd import Function -from torch.nn import Module -from torch.nn.modules.utils import _pair import mx_driving._C @@ -48,21 +46,3 @@ class _Voxelization(Function): voxelization = _Voxelization.apply - - -class Voxelization(torch.nn.Module): - def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000, deterministic=True): - super().__init__() - - self.voxel_size = voxel_size - self.point_cloud_range = point_cloud_range - self.max_num_points = max_num_points - self.max_voxels = max_voxels - self.max_voxels = max_voxels if isinstance(max_voxels, tuple) else _pair(max_voxels) - self.deterministic = deterministic - - def forward(self, points: torch.Tensor): - max_voxels = self.max_voxels[0] if self.training else self.max_voxels[1] - return voxelization( - points, self.voxel_size, self.point_cloud_range, self.max_num_points, max_voxels, self.deterministic - ) diff --git a/mx_driving/point/__init__.py b/mx_driving/point.py similarity index 49% rename from mx_driving/point/__init__.py rename to mx_driving/point.py index 6aa7d099bac32bc1b6ca77cc39f73155ff5bcc31..3f79fd6526c834302d50565eb74157652cbbe88e 100644 --- a/mx_driving/point/__init__.py +++ b/mx_driving/point.py @@ -1,10 +1,16 @@ -from .ops.group_points import npu_group_points -from .ops.group_points import group_points +import warnings + + +warnings.warn( + "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning +) +from .modules.voxelization import Voxelization from .ops.bev_pool import bev_pool from .ops.bev_pool_v2 import bev_pool_v2 -from .ops.furthest_point_sampling_with_dist import furthest_point_sample_with_dist +from .ops.bev_pool_v3 import bev_pool_v3 from .ops.furthest_point_sampling import npu_furthest_point_sampling +from .ops.furthest_point_sampling_with_dist import furthest_point_sample_with_dist +from .ops.group_points import group_points, npu_group_points from .ops.npu_dynamic_scatter import npu_dynamic_scatter -from .ops.voxelization import voxelization, Voxelization from .ops.voxel_pooling_train import npu_voxel_pooling_train -from .ops.bev_pool_v3 import bev_pool_v3 \ No newline at end of file +from .ops.voxelization import voxelization diff --git a/mx_driving/point/CMakeLists.txt b/mx_driving/point/CMakeLists.txt deleted file mode 100644 index 63ebf65165f490b26ec6fbb6cb034f1e8d947c59..0000000000000000000000000000000000000000 --- a/mx_driving/point/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) - add_subdirectory(ops/kernels) -endif() - -if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) - add_subdirectory(ops/onnx/plugin) -endif() - -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) - add_subdirectory(ops/csrc) -endif() diff --git a/mx_driving/point/components/README.md b/mx_driving/point/components/README.md deleted file mode 100644 index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000 --- a/mx_driving/point/components/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some pytorch algorithm modules. \ No newline at end of file diff --git a/mx_driving/point/ops/__init__.py b/mx_driving/point/ops/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/mx_driving/point/ops/csrc/CMakeLists.txt b/mx_driving/point/ops/csrc/CMakeLists.txt deleted file mode 100644 index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/csrc/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_CSRC_SRC - ${ASCEND_CSRC_SRC} ${CSRC_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/point/ops/csrc/OWNERS b/mx_driving/point/ops/csrc/OWNERS deleted file mode 100644 index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/csrc/OWNERS +++ /dev/null @@ -1,7 +0,0 @@ -approvers: -- wangxiaoxin-sherie -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/point/ops/csrc/README.md b/mx_driving/point/ops/csrc/README.md deleted file mode 100644 index 0bbe4f394307b9d81004b5bd923e630eabd9a509..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/csrc/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some cpp source files, which provide code for adaptation of ascend kernels. It provide links for kernels and cpp interfaces. \ No newline at end of file diff --git a/mx_driving/point/ops/csrc/pybind.cpp b/mx_driving/point/ops/csrc/pybind.cpp deleted file mode 100644 index e5b0c25d94faa44919839030d7e9b748eb6874a3..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/csrc/pybind.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include "csrc/pybind.h" - -#include - -#include "functions.h" - -void init_point(pybind11::module& m) -{ - // group_points - m.def("group_points", &group_points); - m.def("group_points_backward", &group_points_backward); - - // vec_pool - m.def("vec_pool_backward", &vec_pool_backward); - - m.def("point_to_voxel", &point_to_voxel); - - m.def("voxel_to_point", &voxel_to_point); - - m.def("unique_voxel", &unique_voxel); - - m.def("hard_voxelize", &hard_voxelize); - - // bev_pool - m.def("npu_bev_pool", &npu_bev_pool, "npu_bev_pool NPU version"); - m.def("npu_bev_pool_backward", &npu_bev_pool_backward, "npu_bev_pool_backward NPU version"); - m.def("npu_bev_pool_v2", &npu_bev_pool_v2, "npu_bev_pool_v2 NPU version"); - m.def("npu_bev_pool_v2_backward", &npu_bev_pool_v2_backward, "npu_bev_pool_v2_backward NPU version"); - m.def("npu_bev_pool_v3", &npu_bev_pool_v3, "npu_bev_pool_v3 NPU version"); - m.def("npu_bev_pool_v3_backward", &npu_bev_pool_v3_backward, "npu_bev_pool_v3_backward NPU version"); - - // furthest_points_sampling_with_dist - m.def("furthest_point_sampling_with_dist", &furthest_point_sampling_with_dist); - - // npu_dynamic_scatter - m.def("npu_dynamic_scatter", &npu_dynamic_scatter); - m.def("npu_dynamic_scatter_grad", &npu_dynamic_scatter_grad); - - // dyn_voxelization - m.def("dynamic_voxelization", &dynamic_voxelization); - - // npu_furthest_point_sampling - m.def("npu_furthest_point_sampling", &npu_furthest_point_sampling); - - // voxel_pooling - m.def("voxel_pooling_train", &voxel_pooling_train); - m.def("voxel_pool_train_backward", &voxel_pool_train_backward); -} diff --git a/mx_driving/point/ops/kernels/CMakeLists.txt b/mx_driving/point/ops/kernels/CMakeLists.txt deleted file mode 100644 index 179d9da23345abf75fb87954f266055922527742..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/kernels/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework) - add_subdirectory(framework) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) - add_subdirectory(op_host) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) - add_subdirectory(op_kernel) -endif() -if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) - add_subdirectory(testcases) -endif() diff --git a/mx_driving/point/ops/kernels/README.md b/mx_driving/point/ops/kernels/README.md deleted file mode 100644 index 214fb0a6d662e806bd7f6bdd1b8962bc1639026e..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/kernels/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some ascend-kernel source files, which are like cuda-kernels and supply some ops that can be run on ascend device. \ No newline at end of file diff --git a/mx_driving/point/ops/kernels/op_host/CMakeLists.txt b/mx_driving/point/ops/kernels/op_host/CMakeLists.txt deleted file mode 100644 index 7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/kernels/op_host/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_HOST_SRC - ${ASCEND_HOST_SRC} ${HOST_SRC} - CACHE INTERNAL "") - -# add the exclude files for aclnn -set(aclop_exclude - ${aclop_exclude} "" - CACHE INTERNAL "") -file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp) -file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h) -set(ACLNN_SRC_CUSTOM - ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC} - CACHE INTERNAL "") -set(ACLNN_INC_CUSTOM - ${ACLNN_INC_CUSTOM} ${ACLNN_INC} - CACHE INTERNAL "") diff --git a/mx_driving/point/ops/kernels/op_host/OWNERS b/mx_driving/point/ops/kernels/op_host/OWNERS deleted file mode 100644 index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/kernels/op_host/OWNERS +++ /dev/null @@ -1,7 +0,0 @@ -approvers: -- wangxiaoxin-sherie -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/point/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/point/ops/kernels/op_kernel/CMakeLists.txt deleted file mode 100644 index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/kernels/op_kernel/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_KERNEL_SRC - ${ASCEND_KERNEL_SRC} ${KERNEL_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/point/ops/kernels/op_kernel/common.h b/mx_driving/point/ops/kernels/op_kernel/common.h deleted file mode 100644 index 2041af4985be2803dae6afeae4b2c56b59f1df1c..0000000000000000000000000000000000000000 --- a/mx_driving/point/ops/kernels/op_kernel/common.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef COMMON_H_ -#define COMMON_H_ - -#include "kernel_operator.h" - -constexpr int32_t TILING_ALIGN32B_FLAG = 1; -constexpr int32_t TILING_FP32_BIT = 1; -constexpr int32_t TILING_FP16_BIT = 2; -constexpr int32_t TILING_BF16_BIT = 3; - -class TaskIterator { -public: - __aicore__ inline TaskIterator( - int32_t blkIdx, int32_t blkDim, int32_t avgTaskNum, int32_t tailTaskNum, int32_t totalTaskNum) - : blkIdx_(blkIdx), blkDim_(blkDim), totalTaskNum_(totalTaskNum) - { - nextIdx_ = blkIdx * avgTaskNum + (blkIdx < tailTaskNum ? blkIdx : tailTaskNum); - endIdx_ = nextIdx_ + avgTaskNum + (blkIdx < tailTaskNum ? 1 : 0); - } - - __aicore__ inline bool HasNext() const - { - return nextIdx_ < endIdx_; - } - - __aicore__ inline int32_t Next() - { - return nextIdx_++; - } - - __aicore__ inline int32_t GetNext() const - { - return nextIdx_; - } - - __aicore__ inline int32_t GetTaskNum() const - { - return totalTaskNum_; - } - -private: - int32_t blkIdx_, blkDim_; - int32_t nextIdx_, endIdx_; - int32_t totalTaskNum_; -}; -#endif // COMMON_H_ \ No newline at end of file diff --git a/mx_driving/preprocess/__init__.py b/mx_driving/preprocess.py similarity index 45% rename from mx_driving/preprocess/__init__.py rename to mx_driving/preprocess.py index db975036570fc6b3f7fed6419cc4bf55701ce45f..520c307946ea5dbdb4417acb526c15b41ceb9e76 100644 --- a/mx_driving/preprocess/__init__.py +++ b/mx_driving/preprocess.py @@ -1,4 +1,9 @@ +import warnings + +warnings.warn( + "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning +) from .ops.npu_points_in_box import npu_points_in_box from .ops.npu_points_in_box_all import npu_points_in_box_all from .ops.npu_points_in_box_all import points_in_boxes_all -from .ops.npu_roipoint_pool3d import RoipointPool3d as RoIPointPool3d \ No newline at end of file +from .modules.roi_point_pool_3d import RoIPointPool3d diff --git a/mx_driving/preprocess/CMakeLists.txt b/mx_driving/preprocess/CMakeLists.txt deleted file mode 100644 index 63ebf65165f490b26ec6fbb6cb034f1e8d947c59..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) - add_subdirectory(ops/kernels) -endif() - -if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) - add_subdirectory(ops/onnx/plugin) -endif() - -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) - add_subdirectory(ops/csrc) -endif() diff --git a/mx_driving/preprocess/components/README.md b/mx_driving/preprocess/components/README.md deleted file mode 100644 index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/components/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some pytorch algorithm modules. \ No newline at end of file diff --git a/mx_driving/preprocess/ops/__init__.py b/mx_driving/preprocess/ops/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/mx_driving/preprocess/ops/csrc/CMakeLists.txt b/mx_driving/preprocess/ops/csrc/CMakeLists.txt deleted file mode 100644 index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/csrc/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_CSRC_SRC - ${ASCEND_CSRC_SRC} ${CSRC_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/preprocess/ops/csrc/OWNERS b/mx_driving/preprocess/ops/csrc/OWNERS deleted file mode 100644 index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/csrc/OWNERS +++ /dev/null @@ -1,7 +0,0 @@ -approvers: -- wangxiaoxin-sherie -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/preprocess/ops/csrc/README.md b/mx_driving/preprocess/ops/csrc/README.md deleted file mode 100644 index 8073915fabe1c484db0488c9abc5e09b858c52c8..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/csrc/README.md +++ /dev/null @@ -1,6 +0,0 @@ -## Description -The `csrc` lib implements python interface, which use `pybind11` to wrap the C++ code. -There are 3 files you need to focus: -1. `pybind.cpp`: Define the python interface. -2. `functions.cpp`: Define the C++ interface. -3. The file naming in `Pascal` style: The implementation of the C++ interface. \ No newline at end of file diff --git a/mx_driving/preprocess/ops/csrc/functions.h b/mx_driving/preprocess/ops/csrc/functions.h deleted file mode 100644 index e509d755e0b806a5f6c9cbee2f15ee186f0e4d45..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/csrc/functions.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2024, Huawei Technologies.All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#ifndef PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ -#define PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ - -#include -#include - -at::Tensor npu_points_in_box(const at::Tensor& boxes, const at::Tensor& pts); - -at::Tensor npu_points_in_box_all(const at::Tensor& boxes, const at::Tensor& pts); - -std::tuple npu_roipoint_pool3d_forward(const int32_t num_sampled_points, - const at::Tensor& points, const at::Tensor& point_features, const at::Tensor& boxes3d); -#endif // PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ diff --git a/mx_driving/preprocess/ops/csrc/pybind.cpp b/mx_driving/preprocess/ops/csrc/pybind.cpp deleted file mode 100644 index f9bc7205886a1ca8a859c435359a37bde3f9b3bb..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/csrc/pybind.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include -#include "csrc/pybind.h" -#include "functions.h" - -void init_preprocess(pybind11::module& m) -{ - // npu_points_in_box - m.def("npu_points_in_box", &npu_points_in_box); - - // npu_points_in_box_all - m.def("npu_points_in_box_all", &npu_points_in_box_all); - - // npu_roipoint_pool3d_forward - m.def("npu_roipoint_pool3d_forward", &npu_roipoint_pool3d_forward); -} diff --git a/mx_driving/preprocess/ops/kernels/CMakeLists.txt b/mx_driving/preprocess/ops/kernels/CMakeLists.txt deleted file mode 100644 index 179d9da23345abf75fb87954f266055922527742..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/kernels/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework) - add_subdirectory(framework) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) - add_subdirectory(op_host) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) - add_subdirectory(op_kernel) -endif() -if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) - add_subdirectory(testcases) -endif() diff --git a/mx_driving/preprocess/ops/kernels/README.md b/mx_driving/preprocess/ops/kernels/README.md deleted file mode 100644 index 1e6645553e8d86a84a9833a13610741b59930494..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/kernels/README.md +++ /dev/null @@ -1,13 +0,0 @@ -## 算子原型 - - - - - - - - - - - -
算子类型(OpType)Add
算子输入nameshapedata typeformat
x-floatND
y-floatND
算子输出z-floatND
核函数名add_custom
\ No newline at end of file diff --git a/mx_driving/preprocess/ops/kernels/framework/CMakeLists.txt b/mx_driving/preprocess/ops/kernels/framework/CMakeLists.txt deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/mx_driving/preprocess/ops/kernels/op_host/CMakeLists.txt b/mx_driving/preprocess/ops/kernels/op_host/CMakeLists.txt deleted file mode 100644 index c44b2b0174f28f0144a7c03fc6c40cc5b389c14e..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/kernels/op_host/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_HOST_SRC - ${ASCEND_HOST_SRC} ${HOST_SRC} - CACHE INTERNAL "") -# add the exclude files for aclnn -set(aclop_exclude - ${aclop_exclude} "" - CACHE INTERNAL "") -file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp) -file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h) -set(ACLNN_SRC_CUSTOM - ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC} - CACHE INTERNAL "") -set(ACLNN_INC_CUSTOM - ${ACLNN_INC_CUSTOM} ${ACLNN_INC} - CACHE INTERNAL "") diff --git a/mx_driving/preprocess/ops/kernels/op_host/OWNERS b/mx_driving/preprocess/ops/kernels/op_host/OWNERS deleted file mode 100644 index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/kernels/op_host/OWNERS +++ /dev/null @@ -1,7 +0,0 @@ -approvers: -- wangxiaoxin-sherie -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/preprocess/ops/kernels/op_host/common.h b/mx_driving/preprocess/ops/kernels/op_host/common.h deleted file mode 100644 index 4580dff5fd0b206d1b94383f160932c22d1cb8a9..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/kernels/op_host/common.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. - */ -#ifndef COMMON_H -#define COMMON_H - -#include "register/op_def_registry.h" -#include "tiling/platform/platform_ascendc.h" -#include "tiling/tiling_api.h" -#include "register/tilingdata_base.h" - -inline uint32_t ceil_multiple(uint32_t num, uint32_t block) -{ - if (block == 0) { - return 0; - } - return (num + block - 1) / block; -} - -inline uint32_t ceil_value(uint32_t num, uint32_t block) -{ - if (block == 0) { - return 0; - } - return ((num + block - 1) / block) * block; -} - -#endif // COMMON_H diff --git a/mx_driving/preprocess/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/preprocess/ops/kernels/op_kernel/CMakeLists.txt deleted file mode 100644 index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000 --- a/mx_driving/preprocess/ops/kernels/op_kernel/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_KERNEL_SRC - ${ASCEND_KERNEL_SRC} ${KERNEL_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/spconv.py b/mx_driving/spconv.py new file mode 100644 index 0000000000000000000000000000000000000000..248713a204292255a5a0eac58deb596a45fe8f73 --- /dev/null +++ b/mx_driving/spconv.py @@ -0,0 +1,8 @@ +import warnings + + +warnings.warn( + "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning +) +from .modules.sparse_conv import SparseConv3d, SparseInverseConv3d, SubMConv3d +from .modules.sparse_modules import SparseConvTensor, SparseModule, SparseSequential diff --git a/mx_driving/spconv/CMakeLists.txt b/mx_driving/spconv/CMakeLists.txt deleted file mode 100644 index 63ebf65165f490b26ec6fbb6cb034f1e8d947c59..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) - add_subdirectory(ops/kernels) -endif() - -if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) - add_subdirectory(ops/onnx/plugin) -endif() - -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) - add_subdirectory(ops/csrc) -endif() diff --git a/mx_driving/spconv/__init__.py b/mx_driving/spconv/__init__.py deleted file mode 100644 index 7435203cfc1b1c542a7ebc1a7d5c3c7b9e3714c7..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .ops.sparse_conv import SubMConv3d -from .ops.sparse_conv import SparseConv3d -from .ops.sparse_conv import SparseInverseConv3d -from .ops.sparse_modules import SparseSequential -from .ops.sparse_modules import SparseConvTensor -from .ops.sparse_modules import SparseModule diff --git a/mx_driving/spconv/ops/__init__.py b/mx_driving/spconv/ops/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/mx_driving/spconv/ops/csrc/CMakeLists.txt b/mx_driving/spconv/ops/csrc/CMakeLists.txt deleted file mode 100644 index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/csrc/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_CSRC_SRC - ${ASCEND_CSRC_SRC} ${CSRC_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/spconv/ops/csrc/OWNERS b/mx_driving/spconv/ops/csrc/OWNERS deleted file mode 100644 index 6d60158d26b6a9b3c818a73e78f09a6aa3700cf7..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/csrc/OWNERS +++ /dev/null @@ -1,8 +0,0 @@ -approvers: -- wangxiaoxin-sherie -- liu_zhi_xu -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/spconv/ops/csrc/README.md b/mx_driving/spconv/ops/csrc/README.md deleted file mode 100644 index 0bbe4f394307b9d81004b5bd923e630eabd9a509..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/csrc/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some cpp source files, which provide code for adaptation of ascend kernels. It provide links for kernels and cpp interfaces. \ No newline at end of file diff --git a/mx_driving/spconv/ops/csrc/functions.h b/mx_driving/spconv/ops/csrc/functions.h deleted file mode 100644 index c1547c91050c634dc3c9bd3f397c6b0ccba7b492..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/csrc/functions.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2024, Huawei Technologies.All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#ifndef PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ -#define PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ - -#include -#include - -std::tuple npu_subm_sparse_conv3d(const at::Tensor& feature, const at::Tensor& indices, - const at::Tensor& weight, - at::IntArrayRef kernel_size, int out_channel, - at::IntArrayRef outSpatialShape, int batch_size, - const at::Tensor& temp); - -std::tuple multi_to_sparse(const at::Tensor& out_features, const at::Tensor& unique_indices_offset, - const at::Tensor& sorted_idx_to_former_indices, const at::Tensor& outidx_pair); - -std::tuple multi_to_sparse_v2(const at::Tensor& features, const at::Tensor& weight, const at::Tensor& unique_indices_offset, - const at::Tensor& sorted_idx_to_former_indices, const at::Tensor& outidx_pair); - -std::tuple npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, - int out_channel, at::IntArrayRef outSpatialShape, int batch_size); - -std::tuple npu_sparse_inverse_conv3d(const at::Tensor& feature, const at::Tensor& indices, const at::Tensor& weight, - at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, - at::IntArrayRef dilation, at::IntArrayRef output_padding, - int out_channel, at::IntArrayRef outSpatialShape, int batch_size); - -std::tuple npu_sparse_conv3d_grad(const at::Tensor& indices_offset, const at::Tensor& former_sorted_indices, - const at::Tensor& feature, const at::Tensor& weight, const at::Tensor& grad); - -std::tuple npu_prepare_subm_conv3d(const at::Tensor& flattenIndices, - at::IntArrayRef outSpatialShape, int batch_size); - -#endif // PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_ diff --git a/mx_driving/spconv/ops/csrc/pybind.cpp b/mx_driving/spconv/ops/csrc/pybind.cpp deleted file mode 100644 index 26bfaf7a09a8ab588bb7ad66a551490c0d42bb88..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/csrc/pybind.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include -#include "csrc/pybind.h" -#include "functions.h" - -void init_spconv(pybind11::module &m) -{ - // npu_subm_sparse_conv3d - m.def("npu_subm_sparse_conv3d", &npu_subm_sparse_conv3d); - - // npu_sparse_conv3d - m.def("npu_sparse_conv3d", &npu_sparse_conv3d); - - // npu_sparse_inverse_conv3d - m.def("npu_sparse_inverse_conv3d", &npu_sparse_inverse_conv3d); - - // multi_to_sparse - m.def("multi_to_sparse", &multi_to_sparse); - - // multi_to_sparse_v2 - m.def("multi_to_sparse_v2", &multi_to_sparse_v2); - - // npu_sparse_conv3d_grad - m.def("npu_sparse_conv3d_grad", &npu_sparse_conv3d_grad); - - m.def("npu_prepare_subm_conv3d", &npu_prepare_subm_conv3d); -} diff --git a/mx_driving/spconv/ops/kernels/CMakeLists.txt b/mx_driving/spconv/ops/kernels/CMakeLists.txt deleted file mode 100644 index 179d9da23345abf75fb87954f266055922527742..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/kernels/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework) - add_subdirectory(framework) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) - add_subdirectory(op_host) -endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) - add_subdirectory(op_kernel) -endif() -if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) - add_subdirectory(testcases) -endif() diff --git a/mx_driving/spconv/ops/kernels/README.md b/mx_driving/spconv/ops/kernels/README.md deleted file mode 100644 index 214fb0a6d662e806bd7f6bdd1b8962bc1639026e..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/kernels/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Description -+ The folder contains some ascend-kernel source files, which are like cuda-kernels and supply some ops that can be run on ascend device. \ No newline at end of file diff --git a/mx_driving/spconv/ops/kernels/op_host/CMakeLists.txt b/mx_driving/spconv/ops/kernels/op_host/CMakeLists.txt deleted file mode 100644 index 7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/kernels/op_host/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_HOST_SRC - ${ASCEND_HOST_SRC} ${HOST_SRC} - CACHE INTERNAL "") - -# add the exclude files for aclnn -set(aclop_exclude - ${aclop_exclude} "" - CACHE INTERNAL "") -file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp) -file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h) -set(ACLNN_SRC_CUSTOM - ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC} - CACHE INTERNAL "") -set(ACLNN_INC_CUSTOM - ${ACLNN_INC_CUSTOM} ${ACLNN_INC} - CACHE INTERNAL "") diff --git a/mx_driving/spconv/ops/kernels/op_host/OWNERS b/mx_driving/spconv/ops/kernels/op_host/OWNERS deleted file mode 100644 index 6d60158d26b6a9b3c818a73e78f09a6aa3700cf7..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/kernels/op_host/OWNERS +++ /dev/null @@ -1,8 +0,0 @@ -approvers: -- wangxiaoxin-sherie -- liu_zhi_xu -reviewers: -- zhuguodong1 -- captainjing -options: - no_parent_owners: true diff --git a/mx_driving/spconv/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/spconv/ops/kernels/op_kernel/CMakeLists.txt deleted file mode 100644 index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/kernels/op_kernel/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -set(ASCEND_KERNEL_SRC - ${ASCEND_KERNEL_SRC} ${KERNEL_SRC} - CACHE INTERNAL "") diff --git a/mx_driving/spconv/ops/sparse_ops.py b/mx_driving/spconv/ops/sparse_ops.py deleted file mode 100644 index 065774559d9b77c2dbf3e899fc6c16755a200aa7..0000000000000000000000000000000000000000 --- a/mx_driving/spconv/ops/sparse_ops.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2024, Huawei Technologies.All rights reserved. -# Copyright 2019 Yan Yan -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -def get_conv_output_size(input_size, kernel_size, stride, padding, dilation): - ndim = len(input_size) - output_size = [] - for i in range(ndim): - size = (input_size[i] + 2 * padding[i] - dilation[i] * - (kernel_size[i] - 1) - 1) // stride[i] + 1 - if kernel_size[i] == -1: - output_size.append(1) - else: - output_size.append(size) - return output_size - - -def get_inverse_conv_output_size(input_size, kernel_size, stride, padding, dilation, output_padding): - ndim = len(input_size) - output_size = [] - for i in range(ndim): - size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + dilation[i] * (kernel_size[i] - 1) + output_padding[i] + 1 - if kernel_size[i] == -1: - output_size.append(1) - else: - output_size.append(size) - return output_size - - -def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation, - output_padding): - ndim = len(input_size) - output_size = [] - for i in range(ndim): - if kernel_size[i] == -1: - raise ValueError("deconv don't support kernel_size < 0") - size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[ - i] + output_padding[i] - output_size.append(size) - return output_size diff --git a/onnx_plugin/CMakeLists.txt b/onnx_plugin/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb625814f17edf9774c4201c36318cf8801341d3 --- /dev/null +++ b/onnx_plugin/CMakeLists.txt @@ -0,0 +1,33 @@ +file(GLOB ONNX_PLUGIN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h) +if(BUILD_STAGE EQUAL 1) + if(${ENABLE_ONNX}) + if(CANN_PATHS) + if(${ARCH} STREQUAL "aarch64") + protobuf_generate( + PROTO_FILE ${CANN_PATHS}/aarch64-linux/include/proto/ge_onnx.proto + OUT_DIR ${ASCEND_AUTOGEN_PATH}) + else() + protobuf_generate( + PROTO_FILE ${CANN_PATHS}/x86_64-linux/include/proto/ge_onnx.proto + OUT_DIR ${ASCEND_AUTOGEN_PATH}) + endif() + else() + protobuf_generate( + PROTO_FILE ${ASCEND_CANN_PACKAGE_PATH}/include/proto/ge_onnx.proto + OUT_DIR ${ASCEND_AUTOGEN_PATH}) + endif() + + add_library(cust_onnx_parsers SHARED ${ONNX_PLUGIN_SRC}) + target_compile_options( + cust_onnx_parsers + PRIVATE -O2 -Werror -Wno-deprecated-declarations -Dgoogle=ascend_private + "-fno-common" "-fno-strict-aliasing") + target_link_libraries(cust_onnx_parsers PRIVATE intf_pub) + target_include_directories( + cust_onnx_parsers PRIVATE ${PROJECT_SOURCE_DIR}/include + ${ASCEND_AUTOGEN_PATH}) + + install_target(TRG cust_onnx_parsers DST + packages/vendors/${vendor_name}/framework/onnx/) + endif() +endif() diff --git a/mx_driving/fused/ops/onnx/plugin/onnx_multi_scale_deformable_attn.cpp b/onnx_plugin/onnx_multi_scale_deformable_attn.cpp similarity index 100% rename from mx_driving/fused/ops/onnx/plugin/onnx_multi_scale_deformable_attn.cpp rename to onnx_plugin/onnx_multi_scale_deformable_attn.cpp diff --git a/mx_driving/detection/ops/onnx/plugin/onnx_roi_align_rotated.cpp b/onnx_plugin/onnx_roi_align_rotated.cpp similarity index 100% rename from mx_driving/detection/ops/onnx/plugin/onnx_roi_align_rotated.cpp rename to onnx_plugin/onnx_roi_align_rotated.cpp diff --git a/tests/onnx/roi_align_rotated_plugin.py b/tests/onnx/roi_align_rotated_plugin.py index 75b3193a36a658ad8ef55b933a4568cc7d7b991b..da53bdf900d1a65b6db0a31c5ed37eea79a1c9cd 100644 --- a/tests/onnx/roi_align_rotated_plugin.py +++ b/tests/onnx/roi_align_rotated_plugin.py @@ -1,6 +1,6 @@ import os -import onnx -from onnx import helper, TensorProto +import onnx_plugin +from onnx_plugin import helper, TensorProto def roi_align_rotated(): @@ -9,7 +9,7 @@ def roi_align_rotated(): output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [3, 48, 2, 2]) - node_def = onnx.helper.make_node('RoiAlignRotatedV2', + node_def = onnx_plugin.helper.make_node('RoiAlignRotatedV2', inputs=['input', 'rois'], outputs=['output'], spatial_scale=1.0, @@ -32,7 +32,7 @@ def roi_align_rotated(): current_path = os.path.abspath(__file__) idx = current_path.rfind('/') current_path = current_path[:idx] - onnx.save(model_def, os.path.join(current_path, "roi_align_rotated.onnx")) + onnx_plugin.save(model_def, os.path.join(current_path, "roi_align_rotated.onnx")) if __name__ == "__main__": roi_align_rotated() \ No newline at end of file diff --git a/tests/torch/test_npu_geometric_kernel_attention_func.py b/tests/torch/test_npu_geometric_kernel_attention_func.py index ba5254f6685b25e4c77a7f099373a0ff5bd0b020..b3f027fddcd16ad8d3debec186add5c641213205 100644 --- a/tests/torch/test_npu_geometric_kernel_attention_func.py +++ b/tests/torch/test_npu_geometric_kernel_attention_func.py @@ -79,10 +79,6 @@ class TestGeometricKernelAttentionFunc(TestCase): self.test_results = self.gen_results() def gen_results(self): - if DEVICE_NAME != "Ascend910B": - self.skipTest( - "OP `MultiScaleDeformableAttnFunction` is only supported on 910B, skipping test data generation!" - ) test_results = [] for shape, dtype in self.items: cpu_inputs, npu_inputs = self.gen_inputs(shape, dtype) @@ -157,7 +153,7 @@ class TestGeometricKernelAttentionFunc(TestCase): sampling_locations = npu_inputs.sampling_locations attn_weights = npu_inputs.attn_weights grad_output = npu_inputs.grad_output - npu_output = mx_driving.fused.npu_geometric_kernel_attention_func( + npu_output = mx_driving.fused.npu_geometric_kernel_attention( value, spatial_shapes, level_start_index, sampling_locations, attn_weights ) npu_output.backward(grad_output) diff --git a/tests/torch/test_roipoint_pool3d.py b/tests/torch/test_roipoint_pool3d.py index 0cb748ae7c28b40a3ef1b590cf75fba641f28e49..cf17193e6858e2122b17ba0257f7acaa0123bba6 100644 --- a/tests/torch/test_roipoint_pool3d.py +++ b/tests/torch/test_roipoint_pool3d.py @@ -11,21 +11,46 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import sys import random import unittest + +import numpy as np import torch import torch_npu -import numpy as np from torch_npu.testing.testcase import TestCase, run_tests + from mx_driving.preprocess import RoIPointPool3d -sys.path.append("../utils") -from random_matrix import random_value DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] +# float16[-14,16], float32[-126,128], float64[-1022,1024], int16[0,15], int32[0,31], int64[0,63] +# random_value(-7, 8, (1, 2, 3), np.float32, True, True, False, False) +# pylint: disable=too-many-arguments,huawei-too-many-arguments +def random_value( + min_log, max_log, size, dtype=np.float32, nega_flag=True, zero_flag=True, inf_flag=False, nan_flag=False +): + matrix_log = np.random.uniform(low=min_log, high=max_log, size=size).astype(np.float32) + matrix = np.exp2(matrix_log).astype(dtype) + flag_value = int(zero_flag) + int(inf_flag) + int(nan_flag) + size_value = np.prod(size) + p0 = 0.1 + if (flag_value > 0) and (size_value > 0): + p0 = 0.1 / flag_value / size_value # 10% + if nega_flag: + matrix *= np.random.choice(a=[1, -1], size=size, p=[0.5, 0.5]) + if zero_flag: + matrix *= np.random.choice(a=[1, 0], size=size, p=[1 - p0, p0]) + if inf_flag: + np_inf = np.array([np.inf]).astype(dtype)[0] + matrix += np.random.choice(a=[0, np_inf], size=size, p=[1 - p0, p0]) + if nan_flag: + np_nan = np.array([np.nan]).astype(dtype)[0] + matrix += np.random.choice(a=[0, np_nan], size=size, p=[1 - p0, p0]) + return matrix + + # points: (B, N, 3) 输入点 # point_features: (B, N, C) 输入点特征 # boxes3d: (B, M, 7) 边界框 @@ -53,106 +78,113 @@ def check_point_in_box3d(point, box3d): def roipoint_pool3d_forward(num_sampled_points, points, point_features, boxes3d, pooled_features): - point_num = points.shape[0] # N - feature_len = point_features.shape[1] # C - point_flag = np.zeros((point_num), dtype=np.int32) # (N) - point_idx = np.zeros((num_sampled_points), dtype=np.int32) # (num) + point_num = points.shape[0] # N + feature_len = point_features.shape[1] # C + point_flag = np.zeros((point_num), dtype=np.int32) # (N) + point_idx = np.zeros((num_sampled_points), dtype=np.int32) # (num) for pt_idx in range(point_num): point_flag[pt_idx] = check_point_in_box3d(points[pt_idx], boxes3d) cnt = 0 for pt_idx in range(point_num): - if (point_flag[pt_idx] == 0): + if point_flag[pt_idx] == 0: continue point_idx[cnt] = pt_idx cnt += 1 - if (cnt == num_sampled_points): + if cnt == num_sampled_points: break - if (cnt == 0): + if cnt == 0: return 1 - if (cnt < num_sampled_points): + if cnt < num_sampled_points: for spn_idx in range(cnt, num_sampled_points): point_idx[spn_idx] = point_idx[spn_idx % cnt] for sample_point_idx in range(num_sampled_points): src_point_idx = point_idx[sample_point_idx] pooled_features[sample_point_idx, 0:3] = points[src_point_idx, 0:3] - pooled_features[sample_point_idx, 3:3 + feature_len] = \ - point_features[src_point_idx, 0:feature_len] + pooled_features[sample_point_idx, 3 : 3 + feature_len] = point_features[src_point_idx, 0:feature_len] return 0 def cpu_roipoint_pool3d(num_sampled_points, points, point_features, boxes3d): # B=batch_size; N=point_num; M=boxes_num; C=feature_len; num = num_sampled_points - batch_size = points.shape[0] # B - feature_len = point_features.shape[2] # C - boxes_num = boxes3d.shape[1] # M + batch_size = points.shape[0] # B + feature_len = point_features.shape[2] # C + boxes_num = boxes3d.shape[1] # M pooled_features = np.zeros_like(points, shape=(batch_size, boxes_num, num_sampled_points, 3 + feature_len)) pooled_empty_flag = np.zeros((batch_size, boxes_num), dtype=np.int32) for bs_idx in range(batch_size): for boxes_idx in range(boxes_num): - pooled_empty_flag[bs_idx][boxes_idx] = roipoint_pool3d_forward(num_sampled_points, points[bs_idx], - point_features[bs_idx], boxes3d[bs_idx][boxes_idx], pooled_features[bs_idx][boxes_idx]) + pooled_empty_flag[bs_idx][boxes_idx] = roipoint_pool3d_forward( + num_sampled_points, + points[bs_idx], + point_features[bs_idx], + boxes3d[bs_idx][boxes_idx], + pooled_features[bs_idx][boxes_idx], + ) return pooled_features, pooled_empty_flag class TestRoipointPool3d(TestCase): - @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `RoipointPool3d` is only supported on 910B, skip this ut!") + @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `RoipointPool3d` is only supported on 910B, skip this ut!") def test_roipoint_pool3d_float(self): random.seed() - batch_size = random.randint(1, 4) # B - num_sampled_points = random.randint(1, 48) # num - boxes_num = random.randint(1, 48) # M - point_num = random.randint(max(num_sampled_points, boxes_num), 105) # N - points = random_value(-15.5, 16, (batch_size, point_num, 3), np.float32) # (B, N, 3) - point_features = points.copy() # (B, N, C) - boxes3d = np.zeros((batch_size, boxes_num, 7), dtype=np.float32) # (B, M, 7) + batch_size = random.randint(1, 4) # B + num_sampled_points = random.randint(1, 48) # num + boxes_num = random.randint(1, 48) # M + point_num = random.randint(max(num_sampled_points, boxes_num), 105) # N + points = random_value(-15.5, 16, (batch_size, point_num, 3), np.float32) # (B, N, 3) + point_features = points.copy() # (B, N, C) + boxes3d = np.zeros((batch_size, boxes_num, 7), dtype=np.float32) # (B, M, 7) boxes3d[0:, 0:, 0:3] = random_value(-15.5, 16, (batch_size, boxes_num, 3)) boxes3d[0:, 0:, 3:6] = random_value(-63, 64, (batch_size, boxes_num, 3), nega_flag=False) - boxes3d[0:, 0:, 6:] = \ - np.random.uniform(low=0, high=3.141592654, size=(batch_size, boxes_num, 1)).astype(np.float32) + boxes3d[0:, 0:, 6:] = np.random.uniform(low=0, high=3.141592654, size=(batch_size, boxes_num, 1)).astype( + np.float32 + ) cpu_pooled_features, cpu_pooled_empty_flag = cpu_roipoint_pool3d( - num_sampled_points, points, point_features, boxes3d) + num_sampled_points, points, point_features, boxes3d + ) roipoint_pool3d = RoIPointPool3d(num_sampled_points) - pooled_features, pooled_empty_flag = roipoint_pool3d(torch.from_numpy(points).npu(), - torch.from_numpy(point_features).npu(), torch.from_numpy(boxes3d).npu()) + pooled_features, pooled_empty_flag = roipoint_pool3d( + torch.from_numpy(points).npu(), torch.from_numpy(point_features).npu(), torch.from_numpy(boxes3d).npu() + ) float_pooled_features = pooled_features.cpu().numpy() float_pooled_empty_flag = pooled_empty_flag.cpu().numpy() - self.assertRtolEqual(float_pooled_features, cpu_pooled_features, prec=0.00005) # (B, M, num, 3+C) - self.assertRtolEqual(float_pooled_empty_flag, cpu_pooled_empty_flag, prec=0.00005) # (B, M) + self.assertRtolEqual(float_pooled_features, cpu_pooled_features, prec=0.00005) # (B, M, num, 3+C) + self.assertRtolEqual(float_pooled_empty_flag, cpu_pooled_empty_flag, prec=0.00005) # (B, M) - - @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `RoipointPool3d` is only supported on 910B, skip this ut!") + @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `RoipointPool3d` is only supported on 910B, skip this ut!") def test_roipoint_pool3d_half(self): random.seed() - batch_size = random.randint(1, 4) # B - num_sampled_points = random.randint(1, 60) # num - boxes_num = random.randint(1, 60) # M - point_num = random.randint(max(num_sampled_points, boxes_num), 105) # N - points = random_value(-3.5, 4, (batch_size, point_num, 3), np.float16) # (B, N, 3) - point_features = points.copy() # (B, N, C) - boxes3d = np.zeros((batch_size, boxes_num, 7), dtype=np.float16) # (B, M, 7) + batch_size = random.randint(1, 4) # B + num_sampled_points = random.randint(1, 60) # num + boxes_num = random.randint(1, 60) # M + point_num = random.randint(max(num_sampled_points, boxes_num), 105) # N + points = random_value(-3.5, 4, (batch_size, point_num, 3), np.float16) # (B, N, 3) + point_features = points.copy() # (B, N, C) + boxes3d = np.zeros((batch_size, boxes_num, 7), dtype=np.float16) # (B, M, 7) boxes3d[0:, 0:, 0:3] = random_value(-3.5, 4, (batch_size, boxes_num, 3), np.float16) boxes3d[0:, 0:, 3:6] = random_value(-14, 16, (batch_size, boxes_num, 3), np.float16, nega_flag=False) - boxes3d[0:, 0:, 6:] = \ - np.random.uniform(low=0, high=3.142, size=(batch_size, boxes_num, 1)).astype(np.float16) + boxes3d[0:, 0:, 6:] = np.random.uniform(low=0, high=3.142, size=(batch_size, boxes_num, 1)).astype(np.float16) - cpu_pooled_features, cpu_pooled_empty_flag = cpu_roipoint_pool3d(num_sampled_points, - points.astype(np.float32), point_features.astype(np.float32), boxes3d.astype(np.float32)) + cpu_pooled_features, cpu_pooled_empty_flag = cpu_roipoint_pool3d( + num_sampled_points, points.astype(np.float32), point_features.astype(np.float32), boxes3d.astype(np.float32) + ) roipoint_pool3d = RoIPointPool3d(num_sampled_points) - pooled_features, pooled_empty_flag = roipoint_pool3d(torch.from_numpy(points).npu(), - torch.from_numpy(point_features).npu(), torch.from_numpy(boxes3d).npu()) + pooled_features, pooled_empty_flag = roipoint_pool3d( + torch.from_numpy(points).npu(), torch.from_numpy(point_features).npu(), torch.from_numpy(boxes3d).npu() + ) half_pooled_features = pooled_features.cpu().numpy().astype(np.float32) half_pooled_empty_flag = pooled_empty_flag.cpu().numpy() - self.assertRtolEqual(half_pooled_features, cpu_pooled_features, prec=0.0005) # (B, M, num, 3+C) - self.assertRtolEqual(half_pooled_empty_flag, cpu_pooled_empty_flag, prec=0.0005) # (B, M) + self.assertRtolEqual(half_pooled_features, cpu_pooled_features, prec=0.0005) # (B, M, num, 3+C) + self.assertRtolEqual(half_pooled_empty_flag, cpu_pooled_empty_flag, prec=0.0005) # (B, M) if __name__ == "__main__": diff --git a/tests/utils/random_matrix.py b/tests/utils/random_matrix.py deleted file mode 100644 index a4f8879519de2e4a77c3dba04a648efd9c3ca23e..0000000000000000000000000000000000000000 --- a/tests/utils/random_matrix.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -import random -import numpy as np - - -# float16[-14,16], float32[-126,128], float64[-1022,1024], int16[0,15], int32[0,31], int64[0,63] -# random_value(-7, 8, (1, 2, 3), np.float32, True, True, False, False) -def random_value(min_log, max_log, size, dtype=np.float32, - nega_flag=True, zero_flag=True, inf_flag=False, nan_flag=False): - matrix_log = np.random.uniform(low=min_log, high=max_log, size=size).astype(np.float32) - matrix = np.exp2(matrix_log).astype(dtype) - flag_value = int(zero_flag) + int(inf_flag) + int(nan_flag) - size_value = np.prod(size) - p0 = 0.1 - if (flag_value > 0) and (size_value > 0): - p0 = 0.1 / flag_value / size_value # 10% - if nega_flag: - matrix *= np.random.choice(a=[1, -1], size=size, p=[0.5, 0.5]) - if zero_flag: - matrix *= np.random.choice(a=[1, 0], size=size, p=[1 - p0, p0]) - if inf_flag: - np_inf = np.array([np.inf]).astype(dtype)[0] - matrix += np.random.choice(a=[0, np_inf], size=size, p=[1 - p0, p0]) - if nan_flag: - np_nan = np.array([np.nan]).astype(dtype)[0] - matrix += np.random.choice(a=[0, np_nan], size=size, p=[1 - p0, p0]) - return matrix - - -# random_size(-7, 8, ([1, 4], [1, 4], [3]), np.float32, True, True, False, False) -def random_size(min_log, max_log, size_range, dtype=np.float32, - nega_flag=True, zero_flag=True, inf_flag=False, nan_flag=False): - size = [] - dim = len(size_range) - for i in range(dim): - if len(size_range[i]) == 1: - size.append(size_range[i][0]) - else: - random.seed() - size.append(random.randint(size_range[i][0], size_range[i][1])) - return random_value(min_log, max_log, size, dtype, nega_flag, zero_flag, inf_flag, nan_flag)