diff --git a/CMakeLists.txt b/CMakeLists.txt
index f58f410ede3f17598fc63c0f05d612c9983b33ae..367e0be68283fb4b8f0393cd223980f371557fc2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,19 +5,6 @@ include(cmake/config.cmake)
 include(cmake/func.cmake)
 include(cmake/intf.cmake)
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/bind)
-set(MX_DRIVING_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mx_driving)
-add_subdirectory(${MX_DRIVING_DIR}/common)
-add_subdirectory(${MX_DRIVING_DIR}/preprocess)
-add_subdirectory(${MX_DRIVING_DIR}/fused)
-add_subdirectory(${MX_DRIVING_DIR}/point)
-add_subdirectory(${MX_DRIVING_DIR}/detection)
-add_subdirectory(${MX_DRIVING_DIR}/spconv)
-
-if(BUILD_STAGE EQUAL 0)
-  include(cmake/stage_0.cmake)
-elseif(BUILD_STAGE EQUAL 1)
-  include(cmake/stage_1.cmake)
-elseif(BUILD_STAGE EQUAL 2)
-  include(cmake/stage_2.cmake)
-endif()
+add_subdirectory(${PROJECT_SOURCE_DIR}/kernels)
+add_subdirectory(${PROJECT_SOURCE_DIR}/onnx_plugin)
+add_subdirectory(${PROJECT_SOURCE_DIR}/mx_driving/csrc)
diff --git a/Third_Party_Open_Source__Software_Notice b/Third_Party_Open_Source_Software_Notice
similarity index 100%
rename from Third_Party_Open_Source__Software_Notice
rename to Third_Party_Open_Source_Software_Notice
diff --git a/bind/CMakeLists.txt b/bind/CMakeLists.txt
deleted file mode 100644
index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000
--- a/bind/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_CSRC_SRC
-    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
-    CACHE INTERNAL "")
diff --git a/bind/pybind.cpp b/bind/pybind.cpp
deleted file mode 100644
index a227bee8f5c9b147bf7c4bdcfd3c335367f36509..0000000000000000000000000000000000000000
--- a/bind/pybind.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "csrc/pybind.h"
-#include <torch/extension.h>
-
-#include <mutex>
-#include <string>
-
-std::string g_opApiSoPath;
-std::once_flag init_flag; // Flag for one-time initialization
-
-void init_op_api_so_path(const std::string& path)
-{
-    std::call_once(init_flag, [&]() { g_opApiSoPath = path; });
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("_init_op_api_so_path", &init_op_api_so_path);
-    init_common(m);
-    init_fused(m);
-    init_point(m);
-    init_preprocess(m);
-    init_detection(m);
-    init_spconv(m);
-}
diff --git a/cmake/config.cmake b/cmake/config.cmake
index cfb478e0713248f27c00e6c5f440ae2e4a90df9b..1c3690727932ee4b6b3323971448e1f7f402b7db 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -58,40 +58,8 @@ execute_process(
 set(ASCEND_TENSOR_COMPILER_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler)
 set(ASCEND_CCEC_COMPILER_PATH ${ASCEND_TENSOR_COMPILER_PATH}/ccec_compiler/bin)
 set(ASCEND_AUTOGEN_PATH ${CMAKE_BINARY_DIR}/autogen)
-set(ASCEND_KERNEL_PATH ${CMAKE_BINARY_DIR}/kernels)
-
-unset(ASCEND_CSRC_SRC CACHE)
-unset(ASCEND_HOST_SRC CACHE)
-unset(ASCEND_KERNEL_SRC CACHE)
-unset(ACLNN_SRC_CUSTOM CACHE)
-unset(ACLNN_INC_CUSTOM CACHE)
-unset(aclop_exclude CACHE)
-unset(ASCEND_ONNX_SRC CACHE)
-set(ASCEND_CSRC_SRC
-  ""
-  CACHE STRING "csrc source files")
-set(ASCEND_HOST_SRC
-  ""
-  CACHE STRING "host source files")
-set(ASCEND_KERNEL_SRC
-  ""
-  CACHE STRING "kernel source files")
-set(ACLNN_SRC_CUSTOM
-  ""
-  CACHE STRING "aclnn source files")
-set(ACLNN_INC_CUSTOM
-  ""
-  CACHE STRING "aclnn include files")
-set(aclop_exclude
-  ""
-  CACHE STRING "aclop exclude files")
-set(ASCEND_ONNX_SRC
-  ""
-  CACHE STRING "onnx source files")
-  
 set(ASCEND_FRAMEWORK_TYPE tensorflow)
 file(MAKE_DIRECTORY ${ASCEND_AUTOGEN_PATH})
-file(MAKE_DIRECTORY ${ASCEND_KERNEL_PATH})
 set(CUSTOM_COMPILE_OPTIONS "custom_compile_options.ini")
 execute_process(COMMAND rm -rf ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS}
                 COMMAND touch ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS})
diff --git a/cmake/stage_0.cmake b/cmake/stage_0.cmake
deleted file mode 100644
index edac7db17d85196fef6851eca2e120e161e8b0eb..0000000000000000000000000000000000000000
--- a/cmake/stage_0.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-add_library(ascend_all_ops SHARED ${ASCEND_HOST_SRC})
-target_compile_options(ascend_all_ops PRIVATE -g -fPIC -std=c++11
-                                              -D_GLIBCXX_USE_CXX11_ABI=0)
-target_include_directories(ascend_all_ops PRIVATE ${CANN_INCLUDE_PATH})
-target_link_libraries(ascend_all_ops PRIVATE intf_pub exe_graph register
-                                             tiling_api ascendcl)
-add_custom_command(
-  TARGET ascend_all_ops
-  POST_BUILD
-  COMMAND ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build
-          $<TARGET_FILE:ascend_all_ops> ${ASCEND_AUTOGEN_PATH})
diff --git a/cmake/stage_1.cmake b/cmake/stage_1.cmake
deleted file mode 100644
index 502263afe6c29b90cde3bc42c2a988bdc6a866d8..0000000000000000000000000000000000000000
--- a/cmake/stage_1.cmake
+++ /dev/null
@@ -1,214 +0,0 @@
-# ===================Build proto ===================
-add_library(cust_op_proto SHARED ${ASCEND_AUTOGEN_PATH}/op_proto.cc)
-target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
-target_compile_options(cust_op_proto PRIVATE -fvisibility=hidden)
-target_link_libraries(
-  cust_op_proto
-  PRIVATE intf_pub
-          exe_graph
-          register
-          tiling_api
-          ascendcl
-          -Wl,--whole-archive
-          rt2_registry
-          -Wl,--no-whole-archive)
-set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME cust_opsproto_rt2.0)
-install_target(
-  TRG cust_op_proto DST
-  packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
-install_file(TRG cust_op_proto SRC ${ASCEND_AUTOGEN_PATH}/op_proto.h DST
-             packages/vendors/${vendor_name}/op_proto/inc)
-
-add_library(cust_optiling SHARED ${ASCEND_HOST_SRC})
-target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
-target_compile_options(cust_optiling PRIVATE -fvisibility=hidden)
-target_link_libraries(
-  cust_optiling
-  PRIVATE intf_pub
-          exe_graph
-          register
-          tiling_api
-          ascendcl
-          -Wl,--whole-archive
-          rt2_registry
-          -Wl,--no-whole-archive)
-set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME cust_opmaster_rt2.0)
-install_target(
-  TRG
-  cust_optiling
-  DST
-  packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}
-)
-# create liboptiling.so link
-add_custom_command(
-  TARGET cust_optiling
-  POST_BUILD
-  COMMAND
-    ${CMAKE_COMMAND} -E chdir
-    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling
-    ${CMAKE_COMMAND} -E create_symlink
-    lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
-    liboptiling.so)
-install(
-  FILES
-    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/liboptiling.so
-  DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
-
-if(${ENABLE_ONNX})
-  if(CANN_PATHS)
-    if(${ARCH} STREQUAL "aarch64")
-      protobuf_generate(
-        PROTO_FILE ${CANN_PATHS}/aarch64-linux/include/proto/ge_onnx.proto
-        OUT_DIR ${ASCEND_AUTOGEN_PATH})
-    else()
-      protobuf_generate(
-        PROTO_FILE ${CANN_PATHS}/x86_64-linux/include/proto/ge_onnx.proto
-        OUT_DIR ${ASCEND_AUTOGEN_PATH})
-    endif()
-  else()
-    protobuf_generate(
-      PROTO_FILE ${ASCEND_CANN_PACKAGE_PATH}/include/proto/ge_onnx.proto
-      OUT_DIR ${ASCEND_AUTOGEN_PATH})
-  endif()
-
-  add_library(cust_onnx_parsers SHARED ${ASCEND_ONNX_SRC})
-  target_compile_options(
-    cust_onnx_parsers
-    PRIVATE -O2 -Werror -Wno-deprecated-declarations -Dgoogle=ascend_private
-            "-fno-common" "-fno-strict-aliasing")
-  target_link_libraries(cust_onnx_parsers PRIVATE intf_pub)
-  target_include_directories(
-    cust_onnx_parsers PRIVATE ${PROJECT_SOURCE_DIR}/include
-                              ${ASCEND_AUTOGEN_PATH})
-
-  install_target(TRG cust_onnx_parsers DST
-                 packages/vendors/${vendor_name}/framework/onnx/)
-endif()
-
-# ===================Build ACLNN===================
-file(GLOB ACLNN_SRC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
-file(GLOB ACLNN_INC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
-set(ACLNN_SRC ${ACLNN_SRC_GEN} ${ACLNN_SRC_CUSTOM})
-set(ACLNN_INC ${ACLNN_INC_GEN} ${ACLNN_INC_CUSTOM})
-add_library(cust_opapi SHARED ${ACLNN_SRC})
-target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase opapi)
-install_target(TRG cust_opapi DST packages/vendors/${vendor_name}/op_api/lib)
-install_file(TRG cust_opapi SRC ${ACLNN_INC} DST
-             packages/vendors/${vendor_name}/op_api/include)
-
-# ===================Build Kernel===================
-# set custom compile options
-if("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
-  add_ops_compile_options(ALL OPTIONS -g -O0)
-endif()
-
-file(COPY ${ASCEND_KERNEL_SRC} DESTINATION ${ASCEND_KERNEL_PATH})
-
-foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
-  if(EXISTS ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini)
-    # generate aic-${compute_unit}-ops-info.json
-    add_ops_info_target(
-      TARGET
-      ops_info_gen_${compute_unit}
-      OUTPUT
-      ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}/aic-${compute_unit}-ops-info.json
-      OPS_INFO
-      ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
-      INSTALL_DIR
-      packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
-    )
-
-    # generate ascendc impl py once
-    if(NOT TARGET ascendc_impl_gen)
-      add_ops_impl_target(
-        TARGET
-        ascendc_impl_gen
-        OPS_INFO
-        ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
-        IMPL_DIR
-        ${ASCEND_KERNEL_PATH}
-        OUT_DIR
-        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl
-      )
-      install_file(
-        TRG
-        ascendc_impl_gen
-        SRC
-        ${ASCEND_KERNEL_SRC}
-        DST
-        packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
-      )
-    endif()
-
-    # dynamic shape binary compile
-    if(${ENABLE_BINARY_PACKAGE})
-      add_bin_compile_target(
-        TARGET
-        ascendc_bin_${compute_unit}
-        OPS_INFO
-        ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
-        IMPL_DIR
-        ${ASCEND_KERNEL_PATH}
-        ADP_DIR
-        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
-        OUT_DIR
-        ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
-        KERNEL_DIR
-        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
-        INSTALL_DIR
-        packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
-        COMPUTE_UNIT
-        ${compute_unit})
-      add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen
-                       cust_optiling)
-    endif()
-  endif()
-endforeach()
-
-# generate npu_supported_ops.json
-add_npu_support_target(
-  TARGET
-  npu_supported_ops
-  OPS_INFO_DIR
-  ${ASCEND_AUTOGEN_PATH}
-  OUT_DIR
-  ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_info_cfg/ai_core
-  INSTALL_DIR
-  packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE})
-
-# ===================Build test===================
-# WARN: WIP
-if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
-  add_subdirectory(testcases)
-endif()
-
-get_system_info(SYSTEM_INFO)
-
-# gen version.info
-add_custom_target(
-  gen_version_info ALL
-  COMMAND
-    bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh
-    ${ASCEND_CANN_PACKAGE_PATH}
-    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name})
-
-install(FILES ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/version.info
-        DESTINATION packages/vendors/${vendor_name})
-
-if(COMPILE_OPP_PACKAGE)
-  # CPack config
-  set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
-  set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
-  set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
-  set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
-  set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
-  set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run")
-  set(CPACK_GENERATOR External)
-  set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
-  set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
-  set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
-  set(CPACK_EXTERNAL_BUILT_PACKAGES
-      ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME}
-  )
-  include(CPack)
-endif()
diff --git a/cmake/stage_2.cmake b/cmake/stage_2.cmake
deleted file mode 100644
index 8d6da51b64b235ad2b129e3d5a37c9d16704ffd1..0000000000000000000000000000000000000000
--- a/cmake/stage_2.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-set(Python3_USE_STATIC_LIBS FALSE)
-find_package(Python3 COMPONENTS Interpreter Development)
-
-execute_process(
-  COMMAND ${Python3_EXECUTABLE} -c
-          "import os; import torch; print(os.path.dirname(torch.__file__))"
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE TORCH_PATH)
-execute_process(
-  COMMAND
-    ${Python3_EXECUTABLE} -c
-    "import os; import site; print(site.getsitepackages()[0] + '/torch_npu')"
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE TORCH_NPU_PATH)
-message("TORCH_PATH is ${TORCH_PATH}")
-message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
-
-set(EXT_CXX_FLAGS "${EXT_CXX_FLAGS}")
-separate_arguments(EXT_CXX_FLAGS)
-add_library(_C SHARED ${ASCEND_CSRC_SRC})
-set_target_properties(
-  _C
-  PROPERTIES OUTPUT_NAME "${MX_DRIVING_PATH}/_C.${Python3_SOABI}"
-             PREFIX ""
-             SUFFIX ".so")
-
-if(${COMPILE_WITH_XLA})
-  target_compile_definitions(_C PRIVATE COMPILE_WITH_XLA)
-endif()
-target_include_directories(
-  _C
-  PRIVATE ${Python3_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/include
-          ${TORCH_NPU_PATH}/include ${TORCH_PATH}/include
-          ${TORCH_PATH}/include/torch/csrc/api/include)
-target_compile_options(
-  _C
-  PRIVATE -fprofile-arcs
-          -ftest-coverage
-          -fPIC
-          $<$<CONFIG:Release>:-O3>
-          $<$<CONFIG:Debug>:-O0
-          -g>
-          -fstack-protector-all
-          -DTORCH_API_INCLUDE_EXTENSION_H
-          -DTORCH_EXTENSION_NAME=_C
-          -D_GLIBCXX_USE_CXX11_ABI=0
-          -D__FILENAME__=__FILE__
-          ${EXT_CXX_FLAGS})
-
-target_link_directories(_C PRIVATE ${TORCH_PATH}/lib ${TORCH_NPU_PATH}/lib)
-target_link_libraries(_C PRIVATE gcov c10 torch torch_python torch_npu)
-target_link_options(
-  _C
-  PRIVATE
-  $<$<STREQUAL:$<TARGET_PROPERTY:TYPE>,EXECUTABLE>:-pie>
-  $<$<CONFIG:Release>:-s>
-  -Wl,-z,relro
-  -Wl,-z,now
-  -Wl,-z,noexecstack)
diff --git a/docs/api/README.md b/docs/api/README.md
index 2986bfbec667653aac35e60d9a0cc9aaa6fc8dd2..50478d6208642c5d9767b985f42f04c78cd312e7 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -1,1455 +1,10 @@
-> Note: 以prototype标注的接口，表示该接口为预发布接口，可能会有变动，不建议在生产环境中使用。
-# 通用算子
-## scatter_max
-### 接口原型
-```python
-mx_driving.common.scatter_max(Tensor updates, Tensor indices, Tensor out=None) -> (Tensor out, Tensor argmax)
-```
-### 功能描述
-在第0维上，将输入张量`updates`中的元素按照`indices`中的索引进行分散，然后在第0维上取最大值，返回最大值和对应的索引。对于1维张量，公式如下：
-$$out_i = max(out_i, max_j(updates_j))$$
-$$argmax_i = argmax_j(updates_j)$$
-这里，$i = indices_j$。
-### 参数说明
-- `updates`：更新源张量，数据类型为`float32`，且
-  - `updates`的第0维外其余轴合轴后必须32字节对齐。
-- `indices`：索引张量，数据类型为`int32`，且
-  - `indices`的维度必须为`1`，
-  - `indices`第0维的长度必须与`updates`第0维的长度相同。
-  - `indices`的最大值必须小于`491520`。
-  - `indices`的取值必须为非负的有效索引值。
-- `out`：被更新张量，数据类型为`float32`，默认为`None`,且
-  - `out`的维度必须与`updates`的维度相同。
-  - `out`除第0维外其余维的长度必须与`updates`相同。
-### 返回值
-- `out`：更新后的张量，数据类型为`float32`。
-- `argmax`：最大值对应的索引张量，数据类型为`int32`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.common import scatter_max
-updates = torch.tensor([[2, 0, 1, 3, 1, 0, 0, 4], [0, 2, 1, 3, 0, 3, 4, 2], [1, 2, 3, 4, 4, 3, 2, 1]], dtype=torch.float32).npu()
-indices = torch.tensor([0, 2, 0], dtype=torch.int32).npu()
-out = updates.new_zeros((3, 8))
-out, argmax = scatter_max(updates, indices, out)
-```
-## knn
-### 接口原型
-```python
-mx_driving.common.knn(int k, Tensor xyz, Tensor center_xyz, bool Transposed) -> Tensor
-```
-### 功能描述
-对center_xyz中的每个点找到xyz中对应batch中的距离最近的k个点，并且返回此k个点的索引值。
-### 参数说明
-- `xyz(Tensor)`：点数据，表示(x, y, z)三维坐标，数据类型为`float32`。shape为`[B, N, 3]`(当Transposed=False)或`[B, 3, N]`(当Transposed=True)。其中`B`为batch size，`N`为点的数量。
-- `center_xyz(Tensor)`：点数据，表示(x, y, z)三维坐标，数据类型为`float32`。shape为`[B, npoint, 3]`(当Transposed=False)或`[B, 3, npoint]`(当Transposed=True)。其中`B`为batch size，`npoint`为点的数量。
-- `k(int)`：采样点的数量。
-- `Transposed(bool)`: 输入是否需要进行转置
-### 返回值
-- `idx(Tensor)`：采样后的索引数据，数据类型为`int32`。shape为`[B, k, npoint]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.common import knn
-xyz = torch.tensor([[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=torch.float32).npu()
-center_xyz = torch.tensor([[[1, 2, 3]], [[1, 2, 3]]], dtype=torch.float32).npu()
-idx = knn(2, xyz, center_xyz, False)
-```
-### 算子约束
-1. k必须>0且<100。
-2. xyz中的每个batch中的任意一个点到center_xyz对应batch中的任意一个点的距离必须在1e10f以内。
-3. xyz和center_xyz的shape必须是3维，当Transposed=True时，xyz和center_xyz的shape的dim的第1维必须是3；当Transposed=False时，xyz和center_xyz的shape的dim的第2维必须是3。
-4. 由于距离相同时排序为不稳定排序，存在距离精度通过但索引精度错误问题，与竞品无法完全对齐。
-
-## scatter_mean
-### 接口原型
-```python
-mx_driving.common.scatter_mean(Tensor src, Tensor indices, int dim=0， Tensor out=None, int dim_size=None) -> Tensor
-```
-### 功能描述
-将输入张量`src`中的元素按照`indices`中的索引在指定的`dim`维进行分组，并计算每组的平均值，返回平均值。
-### 参数说明
-- `src`：源张量，数据类型为`float32`。
-- `indices`：索引张量，数据类型为`int32`。
-- `out`：被更新张量，数据类型为`float32`，可选入参，默认为`None`，输入`out`不为`None`时，`out`中的元素参与平均值的计算。
-- `dim`：指定的维度，表示按照哪个维度进行分组平均计算，数据类型为`int32`，可选入参，默认取值为`0`。
-- `dim_size`：输出张量在`dim`维的长度，数据类型为`int32`，可选入参，默认为`None`，该参数仅在输入`out`为`None`时生效。
-### 返回值
-- `out`：求平均后的张量，数据类型为`float32`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-
-```python
-import torch, torch_npu
-from mx_driving.common import scatter_mean
-src = torch.randn(4, 5, 6).to(torch.float)
-indices = torch.randint(5, (4, 5)).to(torch.int32)
-dim = 0
-src.requires_grad = True
-out = scatter_mean(src.npu(), indices.npu(), None, dim)
-grad_out_tensor = torch.ones_like(out)
-out.backward(grad_out_tensor)
-```
-### 算子约束
-- `indices`的维度必须小于等于`src`的维度，且每一维的长度均必须与`src`长度相同。
-- `indices`的取值必须为非负的有效索引值，参数`out`或`data_size`不为`None`时，`indices`的取值应该为输出张量在`dim`维的有效索引值。
-- `out`的维度必须与`src`的维度相同，且除第`dim`维外其余维的长度必须与`src`相同。
-- `dim`取值不能超过`indices`的维度。
-- `dim_size`的取值必须为非负的有效长度值。
-- `src`和`out`不支持`inf`、`-inf`和`nan`。
-### 其他说明
-- 该算子对尾块较大的场景较为亲和，对尾块很小的场景不亲和，其中，尾块表示`src`后`N`维的大小，`N = src.dim() - indices.dim()`。
-
-## three_interpolate
-### 接口原型
-```python
-mx_driving.common.three_interpolate(features: torch.Tensor, indices: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
-```
-### 功能描述
-对三维数据进行加权最近邻线性插值处理
-### 参数说明
-- `features`：需要被插值的特征，数据类型为`float32|float16`，维度为（B, C, M）。
-- `indices`：获取目标特征计算的索引，数据类型为`int32`，维度为（B, N, 3），
-  - `indices`的元素值需小于`features`的第三维度，即值在[0, M)。
-- `weight`：获取目标特征计算的权重，数据类型为`float32|float16`，维度为（B, N, 3）。
-  - `weight`数据类型与`features`须一致。
-- `features`，`indices`，`weights`三个参数的每个维度须小于10000。
-- `features`，`indices`，`weights`三个参数的大小请勿超过2^24。
-### 返回值
-- `output`：目标特征张量，数据类型为`float32|float16`，维度为（B, C, N）。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch
-from mx_driving.common import three_interpolate
-
-
-features = torch.tensor(
-            [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
-            [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
-            [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
-            [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
-            [0.3207, 0.0000, 0.3411, 0.3207, 0.3207, 0.3207]],
-            [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
-            [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
-            [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
-            [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
-            [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]],
-            ).npu()
-idx = torch.tensor(
-            [[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2], [0, 1, 3]],
-            [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4], [0, 1, 2]]],
-            ).int().npu()
-weight = torch.tensor(
-            [[[3.3333e-01, 3.3333e-01, 3.3333e-01],
-              [1.0000e+00, 5.8155e-08, 2.2373e-08],
-              [1.0000e+00, 1.7737e-08, 1.7356e-08],
-              [3.3333e-01, 3.3333e-01, 3.3333e-01],
-              [3.3333e-01, 3.3333e-01, 3.3333e-01],
-              [3.3333e-01, 3.3333e-01, 3.3333e-01]],
-             [[3.3333e-01, 3.3333e-01, 3.3333e-01],
-              [1.0000e+00, 1.3651e-08, 7.7312e-09],
-              [1.0000e+00, 1.7148e-08, 1.4070e-08],
-              [3.3333e-01, 3.3333e-01, 3.3333e-01],
-              [3.3333e-01, 3.3333e-01, 3.3333e-01],
-              [3.3333e-01, 3.3333e-01, 3.3333e-01]]],
-            ).npu()
-output = three_interpolate(features, idx, weight)
-```
-
-
-## three_nn
-### 接口原型
-```python
-mx_driving.common.three_nn(Tensor target, Tensor source) -> (Tensor dist, Tensor idx)
-```
-### 功能描述
-对target中的每个点找到source中对应batch中的距离最近的3个点，并且返回此3个点的距离和索引值。
-### 参数说明
-- `target(Tensor)`：点数据，表示(x, y, z)三维坐标，数据类型为`float32/float16`。shape为`[B, npoint, 3]`。其中`B`为batch size，`npoint`为点的数量。
-- `source(Tensor)`：点数据，表示(x, y, z)三维坐标，数据类型为`float32/float16`。shape为`[B, N, 3]`。其中`B`为batch size，`N`为点的数量。
-### 返回值
-- `dist(Tensor)`：采样后的索引数据，数据类型为`float32/float16`。shape为`[B, npoint, 3]`。
-- `idx(Tensor)`：采样后的索引数据，数据类型为`int32/int32`。shape为`[B, npoint, 3]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.common import three_nn
-source = torch.tensor([[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=torch.float32).npu()
-target = torch.tensor([[[1, 2, 3]], [[1, 2, 3]]], dtype=torch.float32).npu()
-dist, idx = three_nn(target, source)
-```
-### 算子约束
-1. source和target的shape必须是3维，且source和target的shape的dim的第2维必须是3。
-2. 距离相同时排序为不稳定排序，存在距离精度通过但索引精度错误问题，与竞品无法完全对齐。
-
-
-## hypot
-### 接口原型
-```python
-mx_driving.common.hypot(Tensor input, Tensor other) -> Tensor
-```
-### 功能描述
-给出直角三角形的两边，返回它的斜边。
-### 参数说明
-- `input(Tensor)`：代表直角三角形第一条直角边的输入张量，数据类型为`float32`。
-- `other(Tensor)`：代表直角三角形第二条直角边的输入张量，数据类型为`float32`。
-### 返回值
-- `Tensor`：经过计算后的直角三角形斜边，数据类型为`float32`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.common import hypot
-input = torch.tensor([3,3,3], dtype=torch.float32).npu()
-other = torch.tensor([4,4,4], dtype=torch.float32).npu()
-out = hypot(input, other) # tensor([5.,5.,5.])
-```
-### 算子约束
-1. input和other的shape必须是可广播的。
-
-
-## assign_score_withk
-### 接口原型
-```python
-mx_driving.common.assign_score_withk(Tensor scores, Tensor point_features, Tensor center_features, Tensor knn_idx, str aggregate='sum') -> Tensor
-```
-### 功能描述
-根据`knn_idx`得到采样点及其邻居点的索引，计算`point_features`和`center_features`的差，并与`scores`相乘后在特征维度进行聚合，返回采样点的特征。
-### 参数说明
-- `scores(Tensor)`：权重矩阵的重要系数，数据类型为`float32`。Shape为`[B, npoint, K, M]`，其中`B`为batch size，`npoint`为采样点的数量，`K`为一个样本点及其邻居点的数量之和，`M`为权重矩阵集合的规模。
-- `point_features(Tensor)`：所有点的特征，数据类型为`float32`。Shape为`[B, N, M, O]`，其中`N`为所有点的数量，`O`为特征数量。
-- `center_features(Tensor)`：所有点的中心特征，数据类型为`float32`。Shape为`[B, N, M, O]`。
-- `knn_idx[Tensor]`：采样点及其邻居点的索引，数据类型为`int64`。Shape为`[B, npoint, K]`。
-- `aggregate`：聚合方式，默认为`sum`，数据类型为`str`。
-### 返回值
-- `output`：聚合后采样点的特征，数据类型为`float32`。Shape为`[B, O, npoint, K]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-
-```python
-import torch, torch_npu
-from mx_driving.common import assign_score_withk
-points = np.random.rand(4, 100, 8, 16).astype(np.float32)
-centers = np.random.rand(4, 100, 8, 16).astype(np.float32)
-scores = np.random.rand(4, 64, 10, 8).astype(np.float32)
-knn_idx = np.random.randint(0, N, size=(4, 64, 10)).astype(np.int64)
-output = assign_score_withk(torch.from_numpy(scores).npu(),
-                            torch.from_numpy(points).npu(),
-                            torch.from_numpy(centers).npu(),
-                            torch.from_numpy(knn_idx).npu(),
-                            "sum")
-```
-### 算子约束
-- `npoint`和`K`都不大于`N`。
-
-
-# 数据预处理算子
-## npu_points_in_box
-### 接口原型
-```python
-mx_driving.preprocess.npu_points_in_box(Tensor boxes, Tensor points) -> Tensor
-```
-### 功能描述
-判断点是否在框内。
-### 参数说明
-- `boxes(Tensor)`：框张量，数据类型为`float32`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
-- `points(Tensor)`：点张量，数据类型为`float32`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。
-### 返回值
-- `boxes_idx_of_points(Tensor)`：点在框内的索引张量，数据类型为`int32`。shape 为`[B, N]`。
-### 约束说明
-- `boxes`和`points`的`B`必须相同，且只能为`1`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.preprocess import npu_points_in_box
-boxes = torch.tensor([[[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]]], dtype=torch.float32).npu()
-points = torch.tensor([[[1, 2, 3], [3, 4, 5]]], dtype=torch.float32).npu()
-out = npu_points_in_box(boxes, points)
-```
-
-## npu_points_in_box_all
-Note: 该接口命名将于2025年改为`points_in_boxes_all`。
-### 接口原型
-```python
-mx_driving.preprocess.npu_points_in_box_all(Tensor boxes, Tensor points) -> Tensor
-```
-### 功能描述
-判断点是否在框内。
-### 参数说明
-- `boxes(Tensor)`：框张量，数据类型为`float32`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
-- `points(Tensor)`：点张量，数据类型为`float32`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。
-### 返回值
-- `boxes_idx_of_points(Tensor)`：同一`batch`下，各点是否在各框内的张量，数据类型为`int32`。shape 为`[B, N, M]`。
-### 约束说明
-- `boxes`和`points`的`B`必须相同。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.preprocess import npu_points_in_box_all
-boxes = torch.tensor([[[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]]], dtype=torch.float32).npu()
-points = torch.tensor([[[1, 2, 5], [3, 4, 8]]], dtype=torch.float32).npu()
-out = npu_points_in_box_all(boxes, points)
-```
-
-## RoipointPool3d
-### 接口原型
-```python
-mx_driving.preprocess.RoipointPool3d(int num_sampled_points, Tensor points, Tensor point_features, Tensor boxes3d) -> (Tensor pooled_features, Tensor pooled_empty_flag)
-```
-### 功能描述
-对每个3D方案的几何特定特征进行编码。
-### 参数说明
-- `num_sampled_points(int)`：特征点的数量，正整数。
-- `points(Tensor)`：点张量，数据类型为`float32, float16`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。
-- `point_features(Tensor)`：点特征张量，数据类型为`float32, float16`。shape 为`[B, N, C]`。`C`分别代表`x, y, z`。
-- `boxes3d(Tensor)`：框张量，数据类型为`float32, float16`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
-### 返回值
-- `pooled_features(Tensor)`：点在框内的特征张量，数据类型为`float32, float16`。shape 为`[B, M, num, 3+C]`。
-- `pooled_empty_flag(Tensor)`：所有点不在框内的空标记张量，数据类型为`int32`。shape 为`[B, M]`。
-### 约束说明
-- `points`、`point_features`和`boxes3d`的数据类型必须相同，以及`B`也必须相同。
-- `num_sampled_points`必须小于等于`N`。
-- 数据类型为`float32`时，建议`B`小于100、`N`小于等于2640、`M`小于等于48、`num_sampled_points`小于等于48，个别shape值略微超过建议值无影响，但所有shape值均大于建议值时，算子执行会发生错误。
-- 数据类型为`float16`时，建议`B`小于100、`N`小于等于3360、`M`小于等于60、`num_sampled_points`小于等于60，个别shape值略微超过建议值无影响，但所有shape值均大于建议值时，算子执行会发生错误。
-- `N`/`M`的值越大，性能劣化越严重，建议`N`小于`M`的六百倍，否则性能可能会低于0.1x A100。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.preprocess import RoIPointPool3d
-num_sampled_points = 1
-points = torch.tensor([[[1, 2, 3]]], dtype=torch.float).npu()
-point_features = points.clone()
-boxes3d = torch.tensor([[[1, 2, 3, 4, 5, 6, 1]]], dtype=torch.float).npu()
-roipoint_pool3d = RoIPointPool3d(num_sampled_points)
-pooled_features, pooled_empty_flag = roipoint_pool3d(points, point_features, boxes3d)
-```
-
-
-# 目标检测算子
-## npu_boxes_overlap_bev
-Note: 该接口命名将于2025年改为`boxes_overlap_bev`。
-### 接口原型
-```python
-mx_driving.detection.npu_boxes_overlap_bev(Tensor boxes_a, Tensor boxes_b) -> Tensor
-```
-### 功能描述
-计算bev视角下中两个边界框的重叠面积。
-### 参数说明
-- `boxes_a (Tensor)`：第一组bounding boxes，数据类型为`float32`。shape为`[M, 5]`。其中`5`分别代表`x1, y1, x2, y2, angle`, `x1, y1, x2, y2`代表box四个顶点的横纵坐标，`angle`代表box的弧度制旋转角。
-- `boxes_b (Tensor)`：第二组bounding boxes，数据类型为`float32`。shape为`[N, 5]`。其中`5`分别代表`x1, y1, x2, y2, angle`, `x1, y1, x2, y2`代表box四个顶点的横纵坐标，`angle`代表box的弧度制旋转角。
-### 返回值
-- `area_overlap(Tensor)`：包含两组bounding boxes交叠面积的张量，数据类型为`float32`。shape为`[M, N]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.detection import npu_boxes_overlap_bev
-boxes_a = torch.tensor([[0, 0, 2, 2, 0]], dtype=torch.float32).npu()
-boxes_b = torch.tensor([[1, 1, 3, 3, 0]], dtype=torch.float32).npu()
-area_overlap = npu_boxes_overlap_bev(boxes_a, boxes_b)
-```
-## box_iou_quadri
-### 接口原型
-```python
-mx_driving.detection.box_iou_quadri(Tensor boxes_a, Tensor boxes_b, str mode='iou', bool aligned=False) -> Tensor
-```
-### 功能描述
-计算两个边界框的IoU。
-### 参数说明
-- `boxes_a (Tensor)`：第一组bounding boxes，数据类型为`float32`。shape为`[M, 8]`。其中`8`分别代表`x1, y1, x2, y2, x3, y3, x4, y4`, 表示box四个顶点的横纵坐标。
-- `boxes_b (Tensor)`：第二组bounding boxes，数据类型为`float32`。shape为`[N, 8]`。其中`8`分别代表`x1, y1, x2, y2, x3, y3, x4, y4`, 表示box四个顶点的横纵坐标。
-- `mode (str)`：取值为`"iou"`时，计算IoU（intersection over union）；取值为`"iof"`时，计算IoF（intersection over foregroud）。
-- `aligned (bool)`：取值为`True`时，只计算配对的box之间的结果；取值为`False`时，计算每对box之间的结果。
-### 返回值
-- `ious(Tensor)`：包含两组bounding boxes的IoU（`mode="iou"`）或IoF（`mode="iof"`）的张量，数据类型为`float32`。shape为`[M]`（`aligned=True`）或`[M, N]`（`aligned=False`）。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.detection import box_iou_quadri
-boxes_a = torch.tensor([[7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]], dtype=torch.float32).npu()
-boxes_b = torch.tensor([[7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]], dtype=torch.float32).npu()
-ious = box_iou_quadri(boxes_a, boxes_b, mode="iou", aligned=False)
-```
-## npu_nms3d
-### 接口原型
-```python
-mx_driving.detection.npu_nms3d(Tensor boxes, Tensor scores, float: iou_threshold) -> Tensor
-```
-### 功能描述
-3D非极大值抑制，在bev视角下剔除多个3d box交并比大于阈值的box。
-### 参数说明
-- `boxes(Tensor)`：框张量，数据类型为`float32, float16`。shape 为`[N, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
-- `scores(Tensor)`：评分张量，数据类型为`float32, float16`。shape 为`[N]`。
-- `iou_threshold(float)`：IoU阈值。
-### 返回值
-- `Tensor`：NMS后的框张量，数据类型为`int32`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.detection import npu_nms3d
-boxes = torch.tensor([[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]], dtype=torch.float32).npu()
-scores = torch.tensor([1, 2], dtype=torch.float32).npu()
-out = npu_nms3d(boxes, scores, 0.5)
-```
-## npu_nms3d_normal
-### 接口原型
-```python
-mx_driving.detection.npu_nms3d_normal(Tensor boxes, Tensor scores, float: iou_threshold) -> Tensor
-```
-### 功能描述
-3D非极大值抑制。
-### 参数说明
-- `boxes(Tensor)`：框张量，数据类型为`float32, float16`。shape 为`[N, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
-- `scores(Tensor)`：评分张量，数据类型为`float32, float16`。shape 为`[N]`。
-- `iou_threshold(float)`：IoU阈值。
-### 返回值
-- `Tensor`：NMS后的框张量，数据类型为`int32`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.detection import npu_nms3d_normal
-boxes = torch.tensor([[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]], dtype=torch.float32).npu()
-scores = torch.tensor([1, 2], dtype=torch.float32).npu()
-out = npu_nms3d_normal(boxes, scores, 0.5)
-```
-## npu_rotated_iou
-### 接口原型
-```python
-mx_driving.detection.npu_rotated_iou(Tensor self, Tensor query_boxes, bool trans=False, int mode=0, bool is_cross=True, float v_threshold=0.0, float e_threshold=0.0) -> Tensor
-```
-### 功能描述
-计算旋转框的IoU。
-### 参数说明
-- `self(Tensor)`：边界框张量，数据类型为`float32, float16`，形状为`[B, N, 5]`。
-- `query_boxes(Tensor)`：查询框张量，数据类型为`float32, float16`，形状为`[B, M, 5]`。
-- `trans(bool)`：是否进行坐标变换。默认值为`False`。值为`True`时，表示`xyxyt`, 值为`False`时，表示`xywht`，其中`t`为角度制。
-- `is_cross(bool)`：值为`True`时，则对两组边界框中每个边界框之间进行计算。值为`False`时，只对对齐的边界框之间进行计算。
-- `mode(int)`：计算IoU的模式。默认值为`0`。值为`0`时，表示计算`IoU`，值为`1`时，表示计算`IoF`。
-- `v_threshold(float)`：顶点判断的容忍阈值。
-- `e_threshold(float)`：边相交判断的容忍阈值。
-### 返回值
-- `Tensor`：IoU张量，数据类型为`float32, float16`，`is_cross`为`True`时形状为`[B, N, M]，反之则为`[B, N]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-import numpy as np
-from mx_driving.detection import npu_rotated_iou
-a = np.random.uniform(0, 1, (2, 2, 5)).astype(np.float16)
-b = np.random.uniform(0, 1, (2, 3, 5)).astype(np.float16)
-box1 = torch.from_numpy(a).npu()
-box2 = torch.from_numpy(b).npu()
-iou = npu_rotated_iou(box1, box2, False, 0, True, 1e-5, 1e-5)
-```
-## npu_rotated_overlaps
-### 接口原型
-```python
-mx_driving.detection.npu_rotated_overlaps(Tensor self, Tensor query_boxes, bool trans=False) -> Tensor
-```
-### 功能描述
-计算旋转框的重叠面积。
-### 参数说明
-- `self(Tensor)`：边界框张量，数据类型为`float32, float16`，形状为`[B, N, 5]`。
-- `query_boxes(Tensor)`：查询框张量，数据类型为`float32, float16`，形状为`[B, M, 5]`。
-- `trans(bool)`：是否进行坐标变换。默认值为`False`。值为`True`时，表示`xyxyt`, 值为`False`时，表示`xywht`。
-### 返回值
-- `Tensor`：重叠面积张量，数据类型为`float32, float16`，形状为`[B, N, M]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-import numpy as np
-from mx_driving.detection import npu_rotated_overlaps
-a = np.random.uniform(0, 1, (1, 3, 5)).astype(np.float16)
-b = np.random.uniform(0, 1, (1, 2, 5)).astype(np.float16)
-box1 = torch.from_numpy(a).npu()
-box2 = torch.from_numpy(b).npu()
-output = npu_rotated_overlaps(box1, box2, True)
-```
-## roi_align_rotated[beta]
-### 接口原型
-```python
-mx_driving.detection.roi_align_rotated(Tensor feature_map, Tensor rois, float: spatial_scale,
-                                       int: sampling_ratio, int: pooled_height, int: pooled_width, bool: aligned, bool: clockwise) -> Tensor
-```
-### 功能描述
-计算旋转候选框的RoI Align池化特征图。
-### 参数说明
-- `feature map(Tensor)`：特征图张量，数据类型为`float32`，形状为`[B, C, H, W]`。
-- `rois(Tensor)`：感兴趣区域张量，数据类型为`float32`，形状为`[n, 6]`。
-- `spatial_scale(float)`：感兴趣区域边界框的缩放率，数据类型为`float32`。
-- `sampling_ratio(int)`：采样率，数据类型为`int`。取值范围为非负整数。
-- `pooled_height(int)`：池化特征图高度，数据类型为`int`。
-- `pooled_width(int)`：池化特征图宽度，数据类型为`int`。
-- `aligned(bool)`：是否对齐，数据类型为`bool`。值为`True`时，表示对齐, 值为`False`时，表示不对齐。
-- `clockwise(bool)`：旋转候选框的旋转方向，数据类型为`bool`。值为`True`时，表示逆时针旋转，值为`False`时，表示顺时针旋转。
-### 返回值
-- `Tensor`：池化特征图张量，数据类型为`float32`，形状为`[n, C, pooled_height, pooled_width]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import math
-import torch, torch_npu
-import numpy as np
-from mx_driving.detection import roi_align_rotated
-
-feature_map = torch.rand([1, 3, 16, 16])
-feature_map.requires_grad = True
-rois = torch.Tensor(6, 8)
-rois[0] = torch.randint(0, 1, (8,))
-rois[1].uniform_(0, 16)
-rois[2].uniform_(0, 16)
-rois[3].uniform_(0, 16)
-rois[4].uniform_(0, 16)
-rois[5].uniform_(0, math.pi)
-
-output = roi_align_rotated(feature_map.npu(), rois.npu(), 1, 1, 7, 7, True, True)
-output.backward(torch.ones_like(output))
-```
-### 其他说明
-在双线性插值采样过程中，当采样点`x`接近`-1`或`W`位置，`y`接近`-1`或`H`位置时，由于平台差异和计算误差，可能导致该采样点的精度无法与竞品精度完全对齐。
-
-## roiaware_pool3d
-### 接口原型
-```python
-mx_driving.detection.roiaware_pool3d(Tensor rois, Tensor pts, Tensor pts_feature,
-                    Union[int, tuple] out_size, int max_pts_per_voxel, int mode) -> Tensor
-```
-### 功能描述
-将输入的点云特征在ROI框内进行池化
-### 参数说明
-- `rois (Tensor)`：输入的RoI框坐标与尺寸，数据类型为`float32/float16`，shape为`[Roi_num, 7]`。
-- `pts (Tensor)`：输入的点云坐标，数据类型为`float32/float16`，shape为`[Pts_num, 3]`。
-- `pts_feature (Tensor)`：输入的点的特征向量，数据类型为`float32/float16`，shape为`[Pts_num, Channels]`。
-- `out_size (Union)`：输出的RoI框内voxel的尺寸，数据类型为`int`或者`tuple`，shape为`[out_x, out_y, out_z]`。
-- `max_pts_per_voxel (int)`：每个voxel内最大的点的个数，数据类型为`int`。
-- `mode (int)`：池化的方式，0为maxpool, 1为avgpool，数据类型为`int`。
-### 返回值
-- `pooled_features (Tensor)`：池化得到的RoI框特征，数据类型为`float32/float16`，shape为`[Roi_num, out_x, out_y, out_z, Channels]`。
-### 约束说明
-- Roi_num <= 100
-- Pts_num <= 1000
-- Channels <= 1024
-- 1 <= max_pts_per_voxel <=256，max_pts_per_voxel <= Pts_num
-- 反向具有相同约束。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch
-import math
-import torch_npu
-import mx_driving.detection
-
-out_size = (5, 5, 5)
-max_pts_per_voxel = 128
-mode = 1
-
-N = 40
-npoints = 1000
-channels = 1024
-
-xyz_coor = np.random.uniform(-1, 1, size = (N, 3)).astype(np.float32)
-xyz_size_num = np.random.uniform(5, 50, size = (1, 3))
-xyz_size = (xyz_size_num * np.ones((N, 3))).astype(np.float32)
-angle = np.radians(np.random.randint(0, 360, size = (N , 1))).astype(np.float32)
-
-rois = np.concatenate((xyz_coor, xyz_size), axis=1)
-rois = np.concatenate((rois, angle), axis=1)
-
-pts = np.random.uniform(-5, 5, size = (npoints, 3)).astype(np.float32)
-pts_feature = np.random.uniform(-1, 1, size=(npoints, channels)).astype(np.float32)
-
-pooled_features_npu = mx_driving.detection.roiaware_pool3d(torch.tensor(rois).npu(), torch.tensor(pts).npu(),
-                                                            torch.tensor(pts_feature).npu(), out_size, max_pts_per_voxel, mode)
-```
-
-## border_align
-### 接口原型
-```python
-mx_driving.detection.border_align(Tensor feature_map, Tensor rois, int pooled_size) -> Tensor
-```
-### 功能描述
-对输入的RoI框进行边缘特征提取。
-### 参数说明
-- `feature_map (Tensor)`：输入的特征图，数据类型为`float32`，shape为`[Batch_size, Channels, Height, Width]`。
-- `rois (Tensor)`：输入的RoI框坐标，数据类型为`int32`，shape为`[Batch_size, Height * Width, 4]`。
-- `pooled_size (int)`：在每条边上的采样点数，数据类型为`int`。
-### 返回值
-- `out_features (Tensor)`：提取到的RoI框特征，数据类型为`float32`，shape为`[Batch_size, Channels / 4, Height * Width, 4]`。
-### 约束说明
-- Batch_size <= 128
-- Channels <= 8192, Channels % 4 == 0
-- Height <= 256, Width <= 256
-- 2 <= pooled_size <= 20
-- 反向具有相同约束。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch
-import torch_npu
-import numpy as np
-from mx_driving.detection import border_align
-
-def generate_features(feature_shape):
-    features = torch.rand(feature_shape)
-    return features
-
-def generate_rois(inputs):
-    num_boxes = inputs.shape[0] * inputs.shape[2] * inputs.shape[3]
-    xyxy = torch.rand(num_boxes, 4)
-    xyxy[:, 0::2] = xyxy[:, 0::2] * inputs.size(3)
-    xyxy[:, 1::2] = xyxy[:, 1::2] * inputs.size(2)
-    xyxy[:, 2:] = xyxy[:, 0:2] + xyxy[:, 2:]
-    rois = xyxy.view(inputs.shape[0], -1, 4).contiguous()
-    return rois
-
-batch_size = 2
-input_channels = 16
-input_height = 8
-input_width = 8
-pooled_size = 3
-features = generate_features([batch_size, input_channels, input_height, input_width])
-rois = generate_rois(features)
-output = border_align(features.npu(), rois.npu(), pooled_size)
-```
-
-## pixel_group
-### 接口原型
-```python
-mx_driving.detection.pixel_group(Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label, Tensor kernel_contour, int kernel_region_num, float distance_threshold) -> List[List]
-```
-### 功能描述
-根据像素之间的嵌入向量和距离，将未被分组的像素分组。
-### 参数说明
-- `score (Tensor)`：前景得分矩阵，数据类型为`float32`，shape为`[Height, Width]`。
-- `mask (Tensor)`：前景掩码矩阵，数据类型为`bool`，shape为`[Height, Width]`。
-- `embedding (Tensor)`：特征向量，数据类型为`float32`，shape为`[Height, Width, Embedding_dim]`。
-- `kernel_label (Tensor)`：像素的实例标签，数据类型为`int32`，shape为`[Height, Width]`。
-- `kernel_contour (Tensor)`：内核的边界像素，数据类型为`uint8`，shape为`[Height, Width]`。
-- `kernel_region_num`：不同内核（分组）的数量，数据类型为`int`。
-- `distance_threshold`：嵌入向量的距离阈值，数据类型为`float`。
-### 返回值
-- `pixel_assignment (List)`：像素的分组信息，数据类型为`float32`，length为入参`kernel_region_num`。
-### 约束说明
-- mask = score > 0.5
-- `score`的取值范围在`[0, 1]`之间
-- `kernel_label`的最大值为`kernel_region_num`-1
-- `kernel_contour`的取值非0即1
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-import numpy as np
-from mx_driving.detection import pixel_group
-H, W, dim, num = 10, 10, 8, 3
-score = np.random.uniform(0, 1, [H, W]).astype(np.float32)
-score = torch.from_numpy(score).npu()
-mask = (score) > 0.5
-embedding = np.random.uniform(0, 10, [H, W, dim]).astype(np.float32)
-embedding = torch.from_numpy(embedding).npu()
-kernel_label = np.random.uniform(0, num, [H, W]).astype(np.int32)
-kernel_label = torch.from_numpy(kernel_label).npu()
-kernel_contour = np.random.uniform(0, 1, [H, W]).astype(np.uint8)
-kernel_contour = torch.from_numpy(kernel_contour).npu()
-kernel_region_num = num
-distance_threshold = float(0.8)
-
-output = pixel_group(score, mask, embedding, kernel_label, kernel_contour, kernel_region_num, distance_threshold)
-```
-
-# 融合算子
-## multi_scale_deformable_attn(MultiScaleDeformableAttnFunction.Apply)
-### 接口原型
-```python
-mx_driving.fused.multi_scale_deformable_attn(Tensor value, Tensor value_spatial_shapes, Tensor value_level_start_index, Tensor sampling_locations, Tensor attention_weights) -> Tensor
-```
-### 功能描述
-多尺度可变形注意力机制, 将多个视角的特征图进行融合。
-### 参数说明
-- `value(Tensor)`：特征张量，数据类型为`float32, float16`。shape为`[bs, num_keys, num_heads, embed_dims]`。其中`bs`为batch size，`num_keys`为特征图的大小，`num_heads`为头的数量，`embed_dims`为特征图的维度，其中`embed_dims`需要为8的倍数。
-- `value_spatial_shapes(Tensor)`：特征图的形状，数据类型为`int32, int64`。shape为`[num_levels, 2]`。其中`num_levels`为特征图的数量，`2`分别代表`H, W`。
-- `value_level_start_index(Tensor)`：偏移量张量，数据类型为`int32, int64`。shape为`[num_levels]`。
-- `sampling_locations(Tensor)`：位置张量，数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads, num_levels, num_points, 2]`。其中`bs`为batch size，`num_queries`为查询的数量，`num_heads`为头的数量，`num_levels`为特征图的数量，`num_points`为采样点的数量，`2`分别代表`y, x`。
-- `attention_weights(Tensor)`：权重张量，数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads, num_levels, num_points]`。其中`bs`为batch size，`num_queries`为查询的数量，`num_heads`为头的数量，`num_levels`为特征图的数量，`num_points`为采样点的数量。
-### 返回值
-- `output(Tensor)`：融合后的特征张量，数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads*embed_dims]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-- `locations`的值在`[0, 1]`之间。
-- 当前版本只支持`num_keys` &le; 8，`num_heads` &le; 8，`embed_dims` == 16或32，`num_points` = 1或偶数。
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.fused import multi_scale_deformable_attn
-bs, num_levels, num_heads, num_points, num_queries, embed_dims = 1, 1, 4, 8, 16, 32
-
-shapes = torch.as_tensor([(100, 100)], dtype=torch.long)
-num_keys = sum((H * W).item() for H, W in shapes)
-
-value = torch.rand(bs, num_keys, num_heads, embed_dims) * 0.01
-sampling_locations = torch.ones(bs, num_queries, num_heads, num_levels, num_points, 2) * 0.005
-attention_weights = torch.rand(bs, num_queries, num_heads, num_levels, num_points) + 1e-5
-level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
-
-out = multi_scale_deformable_attn(value.npu(), shapes.npu(), level_start_index.npu(), sampling_locations.npu(), attention_weights.npu())
-```
-
-## npu_max_pool2d
-### 接口原型
-```python
-mx_driving.fused.npu_max_pool2d(Tensor x, int kernel_size, int stride, int padding) -> Tensor
-```
-### 功能描述
-对输入进行最大池化，并输出最大池化值。
-### 参数说明
-- `x (Tensor)`：一组待池化对象，数据类型为`float32`，format为NCHW，输入数据量不超过10亿。
-### 返回值
-- `y (Tensor)`：池化后的最大值，数据类型为`float32`，format为NCHW。
-### 约束说明
-kernel_size仅支持3，stride仅支持2，padding仅支持1，且输入C轴数据量要求为8的倍数，H和W需要大于100。
-性能在C值较大的场景下较优，建议使用规格为C>=64。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.fused import npu_max_pool2d
-kernel_size = 3
-stride = 2
-padding = 1
-x = torch.randn(18, 64, 464, 800).npu()
-res = npu_max_pool2d(x, kernel_size, stride, padding)
-```
-
-## npu_deformable_aggregation
-### 接口原型
-```python
-mx_driving.fused.npu_deformable_aggregation(Tensor feature_maps, Tensor spatial_shape, Tensor scale_start_index, Tensor sample_locations, Tensor weight) -> Tensor
-```
-### 功能描述
-可变形聚合，对于每个锚点实例，对多个关键点的多时间戳、视图、缩放特征进行稀疏采样后分层融合为实例特征，实现精确的锚点细化。
-### 参数说明
-- `feature_maps(Tensor)`：特征张量，数据类型为`float32`。shape为`[bs, num_feat, c]`。其中`bs`为batch size，`num_feat`为特征图的大小，`c`为特征图的维度。
-- `spatial_shape(Tensor)`：特征图的形状，数据类型为`int32`。shape为`[cam, scale, 2]`。其中`cam`为相机数量，其中`scale`为每个相机的特征图数量，`2`分别代表H, W。
-- `scale_start_index(Tensor)`：每个特征图的偏移位置张量，数据类型为`int32`。shape为`[cam, scale]`，其中`cam`为相机数量，其中`scale`每个相机的特征图数量。
-- `sample_locations(Tensor)`：位置张量，数据类型为`float32`。shape为`[bs, anchor, pts, cam, 2]`。其中`bs`为batch size，`anchor`为锚点数量，`pts`为采样点的数量，`cam`为相机的数量，`2`分别代表y, x。
-- `weight(Tensor)`：权重张量，数据类型为`float32`。shape为`[bs, anchor, pts, cam, scale, group]`。其中`bs`为batch size，`anchor`为锚点数量，`pts`为采样点的数量，`cam`为相机的数量，`scale`每个相机的特征图数量，`group`为分组数。
-### 返回值
-- `output(Tensor)`：输出结果张量，数据类型为`float32`。shape为`[bs, anchor, c]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-- bs <= 128
-- num_feat的值为spatial_shape中每幅图的特征数量之和
-- c <= 256,且为group的整数倍
-- cam <= 6
-- scale <= 4
-- anchor <= 2048
-- pts <= 2048
-- group <= 32,且为2的指数倍
-- sample_locations的值在[0, 1]之间。
-- 每个输入tensor的数据量不超过1.5亿。
-- 反向具有相同约束。
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.fused import npu_deformable_aggregation
-
-bs, num_feat, c, cam, anchor, pts, scale, group = 1, 2816, 256, 1, 10, 2000, 1, 8
-
-feature_maps = torch.ones_like(torch.randn(bs,num_feat ,c)).to(torch.float16)
-spatial_shape = torch.tensor([[[32, 88]]])
-scale_start_index = torch.tensor([[0]])
-sampling_location = torch.rand(bs, anchor, pts, cam, 2)
-weights = torch.randn(bs, anchor, pts, cam, scale, group)
-
-out = npu_deformable_aggregation(feature_maps.npu(), spatial_shape.npu(), scale_start_index.npu(), sampling_location.npu(), weights.npu())
-```
-
-## deform_conv2d(DeformConv2dFunction.apply)
-### 接口原型
-```python
-mx_driving.fused.deform_conv2d(Tensor x, Tensor offset, Tensor weight, Union[int, Tuple[int, ...]] stride, Union[int, Tuple[int, ...]] padding, Union[int, Tuple[int, ...]] dilation, int groups, int deformable_groups) -> Tensor
-```
-### 功能描述
-可变形卷积。
-### 参数说明
-- `x(Tensor)`：输入特征，数据类型为`float32`，shape为`(n, c_in, h_in, w_in)`，其中`n`为 batch size，`c_in`为输入特征的通道数量，`h_in`为输入特征图的高，`w_in`为输入特征图的宽。
-- `offset(Tensor)`：偏移量，数据类型为`float32`，shape 为`(n, 2 * k * k, h_out, w_out)`，其中`n`为 batch size，`k` 为卷积核大小，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
-- `weight(Tensor)`：卷积核权重，数据类型为`float32`，shape 为 `(c_out, c_in, k, k)`，其中 `c_out` 为输出的通道数，`c_in` 为输入的通道数，`k` 为卷积核大小。
-- `stride(Union)`：卷积步长。
-- `padding(Union)`：卷积的填充大小。
-- `dilation(Union)`：空洞卷积大小。
-- `groups(int)`：分组卷积大小，当前只支持1。
-- `deformable_groups(int)`：将通道分成几组计算offsets，当前只支持1。
-### 返回值
-- `output(Tensor)`：输出张量，数据类型为`float32`，shape 为 `(n, c_out, h_out, w_out)`，其中`n`为 batch size，`c_out`为输出通道，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-1. `deformable_groups`和`groups`当前只支持1。
-2. `h_in`,`w_in`,`h_out`,`w_out`需满足
-$$
-w_{out}=(w_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 \\
-h_{out}=(h_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1
-$$
-3. `c_in`需要为64的倍数。
-### 调用示例
-```python
-import torch
-import torch_npu
-from mx_driving.fused import deform_conv2d, DeformConv2dFunction
-
-n, c_in, h_in, w_in = 16, 64, 100, 200
-c_out, k, h_out, w_out = 64, 3, 50, 100
-
-x = torch.randn((n, c_in, h_in, w_in)).npu()
-offset = torch.randn((n, 2 * k * k, h_out, w_out)).npu()
-weight = torch.randn((c_out, c_in, k, k)).npu()
-stride = 1
-padding = 1
-dilation = 1
-groups = 1
-deformable_groups = 1
-
-output = deform_conv2d(x, offset, weight, stride, padding, dilation, groups, deformable_groups)
-output = DeformConv2dFunction.apply(x, offset, weight, stride, padding, dilation, groups, deformable_groups)
-```
-## modulated_deform_conv2d(ModulatedDeformConv2dFunction.apply)
-### 接口原型
-```python
-mx_driving.fused.modulated_deform_conv2d(Tensor x, Tensor offset, Tensor mask, Tensor weight, Tensor bias, Union[int, Tuple[int, ...]] stride, Union[int, Tuple[int, ...]] padding, Union[int, Tuple[int, ...]] dilation, int groups, int deformable_groups) -> Tensor
-```
-### 功能描述
-在可变形卷积的基础之上加上了 modulation 机制，通过调控输出特征的幅度，提升可变形卷积的聚焦相关区域的能力。
-### 参数说明
-- `x(Tensor)`：输入特征，数据类型为`float32`，shape为`(n, c_in, h_in, w_in)`，其中`n`为 batch size，`c_in`为输入特征的通道数量，`h_in`为输入特征图的高，`w_in`为输入特征图的宽。
-- `offset(Tensor)`：偏移量，数据类型为`float32`，shape 为`(n, 2 * k * k, h_out, w_out)`，其中`n`为 batch size，`k` 为卷积核大小，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
-- `mask(Tensor)`：掩码，用于调控输出特征的幅度，数据类型为`float32`，shape 为`(n, k * k, h_out, w_out)`，其中`n`为 batch size，k 为卷积核大小，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
-- `weight(Tensor)`：卷积核权重，数据类型为`float32`，shape 为 `(c_out, c_in, k, k)`，其中 `c_out` 为输出的通道数，`c_in` 为输入的通道数，`k` 为卷积核大小。
-- `bias(Tensor)`：偏置，暂不支持bias，传入 `None` 即可。
-- `stride(Union)`：卷积步长。
-- `padding(Union)`：卷积的填充大小。
-- `dilation(Union)`：空洞卷积大小。
-- `groups(int)`：分组卷积大小，当前只支持1。
-- `deformable_groups(int)`：将通道分成几组计算offsets，当前只支持1。
-### 返回值
-- `output(Tensor)`：输出张量，数据类型为`float32`，shape 为 `(n, c_out, h_out, w_out)`，其中`n`为 batch size，`c_out`为输出通道，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-1. `deformable_groups`和`groups`当前只支持1。
-2. `h_in`,`w_in`,`h_out`,`w_out`需满足
-$$
-w_{out}=(w_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 \\
-h_{out}=(h_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1
-$$
-3. `c_in`需要为64的倍数。
-### 调用示例
-```python
-import torch
-import torch_npu
-from mx_driving.fused import modulated_deform_conv2d, ModulatedDeformConv2dFunction
-
-n, c_in, h_in, w_in = 16, 64, 100, 200
-c_out, k, h_out, w_out = 64, 3, 50, 100
-
-x = torch.randn((n, c_in, h_in, w_in)).npu()
-offset = torch.randn((n, 2 * k * k, h_out, w_out)).npu()
-mask = torch.randn((n, k * k, h_out, w_out)).npu()
-weight = torch.randn((c_out, c_in, k, k)).npu()
-bias = None
-stride = 1
-padding = 1
-dilation = 1
-groups = 1
-deformable_groups = 1
-
-output = modulated_deform_conv2d(x, offset, mask, weight, bias,
-  stride, padding, dilation, groups, deformable_groups)
-output = ModulatedDeformConv2dFunction.apply(x, offset, mask, weight, bias,
-  stride, padding, dilation, groups, deformable_groups)
-```
-
-# 点云算子
-## bev_pool
-### 接口原型
-```python
-mx_driving.point.bev_pool(Tensor feat, Tensor geom_feat, int B, int D, int H, int W) -> Tensor
-```
-### 功能描述
-BEV池化。可参考论文`BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation`
-### 参数说明
-- `feat(Tensor)`：特征张量，数据类型为`float32`。shape为`[N, C]`。其中`N`为原特征张量拉伸后的数量，`C`为特征的维度。
-- `geom_feat(Tensor)`：输出坐标张量，数据类型为`int32`。shape为`[N, 4]`。其中`4`分别代表`h, w, b, d`。
-- `B(int)`：batch size。
-- `D(int)`：输出池化深度。
-- `H(int)`：输出池化高度。
-- `W(int)`：输出池化宽度。
-### 返回值
-- `bev_pooled_feat(Tensor)`：采样后的点云数据，数据类型为`float32`。shape为`[B, D, H, W, C]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-- `geom_feat`的4个对应的值必须在`[0, H-1]`, `[0, W-1]`, `[0, B-1]`, `[0, D-1]`之间。
-- `geom_feat`和`feat`的第0维长度必须相同。
-- C <= 1024
-- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256
-- 对于反向也是同样的约束。
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.point import bev_pool
-feat = torch.rand(4, 256).npu()
-feat.requires_grad_()
-geom_feat = torch.tensor([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2], [0, 0, 0, 3]], dtype=torch.int32).npu()
-bev_pooled_feat = bev_pool(feat, geom_feat, 4, 1, 256, 256)
-loss = bev_pooled_feat.sum()
-loss.backward()
-```
-## bev_pool_v2
-### 接口原型
-```python
-mx_driving.point.bev_pool_v2(Tensor depth, Tensor feat, Tensor ranks_depth, Tensor ranks_feat, Tensor ranks_bev,
-                                 List[int] bev_feat_shape, Tensor interval_starts, Tensor interval_lengths) -> Tensor
-```
-### 功能描述
-BEV池化优化。可参考论文`BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View`。
-### 参数说明
-- `depth(Tensor)`：深度张量，数据类型为`float32`。shape为`[B, N, D, H, W]`。其中`B`为batch size，`N`为特征的数量，`D, H, W`分别代表深度、高度、宽度。
-- `feat(Tensor)`：特征张量，数据类型为`float32`。shape为`[B, N, H, W, C]`。其中`B`为batch size，`N`为特征的数量，`H, W, C`分别代表高度、宽度、通道数。
-- `ranks_depth(Tensor)`：深度排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
-- `ranks_feat(Tensor)`：特征排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
-- `ranks_bev(Tensor)`：BEV排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
-- `bev_feat_shape(List[int])`：BEV特征形状，数据类型为`int32`。长度为`5`， 分别代表`B, D, H, W, C`。
-- `interval_starts(Tensor)`：间隔开始张量，数据类型为`int32`。shape为`[N_INTERVALS]`。
-- `interval_lengths(Tensor)`：间隔长度张量，数据类型为`int32`。shape为`[N_INTERVALS]`。
-### 返回值
-- `bev_pooled_feat(Tensor)`：BEV池化后的特征张量，数据类型为`float32`。shape为`[B, D, H, W, C]`。
-### 约束说明
-- `ranks_depth`的值必须在`[0, B*B*D*H*W]`之间。
-- `ranks_feat`的值必须在`[0, B*N*H*W]`之间。
-- `ranks_bev`的值必须在`[0, B*D*H*W]`之间。
-- C <= 1024
-- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256
-- N_RANKS <= 2^21
-- 对于反向也是同样的约束。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.point import bev_pool_v2
-depth = torch.rand(2, 1, 8, 256, 256).npu()
-feat = torch.rand(2, 1, 256, 256, 64).npu()
-feat.requires_grad_()
-ranks_depth = torch.tensor([0, 1], dtype=torch.int32).npu()
-ranks_feat = torch.tensor([0, 1], dtype=torch.int32).npu()
-ranks_bev = torch.tensor([0, 1], dtype=torch.int32).npu()
-bev_feat_shape = [2, 8, 256, 256, 64]
-interval_starts = torch.tensor([0], dtype=torch.int32).npu()
-interval_lengths = torch.tensor([2], dtype=torch.int32).npu()
-bev_pooled_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths)
-loss = bev_pooled_feat.sum()
-loss.backward()
-```
-## bev_pool_v3
-### 接口原型
-```python
-mx_driving.point.bev_pool_v3(Tensor depth, Tensor feat, Tensor ranks_depth, Tensor ranks_feat, Tensor ranks_bev,
-                                 List[int] bev_feat_shape) -> Tensor
-```
-### 功能描述
-BEV池化优化。`bev_pool_v2`的NPU亲和版本，优先推荐使用。
-### 参数说明
-- `depth(Tensor)`：深度张量，数据类型为`float32`。shape为`[B, N, D, H, W]`。其中`B`为batch size，`N`为特征的数量，`D, H, W`分别代表深度、高度、宽度。
-- `feat(Tensor)`：特征张量，数据类型为`float32`。shape为`[B, N, H, W, C]`。其中`B`为batch size，`N`为特征的数量，`H, W, C`分别代表高度、宽度、通道数。
-- `ranks_depth(Tensor)`：深度排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
-- `ranks_feat(Tensor)`：特征排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
-- `ranks_bev(Tensor)`：BEV排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
-- `bev_feat_shape(List[int])`：BEV特征形状，数据类型为`int32`。长度为`5`， 分别代表`B, D, H, W, C`。
-### 返回值
-- `bev_pooled_feat(Tensor)`：BEV池化后的特征张量，数据类型为`float32`。shape为`[B, D, H, W, C]`。
-### 约束说明
-- `ranks_depth`的值必须在`[0, B*B*D*H*W]`之间。
-- `ranks_feat`的值必须在`[0, B*N*H*W]`之间。
-- `ranks_bev`的值必须在`[0, B*D*H*W]`之间。
-- C 必须为8的倍数。
-- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256
-- 对于反向也是同样的约束。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.point import bev_pool_v3
-depth = torch.rand(2, 1, 8, 256, 256).npu()
-feat = torch.rand(2, 1, 256, 256, 64).npu()
-feat.requires_grad_()
-ranks_depth = torch.tensor([0, 1], dtype=torch.int32).npu()
-ranks_feat = torch.tensor([0, 1], dtype=torch.int32).npu()
-ranks_bev = torch.tensor([0, 1], dtype=torch.int32).npu()
-bev_feat_shape = [2, 8, 256, 256, 64]
-bev_pooled_feat = bev_pool_v3(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape)
-loss = bev_pooled_feat.sum()
-loss.backward()
-```
-## furthest_point_sample_with_dist
-### 接口原型
-```python
-mx_driving.point.furthest_point_sample_with_dist(Tensor points, int num_points) -> Tensor
-```
-### 功能描述
-与`npu_furthest_point_sampling`功能相同，但输入略有不同。
-### 参数说明
-- `points(Tensor)`：点云数据，表示各点间的距离，数据类型为`float32`。shape为`[B, N, N]`。其中`B`为batch size，`N`为点的数量。
-- `num_points(int)`：采样点的数量。
-### 返回值
-- `Tensor`：采样后的点云数据，数据类型为`float32`。shape为`[B, num_points]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.point import furthest_point_sample_with_dist
-points = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu()
-out = furthest_point_sample_with_dist(points, 2)
-```
-## npu_furthest_point_sampling
-### 接口原型
-```python
-mx_driving.point.npu_furthest_point_sampling(Tensor points, int num_points) -> Tensor
-```
-### 功能描述
-点云数据的最远点采样。
-### 参数说明
-- `points(Tensor)`：点云数据，数据类型为`float32`。shape为`[B, N, 3]`。其中`B`为batch size，`N`为点的数量，`3`分别代表`x, y, z`。
-- `num_points(int)`：采样点的数量。
-### 返回值
-- `Tensor`：采样后的点云数据，数据类型为`float32`。shape为`[B, num_points]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.point import npu_furthest_point_sampling
-points = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu()
-out = npu_furthest_point_sampling(points, 2)
-```
-
-### 算子约束
-1. points输入shape[B, N, 3]的总大小(B x N x 3)不应该超过383166
-## npu_group_points
-Note：该接口命名将于2025年改为'group_points'。
-### 接口原型
-```python
-mx_driving.point.npu_group_points(Tensor features, Tensor indices) -> Tensor
-```
-### 功能描述
-点云数据按照索引重新分组。
-### 参数说明
-- `features`：需要被插值的特征，数据类型为`float32`，维度为（B, C, N）。
-- `indices`：获取目标特征计算的索引，数据类型为`int32`，维度为（B, npoints, nsample）。
-### 返回值
-- `output(Tensor)`：分组后的点云数据，数据类型为`float32`。shape为`[B, C, npoints, nsample]`。
-### 约束说明
-- `indices`的元素值需小于`features`的第三维度，即值在[0, N)。
-- C <= 1024
-- 反向具有相同约束。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch
-import torch_npu
-from mx_driving.point import npu_group_points
-
-
-indices = torch.tensor([[[0, 2, 5, 5], [1, 0, 5, 0], [2, 1, 4, 4]]]).int().npu()
-features = torch.tensor([[[0.9178, -0.7250, -1.6587, 0.0715, -0.2252, 0.4994],
-                          [0.6190, 0.1755, -1.7902, -0.5852, -0.3311, 1.9764],
-                          [1.7567, 0.0740, -1.1414, 0.4705, -0.3197, 1.1944],
-                          [-0.2343, 0.1194, 0.4306, 1.3780, -1.4282, -0.6377],
-                          [0.7239, 0.2321, -0.6578, -1.1395, -2.3874, 1.1281]]],
-                          dtype=torch.float32).npu()
-output = npu_group_points(features, indices)
-```
-
-## npu_add_relu
-### 接口原型
-```python
-mx_driving.fused.npu_add_relu(Tensor x, Tensor y) -> Tensor
-```
-### 功能描述
-与`relu(x + y)`功能相同。
-### 参数说明
-- `x(Tensor)`：输入数据，数据类型为`float32`，shape无限制。
-- `y(Tensor)`：输入数据，数据类型为`float32`，shape需要和x一致。
-### 返回值
-- `Tensor`：输出数据，数据类型为`float32`，shape和x一致。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.fused import npu_add_relu
-x = torch.tensor([[[1, 2, 3], [-1, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu()
-y = torch.tensor([[[1, 2, 3], [-1, -2, 6], [7, 8, 9]]], dtype=torch.float32).npu()
-out = npu_add_relu(x, y)
-```
-### 算子约束
-- 输入`x`与输入`y`的shape和dtype需要保持一致，不支持广播。
-- 仅在x的元素个数超过2000000时，相较于`relu(x + y)`有性能提升。
-
-## voxelization
-### 接口原型
-```python
-mx_driving.point.voxelization(Tensor points, List[float] voxel_size, List[float] coors_range, int max_points=-1, int max_voxels=-1, bool deterministic=True) -> Tensor
-```
-### 功能描述
-将点云数据进行体素化。
-### 参数说明
-- `points(Tensor)`：点云数据，数据类型为`float32`。shape为`[N, F]`。其中`N`为点的数量，`F`分别代表每个点的特征维度，其中`N > 0, F >= 3`。
-- `voxel_size(List[float])`：体素大小，数据类型为`float32`。shape为`[3]`。其中`3`分别代表`x, y, z`。
-- `coors_range(List[float])`：体素范围，数据类型为`float32`。shape为`[6]`。其中`6`分别代表`x_min, y_min, z_min, x_max, y_max, z_max`。
-- `max_points(int)`：每个体素的最大点数。默认值为`-1`。
-- `max_voxels(int)`：最大体素数。默认值为`-1`。
-- `deterministic(bool)`：是否确定性。默认值为`True`。
-### 返回值
-- `coors(Tensor)`：每个点所属的体素坐标，数据类型为`int32`。shape为`[N, 3]`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.point import Voxelization
-points = torch.randint(-20, 100, [16, 3], dtype=torch.float32).npu()
-coors_range = [0, -40, -3, 70.4, 40, 1]
-max_points = -1
-voxel_size = [0.5, 0.5, 0.5]
-dynamic_voxelization = Voxelization(voxel_size, coors_range, max_points)
-coors = dynamic_voxelization.forward(points)
-```
-## npu_dynamic_scatter
-### 接口原型
-```python
-mx_driving.point.npu_dynamic_scatter(Tensor feats, Tensor coors, str reduce_type = 'max') -> Tuple[torch.Tensor, torch.Tensor]
-```
-### 功能描述
-将点云特征点在对应体素中进行特征压缩。
-### 参数说明
-- `feats(Tensor)`：点云特征张量[N, C]，仅支持两维，数据类型为`float32`，特征向量`C`长度上限为2048。
-- `coors(Tensor)`：体素坐标映射张量[N, 3]，仅支持两维，数据类型为`int32`，此处以x, y, z指代体素三维坐标，其取值范围为`0 <= x, y < 2048`,  `0 <= z < 256`。
-- `reduce_type(str)`：压缩类型。可选值为`'max'`, `'mean'`, `'sum'`。默认值为`'max'`
-### 返回值
-- `voxel_feats(Tensor)`：压缩后的体素特征张量，仅支持两维，数据类型为`float32`。
-- `voxel_coors(Tensor)`：去重后的体素坐标，仅支持两维，数据类型为`int32`。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch, torch_npu
-from mx_driving.point import npu_dynamic_scatter
-
-feats = torch.tensor([[1, 2, 3], [3, 2, 1], [7, 8, 9], [9, 8, 7]], dtype=torch.float32).npu()
-coors = torch.tensor([[1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]], dtype=torch.int32).npu()
-voxel_feats, voxel_coors = npu_dynamic_scatter(feats, coors, 'max')
-
-```
-## unique_voxel
-### 接口原型
-```python
-mx_driving._C.unique_voxel(Tensor voxels) -> int, Tensor, Tensor, Tensor, Tensor
-```
-### 功能描述
-对输入的点云数据进行去重处理。
-### 参数说明
-- `voxels (Tensor)`：数据语义为索引，数据类型为`int32`，shape为`[N]`。
-### 返回值
-- `num_voxels(int)`, 体素数量。
-- `uni_voxels(Tensor)`，去重后的体素数据，数据类型为`int32`，shape为`[num_voxels]`。
-- `uni_indices(Tensor)`, 去重后的索引数据，数据类型为`int32`，shape为`[num_voxels]`。
-- `argsort_indices(Tensor)`, 排序后的索引数据，数据类型为`int32`，shape为`[N]`。
-- `uni_argsort_indices(Tensor)`, 去重后的排序后的索引数据，数据类型为`int32`，shape为`[num_voxels]`。
-### 约束说明
-N的大小受限于内存大小，建议N小于等于2^32。
-
-受限于芯片指令，输入的数据类型只能是int32，且>=0,<2^30。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch
-import torch_npu
-import numpy as np
-from mx_driving._C import unique_voxel
-voxels = np.random.randint(0, 1024, (100000,)).astype(np.int32)
-voxels_npu = torch.from_numpy(voxels).npu()
-num_voxels, uni_voxels, uni_indices, argsort_indices, uni_argsort_indices = unique_voxel(voxels_npu)
-
-```
-
-
-## voxel_pooling_train
-### 接口原型
-```python
-mx_driving.point.npu_voxel_pooling_train(Tensor geom_xyz, Tensor input_features, List[int] voxel_num) -> Tensor
-```
-### 功能描述
-点云数据体素化。
-### 参数说明
-- `geom_xyz`：体素坐标，数据类型为`int32`，维度为（B, N, 3）, 3表示x, y, z。
-- `input_features`：点云数据，数据类型为`float32|float16`，维度为（B, N, C）。
-- `voxel_num`：体素格子长宽高，数据类型为`int32`，维度为（3），3表示体素格子的长宽高。
-### 返回值
-- `output(Tensor)`：输出结果，数据类型为`float32|float16`。shape为`[B, num_voxel_y, num_voxel_x, C]`。
-### 约束说明
-- B <= 128
-- N <= 100000
-- C <= 256
-- num_voxel_x <= 1000
-- num_voxel_y <= 1000
-- num_voxel_z <= 10
-- B * num_voxel_y * num_voxel_x * C <= 100000000
-- B * N * C <= 100000000
-- 反向具有相同约束。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 调用示例
-```python
-import torch
-import torch_npu
-import mx_driving.point
-
-def gen_data(geom_shape, feature_shape, coeff, batch_size, num_channels, dtype):
-       geom_xyz = torch.rand(geom_shape) * coeff
-       geom_xyz = geom_xyz.reshape(batch_size, -1, 3)
-       geom_xyz[:, :, 2] /= 100
-       geom_xyz_cpu = geom_xyz.int()
-       features = torch.rand(feature_shape, dtype=dtype) - 0.5
-       features_cpu = features.reshape(batch_size, -1, num_channels)
-
-       return geom_xyz_cpu, features_cpu
-
-dtype = torch.float32
-coeff = 90
-voxel_num = [128, 128, 1]
-batch_size = 2
-num_points = 40
-num_channel = 80
-xyz = 3
-
-geom_shape = [batch_size, num_points, xyz]
-feature_shape = [batch_size, num_points, num_channel]
-
-geom_cpu, feature_cpu = gen_data(geom_shape, feature_shape, coeff, batch_size, num_channel, dtype)
-
-geom_npu = geom_cpu.npu()
-feature_npu = feature_cpu.npu()
-
-result_npu = mx_driving.point.npu_voxel_pooling_train(geom_npu, feature_npu, voxel_num)
-```
-# 稀疏卷积算子(beta)
-## SparseConv3d(beta)
-### 接口原型
-```python
-mx_driving.spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor
-```
-### 功能描述
-稀疏卷积
-### 参数说明
-- `in_channels(int)`：输入数据的通道数
-- `out_channels(int)`：输出通道数
-- `kernel_size(List(int)/Tuple(int)/int)`：卷积神经网络中卷积核的大小
-- `stride(List(int)/Tuple(int)/int)`：卷积核在输入数据上滑动时的步长
-- `dilation(List(int)/Tuple(int)/int)`：空洞卷积大小
-- `groups(int)`：分组卷积
-- `bias(bool)`：偏置项
-- `indice_key(String)`：该输入用于复用之前计算的索引信息
-- `mode(String)`：区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积
-### 返回值
-- `SparseConvTensor`：存储了输出的特征值`out_feature`，对应索引位置`out_indices`和对应的spatital_shape。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-- `kernel_size`当前支持数据类型为三维List/Tuple或Int，值域为`[1, 3]`
-- `stride`当前支持数据类型为三维List/Tuple或Int
-- `dilation`，`groups`当前仅支持值为1
-- 对于反向也是同样的约束。
-### 调用示例
-```python
-import torch,torch_npu
-import numpy as np
-from mx_driving.spconv import SparseConv3d, SparseConvTensor
-
-def generate_indice(batch, height, width, depth, actual_num):
-    base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num]
-    base_indices = np.sort(base_indices)
-    b_indice = base_indices // (height * width * depth)
-    base_indices = base_indices % (height * width * depth)
-    h_indice = base_indices // (width * depth)
-    base_indices = base_indices // (width * depth)
-    w_indice = base_indices // depth
-    d_indice = base_indices % depth
-    indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num)
-    return indices
-
-actual_num = 20
-batch = 4
-spatial_shape = [9, 9, 9]
-indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu()
-feature = tensor_uniform = torch.rand(actual_num, 16).npu()
-feature.requires_grad = True
-x = SparseConvTensor(feature, indices, spatial_shape, batch)
-net = SparseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu()
-out = net(x)
-dout = torch.ones_like(out.features).float().npu()
-out.features.backward(dout)
-```
-
-
-## SparseInverseConv3d(beta)
-### 接口原型
-```python
-mx_driving.spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, output_padding=0,bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor
-```
-### 功能描述
-稀疏逆卷积
-### 参数说明
-- `in_channels(int)`：输入数据的通道数
-- `out_channels(int)`：输出通道数
-- `kernel_size(List(int)/Tuple(int)/int)`：卷积神经网络中卷积核的大小
-- `stride(List(int)/Tuple(int)/int)`：卷积核在输入数据上滑动时的步长
-- `dilation(List(int)/Tuple(int)/int)`：空洞卷积大小
-- `groups(int)`：分组卷积
-- `bias(bool)`：偏置项
-- `indice_key(String)`：该输入用于复用之前计算的索引信息
-- `mode(String)`：区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积
-### 返回值
-- `SparseConvTensor`：存储了输出的特征值`out_feature`，对应索引位置`out_indices`和对应的spatital_shape。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-- `kernel_size`当前支持数据类型为三维List/Tuple或Int，值域为`[1, 3]`
-- `stride`当前支持数据类型为三维List/Tuple或Int
-- `dilation`，`groups`当前仅支持值为1
-- 对于反向也是同样的约束。
-### 调用示例
-```python
-import torch,torch_npu
-import numpy as np
-from mx_driving.spconv import SparseInverseConv3d, SparseConvTensor
-
-def generate_indice(batch, height, width, depth, actual_num):
-    base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num]
-    base_indices = np.sort(base_indices)
-    b_indice = base_indices // (height * width * depth)
-    base_indices = base_indices % (height * width * depth)
-    h_indice = base_indices // (width * depth)
-    base_indices = base_indices // (width * depth)
-    w_indice = base_indices // depth
-    d_indice = base_indices % depth
-    indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num)
-    return indices
-
-actual_num = 20
-batch = 4
-spatial_shape = [9, 9, 9]
-indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu()
-feature = tensor_uniform = torch.rand(actual_num, 16).npu()
-feature.requires_grad = True
-x = SparseConvTensor(feature, indices, spatial_shape, batch)
-net = SparseInverseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu()
-out = net(x)
-dout = torch.ones_like(out.features).float().npu()
-out.features.backward(dout)
-```
-
-
-## SubMConv3d(beta)
-### 接口原型
-```python
-mx_driving.spconv.SubMConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor
-```
-### 功能描述
-稀疏卷积，只有当卷积核中心参与计算时，才会影响输出
-### 参数说明
-- `in_channels(int)`：输入数据的通道数
-- `out_channels(int)`：输出通道数
-- `kernel_size(List(int)/Tuple(int)/int)`：卷积神经网络中卷积核的大小
-- `stride(List(int)/Tuple(int)/int)`：卷积核在输入数据上滑动时的步长
-- `dilation(List(int)/Tuple(int)/int)`：空洞卷积大小
-- `groups(int)`：分组卷积
-- `bias(bool)`：偏置项
-- `indice_key(String)`：该输入用于复用之前计算的索引信息
-- `mode(String)`：区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积
-### 返回值
-- `SparseConvTensor`：存储了输出的特征值`out_feature`，对应索引位置`out_indices`和对应的spatital_shape。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-- `kernel_size`当前支持数据类型为三维List/Tuple或Int，当前值仅支持1、3
-- `stride`当前支持数据类型为三维List/Tuple或Int,当前仅支持值为1
-- `dilation`，`groups`当前仅支持值为1
-- 对于反向也是同样的约束。
-### 调用示例
-```python
-import torch,torch_npu
-import numpy as np
-from mx_driving.spconv import SubMConv3d, SparseConvTensor
-
-def generate_indice(batch, height, width, depth, actual_num):
-    base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num]
-    base_indices = np.sort(base_indices)
-    b_indice = base_indices // (height * width * depth)
-    base_indices = base_indices % (height * width * depth)
-    h_indice = base_indices // (width * depth)
-    base_indices = base_indices // (width * depth)
-    w_indice = base_indices // depth
-    d_indice = base_indices % depth
-    indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num)
-    return indices
-
-actual_num = 20
-batch = 4
-spatial_shape = [9, 9, 9]
-indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu()
-feature = tensor_uniform = torch.rand(actual_num, 16).npu()
-feature.requires_grad = True
-x = SparseConvTensor(feature, indices, spatial_shape, batch)
-net = SubMConv3d(in_channels=16, out_channels=32, kernel_size=3).npu()
-out = net(x)
-dout = torch.ones_like(out.features).float().npu()
-out.features.backward(dout)
-```
\ No newline at end of file
+# mxDriving API 汇总
+
+
+- 通用
+- 感知
+- 点云
+- 预处理
+- 稀疏
+- 融合
+    - [multi_scale_deformable_attn](./context/multi_scale_deformable_attn.md)
\ No newline at end of file
diff --git a/docs/api/context/multi_scale_deformable_attn.md b/docs/api/context/multi_scale_deformable_attn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c1fc37a220f12a40d841175f1f6c9956d78121e5
--- /dev/null
+++ b/docs/api/context/multi_scale_deformable_attn.md
@@ -0,0 +1,36 @@
+## multi_scale_deformable_attn(MultiScaleDeformableAttnFunction.Apply)
+### 接口原型
+```python
+mx_driving.fused.multi_scale_deformable_attn(Tensor value, Tensor value_spatial_shapes, Tensor value_level_start_index, Tensor sampling_locations, Tensor attention_weights) -> Tensor
+```
+### 功能描述
+多尺度可变形注意力机制, 将多个视角的特征图进行融合。
+### 参数说明
+- `value(Tensor)`：特征张量，数据类型为`float32, float16`。shape为`[bs, num_keys, num_heads, embed_dims]`。其中`bs`为batch size，`num_keys`为特征图的大小，`num_heads`为头的数量，`embed_dims`为特征图的维度，其中`embed_dims`需要为8的倍数。
+- `value_spatial_shapes(Tensor)`：特征图的形状，数据类型为`int32, int64`。shape为`[num_levels, 2]`。其中`num_levels`为特征图的数量，`2`分别代表`H, W`。
+- `value_level_start_index(Tensor)`：偏移量张量，数据类型为`int32, int64`。shape为`[num_levels]`。
+- `sampling_locations(Tensor)`：位置张量，数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads, num_levels, num_points, 2]`。其中`bs`为batch size，`num_queries`为查询的数量，`num_heads`为头的数量，`num_levels`为特征图的数量，`num_points`为采样点的数量，`2`分别代表`y, x`。
+- `attention_weights(Tensor)`：权重张量，数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads, num_levels, num_points]`。其中`bs`为batch size，`num_queries`为查询的数量，`num_heads`为头的数量，`num_levels`为特征图的数量，`num_points`为采样点的数量。
+### 返回值
+- `output(Tensor)`：融合后的特征张量，数据类型为`float32, float16`。shape为`[bs, num_queries, num_heads*embed_dims]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 约束说明
+- `locations`的值在`[0, 1]`之间。
+- 当前版本只支持`num_keys` &le; 8，`num_heads` &le; 8，`embed_dims` == 16或32，`num_points` = 1或偶数。
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.fused import multi_scale_deformable_attn
+bs, num_levels, num_heads, num_points, num_queries, embed_dims = 1, 1, 4, 8, 16, 32
+
+shapes = torch.as_tensor([(100, 100)], dtype=torch.long)
+num_keys = sum((H * W).item() for H, W in shapes)
+
+value = torch.rand(bs, num_keys, num_heads, embed_dims) * 0.01
+sampling_locations = torch.ones(bs, num_queries, num_heads, num_levels, num_points, 2) * 0.005
+attention_weights = torch.rand(bs, num_queries, num_heads, num_levels, num_points) + 1e-5
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+
+out = multi_scale_deformable_attn(value.npu(), shapes.npu(), level_start_index.npu(), sampling_locations.npu(), attention_weights.npu())
+```
\ No newline at end of file
diff --git a/docs/api/mxDriving.md b/docs/api/mxDriving.md
new file mode 100644
index 0000000000000000000000000000000000000000..a0042ce32e4f0881d4edc67d40e6fe5713b6755a
--- /dev/null
+++ b/docs/api/mxDriving.md
@@ -0,0 +1,1420 @@
+> Note: 以prototype标注的接口，表示该接口为预发布接口，可能会有变动，不建议在生产环境中使用。
+# 通用算子
+## scatter_max
+### 接口原型
+```python
+mx_driving.common.scatter_max(Tensor updates, Tensor indices, Tensor out=None) -> (Tensor out, Tensor argmax)
+```
+### 功能描述
+在第0维上，将输入张量`updates`中的元素按照`indices`中的索引进行分散，然后在第0维上取最大值，返回最大值和对应的索引。对于1维张量，公式如下：
+$$out_i = max(out_i, max_j(updates_j))$$
+$$argmax_i = argmax_j(updates_j)$$
+这里，$i = indices_j$。
+### 参数说明
+- `updates`：更新源张量，数据类型为`float32`，且
+  - `updates`的第0维外其余轴合轴后必须32字节对齐。
+- `indices`：索引张量，数据类型为`int32`，且
+  - `indices`的维度必须为`1`，
+  - `indices`第0维的长度必须与`updates`第0维的长度相同。
+  - `indices`的最大值必须小于`491520`。
+  - `indices`的取值必须为非负的有效索引值。
+- `out`：被更新张量，数据类型为`float32`，默认为`None`,且
+  - `out`的维度必须与`updates`的维度相同。
+  - `out`除第0维外其余维的长度必须与`updates`相同。
+### 返回值
+- `out`：更新后的张量，数据类型为`float32`。
+- `argmax`：最大值对应的索引张量，数据类型为`int32`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.common import scatter_max
+updates = torch.tensor([[2, 0, 1, 3, 1, 0, 0, 4], [0, 2, 1, 3, 0, 3, 4, 2], [1, 2, 3, 4, 4, 3, 2, 1]], dtype=torch.float32).npu()
+indices = torch.tensor([0, 2, 0], dtype=torch.int32).npu()
+out = updates.new_zeros((3, 8))
+out, argmax = scatter_max(updates, indices, out)
+```
+## knn
+### 接口原型
+```python
+mx_driving.common.knn(int k, Tensor xyz, Tensor center_xyz, bool Transposed) -> Tensor
+```
+### 功能描述
+对center_xyz中的每个点找到xyz中对应batch中的距离最近的k个点，并且返回此k个点的索引值。
+### 参数说明
+- `xyz(Tensor)`：点数据，表示(x, y, z)三维坐标，数据类型为`float32`。shape为`[B, N, 3]`(当Transposed=False)或`[B, 3, N]`(当Transposed=True)。其中`B`为batch size，`N`为点的数量。
+- `center_xyz(Tensor)`：点数据，表示(x, y, z)三维坐标，数据类型为`float32`。shape为`[B, npoint, 3]`(当Transposed=False)或`[B, 3, npoint]`(当Transposed=True)。其中`B`为batch size，`npoint`为点的数量。
+- `k(int)`：采样点的数量。
+- `Transposed(bool)`: 输入是否需要进行转置
+### 返回值
+- `idx(Tensor)`：采样后的索引数据，数据类型为`int32`。shape为`[B, k, npoint]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.common import knn
+xyz = torch.tensor([[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=torch.float32).npu()
+center_xyz = torch.tensor([[[1, 2, 3]], [[1, 2, 3]]], dtype=torch.float32).npu()
+idx = knn(2, xyz, center_xyz, False)
+```
+### 算子约束
+1. k必须>0且<100。
+2. xyz中的每个batch中的任意一个点到center_xyz对应batch中的任意一个点的距离必须在1e10f以内。
+3. xyz和center_xyz的shape必须是3维，当Transposed=True时，xyz和center_xyz的shape的dim的第1维必须是3；当Transposed=False时，xyz和center_xyz的shape的dim的第2维必须是3。
+4. 由于距离相同时排序为不稳定排序，存在距离精度通过但索引精度错误问题，与竞品无法完全对齐。
+
+## scatter_mean
+### 接口原型
+```python
+mx_driving.common.scatter_mean(Tensor src, Tensor indices, int dim=0， Tensor out=None, int dim_size=None) -> Tensor
+```
+### 功能描述
+将输入张量`src`中的元素按照`indices`中的索引在指定的`dim`维进行分组，并计算每组的平均值，返回平均值。
+### 参数说明
+- `src`：源张量，数据类型为`float32`。
+- `indices`：索引张量，数据类型为`int32`。
+- `out`：被更新张量，数据类型为`float32`，可选入参，默认为`None`，输入`out`不为`None`时，`out`中的元素参与平均值的计算。
+- `dim`：指定的维度，表示按照哪个维度进行分组平均计算，数据类型为`int32`，可选入参，默认取值为`0`。
+- `dim_size`：输出张量在`dim`维的长度，数据类型为`int32`，可选入参，默认为`None`，该参数仅在输入`out`为`None`时生效。
+### 返回值
+- `out`：求平均后的张量，数据类型为`float32`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+
+```python
+import torch, torch_npu
+from mx_driving.common import scatter_mean
+src = torch.randn(4, 5, 6).to(torch.float)
+indices = torch.randint(5, (4, 5)).to(torch.int32)
+dim = 0
+src.requires_grad = True
+out = scatter_mean(src.npu(), indices.npu(), None, dim)
+grad_out_tensor = torch.ones_like(out)
+out.backward(grad_out_tensor)
+```
+### 算子约束
+- `indices`的维度必须小于等于`src`的维度，且每一维的长度均必须与`src`长度相同。
+- `indices`的取值必须为非负的有效索引值，参数`out`或`data_size`不为`None`时，`indices`的取值应该为输出张量在`dim`维的有效索引值。
+- `out`的维度必须与`src`的维度相同，且除第`dim`维外其余维的长度必须与`src`相同。
+- `dim`取值不能超过`indices`的维度。
+- `dim_size`的取值必须为非负的有效长度值。
+- `src`和`out`不支持`inf`、`-inf`和`nan`。
+### 其他说明
+- 该算子对尾块较大的场景较为亲和，对尾块很小的场景不亲和，其中，尾块表示`src`后`N`维的大小，`N = src.dim() - indices.dim()`。
+
+## three_interpolate
+### 接口原型
+```python
+mx_driving.common.three_interpolate(features: torch.Tensor, indices: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+```
+### 功能描述
+对三维数据进行加权最近邻线性插值处理
+### 参数说明
+- `features`：需要被插值的特征，数据类型为`float32|float16`，维度为（B, C, M）。
+- `indices`：获取目标特征计算的索引，数据类型为`int32`，维度为（B, N, 3），
+  - `indices`的元素值需小于`features`的第三维度，即值在[0, M)。
+- `weight`：获取目标特征计算的权重，数据类型为`float32|float16`，维度为（B, N, 3）。
+  - `weight`数据类型与`features`须一致。
+- `features`，`indices`，`weights`三个参数的每个维度须小于10000。
+- `features`，`indices`，`weights`三个参数的大小请勿超过2^24。
+### 返回值
+- `output`：目标特征张量，数据类型为`float32|float16`，维度为（B, C, N）。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch
+from mx_driving.common import three_interpolate
+
+
+features = torch.tensor(
+            [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
+            [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
+            [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
+            [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
+            [0.3207, 0.0000, 0.3411, 0.3207, 0.3207, 0.3207]],
+            [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
+            [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
+            [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
+            [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
+            [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]],
+            ).npu()
+idx = torch.tensor(
+            [[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2], [0, 1, 3]],
+            [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4], [0, 1, 2]]],
+            ).int().npu()
+weight = torch.tensor(
+            [[[3.3333e-01, 3.3333e-01, 3.3333e-01],
+              [1.0000e+00, 5.8155e-08, 2.2373e-08],
+              [1.0000e+00, 1.7737e-08, 1.7356e-08],
+              [3.3333e-01, 3.3333e-01, 3.3333e-01],
+              [3.3333e-01, 3.3333e-01, 3.3333e-01],
+              [3.3333e-01, 3.3333e-01, 3.3333e-01]],
+             [[3.3333e-01, 3.3333e-01, 3.3333e-01],
+              [1.0000e+00, 1.3651e-08, 7.7312e-09],
+              [1.0000e+00, 1.7148e-08, 1.4070e-08],
+              [3.3333e-01, 3.3333e-01, 3.3333e-01],
+              [3.3333e-01, 3.3333e-01, 3.3333e-01],
+              [3.3333e-01, 3.3333e-01, 3.3333e-01]]],
+            ).npu()
+output = three_interpolate(features, idx, weight)
+```
+
+
+## three_nn
+### 接口原型
+```python
+mx_driving.common.three_nn(Tensor target, Tensor source) -> (Tensor dist, Tensor idx)
+```
+### 功能描述
+对target中的每个点找到source中对应batch中的距离最近的3个点，并且返回此3个点的距离和索引值。
+### 参数说明
+- `target(Tensor)`：点数据，表示(x, y, z)三维坐标，数据类型为`float32/float16`。shape为`[B, npoint, 3]`。其中`B`为batch size，`npoint`为点的数量。
+- `source(Tensor)`：点数据，表示(x, y, z)三维坐标，数据类型为`float32/float16`。shape为`[B, N, 3]`。其中`B`为batch size，`N`为点的数量。
+### 返回值
+- `dist(Tensor)`：采样后的索引数据，数据类型为`float32/float16`。shape为`[B, npoint, 3]`。
+- `idx(Tensor)`：采样后的索引数据，数据类型为`int32/int32`。shape为`[B, npoint, 3]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.common import three_nn
+source = torch.tensor([[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=torch.float32).npu()
+target = torch.tensor([[[1, 2, 3]], [[1, 2, 3]]], dtype=torch.float32).npu()
+dist, idx = three_nn(target, source)
+```
+### 算子约束
+1. source和target的shape必须是3维，且source和target的shape的dim的第2维必须是3。
+2. 距离相同时排序为不稳定排序，存在距离精度通过但索引精度错误问题，与竞品无法完全对齐。
+
+
+## hypot
+### 接口原型
+```python
+mx_driving.common.hypot(Tensor input, Tensor other) -> Tensor
+```
+### 功能描述
+给出直角三角形的两边，返回它的斜边。
+### 参数说明
+- `input(Tensor)`：代表直角三角形第一条直角边的输入张量，数据类型为`float32`。
+- `other(Tensor)`：代表直角三角形第二条直角边的输入张量，数据类型为`float32`。
+### 返回值
+- `Tensor`：经过计算后的直角三角形斜边，数据类型为`float32`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.common import hypot
+input = torch.tensor([3,3,3], dtype=torch.float32).npu()
+other = torch.tensor([4,4,4], dtype=torch.float32).npu()
+out = hypot(input, other) # tensor([5.,5.,5.])
+```
+### 算子约束
+1. input和other的shape必须是可广播的。
+
+
+## assign_score_withk
+### 接口原型
+```python
+mx_driving.common.assign_score_withk(Tensor scores, Tensor point_features, Tensor center_features, Tensor knn_idx, str aggregate='sum') -> Tensor
+```
+### 功能描述
+根据`knn_idx`得到采样点及其邻居点的索引，计算`point_features`和`center_features`的差，并与`scores`相乘后在特征维度进行聚合，返回采样点的特征。
+### 参数说明
+- `scores(Tensor)`：权重矩阵的重要系数，数据类型为`float32`。Shape为`[B, npoint, K, M]`，其中`B`为batch size，`npoint`为采样点的数量，`K`为一个样本点及其邻居点的数量之和，`M`为权重矩阵集合的规模。
+- `point_features(Tensor)`：所有点的特征，数据类型为`float32`。Shape为`[B, N, M, O]`，其中`N`为所有点的数量，`O`为特征数量。
+- `center_features(Tensor)`：所有点的中心特征，数据类型为`float32`。Shape为`[B, N, M, O]`。
+- `knn_idx[Tensor]`：采样点及其邻居点的索引，数据类型为`int64`。Shape为`[B, npoint, K]`。
+- `aggregate`：聚合方式，默认为`sum`，数据类型为`str`。
+### 返回值
+- `output`：聚合后采样点的特征，数据类型为`float32`。Shape为`[B, O, npoint, K]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+
+```python
+import torch, torch_npu
+from mx_driving.common import assign_score_withk
+points = np.random.rand(4, 100, 8, 16).astype(np.float32)
+centers = np.random.rand(4, 100, 8, 16).astype(np.float32)
+scores = np.random.rand(4, 64, 10, 8).astype(np.float32)
+knn_idx = np.random.randint(0, N, size=(4, 64, 10)).astype(np.int64)
+output = assign_score_withk(torch.from_numpy(scores).npu(),
+                            torch.from_numpy(points).npu(),
+                            torch.from_numpy(centers).npu(),
+                            torch.from_numpy(knn_idx).npu(),
+                            "sum")
+```
+### 算子约束
+- `npoint`和`K`都不大于`N`。
+
+
+# 数据预处理算子
+## npu_points_in_box
+### 接口原型
+```python
+mx_driving.preprocess.npu_points_in_box(Tensor boxes, Tensor points) -> Tensor
+```
+### 功能描述
+判断点是否在框内。
+### 参数说明
+- `boxes(Tensor)`：框张量，数据类型为`float32`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
+- `points(Tensor)`：点张量，数据类型为`float32`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。
+### 返回值
+- `boxes_idx_of_points(Tensor)`：点在框内的索引张量，数据类型为`int32`。shape 为`[B, N]`。
+### 约束说明
+- `boxes`和`points`的`B`必须相同，且只能为`1`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.preprocess import npu_points_in_box
+boxes = torch.tensor([[[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]]], dtype=torch.float32).npu()
+points = torch.tensor([[[1, 2, 3], [3, 4, 5]]], dtype=torch.float32).npu()
+out = npu_points_in_box(boxes, points)
+```
+
+## npu_points_in_box_all
+Note: 该接口命名将于2025年改为`points_in_boxes_all`。
+### 接口原型
+```python
+mx_driving.preprocess.npu_points_in_box_all(Tensor boxes, Tensor points) -> Tensor
+```
+### 功能描述
+判断点是否在框内。
+### 参数说明
+- `boxes(Tensor)`：框张量，数据类型为`float32`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
+- `points(Tensor)`：点张量，数据类型为`float32`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。
+### 返回值
+- `boxes_idx_of_points(Tensor)`：同一`batch`下，各点是否在各框内的张量，数据类型为`int32`。shape 为`[B, N, M]`。
+### 约束说明
+- `boxes`和`points`的`B`必须相同。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.preprocess import npu_points_in_box_all
+boxes = torch.tensor([[[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]]], dtype=torch.float32).npu()
+points = torch.tensor([[[1, 2, 5], [3, 4, 8]]], dtype=torch.float32).npu()
+out = npu_points_in_box_all(boxes, points)
+```
+
+## RoipointPool3d
+### 接口原型
+```python
+mx_driving.preprocess.RoipointPool3d(int num_sampled_points, Tensor points, Tensor point_features, Tensor boxes3d) -> (Tensor pooled_features, Tensor pooled_empty_flag)
+```
+### 功能描述
+对每个3D方案的几何特定特征进行编码。
+### 参数说明
+- `num_sampled_points(int)`：特征点的数量，正整数。
+- `points(Tensor)`：点张量，数据类型为`float32, float16`。shape 为`[B, N, 3]`。`3`分别代表`x, y, z`。
+- `point_features(Tensor)`：点特征张量，数据类型为`float32, float16`。shape 为`[B, N, C]`。`C`分别代表`x, y, z`。
+- `boxes3d(Tensor)`：框张量，数据类型为`float32, float16`。shape 为`[B, M, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
+### 返回值
+- `pooled_features(Tensor)`：点在框内的特征张量，数据类型为`float32, float16`。shape 为`[B, M, num, 3+C]`。
+- `pooled_empty_flag(Tensor)`：所有点不在框内的空标记张量，数据类型为`int32`。shape 为`[B, M]`。
+### 约束说明
+- `points`、`point_features`和`boxes3d`的数据类型必须相同，以及`B`也必须相同。
+- `num_sampled_points`必须小于等于`N`。
+- 数据类型为`float32`时，建议`B`小于100、`N`小于等于2640、`M`小于等于48、`num_sampled_points`小于等于48，个别shape值略微超过建议值无影响，但所有shape值均大于建议值时，算子执行会发生错误。
+- 数据类型为`float16`时，建议`B`小于100、`N`小于等于3360、`M`小于等于60、`num_sampled_points`小于等于60，个别shape值略微超过建议值无影响，但所有shape值均大于建议值时，算子执行会发生错误。
+- `N`/`M`的值越大，性能劣化越严重，建议`N`小于`M`的六百倍，否则性能可能会低于0.1x A100。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.preprocess import RoIPointPool3d
+num_sampled_points = 1
+points = torch.tensor([[[1, 2, 3]]], dtype=torch.float).npu()
+point_features = points.clone()
+boxes3d = torch.tensor([[[1, 2, 3, 4, 5, 6, 1]]], dtype=torch.float).npu()
+roipoint_pool3d = RoIPointPool3d(num_sampled_points)
+pooled_features, pooled_empty_flag = roipoint_pool3d(points, point_features, boxes3d)
+```
+
+
+# 目标检测算子
+## npu_boxes_overlap_bev
+Note: 该接口命名将于2025年改为`boxes_overlap_bev`。
+### 接口原型
+```python
+mx_driving.detection.npu_boxes_overlap_bev(Tensor boxes_a, Tensor boxes_b) -> Tensor
+```
+### 功能描述
+计算bev视角下中两个边界框的重叠面积。
+### 参数说明
+- `boxes_a (Tensor)`：第一组bounding boxes，数据类型为`float32`。shape为`[M, 5]`。其中`5`分别代表`x1, y1, x2, y2, angle`, `x1, y1, x2, y2`代表box四个顶点的横纵坐标，`angle`代表box的弧度制旋转角。
+- `boxes_b (Tensor)`：第二组bounding boxes，数据类型为`float32`。shape为`[N, 5]`。其中`5`分别代表`x1, y1, x2, y2, angle`, `x1, y1, x2, y2`代表box四个顶点的横纵坐标，`angle`代表box的弧度制旋转角。
+### 返回值
+- `area_overlap(Tensor)`：包含两组bounding boxes交叠面积的张量，数据类型为`float32`。shape为`[M, N]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.detection import npu_boxes_overlap_bev
+boxes_a = torch.tensor([[0, 0, 2, 2, 0]], dtype=torch.float32).npu()
+boxes_b = torch.tensor([[1, 1, 3, 3, 0]], dtype=torch.float32).npu()
+area_overlap = npu_boxes_overlap_bev(boxes_a, boxes_b)
+```
+## box_iou_quadri
+### 接口原型
+```python
+mx_driving.detection.box_iou_quadri(Tensor boxes_a, Tensor boxes_b, str mode='iou', bool aligned=False) -> Tensor
+```
+### 功能描述
+计算两个边界框的IoU。
+### 参数说明
+- `boxes_a (Tensor)`：第一组bounding boxes，数据类型为`float32`。shape为`[M, 8]`。其中`8`分别代表`x1, y1, x2, y2, x3, y3, x4, y4`, 表示box四个顶点的横纵坐标。
+- `boxes_b (Tensor)`：第二组bounding boxes，数据类型为`float32`。shape为`[N, 8]`。其中`8`分别代表`x1, y1, x2, y2, x3, y3, x4, y4`, 表示box四个顶点的横纵坐标。
+- `mode (str)`：取值为`"iou"`时，计算IoU（intersection over union）；取值为`"iof"`时，计算IoF（intersection over foregroud）。
+- `aligned (bool)`：取值为`True`时，只计算配对的box之间的结果；取值为`False`时，计算每对box之间的结果。
+### 返回值
+- `ious(Tensor)`：包含两组bounding boxes的IoU（`mode="iou"`）或IoF（`mode="iof"`）的张量，数据类型为`float32`。shape为`[M]`（`aligned=True`）或`[M, N]`（`aligned=False`）。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.detection import box_iou_quadri
+boxes_a = torch.tensor([[7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]], dtype=torch.float32).npu()
+boxes_b = torch.tensor([[7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]], dtype=torch.float32).npu()
+ious = box_iou_quadri(boxes_a, boxes_b, mode="iou", aligned=False)
+```
+## npu_nms3d
+### 接口原型
+```python
+mx_driving.detection.npu_nms3d(Tensor boxes, Tensor scores, float: iou_threshold) -> Tensor
+```
+### 功能描述
+3D非极大值抑制，在bev视角下剔除多个3d box交并比大于阈值的box。
+### 参数说明
+- `boxes(Tensor)`：框张量，数据类型为`float32, float16`。shape 为`[N, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
+- `scores(Tensor)`：评分张量，数据类型为`float32, float16`。shape 为`[N]`。
+- `iou_threshold(float)`：IoU阈值。
+### 返回值
+- `Tensor`：NMS后的框张量，数据类型为`int32`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.detection import npu_nms3d
+boxes = torch.tensor([[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]], dtype=torch.float32).npu()
+scores = torch.tensor([1, 2], dtype=torch.float32).npu()
+out = npu_nms3d(boxes, scores, 0.5)
+```
+## npu_nms3d_normal
+### 接口原型
+```python
+mx_driving.detection.npu_nms3d_normal(Tensor boxes, Tensor scores, float: iou_threshold) -> Tensor
+```
+### 功能描述
+3D非极大值抑制。
+### 参数说明
+- `boxes(Tensor)`：框张量，数据类型为`float32, float16`。shape 为`[N, 7]`。`7`分别代表`x, y, z, x_size, y_size, z_size, rz`。
+- `scores(Tensor)`：评分张量，数据类型为`float32, float16`。shape 为`[N]`。
+- `iou_threshold(float)`：IoU阈值。
+### 返回值
+- `Tensor`：NMS后的框张量，数据类型为`int32`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.detection import npu_nms3d_normal
+boxes = torch.tensor([[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9]], dtype=torch.float32).npu()
+scores = torch.tensor([1, 2], dtype=torch.float32).npu()
+out = npu_nms3d_normal(boxes, scores, 0.5)
+```
+## npu_rotated_iou
+### 接口原型
+```python
+mx_driving.detection.npu_rotated_iou(Tensor self, Tensor query_boxes, bool trans=False, int mode=0, bool is_cross=True, float v_threshold=0.0, float e_threshold=0.0) -> Tensor
+```
+### 功能描述
+计算旋转框的IoU。
+### 参数说明
+- `self(Tensor)`：边界框张量，数据类型为`float32, float16`，形状为`[B, N, 5]`。
+- `query_boxes(Tensor)`：查询框张量，数据类型为`float32, float16`，形状为`[B, M, 5]`。
+- `trans(bool)`：是否进行坐标变换。默认值为`False`。值为`True`时，表示`xyxyt`, 值为`False`时，表示`xywht`，其中`t`为角度制。
+- `is_cross(bool)`：值为`True`时，则对两组边界框中每个边界框之间进行计算。值为`False`时，只对对齐的边界框之间进行计算。
+- `mode(int)`：计算IoU的模式。默认值为`0`。值为`0`时，表示计算`IoU`，值为`1`时，表示计算`IoF`。
+- `v_threshold(float)`：顶点判断的容忍阈值。
+- `e_threshold(float)`：边相交判断的容忍阈值。
+### 返回值
+- `Tensor`：IoU张量，数据类型为`float32, float16`，`is_cross`为`True`时形状为`[B, N, M]，反之则为`[B, N]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+import numpy as np
+from mx_driving.detection import npu_rotated_iou
+a = np.random.uniform(0, 1, (2, 2, 5)).astype(np.float16)
+b = np.random.uniform(0, 1, (2, 3, 5)).astype(np.float16)
+box1 = torch.from_numpy(a).npu()
+box2 = torch.from_numpy(b).npu()
+iou = npu_rotated_iou(box1, box2, False, 0, True, 1e-5, 1e-5)
+```
+## npu_rotated_overlaps
+### 接口原型
+```python
+mx_driving.detection.npu_rotated_overlaps(Tensor self, Tensor query_boxes, bool trans=False) -> Tensor
+```
+### 功能描述
+计算旋转框的重叠面积。
+### 参数说明
+- `self(Tensor)`：边界框张量，数据类型为`float32, float16`，形状为`[B, N, 5]`。
+- `query_boxes(Tensor)`：查询框张量，数据类型为`float32, float16`，形状为`[B, M, 5]`。
+- `trans(bool)`：是否进行坐标变换。默认值为`False`。值为`True`时，表示`xyxyt`, 值为`False`时，表示`xywht`。
+### 返回值
+- `Tensor`：重叠面积张量，数据类型为`float32, float16`，形状为`[B, N, M]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+import numpy as np
+from mx_driving.detection import npu_rotated_overlaps
+a = np.random.uniform(0, 1, (1, 3, 5)).astype(np.float16)
+b = np.random.uniform(0, 1, (1, 2, 5)).astype(np.float16)
+box1 = torch.from_numpy(a).npu()
+box2 = torch.from_numpy(b).npu()
+output = npu_rotated_overlaps(box1, box2, True)
+```
+## roi_align_rotated[beta]
+### 接口原型
+```python
+mx_driving.detection.roi_align_rotated(Tensor feature_map, Tensor rois, float: spatial_scale,
+                                       int: sampling_ratio, int: pooled_height, int: pooled_width, bool: aligned, bool: clockwise) -> Tensor
+```
+### 功能描述
+计算旋转候选框的RoI Align池化特征图。
+### 参数说明
+- `feature map(Tensor)`：特征图张量，数据类型为`float32`，形状为`[B, C, H, W]`。
+- `rois(Tensor)`：感兴趣区域张量，数据类型为`float32`，形状为`[n, 6]`。
+- `spatial_scale(float)`：感兴趣区域边界框的缩放率，数据类型为`float32`。
+- `sampling_ratio(int)`：采样率，数据类型为`int`。取值范围为非负整数。
+- `pooled_height(int)`：池化特征图高度，数据类型为`int`。
+- `pooled_width(int)`：池化特征图宽度，数据类型为`int`。
+- `aligned(bool)`：是否对齐，数据类型为`bool`。值为`True`时，表示对齐, 值为`False`时，表示不对齐。
+- `clockwise(bool)`：旋转候选框的旋转方向，数据类型为`bool`。值为`True`时，表示逆时针旋转，值为`False`时，表示顺时针旋转。
+### 返回值
+- `Tensor`：池化特征图张量，数据类型为`float32`，形状为`[n, C, pooled_height, pooled_width]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import math
+import torch, torch_npu
+import numpy as np
+from mx_driving.detection import roi_align_rotated
+
+feature_map = torch.rand([1, 3, 16, 16])
+feature_map.requires_grad = True
+rois = torch.Tensor(6, 8)
+rois[0] = torch.randint(0, 1, (8,))
+rois[1].uniform_(0, 16)
+rois[2].uniform_(0, 16)
+rois[3].uniform_(0, 16)
+rois[4].uniform_(0, 16)
+rois[5].uniform_(0, math.pi)
+
+output = roi_align_rotated(feature_map.npu(), rois.npu(), 1, 1, 7, 7, True, True)
+output.backward(torch.ones_like(output))
+```
+### 其他说明
+在双线性插值采样过程中，当采样点`x`接近`-1`或`W`位置，`y`接近`-1`或`H`位置时，由于平台差异和计算误差，可能导致该采样点的精度无法与竞品精度完全对齐。
+
+## roiaware_pool3d
+### 接口原型
+```python
+mx_driving.detection.roiaware_pool3d(Tensor rois, Tensor pts, Tensor pts_feature,
+                    Union[int, tuple] out_size, int max_pts_per_voxel, int mode) -> Tensor
+```
+### 功能描述
+将输入的点云特征在ROI框内进行池化
+### 参数说明
+- `rois (Tensor)`：输入的RoI框坐标与尺寸，数据类型为`float32/float16`，shape为`[Roi_num, 7]`。
+- `pts (Tensor)`：输入的点云坐标，数据类型为`float32/float16`，shape为`[Pts_num, 3]`。
+- `pts_feature (Tensor)`：输入的点的特征向量，数据类型为`float32/float16`，shape为`[Pts_num, Channels]`。
+- `out_size (Union)`：输出的RoI框内voxel的尺寸，数据类型为`int`或者`tuple`，shape为`[out_x, out_y, out_z]`。
+- `max_pts_per_voxel (int)`：每个voxel内最大的点的个数，数据类型为`int`。
+- `mode (int)`：池化的方式，0为maxpool, 1为avgpool，数据类型为`int`。
+### 返回值
+- `pooled_features (Tensor)`：池化得到的RoI框特征，数据类型为`float32/float16`，shape为`[Roi_num, out_x, out_y, out_z, Channels]`。
+### 约束说明
+- Roi_num <= 100
+- Pts_num <= 1000
+- Channels <= 1024
+- 1 <= max_pts_per_voxel <=256，max_pts_per_voxel <= Pts_num
+- 反向具有相同约束。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch
+import math
+import torch_npu
+import mx_driving.detection
+
+out_size = (5, 5, 5)
+max_pts_per_voxel = 128
+mode = 1
+
+N = 40
+npoints = 1000
+channels = 1024
+
+xyz_coor = np.random.uniform(-1, 1, size = (N, 3)).astype(np.float32)
+xyz_size_num = np.random.uniform(5, 50, size = (1, 3))
+xyz_size = (xyz_size_num * np.ones((N, 3))).astype(np.float32)
+angle = np.radians(np.random.randint(0, 360, size = (N , 1))).astype(np.float32)
+
+rois = np.concatenate((xyz_coor, xyz_size), axis=1)
+rois = np.concatenate((rois, angle), axis=1)
+
+pts = np.random.uniform(-5, 5, size = (npoints, 3)).astype(np.float32)
+pts_feature = np.random.uniform(-1, 1, size=(npoints, channels)).astype(np.float32)
+
+pooled_features_npu = mx_driving.detection.roiaware_pool3d(torch.tensor(rois).npu(), torch.tensor(pts).npu(),
+                                                            torch.tensor(pts_feature).npu(), out_size, max_pts_per_voxel, mode)
+```
+
+## border_align
+### 接口原型
+```python
+mx_driving.detection.border_align(Tensor feature_map, Tensor rois, int pooled_size) -> Tensor
+```
+### 功能描述
+对输入的RoI框进行边缘特征提取。
+### 参数说明
+- `feature_map (Tensor)`：输入的特征图，数据类型为`float32`，shape为`[Batch_size, Channels, Height, Width]`。
+- `rois (Tensor)`：输入的RoI框坐标，数据类型为`int32`，shape为`[Batch_size, Height * Width, 4]`。
+- `pooled_size (int)`：在每条边上的采样点数，数据类型为`int`。
+### 返回值
+- `out_features (Tensor)`：提取到的RoI框特征，数据类型为`float32`，shape为`[Batch_size, Channels / 4, Height * Width, 4]`。
+### 约束说明
+- Batch_size <= 128
+- Channels <= 8192, Channels % 4 == 0
+- Height <= 256, Width <= 256
+- 2 <= pooled_size <= 20
+- 反向具有相同约束。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch
+import torch_npu
+import numpy as np
+from mx_driving.detection import border_align
+
+def generate_features(feature_shape):
+    features = torch.rand(feature_shape)
+    return features
+
+def generate_rois(inputs):
+    num_boxes = inputs.shape[0] * inputs.shape[2] * inputs.shape[3]
+    xyxy = torch.rand(num_boxes, 4)
+    xyxy[:, 0::2] = xyxy[:, 0::2] * inputs.size(3)
+    xyxy[:, 1::2] = xyxy[:, 1::2] * inputs.size(2)
+    xyxy[:, 2:] = xyxy[:, 0:2] + xyxy[:, 2:]
+    rois = xyxy.view(inputs.shape[0], -1, 4).contiguous()
+    return rois
+
+batch_size = 2
+input_channels = 16
+input_height = 8
+input_width = 8
+pooled_size = 3
+features = generate_features([batch_size, input_channels, input_height, input_width])
+rois = generate_rois(features)
+output = border_align(features.npu(), rois.npu(), pooled_size)
+```
+
+## pixel_group
+### 接口原型
+```python
+mx_driving.detection.pixel_group(Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label, Tensor kernel_contour, int kernel_region_num, float distance_threshold) -> List[List]
+```
+### 功能描述
+根据像素之间的嵌入向量和距离，将未被分组的像素分组。
+### 参数说明
+- `score (Tensor)`：前景得分矩阵，数据类型为`float32`，shape为`[Height, Width]`。
+- `mask (Tensor)`：前景掩码矩阵，数据类型为`bool`，shape为`[Height, Width]`。
+- `embedding (Tensor)`：特征向量，数据类型为`float32`，shape为`[Height, Width, Embedding_dim]`。
+- `kernel_label (Tensor)`：像素的实例标签，数据类型为`int32`，shape为`[Height, Width]`。
+- `kernel_contour (Tensor)`：内核的边界像素，数据类型为`uint8`，shape为`[Height, Width]`。
+- `kernel_region_num`：不同内核（分组）的数量，数据类型为`int`。
+- `distance_threshold`：嵌入向量的距离阈值，数据类型为`float`。
+### 返回值
+- `pixel_assignment (List)`：像素的分组信息，数据类型为`float32`，length为入参`kernel_region_num`。
+### 约束说明
+- mask = score > 0.5
+- `score`的取值范围在`[0, 1]`之间
+- `kernel_label`的最大值为`kernel_region_num`-1
+- `kernel_contour`的取值非0即1
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+import numpy as np
+from mx_driving.detection import pixel_group
+H, W, dim, num = 10, 10, 8, 3
+score = np.random.uniform(0, 1, [H, W]).astype(np.float32)
+score = torch.from_numpy(score).npu()
+mask = (score) > 0.5
+embedding = np.random.uniform(0, 10, [H, W, dim]).astype(np.float32)
+embedding = torch.from_numpy(embedding).npu()
+kernel_label = np.random.uniform(0, num, [H, W]).astype(np.int32)
+kernel_label = torch.from_numpy(kernel_label).npu()
+kernel_contour = np.random.uniform(0, 1, [H, W]).astype(np.uint8)
+kernel_contour = torch.from_numpy(kernel_contour).npu()
+kernel_region_num = num
+distance_threshold = float(0.8)
+
+output = pixel_group(score, mask, embedding, kernel_label, kernel_contour, kernel_region_num, distance_threshold)
+```
+
+# 融合算子
+
+
+## npu_max_pool2d
+### 接口原型
+```python
+mx_driving.fused.npu_max_pool2d(Tensor x, int kernel_size, int stride, int padding) -> Tensor
+```
+### 功能描述
+对输入进行最大池化，并输出最大池化值。
+### 参数说明
+- `x (Tensor)`：一组待池化对象，数据类型为`float32`，format为NCHW，输入数据量不超过10亿。
+### 返回值
+- `y (Tensor)`：池化后的最大值，数据类型为`float32`，format为NCHW。
+### 约束说明
+kernel_size仅支持3，stride仅支持2，padding仅支持1，且输入C轴数据量要求为8的倍数，H和W需要大于100。
+性能在C值较大的场景下较优，建议使用规格为C>=64。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.fused import npu_max_pool2d
+kernel_size = 3
+stride = 2
+padding = 1
+x = torch.randn(18, 64, 464, 800).npu()
+res = npu_max_pool2d(x, kernel_size, stride, padding)
+```
+
+## npu_deformable_aggregation
+### 接口原型
+```python
+mx_driving.fused.npu_deformable_aggregation(Tensor feature_maps, Tensor spatial_shape, Tensor scale_start_index, Tensor sample_locations, Tensor weight) -> Tensor
+```
+### 功能描述
+可变形聚合，对于每个锚点实例，对多个关键点的多时间戳、视图、缩放特征进行稀疏采样后分层融合为实例特征，实现精确的锚点细化。
+### 参数说明
+- `feature_maps(Tensor)`：特征张量，数据类型为`float32`。shape为`[bs, num_feat, c]`。其中`bs`为batch size，`num_feat`为特征图的大小，`c`为特征图的维度。
+- `spatial_shape(Tensor)`：特征图的形状，数据类型为`int32`。shape为`[cam, scale, 2]`。其中`cam`为相机数量，其中`scale`为每个相机的特征图数量，`2`分别代表H, W。
+- `scale_start_index(Tensor)`：每个特征图的偏移位置张量，数据类型为`int32`。shape为`[cam, scale]`，其中`cam`为相机数量，其中`scale`每个相机的特征图数量。
+- `sample_locations(Tensor)`：位置张量，数据类型为`float32`。shape为`[bs, anchor, pts, cam, 2]`。其中`bs`为batch size，`anchor`为锚点数量，`pts`为采样点的数量，`cam`为相机的数量，`2`分别代表y, x。
+- `weight(Tensor)`：权重张量，数据类型为`float32`。shape为`[bs, anchor, pts, cam, scale, group]`。其中`bs`为batch size，`anchor`为锚点数量，`pts`为采样点的数量，`cam`为相机的数量，`scale`每个相机的特征图数量，`group`为分组数。
+### 返回值
+- `output(Tensor)`：输出结果张量，数据类型为`float32`。shape为`[bs, anchor, c]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 约束说明
+- bs <= 128
+- num_feat的值为spatial_shape中每幅图的特征数量之和
+- c <= 256,且为group的整数倍
+- cam <= 6
+- scale <= 4
+- anchor <= 2048
+- pts <= 2048
+- group <= 32,且为2的指数倍
+- sample_locations的值在[0, 1]之间。
+- 每个输入tensor的数据量不超过1.5亿。
+- 反向具有相同约束。
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.fused import npu_deformable_aggregation
+
+bs, num_feat, c, cam, anchor, pts, scale, group = 1, 2816, 256, 1, 10, 2000, 1, 8
+
+feature_maps = torch.ones_like(torch.randn(bs,num_feat ,c)).to(torch.float16)
+spatial_shape = torch.tensor([[[32, 88]]])
+scale_start_index = torch.tensor([[0]])
+sampling_location = torch.rand(bs, anchor, pts, cam, 2)
+weights = torch.randn(bs, anchor, pts, cam, scale, group)
+
+out = npu_deformable_aggregation(feature_maps.npu(), spatial_shape.npu(), scale_start_index.npu(), sampling_location.npu(), weights.npu())
+```
+
+## deform_conv2d(DeformConv2dFunction.apply)
+### 接口原型
+```python
+mx_driving.fused.deform_conv2d(Tensor x, Tensor offset, Tensor weight, Union[int, Tuple[int, ...]] stride, Union[int, Tuple[int, ...]] padding, Union[int, Tuple[int, ...]] dilation, int groups, int deformable_groups) -> Tensor
+```
+### 功能描述
+可变形卷积。
+### 参数说明
+- `x(Tensor)`：输入特征，数据类型为`float32`，shape为`(n, c_in, h_in, w_in)`，其中`n`为 batch size，`c_in`为输入特征的通道数量，`h_in`为输入特征图的高，`w_in`为输入特征图的宽。
+- `offset(Tensor)`：偏移量，数据类型为`float32`，shape 为`(n, 2 * k * k, h_out, w_out)`，其中`n`为 batch size，`k` 为卷积核大小，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
+- `weight(Tensor)`：卷积核权重，数据类型为`float32`，shape 为 `(c_out, c_in, k, k)`，其中 `c_out` 为输出的通道数，`c_in` 为输入的通道数，`k` 为卷积核大小。
+- `stride(Union)`：卷积步长。
+- `padding(Union)`：卷积的填充大小。
+- `dilation(Union)`：空洞卷积大小。
+- `groups(int)`：分组卷积大小，当前只支持1。
+- `deformable_groups(int)`：将通道分成几组计算offsets，当前只支持1。
+### 返回值
+- `output(Tensor)`：输出张量，数据类型为`float32`，shape 为 `(n, c_out, h_out, w_out)`，其中`n`为 batch size，`c_out`为输出通道，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 约束说明
+1. `deformable_groups`和`groups`当前只支持1。
+2. `h_in`,`w_in`,`h_out`,`w_out`需满足
+$$
+w_{out}=(w_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 \\
+h_{out}=(h_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1
+$$
+3. `c_in`需要为64的倍数。
+### 调用示例
+```python
+import torch
+import torch_npu
+from mx_driving.fused import deform_conv2d, DeformConv2dFunction
+
+n, c_in, h_in, w_in = 16, 64, 100, 200
+c_out, k, h_out, w_out = 64, 3, 50, 100
+
+x = torch.randn((n, c_in, h_in, w_in)).npu()
+offset = torch.randn((n, 2 * k * k, h_out, w_out)).npu()
+weight = torch.randn((c_out, c_in, k, k)).npu()
+stride = 1
+padding = 1
+dilation = 1
+groups = 1
+deformable_groups = 1
+
+output = deform_conv2d(x, offset, weight, stride, padding, dilation, groups, deformable_groups)
+output = DeformConv2dFunction.apply(x, offset, weight, stride, padding, dilation, groups, deformable_groups)
+```
+## modulated_deform_conv2d(ModulatedDeformConv2dFunction.apply)
+### 接口原型
+```python
+mx_driving.fused.modulated_deform_conv2d(Tensor x, Tensor offset, Tensor mask, Tensor weight, Tensor bias, Union[int, Tuple[int, ...]] stride, Union[int, Tuple[int, ...]] padding, Union[int, Tuple[int, ...]] dilation, int groups, int deformable_groups) -> Tensor
+```
+### 功能描述
+在可变形卷积的基础之上加上了 modulation 机制，通过调控输出特征的幅度，提升可变形卷积的聚焦相关区域的能力。
+### 参数说明
+- `x(Tensor)`：输入特征，数据类型为`float32`，shape为`(n, c_in, h_in, w_in)`，其中`n`为 batch size，`c_in`为输入特征的通道数量，`h_in`为输入特征图的高，`w_in`为输入特征图的宽。
+- `offset(Tensor)`：偏移量，数据类型为`float32`，shape 为`(n, 2 * k * k, h_out, w_out)`，其中`n`为 batch size，`k` 为卷积核大小，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
+- `mask(Tensor)`：掩码，用于调控输出特征的幅度，数据类型为`float32`，shape 为`(n, k * k, h_out, w_out)`，其中`n`为 batch size，k 为卷积核大小，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
+- `weight(Tensor)`：卷积核权重，数据类型为`float32`，shape 为 `(c_out, c_in, k, k)`，其中 `c_out` 为输出的通道数，`c_in` 为输入的通道数，`k` 为卷积核大小。
+- `bias(Tensor)`：偏置，暂不支持bias，传入 `None` 即可。
+- `stride(Union)`：卷积步长。
+- `padding(Union)`：卷积的填充大小。
+- `dilation(Union)`：空洞卷积大小。
+- `groups(int)`：分组卷积大小，当前只支持1。
+- `deformable_groups(int)`：将通道分成几组计算offsets，当前只支持1。
+### 返回值
+- `output(Tensor)`：输出张量，数据类型为`float32`，shape 为 `(n, c_out, h_out, w_out)`，其中`n`为 batch size，`c_out`为输出通道，`h_out` 为输出特征图高，`w_out` 为输出特征图的宽。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 约束说明
+1. `deformable_groups`和`groups`当前只支持1。
+2. `h_in`,`w_in`,`h_out`,`w_out`需满足
+$$
+w_{out}=(w_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1 \\
+h_{out}=(h_{in}+ 2 * padding - (dilation * (k - 1) + 1)) / stride + 1
+$$
+3. `c_in`需要为64的倍数。
+### 调用示例
+```python
+import torch
+import torch_npu
+from mx_driving.fused import modulated_deform_conv2d, ModulatedDeformConv2dFunction
+
+n, c_in, h_in, w_in = 16, 64, 100, 200
+c_out, k, h_out, w_out = 64, 3, 50, 100
+
+x = torch.randn((n, c_in, h_in, w_in)).npu()
+offset = torch.randn((n, 2 * k * k, h_out, w_out)).npu()
+mask = torch.randn((n, k * k, h_out, w_out)).npu()
+weight = torch.randn((c_out, c_in, k, k)).npu()
+bias = None
+stride = 1
+padding = 1
+dilation = 1
+groups = 1
+deformable_groups = 1
+
+output = modulated_deform_conv2d(x, offset, mask, weight, bias,
+  stride, padding, dilation, groups, deformable_groups)
+output = ModulatedDeformConv2dFunction.apply(x, offset, mask, weight, bias,
+  stride, padding, dilation, groups, deformable_groups)
+```
+
+# 点云算子
+## bev_pool
+### 接口原型
+```python
+mx_driving.point.bev_pool(Tensor feat, Tensor geom_feat, int B, int D, int H, int W) -> Tensor
+```
+### 功能描述
+BEV池化。可参考论文`BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation`
+### 参数说明
+- `feat(Tensor)`：特征张量，数据类型为`float32`。shape为`[N, C]`。其中`N`为原特征张量拉伸后的数量，`C`为特征的维度。
+- `geom_feat(Tensor)`：输出坐标张量，数据类型为`int32`。shape为`[N, 4]`。其中`4`分别代表`h, w, b, d`。
+- `B(int)`：batch size。
+- `D(int)`：输出池化深度。
+- `H(int)`：输出池化高度。
+- `W(int)`：输出池化宽度。
+### 返回值
+- `bev_pooled_feat(Tensor)`：采样后的点云数据，数据类型为`float32`。shape为`[B, D, H, W, C]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 约束说明
+- `geom_feat`的4个对应的值必须在`[0, H-1]`, `[0, W-1]`, `[0, B-1]`, `[0, D-1]`之间。
+- `geom_feat`和`feat`的第0维长度必须相同。
+- C <= 1024
+- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256
+- 对于反向也是同样的约束。
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.point import bev_pool
+feat = torch.rand(4, 256).npu()
+feat.requires_grad_()
+geom_feat = torch.tensor([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2], [0, 0, 0, 3]], dtype=torch.int32).npu()
+bev_pooled_feat = bev_pool(feat, geom_feat, 4, 1, 256, 256)
+loss = bev_pooled_feat.sum()
+loss.backward()
+```
+## bev_pool_v2
+### 接口原型
+```python
+mx_driving.point.bev_pool_v2(Tensor depth, Tensor feat, Tensor ranks_depth, Tensor ranks_feat, Tensor ranks_bev,
+                                 List[int] bev_feat_shape, Tensor interval_starts, Tensor interval_lengths) -> Tensor
+```
+### 功能描述
+BEV池化优化。可参考论文`BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View`。
+### 参数说明
+- `depth(Tensor)`：深度张量，数据类型为`float32`。shape为`[B, N, D, H, W]`。其中`B`为batch size，`N`为特征的数量，`D, H, W`分别代表深度、高度、宽度。
+- `feat(Tensor)`：特征张量，数据类型为`float32`。shape为`[B, N, H, W, C]`。其中`B`为batch size，`N`为特征的数量，`H, W, C`分别代表高度、宽度、通道数。
+- `ranks_depth(Tensor)`：深度排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
+- `ranks_feat(Tensor)`：特征排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
+- `ranks_bev(Tensor)`：BEV排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
+- `bev_feat_shape(List[int])`：BEV特征形状，数据类型为`int32`。长度为`5`， 分别代表`B, D, H, W, C`。
+- `interval_starts(Tensor)`：间隔开始张量，数据类型为`int32`。shape为`[N_INTERVALS]`。
+- `interval_lengths(Tensor)`：间隔长度张量，数据类型为`int32`。shape为`[N_INTERVALS]`。
+### 返回值
+- `bev_pooled_feat(Tensor)`：BEV池化后的特征张量，数据类型为`float32`。shape为`[B, D, H, W, C]`。
+### 约束说明
+- `ranks_depth`的值必须在`[0, B*B*D*H*W]`之间。
+- `ranks_feat`的值必须在`[0, B*N*H*W]`之间。
+- `ranks_bev`的值必须在`[0, B*D*H*W]`之间。
+- C <= 1024
+- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256
+- N_RANKS <= 2^21
+- 对于反向也是同样的约束。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.point import bev_pool_v2
+depth = torch.rand(2, 1, 8, 256, 256).npu()
+feat = torch.rand(2, 1, 256, 256, 64).npu()
+feat.requires_grad_()
+ranks_depth = torch.tensor([0, 1], dtype=torch.int32).npu()
+ranks_feat = torch.tensor([0, 1], dtype=torch.int32).npu()
+ranks_bev = torch.tensor([0, 1], dtype=torch.int32).npu()
+bev_feat_shape = [2, 8, 256, 256, 64]
+interval_starts = torch.tensor([0], dtype=torch.int32).npu()
+interval_lengths = torch.tensor([2], dtype=torch.int32).npu()
+bev_pooled_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths)
+loss = bev_pooled_feat.sum()
+loss.backward()
+```
+## bev_pool_v3
+### 接口原型
+```python
+mx_driving.point.bev_pool_v3(Tensor depth, Tensor feat, Tensor ranks_depth, Tensor ranks_feat, Tensor ranks_bev,
+                                 List[int] bev_feat_shape) -> Tensor
+```
+### 功能描述
+BEV池化优化。`bev_pool_v2`的NPU亲和版本，优先推荐使用。
+### 参数说明
+- `depth(Tensor)`：深度张量，数据类型为`float32`。shape为`[B, N, D, H, W]`。其中`B`为batch size，`N`为特征的数量，`D, H, W`分别代表深度、高度、宽度。
+- `feat(Tensor)`：特征张量，数据类型为`float32`。shape为`[B, N, H, W, C]`。其中`B`为batch size，`N`为特征的数量，`H, W, C`分别代表高度、宽度、通道数。
+- `ranks_depth(Tensor)`：深度排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
+- `ranks_feat(Tensor)`：特征排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
+- `ranks_bev(Tensor)`：BEV排序张量，数据类型为`int32`。shape为`[N_RANKS]`。
+- `bev_feat_shape(List[int])`：BEV特征形状，数据类型为`int32`。长度为`5`， 分别代表`B, D, H, W, C`。
+### 返回值
+- `bev_pooled_feat(Tensor)`：BEV池化后的特征张量，数据类型为`float32`。shape为`[B, D, H, W, C]`。
+### 约束说明
+- `ranks_depth`的值必须在`[0, B*B*D*H*W]`之间。
+- `ranks_feat`的值必须在`[0, B*N*H*W]`之间。
+- `ranks_bev`的值必须在`[0, B*D*H*W]`之间。
+- C 必须为8的倍数。
+- B * D * H * W * C <= 2^31, B, D <= 8, H, W <= 256
+- 对于反向也是同样的约束。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.point import bev_pool_v3
+depth = torch.rand(2, 1, 8, 256, 256).npu()
+feat = torch.rand(2, 1, 256, 256, 64).npu()
+feat.requires_grad_()
+ranks_depth = torch.tensor([0, 1], dtype=torch.int32).npu()
+ranks_feat = torch.tensor([0, 1], dtype=torch.int32).npu()
+ranks_bev = torch.tensor([0, 1], dtype=torch.int32).npu()
+bev_feat_shape = [2, 8, 256, 256, 64]
+bev_pooled_feat = bev_pool_v3(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape)
+loss = bev_pooled_feat.sum()
+loss.backward()
+```
+## furthest_point_sample_with_dist
+### 接口原型
+```python
+mx_driving.point.furthest_point_sample_with_dist(Tensor points, int num_points) -> Tensor
+```
+### 功能描述
+与`npu_furthest_point_sampling`功能相同，但输入略有不同。
+### 参数说明
+- `points(Tensor)`：点云数据，表示各点间的距离，数据类型为`float32`。shape为`[B, N, N]`。其中`B`为batch size，`N`为点的数量。
+- `num_points(int)`：采样点的数量。
+### 返回值
+- `Tensor`：采样后的点云数据，数据类型为`float32`。shape为`[B, num_points]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.point import furthest_point_sample_with_dist
+points = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu()
+out = furthest_point_sample_with_dist(points, 2)
+```
+## npu_furthest_point_sampling
+### 接口原型
+```python
+mx_driving.point.npu_furthest_point_sampling(Tensor points, int num_points) -> Tensor
+```
+### 功能描述
+点云数据的最远点采样。
+### 参数说明
+- `points(Tensor)`：点云数据，数据类型为`float32`。shape为`[B, N, 3]`。其中`B`为batch size，`N`为点的数量，`3`分别代表`x, y, z`。
+- `num_points(int)`：采样点的数量。
+### 返回值
+- `Tensor`：采样后的点云数据，数据类型为`float32`。shape为`[B, num_points]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.point import npu_furthest_point_sampling
+points = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu()
+out = npu_furthest_point_sampling(points, 2)
+```
+
+### 算子约束
+1. points输入shape[B, N, 3]的总大小(B x N x 3)不应该超过383166
+## npu_group_points
+Note：该接口命名将于2025年改为'group_points'。
+### 接口原型
+```python
+mx_driving.point.npu_group_points(Tensor features, Tensor indices) -> Tensor
+```
+### 功能描述
+点云数据按照索引重新分组。
+### 参数说明
+- `features`：需要被插值的特征，数据类型为`float32`，维度为（B, C, N）。
+- `indices`：获取目标特征计算的索引，数据类型为`int32`，维度为（B, npoints, nsample）。
+### 返回值
+- `output(Tensor)`：分组后的点云数据，数据类型为`float32`。shape为`[B, C, npoints, nsample]`。
+### 约束说明
+- `indices`的元素值需小于`features`的第三维度，即值在[0, N)。
+- C <= 1024
+- 反向具有相同约束。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch
+import torch_npu
+from mx_driving.point import npu_group_points
+
+
+indices = torch.tensor([[[0, 2, 5, 5], [1, 0, 5, 0], [2, 1, 4, 4]]]).int().npu()
+features = torch.tensor([[[0.9178, -0.7250, -1.6587, 0.0715, -0.2252, 0.4994],
+                          [0.6190, 0.1755, -1.7902, -0.5852, -0.3311, 1.9764],
+                          [1.7567, 0.0740, -1.1414, 0.4705, -0.3197, 1.1944],
+                          [-0.2343, 0.1194, 0.4306, 1.3780, -1.4282, -0.6377],
+                          [0.7239, 0.2321, -0.6578, -1.1395, -2.3874, 1.1281]]],
+                          dtype=torch.float32).npu()
+output = npu_group_points(features, indices)
+```
+
+## npu_add_relu
+### 接口原型
+```python
+mx_driving.fused.npu_add_relu(Tensor x, Tensor y) -> Tensor
+```
+### 功能描述
+与`relu(x + y)`功能相同。
+### 参数说明
+- `x(Tensor)`：输入数据，数据类型为`float32`，shape无限制。
+- `y(Tensor)`：输入数据，数据类型为`float32`，shape需要和x一致。
+### 返回值
+- `Tensor`：输出数据，数据类型为`float32`，shape和x一致。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.fused import npu_add_relu
+x = torch.tensor([[[1, 2, 3], [-1, 5, 6], [7, 8, 9]]], dtype=torch.float32).npu()
+y = torch.tensor([[[1, 2, 3], [-1, -2, 6], [7, 8, 9]]], dtype=torch.float32).npu()
+out = npu_add_relu(x, y)
+```
+### 算子约束
+- 输入`x`与输入`y`的shape和dtype需要保持一致，不支持广播。
+- 仅在x的元素个数超过2000000时，相较于`relu(x + y)`有性能提升。
+
+## voxelization
+### 接口原型
+```python
+mx_driving.point.voxelization(Tensor points, List[float] voxel_size, List[float] coors_range, int max_points=-1, int max_voxels=-1, bool deterministic=True) -> Tensor
+```
+### 功能描述
+将点云数据进行体素化。
+### 参数说明
+- `points(Tensor)`：点云数据，数据类型为`float32`。shape为`[N, F]`。其中`N`为点的数量，`F`分别代表每个点的特征维度，其中`N > 0, F >= 3`。
+- `voxel_size(List[float])`：体素大小，数据类型为`float32`。shape为`[3]`。其中`3`分别代表`x, y, z`。
+- `coors_range(List[float])`：体素范围，数据类型为`float32`。shape为`[6]`。其中`6`分别代表`x_min, y_min, z_min, x_max, y_max, z_max`。
+- `max_points(int)`：每个体素的最大点数。默认值为`-1`。
+- `max_voxels(int)`：最大体素数。默认值为`-1`。
+- `deterministic(bool)`：是否确定性。默认值为`True`。
+### 返回值
+- `coors(Tensor)`：每个点所属的体素坐标，数据类型为`int32`。shape为`[N, 3]`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.point import Voxelization
+points = torch.randint(-20, 100, [16, 3], dtype=torch.float32).npu()
+coors_range = [0, -40, -3, 70.4, 40, 1]
+max_points = -1
+voxel_size = [0.5, 0.5, 0.5]
+dynamic_voxelization = Voxelization(voxel_size, coors_range, max_points)
+coors = dynamic_voxelization.forward(points)
+```
+## npu_dynamic_scatter
+### 接口原型
+```python
+mx_driving.point.npu_dynamic_scatter(Tensor feats, Tensor coors, str reduce_type = 'max') -> Tuple[torch.Tensor, torch.Tensor]
+```
+### 功能描述
+将点云特征点在对应体素中进行特征压缩。
+### 参数说明
+- `feats(Tensor)`：点云特征张量[N, C]，仅支持两维，数据类型为`float32`，特征向量`C`长度上限为2048。
+- `coors(Tensor)`：体素坐标映射张量[N, 3]，仅支持两维，数据类型为`int32`，此处以x, y, z指代体素三维坐标，其取值范围为`0 <= x, y < 2048`,  `0 <= z < 256`。
+- `reduce_type(str)`：压缩类型。可选值为`'max'`, `'mean'`, `'sum'`。默认值为`'max'`
+### 返回值
+- `voxel_feats(Tensor)`：压缩后的体素特征张量，仅支持两维，数据类型为`float32`。
+- `voxel_coors(Tensor)`：去重后的体素坐标，仅支持两维，数据类型为`int32`。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch, torch_npu
+from mx_driving.point import npu_dynamic_scatter
+
+feats = torch.tensor([[1, 2, 3], [3, 2, 1], [7, 8, 9], [9, 8, 7]], dtype=torch.float32).npu()
+coors = torch.tensor([[1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]], dtype=torch.int32).npu()
+voxel_feats, voxel_coors = npu_dynamic_scatter(feats, coors, 'max')
+
+```
+## unique_voxel
+### 接口原型
+```python
+mx_driving._C.unique_voxel(Tensor voxels) -> int, Tensor, Tensor, Tensor, Tensor
+```
+### 功能描述
+对输入的点云数据进行去重处理。
+### 参数说明
+- `voxels (Tensor)`：数据语义为索引，数据类型为`int32`，shape为`[N]`。
+### 返回值
+- `num_voxels(int)`, 体素数量。
+- `uni_voxels(Tensor)`，去重后的体素数据，数据类型为`int32`，shape为`[num_voxels]`。
+- `uni_indices(Tensor)`, 去重后的索引数据，数据类型为`int32`，shape为`[num_voxels]`。
+- `argsort_indices(Tensor)`, 排序后的索引数据，数据类型为`int32`，shape为`[N]`。
+- `uni_argsort_indices(Tensor)`, 去重后的排序后的索引数据，数据类型为`int32`，shape为`[num_voxels]`。
+### 约束说明
+N的大小受限于内存大小，建议N小于等于2^32。
+
+受限于芯片指令，输入的数据类型只能是int32，且>=0,<2^30。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch
+import torch_npu
+import numpy as np
+from mx_driving._C import unique_voxel
+voxels = np.random.randint(0, 1024, (100000,)).astype(np.int32)
+voxels_npu = torch.from_numpy(voxels).npu()
+num_voxels, uni_voxels, uni_indices, argsort_indices, uni_argsort_indices = unique_voxel(voxels_npu)
+
+```
+
+
+## voxel_pooling_train
+### 接口原型
+```python
+mx_driving.point.npu_voxel_pooling_train(Tensor geom_xyz, Tensor input_features, List[int] voxel_num) -> Tensor
+```
+### 功能描述
+点云数据体素化。
+### 参数说明
+- `geom_xyz`：体素坐标，数据类型为`int32`，维度为（B, N, 3）, 3表示x, y, z。
+- `input_features`：点云数据，数据类型为`float32|float16`，维度为（B, N, C）。
+- `voxel_num`：体素格子长宽高，数据类型为`int32`，维度为（3），3表示体素格子的长宽高。
+### 返回值
+- `output(Tensor)`：输出结果，数据类型为`float32|float16`。shape为`[B, num_voxel_y, num_voxel_x, C]`。
+### 约束说明
+- B <= 128
+- N <= 100000
+- C <= 256
+- num_voxel_x <= 1000
+- num_voxel_y <= 1000
+- num_voxel_z <= 10
+- B * num_voxel_y * num_voxel_x * C <= 100000000
+- B * N * C <= 100000000
+- 反向具有相同约束。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 调用示例
+```python
+import torch
+import torch_npu
+import mx_driving.point
+
+def gen_data(geom_shape, feature_shape, coeff, batch_size, num_channels, dtype):
+       geom_xyz = torch.rand(geom_shape) * coeff
+       geom_xyz = geom_xyz.reshape(batch_size, -1, 3)
+       geom_xyz[:, :, 2] /= 100
+       geom_xyz_cpu = geom_xyz.int()
+       features = torch.rand(feature_shape, dtype=dtype) - 0.5
+       features_cpu = features.reshape(batch_size, -1, num_channels)
+
+       return geom_xyz_cpu, features_cpu
+
+dtype = torch.float32
+coeff = 90
+voxel_num = [128, 128, 1]
+batch_size = 2
+num_points = 40
+num_channel = 80
+xyz = 3
+
+geom_shape = [batch_size, num_points, xyz]
+feature_shape = [batch_size, num_points, num_channel]
+
+geom_cpu, feature_cpu = gen_data(geom_shape, feature_shape, coeff, batch_size, num_channel, dtype)
+
+geom_npu = geom_cpu.npu()
+feature_npu = feature_cpu.npu()
+
+result_npu = mx_driving.point.npu_voxel_pooling_train(geom_npu, feature_npu, voxel_num)
+```
+# 稀疏卷积算子(beta)
+## SparseConv3d(beta)
+### 接口原型
+```python
+mx_driving.spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor
+```
+### 功能描述
+稀疏卷积
+### 参数说明
+- `in_channels(int)`：输入数据的通道数
+- `out_channels(int)`：输出通道数
+- `kernel_size(List(int)/Tuple(int)/int)`：卷积神经网络中卷积核的大小
+- `stride(List(int)/Tuple(int)/int)`：卷积核在输入数据上滑动时的步长
+- `dilation(List(int)/Tuple(int)/int)`：空洞卷积大小
+- `groups(int)`：分组卷积
+- `bias(bool)`：偏置项
+- `indice_key(String)`：该输入用于复用之前计算的索引信息
+- `mode(String)`：区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积
+### 返回值
+- `SparseConvTensor`：存储了输出的特征值`out_feature`，对应索引位置`out_indices`和对应的spatital_shape。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 约束说明
+- `kernel_size`当前支持数据类型为三维List/Tuple或Int，值域为`[1, 3]`
+- `stride`当前支持数据类型为三维List/Tuple或Int
+- `dilation`，`groups`当前仅支持值为1
+- 对于反向也是同样的约束。
+### 调用示例
+```python
+import torch,torch_npu
+import numpy as np
+from mx_driving.spconv import SparseConv3d, SparseConvTensor
+
+def generate_indice(batch, height, width, depth, actual_num):
+    base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num]
+    base_indices = np.sort(base_indices)
+    b_indice = base_indices // (height * width * depth)
+    base_indices = base_indices % (height * width * depth)
+    h_indice = base_indices // (width * depth)
+    base_indices = base_indices // (width * depth)
+    w_indice = base_indices // depth
+    d_indice = base_indices % depth
+    indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num)
+    return indices
+
+actual_num = 20
+batch = 4
+spatial_shape = [9, 9, 9]
+indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu()
+feature = tensor_uniform = torch.rand(actual_num, 16).npu()
+feature.requires_grad = True
+x = SparseConvTensor(feature, indices, spatial_shape, batch)
+net = SparseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu()
+out = net(x)
+dout = torch.ones_like(out.features).float().npu()
+out.features.backward(dout)
+```
+
+
+## SparseInverseConv3d(beta)
+### 接口原型
+```python
+mx_driving.spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, output_padding=0,bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor
+```
+### 功能描述
+稀疏逆卷积
+### 参数说明
+- `in_channels(int)`：输入数据的通道数
+- `out_channels(int)`：输出通道数
+- `kernel_size(List(int)/Tuple(int)/int)`：卷积神经网络中卷积核的大小
+- `stride(List(int)/Tuple(int)/int)`：卷积核在输入数据上滑动时的步长
+- `dilation(List(int)/Tuple(int)/int)`：空洞卷积大小
+- `groups(int)`：分组卷积
+- `bias(bool)`：偏置项
+- `indice_key(String)`：该输入用于复用之前计算的索引信息
+- `mode(String)`：区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积
+### 返回值
+- `SparseConvTensor`：存储了输出的特征值`out_feature`，对应索引位置`out_indices`和对应的spatital_shape。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 约束说明
+- `kernel_size`当前支持数据类型为三维List/Tuple或Int，值域为`[1, 3]`
+- `stride`当前支持数据类型为三维List/Tuple或Int
+- `dilation`，`groups`当前仅支持值为1
+- 对于反向也是同样的约束。
+### 调用示例
+```python
+import torch,torch_npu
+import numpy as np
+from mx_driving.spconv import SparseInverseConv3d, SparseConvTensor
+
+def generate_indice(batch, height, width, depth, actual_num):
+    base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num]
+    base_indices = np.sort(base_indices)
+    b_indice = base_indices // (height * width * depth)
+    base_indices = base_indices % (height * width * depth)
+    h_indice = base_indices // (width * depth)
+    base_indices = base_indices // (width * depth)
+    w_indice = base_indices // depth
+    d_indice = base_indices % depth
+    indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num)
+    return indices
+
+actual_num = 20
+batch = 4
+spatial_shape = [9, 9, 9]
+indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu()
+feature = tensor_uniform = torch.rand(actual_num, 16).npu()
+feature.requires_grad = True
+x = SparseConvTensor(feature, indices, spatial_shape, batch)
+net = SparseInverseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu()
+out = net(x)
+dout = torch.ones_like(out.features).float().npu()
+out.features.backward(dout)
+```
+
+
+## SubMConv3d(beta)
+### 接口原型
+```python
+mx_driving.spconv.SubMConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor
+```
+### 功能描述
+稀疏卷积，只有当卷积核中心参与计算时，才会影响输出
+### 参数说明
+- `in_channels(int)`：输入数据的通道数
+- `out_channels(int)`：输出通道数
+- `kernel_size(List(int)/Tuple(int)/int)`：卷积神经网络中卷积核的大小
+- `stride(List(int)/Tuple(int)/int)`：卷积核在输入数据上滑动时的步长
+- `dilation(List(int)/Tuple(int)/int)`：空洞卷积大小
+- `groups(int)`：分组卷积
+- `bias(bool)`：偏置项
+- `indice_key(String)`：该输入用于复用之前计算的索引信息
+- `mode(String)`：区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积
+### 返回值
+- `SparseConvTensor`：存储了输出的特征值`out_feature`，对应索引位置`out_indices`和对应的spatital_shape。
+### 支持的型号
+- Atlas A2 训练系列产品
+### 约束说明
+- `kernel_size`当前支持数据类型为三维List/Tuple或Int，当前值仅支持1、3
+- `stride`当前支持数据类型为三维List/Tuple或Int,当前仅支持值为1
+- `dilation`，`groups`当前仅支持值为1
+- 对于反向也是同样的约束。
+### 调用示例
+```python
+import torch,torch_npu
+import numpy as np
+from mx_driving.spconv import SubMConv3d, SparseConvTensor
+
+def generate_indice(batch, height, width, depth, actual_num):
+    base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num]
+    base_indices = np.sort(base_indices)
+    b_indice = base_indices // (height * width * depth)
+    base_indices = base_indices % (height * width * depth)
+    h_indice = base_indices // (width * depth)
+    base_indices = base_indices // (width * depth)
+    w_indice = base_indices // depth
+    d_indice = base_indices % depth
+    indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num)
+    return indices
+
+actual_num = 20
+batch = 4
+spatial_shape = [9, 9, 9]
+indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu()
+feature = tensor_uniform = torch.rand(actual_num, 16).npu()
+feature.requires_grad = True
+x = SparseConvTensor(feature, indices, spatial_shape, batch)
+net = SubMConv3d(in_channels=16, out_channels=32, kernel_size=3).npu()
+out = net(x)
+dout = torch.ones_like(out.features).float().npu()
+out.features.backward(dout)
+```
\ No newline at end of file
diff --git a/mx_driving/point/ops/csrc/functions.h b/include/csrc/functions.h
similarity index 31%
rename from mx_driving/point/ops/csrc/functions.h
rename to include/csrc/functions.h
index 806ddd9504e6ec4a9839ad780cd4f8fd1359b5ba..98a171b0fa016366add9180eecf05197f0ddc3aa 100644
--- a/mx_driving/point/ops/csrc/functions.h
+++ b/include/csrc/functions.h
@@ -11,13 +11,90 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#ifndef CSRC_FUNCTIONS_H_
+#define CSRC_FUNCTIONS_H_
 
-#ifndef PERCEPTION_POINT_OPS_CSRC_FUNCTIONS_H_
-#define PERCEPTION_POINT_OPS_CSRC_FUNCTIONS_H_
-#include <ATen/Tensor.h>
-#include <torch/library.h>
+#include <ATen/ATen.h>
 
-#include <tuple>
+std::tuple<at::Tensor, at::Tensor> knn(
+    const at::Tensor& xyz, const at::Tensor& center_xyz, int32_t k, bool is_from_knn);
+
+at::Tensor npu_three_interpolate(
+    int b, int c, int m, int n, const at::Tensor& points, const at::Tensor& idx, const at::Tensor& weight);
+
+at::Tensor npu_three_interpolate_backward(
+    int b, int c, int n, int m, const at::Tensor& grad_out, const at::Tensor& idx, const at::Tensor& weight);
+
+std::tuple<at::Tensor, at::Tensor> scatter_max_with_argmax_v2(
+    const at::Tensor& updates, const at::Tensor& indices, c10::optional<at::Tensor> out);
+
+at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments);
+
+at::Tensor npu_scatter(const at::Tensor& self, const at::Tensor& indices, const at::Tensor& updates, int64_t dim);
+
+at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Tensor& count, int32_t dim);
+
+std::tuple<at::Tensor, at::Tensor> npu_scatter_mean(at::Tensor& src, at::Tensor& index, c10::optional<at::Tensor> out,
+    c10::optional<int> dim, c10::optional<int> dim_size);
+std::tuple<at::Tensor, at::Tensor> npu_sort_pairs(
+    const at::Tensor& keys_in, const at::Tensor& values_in, int64_t dim, bool descending);
+
+at::Tensor npu_hypot(const at::Tensor& x, const at::Tensor& y);
+
+std::tuple<at::Tensor, at::Tensor> npu_hypot_grad(
+    const at::Tensor& x, const at::Tensor& y, const at::Tensor& out, const at::Tensor& out_grad);
+
+void assign_score_withk(const at::Tensor& points, const at::Tensor& centers, const at::Tensor& scores,
+    const at::Tensor& knn_idx, at::Tensor& output, int32_t B, int32_t N, int32_t npoint, int32_t M, int32_t K,
+    int32_t out_dim, int32_t aggregate);
+at::Tensor npu_max_pool2d(const at::Tensor& x, int kernel_size, int stride, int padding);
+
+at::Tensor multi_scale_deformable_attn(const at::Tensor& value, const at::Tensor& value_spatial_shapes,
+    const at::Tensor& value_level_start_index, const at::Tensor& sampling_locations,
+    const at::Tensor& attention_weights);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> multi_scale_deformable_attn_backward(const at::Tensor& value,
+    const at::Tensor& value_spatial_shapes, const at::Tensor& value_level_start_index,
+    const at::Tensor& sampling_locations, const at::Tensor& attention_weights, const at::Tensor& grad_output);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> multi_scale_deformable_attn_grad_v2(const at::Tensor& value,
+    const at::Tensor& shape, const at::Tensor& level_start_index, const at::Tensor& location_trans,
+    const at::Tensor& attn_weight_trans, const at::Tensor& grad_output);
+
+at::Tensor npu_add_relu(at::Tensor& x, const at::Tensor& y);
+
+at::Tensor npu_add_relu_grad(at::Tensor& self, at::Tensor& grad_output);
+std::tuple<at::Tensor, at::Tensor> npu_scatter_mean(at::Tensor& src, at::Tensor& index, c10::optional<at::Tensor> out,
+    c10::optional<int> dim, c10::optional<int> dim_size);
+
+at::Tensor fused_bias_leaky_relu(const at::Tensor& x, const at::Tensor& bias, double negative_slop, double scale);
+
+at::Tensor deformable_aggregation(const at::Tensor& mc_ms_feat, const at::Tensor& spatial_shape,
+    const at::Tensor& scale_start_index, const at::Tensor& sampling_location, const at::Tensor& weights);
+std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_aggregation_backward(const at::Tensor& mc_ms_feat,
+    const at::Tensor& spatial_shape, const at::Tensor& scale_start_index, const at::Tensor& sampling_location,
+    const at::Tensor& weights, const at::Tensor& grad_output, const at::Tensor& grad_mc_ms_feat,
+    const at::Tensor& grad_sampling_location, const at::Tensor& grad_weights);
+
+std::tuple<at::Tensor, at::Tensor> deformable_conv2d(const at::Tensor& input, const at::Tensor& offset,
+    const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding,
+    at::IntArrayRef dilation, int64_t groups, int64_t deformable_groups);
+
+std::tuple<at::Tensor, at::Tensor> modulated_deformable_conv2d(const at::Tensor& input, const at::Tensor& offset,
+    const at::Tensor& mask, const at::Tensor& weight, const c10::optional<at::Tensor>& bias_opt,
+    at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
+    int64_t groups, int64_t deformable_groups, int64_t with_bias);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_conv2d_backward(const at::Tensor& input,
+    const at::Tensor& weight, const at::Tensor& offset, const at::Tensor& offset_output, const at::Tensor& grad_y,
+    at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
+    int64_t groups, int64_t deformable_groups);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> modulated_deformable_conv2d_backward(
+    const at::Tensor& input, const at::Tensor& offset, const at::Tensor& mask, const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias_opt, const at::Tensor& offset_output, const at::Tensor& grad_y,
+    at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
+    int64_t groups, int64_t deformable_groups, int64_t with_bias);
 
 at::Tensor group_points(
     const at::Tensor& points, const at::Tensor& idx, int64_t b, int64_t c, int64_t n, int64_t npoints, int64_t nsample);
@@ -78,4 +155,82 @@ at::Tensor npu_bev_pool_v3(const at::Tensor& depth, const at::Tensor& feat, cons
     const at::Tensor& ranks_feat, const at::Tensor& ranks_bev, int64_t b, int64_t d, int64_t h, int64_t w);
 std::tuple<at::Tensor, at::Tensor> npu_bev_pool_v3_backward(const at::Tensor& grad_out, const at::Tensor& depth,
     const at::Tensor& feat, const at::Tensor& ranks_depth, const at::Tensor& ranks_feat, const at::Tensor& ranks_bev);
-#endif // PERCEPTION_POINT_OPS_CSRC_FUNCTIONS_H_
+std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_subm_sparse_conv3d(const at::Tensor& feature,
+    const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, int out_channel,
+    at::IntArrayRef outSpatialShape, int batch_size, const at::Tensor& temp);
+
+std::tuple<at::Tensor, at::Tensor> multi_to_sparse(const at::Tensor& out_features,
+    const at::Tensor& unique_indices_offset, const at::Tensor& sorted_idx_to_former_indices,
+    const at::Tensor& outidx_pair);
+
+std::tuple<at::Tensor, at::Tensor> multi_to_sparse_v2(const at::Tensor& features, const at::Tensor& weight,
+    const at::Tensor& unique_indices_offset, const at::Tensor& sorted_idx_to_former_indices,
+    const at::Tensor& outidx_pair);
+
+std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size,
+    at::IntArrayRef stride, at::IntArrayRef padding, int out_channel, at::IntArrayRef outSpatialShape, int batch_size);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_sparse_inverse_conv3d(const at::Tensor& feature,
+    const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, at::IntArrayRef output_padding, int out_channel,
+    at::IntArrayRef outSpatialShape, int batch_size);
+
+std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d_grad(const at::Tensor& indices_offset,
+    const at::Tensor& former_sorted_indices, const at::Tensor& feature, const at::Tensor& weight,
+    const at::Tensor& grad);
+
+std::tuple<at::Tensor, at::Tensor> npu_prepare_subm_conv3d(
+    const at::Tensor& flattenIndices, at::IntArrayRef outSpatialShape, int batch_size);
+
+std::tuple<at::Tensor, at::Tensor> nms3d_normal(const at::Tensor& boxes, double nms_overlap_thresh);
+
+std::tuple<at::Tensor, at::Tensor> nms3d(const at::Tensor& boxes, double threshold);
+
+at::Tensor npu_rotated_overlaps(const at::Tensor& self, const at::Tensor& query_boxes, bool trans);
+
+at::Tensor npu_rotated_iou(const at::Tensor& boxes, const at::Tensor& query_boxes, bool trans, int64_t mode,
+    bool is_cross, double v_threshold, double e_threshold);
+
+at::Tensor npu_boxes_overlap_bev(const at::Tensor& boxes_a, const at::Tensor& boxes_b);
+
+void roi_align_rotated_v2_forward_npu(const at::Tensor& input, const at::Tensor& rois_map, at::Tensor& output,
+    double spatial_scale, int32_t sampling_ratio, int32_t pooled_height, int32_t pooled_width, bool aligned,
+    bool clockwise);
+at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input, const at::Tensor& rois, const at::Tensor& grad_output,
+    int32_t pooled_height, int32_t pooled_width, double spatial_scale, int32_t sampling_ratio, bool aligned,
+    bool clockwise);
+
+at::Tensor npu_box_iou_quadri(
+    const at::Tensor& boxes_a, const at::Tensor& boxes_b, const int64_t mode_flag, const bool aligned);
+
+at::Tensor npu_box_iou_rotated(
+    const at::Tensor& boxes_a, const at::Tensor& boxes_b, const int64_t mode_flag, const bool aligned);
+
+void border_align(const at::Tensor& input, const at::Tensor& rois, at::Tensor& output, int32_t pooled_size);
+
+at::Tensor border_align_backward(const at::Tensor& grad_out, const at::Tensor& boxes, const at::Tensor& argmax_idx,
+    int32_t pool_size, int32_t height, int32_t width);
+
+void npu_roiaware_pool3d_forward(const at::Tensor& rois, const at::Tensor& pts, const at::Tensor& pts_feature,
+    at::Tensor& argmax, at::Tensor& pts_idx_of_voxels, at::Tensor& pooled_features, int32_t mode);
+at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::Tensor& argmax,
+    const at::Tensor& grad_out, int32_t npoints, int64_t pool_method);
+
+std::vector<std::vector<float>> pixel_group(const at::Tensor& score, const at::Tensor& mask,
+    const at::Tensor& embedding, const at::Tensor& kernel_label, const at::Tensor& kernel_contour,
+    int kernel_region_num, double distance_threshold);
+
+at::Tensor npu_points_in_box(const at::Tensor& boxes, const at::Tensor& pts);
+
+at::Tensor npu_points_in_box_all(const at::Tensor& boxes, const at::Tensor& pts);
+
+std::tuple<at::Tensor, at::Tensor> npu_roipoint_pool3d_forward(const int32_t num_sampled_points,
+    const at::Tensor& points, const at::Tensor& point_features, const at::Tensor& boxes3d);
+
+at::Tensor npu_geometric_kernel_attention(const at::Tensor& value, const at::Tensor& spatial_shapes,
+    const at::Tensor& level_start_index, const at::Tensor& sampling_locations, const at::Tensor& attn_weights);
+
+std::tuple<at::Tensor, at::Tensor> npu_geometric_kernel_attention_backward(const at::Tensor& value,
+    const at::Tensor& spatial_shapes, const at::Tensor& level_start_index, const at::Tensor& sampling_locations,
+    const at::Tensor& attn_weights, const at::Tensor& grad_output);
+#endif // CSRC_FUNCTIONS_H_
diff --git a/include/csrc/utils.h b/include/csrc/utils.h
index 9ffbf71a525941cc3927678bd1af02a6615289e3..8c89c4b9174c3fd33faba9c58bafe850d4e57fd9 100644
--- a/include/csrc/utils.h
+++ b/include/csrc/utils.h
@@ -17,10 +17,9 @@
 #ifndef CSRC_UTILS_H_
 #define CSRC_UTILS_H_
 
-#include <iostream>
 #include <stdlib.h>
 
-template <typename T1, typename T2>
+template<typename T1, typename T2>
 inline T1 Ceil(const T1& x, const T2& y)
 {
     if (y == 0) {
@@ -29,7 +28,7 @@ inline T1 Ceil(const T1& x, const T2& y)
     return (x + y - 1) / y;
 }
 
-template <typename T1, typename T2>
+template<typename T1, typename T2>
 inline T1 AlignUp(const T1& x, const T2& y)
 {
     if (y == 0) {
@@ -38,12 +37,12 @@ inline T1 AlignUp(const T1& x, const T2& y)
     return ((x + y - 1) / y) * y;
 }
 
-template <typename T1, typename T2>
+template<typename T1, typename T2>
 inline T1 Tail(const T1& x, const T2& y)
 {
     if (x == 0 || y == 0) {
         return 0;
     }
-    return  (x - 1) % y + 1;
+    return (x - 1) % y + 1;
 }
 #endif // CSRC_UTILS_H_
\ No newline at end of file
diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3a3e5516a2089b1f447b2f32cefa4e3e96ddc96f
--- /dev/null
+++ b/kernels/CMakeLists.txt
@@ -0,0 +1,173 @@
+file(GLOB ASCEND_HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/op_host/*.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/op_host/*.h)
+file(GLOB ASCEND_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/*.cpp
+                            ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/*.h)
+if(BUILD_STAGE EQUAL 0)
+  add_library(ascend_all_ops SHARED ${ASCEND_HOST_SRC})
+  target_compile_options(ascend_all_ops PRIVATE -g -fPIC -std=c++11
+                                                -D_GLIBCXX_USE_CXX11_ABI=0)
+  target_include_directories(ascend_all_ops PRIVATE ${CANN_INCLUDE_PATH})
+  target_link_libraries(ascend_all_ops PRIVATE intf_pub exe_graph register
+                                               tiling_api ascendcl)
+  add_custom_command(
+    TARGET ascend_all_ops
+    POST_BUILD
+    COMMAND ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build
+            $<TARGET_FILE:ascend_all_ops> ${ASCEND_AUTOGEN_PATH})
+elseif(BUILD_STAGE EQUAL 1)
+  # ===================Build proto ===================
+  add_library(cust_op_proto SHARED ${ASCEND_AUTOGEN_PATH}/op_proto.cc)
+  target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
+  target_compile_options(cust_op_proto PRIVATE -fvisibility=hidden)
+  target_link_libraries(
+    cust_op_proto
+    PRIVATE intf_pub
+            exe_graph
+            register
+            tiling_api
+            ascendcl
+            -Wl,--whole-archive
+            rt2_registry
+            -Wl,--no-whole-archive)
+  set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME
+                                                 cust_opsproto_rt2.0)
+  install_target(
+    TRG cust_op_proto DST
+    packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}
+  )
+  install_file(TRG cust_op_proto SRC ${ASCEND_AUTOGEN_PATH}/op_proto.h DST
+               packages/vendors/${vendor_name}/op_proto/inc)
+
+  add_library(cust_optiling SHARED ${ASCEND_HOST_SRC})
+  target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
+  target_compile_options(cust_optiling PRIVATE -fvisibility=hidden)
+  target_link_libraries(
+    cust_optiling
+    PRIVATE intf_pub
+            exe_graph
+            register
+            tiling_api
+            ascendcl
+            -Wl,--whole-archive
+            rt2_registry
+            -Wl,--no-whole-archive)
+  set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME
+                                                 cust_opmaster_rt2.0)
+  install_target(
+    TRG
+    cust_optiling
+    DST
+    packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}
+  )
+  # create liboptiling.so link
+  add_custom_command(
+    TARGET cust_optiling
+    POST_BUILD
+    COMMAND
+      ${CMAKE_COMMAND} -E chdir
+      ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling
+      ${CMAKE_COMMAND} -E create_symlink
+      lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
+      liboptiling.so)
+  install(
+    FILES
+      ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/liboptiling.so
+    DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
+
+  # ===================Build ACLNN===================
+  file(GLOB ACLNN_SRC ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
+  file(GLOB ACLNN_INC ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
+  add_library(cust_opapi SHARED ${ACLNN_SRC})
+  target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase opapi)
+  install_target(TRG cust_opapi DST packages/vendors/${vendor_name}/op_api/lib)
+  install_file(TRG cust_opapi SRC ${ACLNN_INC} DST
+               packages/vendors/${vendor_name}/op_api/include)
+  if("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
+    add_ops_compile_options(ALL OPTIONS -g -O0)
+  endif()
+
+  foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+    if(EXISTS ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini)
+      # generate aic-${compute_unit}-ops-info.json
+      add_ops_info_target(
+        TARGET
+        ops_info_gen_${compute_unit}
+        OUTPUT
+        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}/aic-${compute_unit}-ops-info.json
+        OPS_INFO
+        ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+        INSTALL_DIR
+        packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
+      )
+
+      # generate ascendc impl py once
+      if(NOT TARGET ascendc_impl_gen)
+        add_ops_impl_target(
+          TARGET
+          ascendc_impl_gen
+          OPS_INFO
+          ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+          IMPL_DIR
+          ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel
+          OUT_DIR
+          ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl
+        )
+        install_file(
+          TRG
+          ascendc_impl_gen
+          SRC
+          ${ASCEND_KERNEL_SRC}
+          DST
+          packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
+        )
+      endif()
+
+      # dynamic shape binary compile
+      if(${ENABLE_BINARY_PACKAGE})
+        add_bin_compile_target(
+          TARGET
+          ascendc_bin_${compute_unit}
+          OPS_INFO
+          ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+          IMPL_DIR
+          ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel
+          ADP_DIR
+          ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
+          OUT_DIR
+          ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+          KERNEL_DIR
+          ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
+          INSTALL_DIR
+          packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
+          COMPUTE_UNIT
+          ${compute_unit})
+        add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen
+                         cust_optiling)
+      endif()
+    endif()
+  endforeach()
+
+  # generate npu_supported_ops.json
+  add_npu_support_target(
+    TARGET
+    npu_supported_ops
+    OPS_INFO_DIR
+    ${ASCEND_AUTOGEN_PATH}
+    OUT_DIR
+    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_info_cfg/ai_core
+    INSTALL_DIR
+    packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE})
+
+  get_system_info(SYSTEM_INFO)
+
+  # gen version.info
+  add_custom_target(
+    gen_version_info ALL
+    COMMAND
+      bash ${PROJECT_SOURCE_DIR}/cmake/util/gen_version_info.sh
+      ${ASCEND_CANN_PACKAGE_PATH}
+      ${MX_DRIVING_PATH}/packages/vendors/${vendor_name})
+
+  install(FILES ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/version.info
+          DESTINATION packages/vendors/${vendor_name})
+endif()
diff --git a/mx_driving/common/ops/csrc/OWNERS b/kernels/op_host/OWNERS
similarity index 100%
rename from mx_driving/common/ops/csrc/OWNERS
rename to kernels/op_host/OWNERS
diff --git a/mx_driving/fused/ops/kernels/op_host/add_relu.cpp b/kernels/op_host/add_relu.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/add_relu.cpp
rename to kernels/op_host/add_relu.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/add_relu_tiling.h b/kernels/op_host/add_relu_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/add_relu_tiling.h
rename to kernels/op_host/add_relu_tiling.h
diff --git a/mx_driving/common/ops/kernels/op_host/assign_score_withk.cpp b/kernels/op_host/assign_score_withk.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/assign_score_withk.cpp
rename to kernels/op_host/assign_score_withk.cpp
diff --git a/mx_driving/common/ops/kernels/op_host/assign_score_withk_tiling.h b/kernels/op_host/assign_score_withk_tiling.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/assign_score_withk_tiling.h
rename to kernels/op_host/assign_score_withk_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/bev_pool.cpp b/kernels/op_host/bev_pool.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/bev_pool.cpp
rename to kernels/op_host/bev_pool.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/bev_pool_tiling.h b/kernels/op_host/bev_pool_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/bev_pool_tiling.h
rename to kernels/op_host/bev_pool_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/bev_pool_v3.cpp b/kernels/op_host/bev_pool_v3.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/bev_pool_v3.cpp
rename to kernels/op_host/bev_pool_v3.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/bev_pool_v3_tiling.h b/kernels/op_host/bev_pool_v3_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/bev_pool_v3_tiling.h
rename to kernels/op_host/bev_pool_v3_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/border_align.cpp b/kernels/op_host/border_align.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/border_align.cpp
rename to kernels/op_host/border_align.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/border_align_grad.cpp b/kernels/op_host/border_align_grad.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/border_align_grad.cpp
rename to kernels/op_host/border_align_grad.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/border_align_grad_tiling.h b/kernels/op_host/border_align_grad_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/border_align_grad_tiling.h
rename to kernels/op_host/border_align_grad_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/border_align_tiling.h b/kernels/op_host/border_align_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/border_align_tiling.h
rename to kernels/op_host/border_align_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/box_iou.cpp b/kernels/op_host/box_iou.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/box_iou.cpp
rename to kernels/op_host/box_iou.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/box_iou_tiling.h b/kernels/op_host/box_iou_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/box_iou_tiling.h
rename to kernels/op_host/box_iou_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/boxes_overlap_bev.cpp b/kernels/op_host/boxes_overlap_bev.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/boxes_overlap_bev.cpp
rename to kernels/op_host/boxes_overlap_bev.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/boxes_overlap_bev_tiling.h b/kernels/op_host/boxes_overlap_bev_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/boxes_overlap_bev_tiling.h
rename to kernels/op_host/boxes_overlap_bev_tiling.h
diff --git a/mx_driving/common/ops/kernels/op_host/common.h b/kernels/op_host/common.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/common.h
rename to kernels/op_host/common.h
diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_aggregation.cpp b/kernels/op_host/deformable_aggregation.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/deformable_aggregation.cpp
rename to kernels/op_host/deformable_aggregation.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_aggregation_grad.cpp b/kernels/op_host/deformable_aggregation_grad.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/deformable_aggregation_grad.cpp
rename to kernels/op_host/deformable_aggregation_grad.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_aggregation_grad_tiling.h b/kernels/op_host/deformable_aggregation_grad_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/deformable_aggregation_grad_tiling.h
rename to kernels/op_host/deformable_aggregation_grad_tiling.h
diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_aggregation_tiling.h b/kernels/op_host/deformable_aggregation_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/deformable_aggregation_tiling.h
rename to kernels/op_host/deformable_aggregation_tiling.h
diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_conv2d.cpp b/kernels/op_host/deformable_conv2d.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/deformable_conv2d.cpp
rename to kernels/op_host/deformable_conv2d.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_conv2d_grad.cpp b/kernels/op_host/deformable_conv2d_grad.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/deformable_conv2d_grad.cpp
rename to kernels/op_host/deformable_conv2d_grad.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_conv2d_grad_tiling.h b/kernels/op_host/deformable_conv2d_grad_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/deformable_conv2d_grad_tiling.h
rename to kernels/op_host/deformable_conv2d_grad_tiling.h
diff --git a/mx_driving/fused/ops/kernels/op_host/deformable_conv2d_tiling.h b/kernels/op_host/deformable_conv2d_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/deformable_conv2d_tiling.h
rename to kernels/op_host/deformable_conv2d_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_scatter.cpp b/kernels/op_host/dynamic_scatter.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/dynamic_scatter.cpp
rename to kernels/op_host/dynamic_scatter.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_scatter_grad.cpp b/kernels/op_host/dynamic_scatter_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/dynamic_scatter_grad.cpp
rename to kernels/op_host/dynamic_scatter_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_scatter_grad_tiling.h b/kernels/op_host/dynamic_scatter_grad_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/dynamic_scatter_grad_tiling.h
rename to kernels/op_host/dynamic_scatter_grad_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_scatter_tiling.h b/kernels/op_host/dynamic_scatter_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/dynamic_scatter_tiling.h
rename to kernels/op_host/dynamic_scatter_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_voxelization.cpp b/kernels/op_host/dynamic_voxelization.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/dynamic_voxelization.cpp
rename to kernels/op_host/dynamic_voxelization.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/dynamic_voxelization_tiling.h b/kernels/op_host/dynamic_voxelization_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/dynamic_voxelization_tiling.h
rename to kernels/op_host/dynamic_voxelization_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/furthest_point_sampling.cpp b/kernels/op_host/furthest_point_sampling.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/furthest_point_sampling.cpp
rename to kernels/op_host/furthest_point_sampling.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/furthest_point_sampling_tiling.h b/kernels/op_host/furthest_point_sampling_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/furthest_point_sampling_tiling.h
rename to kernels/op_host/furthest_point_sampling_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/furthest_point_sampling_with_dist.cpp b/kernels/op_host/furthest_point_sampling_with_dist.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/furthest_point_sampling_with_dist.cpp
rename to kernels/op_host/furthest_point_sampling_with_dist.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/furthest_point_sampling_with_dist_tiling.h b/kernels/op_host/furthest_point_sampling_with_dist_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/furthest_point_sampling_with_dist_tiling.h
rename to kernels/op_host/furthest_point_sampling_with_dist_tiling.h
diff --git a/mx_driving/fused/ops/kernels/op_host/fused_bias_leaky_relu_v2.cpp b/kernels/op_host/fused_bias_leaky_relu_v2.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/fused_bias_leaky_relu_v2.cpp
rename to kernels/op_host/fused_bias_leaky_relu_v2.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/fused_bias_leaky_relu_v2_tiling.h b/kernels/op_host/fused_bias_leaky_relu_v2_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/fused_bias_leaky_relu_v2_tiling.h
rename to kernels/op_host/fused_bias_leaky_relu_v2_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/gather_nms3d_mask_tiling.cpp b/kernels/op_host/gather_nms3d_mask_tiling.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/gather_nms3d_mask_tiling.cpp
rename to kernels/op_host/gather_nms3d_mask_tiling.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/gather_nms3d_mask_tiling.h b/kernels/op_host/gather_nms3d_mask_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/gather_nms3d_mask_tiling.h
rename to kernels/op_host/gather_nms3d_mask_tiling.h
diff --git a/mx_driving/fused/ops/kernels/op_host/geometric_kernel_attn_grad.cpp b/kernels/op_host/geometric_kernel_attn_grad.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/geometric_kernel_attn_grad.cpp
rename to kernels/op_host/geometric_kernel_attn_grad.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/geometric_kernel_attn_grad_tiling.h b/kernels/op_host/geometric_kernel_attn_grad_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/geometric_kernel_attn_grad_tiling.h
rename to kernels/op_host/geometric_kernel_attn_grad_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/group_points.cpp b/kernels/op_host/group_points.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/group_points.cpp
rename to kernels/op_host/group_points.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/group_points_grad.cpp b/kernels/op_host/group_points_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/group_points_grad.cpp
rename to kernels/op_host/group_points_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/group_points_grad_tiling.h b/kernels/op_host/group_points_grad_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/group_points_grad_tiling.h
rename to kernels/op_host/group_points_grad_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/group_points_tiling.h b/kernels/op_host/group_points_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/group_points_tiling.h
rename to kernels/op_host/group_points_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/hard_voxelize.cpp b/kernels/op_host/hard_voxelize.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/hard_voxelize.cpp
rename to kernels/op_host/hard_voxelize.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/hard_voxelize_tiling.h b/kernels/op_host/hard_voxelize_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/hard_voxelize_tiling.h
rename to kernels/op_host/hard_voxelize_tiling.h
diff --git a/mx_driving/common/ops/kernels/op_host/hypot.cpp b/kernels/op_host/hypot.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/hypot.cpp
rename to kernels/op_host/hypot.cpp
diff --git a/mx_driving/common/ops/kernels/op_host/hypot_grad.cpp b/kernels/op_host/hypot_grad.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/hypot_grad.cpp
rename to kernels/op_host/hypot_grad.cpp
diff --git a/mx_driving/common/ops/kernels/op_host/hypot_tiling.h b/kernels/op_host/hypot_tiling.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/hypot_tiling.h
rename to kernels/op_host/hypot_tiling.h
diff --git a/mx_driving/common/ops/kernels/op_host/knn.cpp b/kernels/op_host/knn.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/knn.cpp
rename to kernels/op_host/knn.cpp
diff --git a/mx_driving/common/ops/kernels/op_host/knn_tiling.h b/kernels/op_host/knn_tiling.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/knn_tiling.h
rename to kernels/op_host/knn_tiling.h
diff --git a/mx_driving/fused/ops/kernels/op_host/max_pool2d.cpp b/kernels/op_host/max_pool2d.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/max_pool2d.cpp
rename to kernels/op_host/max_pool2d.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/max_pool2d.h b/kernels/op_host/max_pool2d.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/max_pool2d.h
rename to kernels/op_host/max_pool2d.h
diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn.cpp b/kernels/op_host/multi_scale_deformable_attn.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn.cpp
rename to kernels/op_host/multi_scale_deformable_attn.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad.cpp b/kernels/op_host/multi_scale_deformable_attn_grad.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad.cpp
rename to kernels/op_host/multi_scale_deformable_attn_grad.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_tiling.h b/kernels/op_host/multi_scale_deformable_attn_grad_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_tiling.h
rename to kernels/op_host/multi_scale_deformable_attn_grad_tiling.h
diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_tiling_v2.h b/kernels/op_host/multi_scale_deformable_attn_grad_tiling_v2.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_tiling_v2.h
rename to kernels/op_host/multi_scale_deformable_attn_grad_tiling_v2.h
diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_v2.cpp b/kernels/op_host/multi_scale_deformable_attn_grad_v2.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_grad_v2.cpp
rename to kernels/op_host/multi_scale_deformable_attn_grad_v2.cpp
diff --git a/mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_tiling.h b/kernels/op_host/multi_scale_deformable_attn_tiling.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_host/multi_scale_deformable_attn_tiling.h
rename to kernels/op_host/multi_scale_deformable_attn_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/nms3d.cpp b/kernels/op_host/nms3d.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/nms3d.cpp
rename to kernels/op_host/nms3d.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.cpp b/kernels/op_host/nms3d_normal_tiling.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.cpp
rename to kernels/op_host/nms3d_normal_tiling.cpp
diff --git a/mx_driving/common/ops/kernels/op_host/nms3d_normal_tiling.h b/kernels/op_host/nms3d_normal_tiling.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/nms3d_normal_tiling.h
rename to kernels/op_host/nms3d_normal_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/nms3d_tiling.h b/kernels/op_host/nms3d_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/nms3d_tiling.h
rename to kernels/op_host/nms3d_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/pixel_group.cpp b/kernels/op_host/pixel_group.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/pixel_group.cpp
rename to kernels/op_host/pixel_group.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/pixel_group_tiling.h b/kernels/op_host/pixel_group_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/pixel_group_tiling.h
rename to kernels/op_host/pixel_group_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/point_to_voxel.cpp b/kernels/op_host/point_to_voxel.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/point_to_voxel.cpp
rename to kernels/op_host/point_to_voxel.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/point_to_voxel_tiling.h b/kernels/op_host/point_to_voxel_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/point_to_voxel_tiling.h
rename to kernels/op_host/point_to_voxel_tiling.h
diff --git a/mx_driving/preprocess/ops/kernels/op_host/points_in_box.cpp b/kernels/op_host/points_in_box.cpp
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_host/points_in_box.cpp
rename to kernels/op_host/points_in_box.cpp
diff --git a/mx_driving/preprocess/ops/kernels/op_host/points_in_box_all.cpp b/kernels/op_host/points_in_box_all.cpp
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_host/points_in_box_all.cpp
rename to kernels/op_host/points_in_box_all.cpp
diff --git a/mx_driving/preprocess/ops/kernels/op_host/points_in_box_all_tiling.h b/kernels/op_host/points_in_box_all_tiling.h
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_host/points_in_box_all_tiling.h
rename to kernels/op_host/points_in_box_all_tiling.h
diff --git a/mx_driving/preprocess/ops/kernels/op_host/points_in_box_tiling.h b/kernels/op_host/points_in_box_tiling.h
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_host/points_in_box_tiling.h
rename to kernels/op_host/points_in_box_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/roi_align_rotated_grad_v2.cpp b/kernels/op_host/roi_align_rotated_grad_v2.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roi_align_rotated_grad_v2.cpp
rename to kernels/op_host/roi_align_rotated_grad_v2.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/roi_align_rotated_grad_v2_tiling.h b/kernels/op_host/roi_align_rotated_grad_v2_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roi_align_rotated_grad_v2_tiling.h
rename to kernels/op_host/roi_align_rotated_grad_v2_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/roi_align_rotated_v2.cpp b/kernels/op_host/roi_align_rotated_v2.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roi_align_rotated_v2.cpp
rename to kernels/op_host/roi_align_rotated_v2.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/roi_align_rotated_v2_tiling.h b/kernels/op_host/roi_align_rotated_v2_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roi_align_rotated_v2_tiling.h
rename to kernels/op_host/roi_align_rotated_v2_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_avgpool3d_grad.cpp b/kernels/op_host/roiaware_avgpool3d_grad.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roiaware_avgpool3d_grad.cpp
rename to kernels/op_host/roiaware_avgpool3d_grad.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_avgpool3d_grad_tiling.h b/kernels/op_host/roiaware_avgpool3d_grad_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roiaware_avgpool3d_grad_tiling.h
rename to kernels/op_host/roiaware_avgpool3d_grad_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_maxpool3d_grad.cpp b/kernels/op_host/roiaware_maxpool3d_grad.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roiaware_maxpool3d_grad.cpp
rename to kernels/op_host/roiaware_maxpool3d_grad.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_maxpool3d_grad_tiling.h b/kernels/op_host/roiaware_maxpool3d_grad_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roiaware_maxpool3d_grad_tiling.h
rename to kernels/op_host/roiaware_maxpool3d_grad_tiling.h
diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_pool3d.cpp b/kernels/op_host/roiaware_pool3d.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roiaware_pool3d.cpp
rename to kernels/op_host/roiaware_pool3d.cpp
diff --git a/mx_driving/detection/ops/kernels/op_host/roiaware_pool3d_tiling.h b/kernels/op_host/roiaware_pool3d_tiling.h
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_host/roiaware_pool3d_tiling.h
rename to kernels/op_host/roiaware_pool3d_tiling.h
diff --git a/mx_driving/preprocess/ops/kernels/op_host/roipoint_pool3d_forward.cpp b/kernels/op_host/roipoint_pool3d_forward.cpp
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_host/roipoint_pool3d_forward.cpp
rename to kernels/op_host/roipoint_pool3d_forward.cpp
diff --git a/mx_driving/preprocess/ops/kernels/op_host/roipoint_pool3d_forward_tiling.h b/kernels/op_host/roipoint_pool3d_forward_tiling.h
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_host/roipoint_pool3d_forward_tiling.h
rename to kernels/op_host/roipoint_pool3d_forward_tiling.h
diff --git a/mx_driving/common/ops/kernels/op_host/scatter_max_with_argmax_v2.cpp b/kernels/op_host/scatter_max_with_argmax_v2.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/scatter_max_with_argmax_v2.cpp
rename to kernels/op_host/scatter_max_with_argmax_v2.cpp
diff --git a/mx_driving/common/ops/kernels/op_host/scatter_max_with_argmax_v2.h b/kernels/op_host/scatter_max_with_argmax_v2.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/scatter_max_with_argmax_v2.h
rename to kernels/op_host/scatter_max_with_argmax_v2.h
diff --git a/mx_driving/common/ops/kernels/op_host/scatter_mean.cpp b/kernels/op_host/scatter_mean.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/scatter_mean.cpp
rename to kernels/op_host/scatter_mean.cpp
diff --git a/mx_driving/common/ops/kernels/op_host/scatter_mean.h b/kernels/op_host/scatter_mean.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/scatter_mean.h
rename to kernels/op_host/scatter_mean.h
diff --git a/mx_driving/common/ops/kernels/op_host/scatter_mean_grad.cpp b/kernels/op_host/scatter_mean_grad.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/scatter_mean_grad.cpp
rename to kernels/op_host/scatter_mean_grad.cpp
diff --git a/mx_driving/common/ops/kernels/op_host/scatter_mean_grad_tiling.h b/kernels/op_host/scatter_mean_grad_tiling.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/scatter_mean_grad_tiling.h
rename to kernels/op_host/scatter_mean_grad_tiling.h
diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_conv3d.cpp b/kernels/op_host/sparse_conv3d.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/sparse_conv3d.cpp
rename to kernels/op_host/sparse_conv3d.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_grad_v2.cpp b/kernels/op_host/sparse_conv3d_grad_v2.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_grad_v2.cpp
rename to kernels/op_host/sparse_conv3d_grad_v2.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_grad_v2_tiling.h b/kernels/op_host/sparse_conv3d_grad_v2_tiling.h
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_grad_v2_tiling.h
rename to kernels/op_host/sparse_conv3d_grad_v2_tiling.h
diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_tiling.h b/kernels/op_host/sparse_conv3d_tiling.h
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/sparse_conv3d_tiling.h
rename to kernels/op_host/sparse_conv3d_tiling.h
diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_inverse_conv3d.cpp b/kernels/op_host/sparse_inverse_conv3d.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/sparse_inverse_conv3d.cpp
rename to kernels/op_host/sparse_inverse_conv3d.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_host/sparse_inverse_conv3d_tiling.h b/kernels/op_host/sparse_inverse_conv3d_tiling.h
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/sparse_inverse_conv3d_tiling.h
rename to kernels/op_host/sparse_inverse_conv3d_tiling.h
diff --git a/mx_driving/spconv/ops/kernels/op_host/subm_sparse_conv3d_tiling.cpp b/kernels/op_host/subm_sparse_conv3d_tiling.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/subm_sparse_conv3d_tiling.cpp
rename to kernels/op_host/subm_sparse_conv3d_tiling.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_host/subm_sparse_conv3d_tiling.h b/kernels/op_host/subm_sparse_conv3d_tiling.h
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/subm_sparse_conv3d_tiling.h
rename to kernels/op_host/subm_sparse_conv3d_tiling.h
diff --git a/mx_driving/spconv/ops/kernels/op_host/to_sparse.cpp b/kernels/op_host/to_sparse.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/to_sparse.cpp
rename to kernels/op_host/to_sparse.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_host/to_sparse_tiling.h b/kernels/op_host/to_sparse_tiling.h
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/to_sparse_tiling.h
rename to kernels/op_host/to_sparse_tiling.h
diff --git a/mx_driving/spconv/ops/kernels/op_host/to_sparse_v3.cpp b/kernels/op_host/to_sparse_v3.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/to_sparse_v3.cpp
rename to kernels/op_host/to_sparse_v3.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_host/to_sparse_v3_tiling.h b/kernels/op_host/to_sparse_v3_tiling.h
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_host/to_sparse_v3_tiling.h
rename to kernels/op_host/to_sparse_v3_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/unique_voxel.cpp b/kernels/op_host/unique_voxel.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/unique_voxel.cpp
rename to kernels/op_host/unique_voxel.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/unique_voxel_tiling.h b/kernels/op_host/unique_voxel_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/unique_voxel_tiling.h
rename to kernels/op_host/unique_voxel_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/vec_pool_grad.cpp b/kernels/op_host/vec_pool_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/vec_pool_grad.cpp
rename to kernels/op_host/vec_pool_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/vec_pool_grad_tiling.h b/kernels/op_host/vec_pool_grad_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/vec_pool_grad_tiling.h
rename to kernels/op_host/vec_pool_grad_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/voxel_pooling_train.cpp b/kernels/op_host/voxel_pooling_train.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/voxel_pooling_train.cpp
rename to kernels/op_host/voxel_pooling_train.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/voxel_pooling_train_grad.cpp b/kernels/op_host/voxel_pooling_train_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/voxel_pooling_train_grad.cpp
rename to kernels/op_host/voxel_pooling_train_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_host/voxel_pooling_train_grad_tiling.h b/kernels/op_host/voxel_pooling_train_grad_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/voxel_pooling_train_grad_tiling.h
rename to kernels/op_host/voxel_pooling_train_grad_tiling.h
diff --git a/mx_driving/point/ops/kernels/op_host/voxel_pooling_train_tiling.h b/kernels/op_host/voxel_pooling_train_tiling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_host/voxel_pooling_train_tiling.h
rename to kernels/op_host/voxel_pooling_train_tiling.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/add_relu.cpp b/kernels/op_kernel/add_relu.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/add_relu.cpp
rename to kernels/op_kernel/add_relu.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/assign_score_withk.cpp b/kernels/op_kernel/assign_score_withk.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/assign_score_withk.cpp
rename to kernels/op_kernel/assign_score_withk.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool.cpp b/kernels/op_kernel/bev_pool.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/bev_pool.cpp
rename to kernels/op_kernel/bev_pool.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool.h b/kernels/op_kernel/bev_pool.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/bev_pool.h
rename to kernels/op_kernel/bev_pool.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_grad.cpp b/kernels/op_kernel/bev_pool_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_grad.cpp
rename to kernels/op_kernel/bev_pool_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v2.cpp b/kernels/op_kernel/bev_pool_v2.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v2.cpp
rename to kernels/op_kernel/bev_pool_v2.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v2.h b/kernels/op_kernel/bev_pool_v2.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v2.h
rename to kernels/op_kernel/bev_pool_v2.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v2_grad.cpp b/kernels/op_kernel/bev_pool_v2_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v2_grad.cpp
rename to kernels/op_kernel/bev_pool_v2_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v3.cpp b/kernels/op_kernel/bev_pool_v3.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v3.cpp
rename to kernels/op_kernel/bev_pool_v3.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/bev_pool_v3_grad.cpp b/kernels/op_kernel/bev_pool_v3_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/bev_pool_v3_grad.cpp
rename to kernels/op_kernel/bev_pool_v3_grad.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/border_align.cpp b/kernels/op_kernel/border_align.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/border_align.cpp
rename to kernels/op_kernel/border_align.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/border_align_grad.cpp b/kernels/op_kernel/border_align_grad.cpp
similarity index 96%
rename from mx_driving/detection/ops/kernels/op_kernel/border_align_grad.cpp
rename to kernels/op_kernel/border_align_grad.cpp
index 5c05866288d97d4332c32a52c8225b3ce5f21c76..efb3a71adf8417bf869578767504ca1aa3952652 100644
--- a/mx_driving/detection/ops/kernels/op_kernel/border_align_grad.cpp
+++ b/kernels/op_kernel/border_align_grad.cpp
@@ -117,6 +117,8 @@ public:
                         xStride = 0;
                         yStride = -stride;
                         break;
+                    default:
+                        break;
                 }
         
                 x = boxesLocal.GetValue((i / 2 * 2));
@@ -131,22 +133,26 @@ public:
                     continue;
                 }
 
-                if (y <= 0.0f) y = 0;
-                if (x <= 0.0f) x = 0;
+                if (y <= 0.0f) {
+                    y = 0;
+                }
+                if (x <= 0.0f) {
+                    x = 0;
+                }
                 
                 yLow = AscendC::ScalarCast<float, int32_t, AscendC::RoundMode::CAST_FLOOR>(y);
                 xLow = AscendC::ScalarCast<float, int32_t, AscendC::RoundMode::CAST_FLOOR>(x);
 
                 if (yLow >= height - 1) {
                     yHigh = yLow = height - 1;
-                    y = (float)yLow;
+                    y = static_cast<float>(yLow);
                 } else {
                     yHigh = yLow + 1;
                 }
 
                 if (xLow >= width - 1) {
                     xHigh = xLow = width - 1;
-                    x = (float)xLow;
+                    x = static_cast<float>(xLow);
                 } else {
                     xHigh = xLow + 1;
                 }
diff --git a/mx_driving/detection/ops/kernels/op_kernel/box_iou.cpp b/kernels/op_kernel/box_iou.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/box_iou.cpp
rename to kernels/op_kernel/box_iou.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/boxes_overlap_bev.cpp b/kernels/op_kernel/boxes_overlap_bev.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/boxes_overlap_bev.cpp
rename to kernels/op_kernel/boxes_overlap_bev.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/common.h b/kernels/op_kernel/common.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/common.h
rename to kernels/op_kernel/common.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/deformable_aggregation.cpp b/kernels/op_kernel/deformable_aggregation.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/deformable_aggregation.cpp
rename to kernels/op_kernel/deformable_aggregation.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/deformable_aggregation_grad.cpp b/kernels/op_kernel/deformable_aggregation_grad.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/deformable_aggregation_grad.cpp
rename to kernels/op_kernel/deformable_aggregation_grad.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/deformable_conv2d.cpp b/kernels/op_kernel/deformable_conv2d.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/deformable_conv2d.cpp
rename to kernels/op_kernel/deformable_conv2d.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/deformable_conv2d_grad.cpp b/kernels/op_kernel/deformable_conv2d_grad.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/deformable_conv2d_grad.cpp
rename to kernels/op_kernel/deformable_conv2d_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter.cpp b/kernels/op_kernel/dynamic_scatter.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter.cpp
rename to kernels/op_kernel/dynamic_scatter.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_base.h b/kernels/op_kernel/dynamic_scatter_base.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_base.h
rename to kernels/op_kernel/dynamic_scatter_base.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad.cpp b/kernels/op_kernel/dynamic_scatter_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad.cpp
rename to kernels/op_kernel/dynamic_scatter_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_base.h b/kernels/op_kernel/dynamic_scatter_grad_base.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_base.h
rename to kernels/op_kernel/dynamic_scatter_grad_base.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_max.h b/kernels/op_kernel/dynamic_scatter_grad_max.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_max.h
rename to kernels/op_kernel/dynamic_scatter_grad_max.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_mean.h b/kernels/op_kernel/dynamic_scatter_grad_mean.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_mean.h
rename to kernels/op_kernel/dynamic_scatter_grad_mean.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_sum.h b/kernels/op_kernel/dynamic_scatter_grad_sum.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_grad_sum.h
rename to kernels/op_kernel/dynamic_scatter_grad_sum.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_max.h b/kernels/op_kernel/dynamic_scatter_max.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_max.h
rename to kernels/op_kernel/dynamic_scatter_max.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_mean.h b/kernels/op_kernel/dynamic_scatter_mean.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_mean.h
rename to kernels/op_kernel/dynamic_scatter_mean.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_sum.h b/kernels/op_kernel/dynamic_scatter_sum.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_scatter_sum.h
rename to kernels/op_kernel/dynamic_scatter_sum.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/dynamic_voxelization.cpp b/kernels/op_kernel/dynamic_voxelization.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/dynamic_voxelization.cpp
rename to kernels/op_kernel/dynamic_voxelization.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling.cpp b/kernels/op_kernel/furthest_point_sampling.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling.cpp
rename to kernels/op_kernel/furthest_point_sampling.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling.h b/kernels/op_kernel/furthest_point_sampling.h
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling.h
rename to kernels/op_kernel/furthest_point_sampling.h
diff --git a/mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling_with_dist.cpp b/kernels/op_kernel/furthest_point_sampling_with_dist.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/furthest_point_sampling_with_dist.cpp
rename to kernels/op_kernel/furthest_point_sampling_with_dist.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/fused_bias_leaky_relu_v2.cpp b/kernels/op_kernel/fused_bias_leaky_relu_v2.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/fused_bias_leaky_relu_v2.cpp
rename to kernels/op_kernel/fused_bias_leaky_relu_v2.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/fused_bias_leaky_relu_v2.h b/kernels/op_kernel/fused_bias_leaky_relu_v2.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/fused_bias_leaky_relu_v2.h
rename to kernels/op_kernel/fused_bias_leaky_relu_v2.h
diff --git a/mx_driving/detection/ops/kernels/op_kernel/gather_nms3d_mask.cpp b/kernels/op_kernel/gather_nms3d_mask.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/gather_nms3d_mask.cpp
rename to kernels/op_kernel/gather_nms3d_mask.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/geometric_kernel_attn_grad.cpp b/kernels/op_kernel/geometric_kernel_attn_grad.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/geometric_kernel_attn_grad.cpp
rename to kernels/op_kernel/geometric_kernel_attn_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/group_points.cpp b/kernels/op_kernel/group_points.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/group_points.cpp
rename to kernels/op_kernel/group_points.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/group_points_grad.cpp b/kernels/op_kernel/group_points_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/group_points_grad.cpp
rename to kernels/op_kernel/group_points_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/hard_voxelize.cpp b/kernels/op_kernel/hard_voxelize.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/hard_voxelize.cpp
rename to kernels/op_kernel/hard_voxelize.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/hypot.cpp b/kernels/op_kernel/hypot.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/hypot.cpp
rename to kernels/op_kernel/hypot.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/hypot_grad.cpp b/kernels/op_kernel/hypot_grad.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/hypot_grad.cpp
rename to kernels/op_kernel/hypot_grad.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/knn.cpp b/kernels/op_kernel/knn.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/knn.cpp
rename to kernels/op_kernel/knn.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/knn.h b/kernels/op_kernel/knn.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/knn.h
rename to kernels/op_kernel/knn.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/max_pool2d.cpp b/kernels/op_kernel/max_pool2d.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/max_pool2d.cpp
rename to kernels/op_kernel/max_pool2d.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_generic.h b/kernels/op_kernel/ms_deform_attn_generic.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_generic.h
rename to kernels/op_kernel/ms_deform_attn_generic.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_generic.h b/kernels/op_kernel/ms_deform_attn_grad_generic.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_generic.h
rename to kernels/op_kernel/ms_deform_attn_grad_generic.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_generic_v2.h b/kernels/op_kernel/ms_deform_attn_grad_generic_v2.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_generic_v2.h
rename to kernels/op_kernel/ms_deform_attn_grad_generic_v2.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf.h b/kernels/op_kernel/ms_deform_attn_grad_high_perf.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf.h
rename to kernels/op_kernel/ms_deform_attn_grad_high_perf.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf_v2.h b/kernels/op_kernel/ms_deform_attn_grad_high_perf_v2.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_grad_high_perf_v2.h
rename to kernels/op_kernel/ms_deform_attn_grad_high_perf_v2.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_high_perf.h b/kernels/op_kernel/ms_deform_attn_high_perf.h
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/ms_deform_attn_high_perf.h
rename to kernels/op_kernel/ms_deform_attn_high_perf.h
diff --git a/mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn.cpp b/kernels/op_kernel/multi_scale_deformable_attn.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn.cpp
rename to kernels/op_kernel/multi_scale_deformable_attn.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn_grad.cpp b/kernels/op_kernel/multi_scale_deformable_attn_grad.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn_grad.cpp
rename to kernels/op_kernel/multi_scale_deformable_attn_grad.cpp
diff --git a/mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn_grad_v2.cpp b/kernels/op_kernel/multi_scale_deformable_attn_grad_v2.cpp
similarity index 100%
rename from mx_driving/fused/ops/kernels/op_kernel/multi_scale_deformable_attn_grad_v2.cpp
rename to kernels/op_kernel/multi_scale_deformable_attn_grad_v2.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/nms3d.cpp b/kernels/op_kernel/nms3d.cpp
similarity index 72%
rename from mx_driving/detection/ops/kernels/op_kernel/nms3d.cpp
rename to kernels/op_kernel/nms3d.cpp
index 224a56e1c48e1a576865266ca819c9a16e827ffb..ac47bd22701f8cdf5c98802e57dfa27684357409 100644
--- a/mx_driving/detection/ops/kernels/op_kernel/nms3d.cpp
+++ b/kernels/op_kernel/nms3d.cpp
@@ -9,8 +9,7 @@
 #define M_PI 3.14159265358979323846 /* pi */
 
 using namespace AscendC;
-constexpr int32_t
-BUFFER_NUM = 2;
+constexpr int32_t BUFFER_NUM = 2;
 constexpr float EPS = 1e-8;
 constexpr float ATAN2_DEFAULT_VALUE = 1000.0;
 
@@ -32,16 +31,12 @@ struct Point {
         y = _y;
     }
 
-    __aicore__ Point
-
-    operator+(const Point &b) const
+    __aicore__ Point operator+(const Point& b) const
     {
         return Point(x + b.x, y + b.y);
     }
 
-    __aicore__ Point
-
-    operator-(const Point &b) const
+    __aicore__ Point operator-(const Point& b) const
     {
         return Point(x - b.x, y - b.y);
     }
@@ -52,8 +47,7 @@ class KernelNms3d {
 public:
     __aicore__ inline KernelNms3d() {}
 
-    __aicore__ inline void Init(GM_ADDR boxes, GM_ADDR mask,
-                                const Nms3dTilingData *__restrict tiling_data)
+    __aicore__ inline void Init(GM_ADDR boxes, GM_ADDR mask, const Nms3dTilingData* __restrict tiling_data)
     {
         ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
         usedCoreNum = tiling_data->usedCoreNum;
@@ -68,9 +62,8 @@ public:
         uint32_t core_id = GetBlockIdx();
         isLastCore = (core_id == (tiling_data->usedCoreNum - 1));
 
-        boxGm.SetGlobalBuffer(reinterpret_cast<__gm__ T * > (boxes), boxNum * 7);
-        maskGm.SetGlobalBuffer(reinterpret_cast<__gm__ int16_t * > (mask),
-                maskNum * boxNum);
+        boxGm.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(boxes), boxNum * 7);
+        maskGm.SetGlobalBuffer(reinterpret_cast<__gm__ int16_t*>(mask), maskNum * boxNum);
 
         pipe.InitBuffer(inQueueCur, BUFFER_NUM, dataAlign * sizeof(T));
         pipe.InitBuffer(inQueueBox, BUFFER_NUM, dataAlign * 7 * sizeof(T));
@@ -90,7 +83,7 @@ public:
         pipe.InitBuffer(min2Buf, dataAlign * sizeof(T));
         pipe.InitBuffer(max1Buf, dataAlign * sizeof(T));
         pipe.InitBuffer(max2Buf, dataAlign * sizeof(T));
-        if constexpr(sizeof(T) == sizeof(half)) {
+        if constexpr (sizeof(T) == sizeof(half)) {
             pipe.InitBuffer(calcBuf, dataAlign * 2 * 7 * sizeof(float));
             curTemp = calcBuf.Get<float>(dataAlign * 2 * 7);
             boxTemp = curTemp[8];
@@ -100,7 +93,7 @@ public:
     __aicore__ inline void Process()
     {
         uint32_t core_id = GetBlockIdx();
-        LocalTensor <int16_t> oneLocal = oneMask.AllocTensor<int16_t>();
+        LocalTensor<int16_t> oneLocal = oneMask.AllocTensor<int16_t>();
         Duplicate(oneLocal, static_cast<int16_t>(1), dataAlign);
         for (size_t i = 0; i < boxNum; ++i) {
             for (size_t j = 0; j < loopTime; ++j) {
@@ -119,24 +112,22 @@ public:
     }
 
 private:
-    __aicore__ inline void CopyIn(int32_t cur_box, int32_t com_box,
-                                  bool is_last)
+    __aicore__ inline void CopyIn(int32_t cur_box, int32_t com_box, bool is_last)
     {
-        LocalTensor <T> curLocal = inQueueCur.AllocTensor<T>();
-        LocalTensor <T> boxLocal = inQueueBox.AllocTensor<T>();
+        LocalTensor<T> curLocal = inQueueCur.AllocTensor<T>();
+        LocalTensor<T> boxLocal = inQueueBox.AllocTensor<T>();
         DataCopy(curLocal, boxGm[cur_box * 7], dataAlign);
         DataCopy(boxLocal, boxGm[com_box * 7], dataAlign * 7);
         inQueueCur.EnQue(curLocal);
         inQueueBox.EnQue(boxLocal);
     }
 
-    __aicore__ inline void Compute(int32_t cur_box, int32_t com_box,
-                                   bool is_last)
+    __aicore__ inline void Compute(int32_t cur_box, int32_t com_box, bool is_last)
     {
         uint32_t cmpNum = is_last ? tailNum : dataAlign;
-        if constexpr(sizeof(T) == sizeof(half)) {
-            LocalTensor <T> curLocal = inQueueCur.DeQue<T>();
-            LocalTensor <T> boxLocal = inQueueBox.DeQue<T>();
+        if constexpr (sizeof(T) == sizeof(half)) {
+            LocalTensor<T> curLocal = inQueueCur.DeQue<T>();
+            LocalTensor<T> boxLocal = inQueueBox.DeQue<T>();
             Cast(curTemp, curLocal, RoundMode::CAST_NONE, dataAlign);
             Cast(boxTemp, boxLocal, RoundMode::CAST_NONE, 7 * dataAlign);
             inQueueCur.FreeTensor(curLocal);
@@ -147,7 +138,7 @@ private:
         }
 
         PipeBarrier<PIPE_ALL>();
-        LocalTensor <int16_t> outLocal = outQueueMask.AllocTensor<int16_t>();
+        LocalTensor<int16_t> outLocal = outQueueMask.AllocTensor<int16_t>();
         for (size_t i = 0; i < cmpNum; i++) {
             if (cur_box >= com_box + i) {
                 outLocal.SetValue(i, 1);
@@ -166,7 +157,7 @@ private:
         }
         PipeBarrier<PIPE_ALL>();
         outQueueMask.EnQue<int16_t>(outLocal);
-        if constexpr(sizeof(T) != sizeof(half)) {
+        if constexpr (sizeof(T) != sizeof(half)) {
             inQueueCur.FreeTensor(curTemp);
             inQueueBox.FreeTensor(boxTemp);
         }
@@ -174,35 +165,30 @@ private:
 
     __aicore__ inline void CopyOut(int32_t cur_box, int32_t com_box)
     {
-        LocalTensor <int16_t> outLocal = outQueueMask.DeQue<int16_t>();
+        LocalTensor<int16_t> outLocal = outQueueMask.DeQue<int16_t>();
         DataCopy(maskGm[cur_box * maskNum + com_box], outLocal, dataAlign);
         outQueueMask.FreeTensor(outLocal);
     }
 
 private:
-    __aicore__ inline float cross(const Point &a, const Point &b)
+    __aicore__ inline float cross(const Point& a, const Point& b)
     {
         return a.x * b.y - a.y * b.x;
     }
 
-    __aicore__ inline float cross(const Point &p1, const Point &p2,
-                                  const Point &p0)
+    __aicore__ inline float cross(const Point& p1, const Point& p2, const Point& p0)
     {
         return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
     }
 
-    __aicore__ int check_rect_cross(const Point &p1, const Point &p2,
-                                    const Point &q1, const Point &q2)
+    __aicore__ int check_rect_cross(const Point& p1, const Point& p2, const Point& q1, const Point& q2)
     {
-        int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
-                  min(q1.x, q2.x) <= max(p1.x, p2.x) &&
-                  min(p1.y, p2.y) <= max(q1.y, q2.y) &&
-                  min(q1.y, q2.y) <= max(p1.y, p2.y);
+        int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+                  min(p1.y, p2.y) <= max(q1.y, q2.y) && min(q1.y, q2.y) <= max(p1.y, p2.y);
         return ret;
     }
 
-    __aicore__ inline int check_in_box2d(const LocalTensor<float> &box,
-                                         const Point &p)
+    __aicore__ inline int check_in_box2d(const LocalTensor<float>& box, const Point& p)
     {
         const float MARGIN = 1e-2;
         float center_x = box.GetValue(0);
@@ -215,17 +201,14 @@ private:
         Cos(cosLocal, angleLocal);
         float angle_cos = cosLocal.GetValue(0);
         float angle_sin = sinLocal.GetValue(0);
-        float rot_x =
-                (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+        float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
         float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
 
-        return (abs(rot_x) < box.GetValue(3) / 2 + MARGIN &&
-                abs(rot_y) < box.GetValue(4) / 2 + MARGIN);
+        return (abs(rot_x) < box.GetValue(3) / 2 + MARGIN && abs(rot_y) < box.GetValue(4) / 2 + MARGIN);
     }
 
-    __aicore__ inline int intersection(const Point &p1, const Point &p0,
-                                       const Point &q1, const Point &q0,
-                                       Point &ans_point)
+    __aicore__ inline int intersection(
+        const Point& p1, const Point& p0, const Point& q1, const Point& q0, Point& ans_point)
     {
         if (check_rect_cross(p0, p1, q0, q1) == 0) {
             return 0;
@@ -234,8 +217,7 @@ private:
         float s2 = cross(p1, q1, p0);
         float s3 = cross(p0, q1, q0);
         float s4 = cross(q1, p1, q0);
-        if (!(s1 * s2 > static_cast<float>(0.0) &&
-              s3 * s4 > static_cast<float>(0.0))) {
+        if (!(s1 * s2 > static_cast<float>(0.0) && s3 * s4 > static_cast<float>(0.0))) {
             return 0;
         }
         float s5 = cross(q1, p1, p0);
@@ -259,22 +241,17 @@ private:
         return 1;
     }
 
-    __aicore__ inline void rotate_around_center(const Point &center,
-                                                const float angle_cos,
-                                                const float angle_sin, Point &p)
+    __aicore__ inline void rotate_around_center(
+        const Point& center, const float angle_cos, const float angle_sin, Point& p)
     {
-        float new_x =
-                (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
-        float new_y =
-                (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+        float new_x = (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
+        float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
         p.set(new_x, new_y);
     }
 
-    __aicore__ inline int point_cmp(const Point &a, const Point &b,
-                                    const Point &center)
+    __aicore__ inline int point_cmp(const Point& a, const Point& b, const Point& center)
     {
-        return math_atan2(a.y - center.y, a.x - center.x) >
-               math_atan2(b.y - center.y, b.x - center.x);
+        return math_atan2(a.y - center.y, a.x - center.x) > math_atan2(b.y - center.y, b.x - center.x);
     }
 
     __aicore__ inline float math_atan2(float a, float b)
@@ -305,8 +282,7 @@ private:
         return atanLocal.GetValue(0);
     }
 
-    __aicore__ inline float box_overlap(const LocalTensor<float> &boxATensor,
-                                        const LocalTensor<float> &boxBTensor)
+    __aicore__ inline float box_overlap(const LocalTensor<float>& boxATensor, const LocalTensor<float>& boxBTensor)
     {
         // params box_a: [x, y, z, dx, dy, dz, heading]
         // params box_b: [x, y, z, dx, dy, dz, heading]
@@ -329,16 +305,8 @@ private:
         Point center_a(boxATensor.GetValue(0), boxATensor.GetValue(1));
         Point center_b(boxBTensor.GetValue(0), boxBTensor.GetValue(1));
 
-        Point box_a_corners[5] = {{a_x1, a_y1},
-                                  {a_x2, a_y1},
-                                  {a_x2, a_y2},
-                                  {a_x1, a_y2},
-                                  {a_x1, a_y1}};
-        Point box_b_corners[5] = {{b_x1, b_y1},
-                                  {b_x2, b_y1},
-                                  {b_x2, b_y2},
-                                  {b_x1, b_y2},
-                                  {b_x1, b_y1}};
+        Point box_a_corners[5] = {{a_x1, a_y1}, {a_x2, a_y1}, {a_x2, a_y2}, {a_x1, a_y2}, {a_x1, a_y1}};
+        Point box_b_corners[5] = {{b_x1, b_y1}, {b_x2, b_y1}, {b_x2, b_y2}, {b_x1, b_y2}, {b_x1, b_y1}};
 
         // get oriented corners
         LocalTensor<float> angleLocal = angleBuf.Get<float>();
@@ -354,10 +322,8 @@ private:
         float b_angle_sin = sinLocal.GetValue(1);
 
         for (int k = 0; k < 4; k++) {
-            rotate_around_center(center_a, a_angle_cos, a_angle_sin,
-                                 box_a_corners[k]);
-            rotate_around_center(center_b, b_angle_cos, b_angle_sin,
-                                 box_b_corners[k]);
+            rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+            rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
         }
 
         box_a_corners[4] = box_a_corners[0];
@@ -372,9 +338,8 @@ private:
         poly_center.set(0, 0);
         for (int i = 0; i < 4; i++) {
             for (int j = 0; j < 4; j++) {
-                flag = intersection(box_a_corners[i + 1], box_a_corners[i],
-                                    box_b_corners[j + 1], box_b_corners[j],
-                                    cross_points[count]);
+                flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j],
+                    cross_points[count]);
                 if (flag) {
                     poly_center = poly_center + cross_points[count];
                     count++;
@@ -412,15 +377,13 @@ private:
 
         float cross_area = 0;
         for (int k = 0; k < count - 1; k++) {
-            cross_area += cross(cross_points[k] - cross_points[0],
-                cross_points[k + 1] - cross_points[0]);
+            cross_area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
         }
 
         return abs(cross_area) / static_cast<float>(2.0);
     }
 
-    __aicore__ inline float iou_bev(const LocalTensor<float> &boxATensor,
-                                    const LocalTensor<float> &boxBTensor)
+    __aicore__ inline float iou_bev(const LocalTensor<float>& boxATensor, const LocalTensor<float>& boxBTensor)
     {
         // params box_a: [x, y, z, dx, dy, dz, heading]
         // params box_b: [x, y, z, dx, dy, dz, heading]
@@ -432,17 +395,17 @@ private:
 
 private:
     TPipe pipe;
-    TQue <QuePosition::VECIN, BUFFER_NUM> inQueueCur, inQueueBox;
-    TQue <QuePosition::VECOUT, BUFFER_NUM> outQueueMask, oneMask;
-    TBuf <TPosition::VECCALC> calcBuf;
-    TBuf <TPosition::VECCALC> comBuf;
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueCur, inQueueBox;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueMask, oneMask;
+    TBuf<TPosition::VECCALC> calcBuf;
+    TBuf<TPosition::VECCALC> comBuf;
 
-    TBuf <TPosition::VECCALC> p1Buf, p2Buf, q1Buf, q2Buf;
-    TBuf <TPosition::VECCALC> angleBuf, sinBuf, cosBuf, pointBuf;
-    TBuf <TPosition::VECCALC> min1Buf, min2Buf, max1Buf, max2Buf;
+    TBuf<TPosition::VECCALC> p1Buf, p2Buf, q1Buf, q2Buf;
+    TBuf<TPosition::VECCALC> angleBuf, sinBuf, cosBuf, pointBuf;
+    TBuf<TPosition::VECCALC> min1Buf, min2Buf, max1Buf, max2Buf;
 
-    GlobalTensor <T> boxGm;
-    GlobalTensor <int16_t> maskGm;
+    GlobalTensor<T> boxGm;
+    GlobalTensor<int16_t> maskGm;
     LocalTensor<float> curTemp, boxTemp;
     uint32_t usedCoreNum;
     uint32_t loopTime;
@@ -456,13 +419,10 @@ private:
     bool isLastCore;
 };
 
-extern "C" __global__ __aicore__
-
-void nms3d(GM_ADDR boxes, GM_ADDR mask,
-           GM_ADDR workspace, GM_ADDR tiling)
+extern "C" __global__ __aicore__ void nms3d(GM_ADDR boxes, GM_ADDR mask, GM_ADDR workspace, GM_ADDR tiling)
 {
     GET_TILING_DATA(tilingData, tiling);
-    const Nms3dTilingData *__restrict tilingDevice = &tilingData;
+    const Nms3dTilingData* __restrict tilingDevice = &tilingData;
     if (TILING_KEY_IS(1)) {
         KernelNms3d<float> op;
         op.Init(boxes, mask, tilingDevice);
diff --git a/mx_driving/detection/ops/kernels/op_kernel/nms3d_normal.cpp b/kernels/op_kernel/nms3d_normal.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/nms3d_normal.cpp
rename to kernels/op_kernel/nms3d_normal.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/pixel_group.cpp b/kernels/op_kernel/pixel_group.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/pixel_group.cpp
rename to kernels/op_kernel/pixel_group.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/point_to_voxel.cpp b/kernels/op_kernel/point_to_voxel.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/point_to_voxel.cpp
rename to kernels/op_kernel/point_to_voxel.cpp
diff --git a/mx_driving/preprocess/ops/kernels/op_kernel/points_in_box.cpp b/kernels/op_kernel/points_in_box.cpp
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_kernel/points_in_box.cpp
rename to kernels/op_kernel/points_in_box.cpp
diff --git a/mx_driving/preprocess/ops/kernels/op_kernel/points_in_box_all.cpp b/kernels/op_kernel/points_in_box_all.cpp
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_kernel/points_in_box_all.cpp
rename to kernels/op_kernel/points_in_box_all.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/roi_align_rotated_grad_v2.cpp b/kernels/op_kernel/roi_align_rotated_grad_v2.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/roi_align_rotated_grad_v2.cpp
rename to kernels/op_kernel/roi_align_rotated_grad_v2.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/roi_align_rotated_v2.cpp b/kernels/op_kernel/roi_align_rotated_v2.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/roi_align_rotated_v2.cpp
rename to kernels/op_kernel/roi_align_rotated_v2.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/roiaware_avgpool3d_grad.cpp b/kernels/op_kernel/roiaware_avgpool3d_grad.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/roiaware_avgpool3d_grad.cpp
rename to kernels/op_kernel/roiaware_avgpool3d_grad.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/roiaware_maxpool3d_grad.cpp b/kernels/op_kernel/roiaware_maxpool3d_grad.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/roiaware_maxpool3d_grad.cpp
rename to kernels/op_kernel/roiaware_maxpool3d_grad.cpp
diff --git a/mx_driving/detection/ops/kernels/op_kernel/roiaware_pool3d.cpp b/kernels/op_kernel/roiaware_pool3d.cpp
similarity index 100%
rename from mx_driving/detection/ops/kernels/op_kernel/roiaware_pool3d.cpp
rename to kernels/op_kernel/roiaware_pool3d.cpp
diff --git a/mx_driving/preprocess/ops/kernels/op_kernel/roipoint_pool3d_forward.cpp b/kernels/op_kernel/roipoint_pool3d_forward.cpp
similarity index 100%
rename from mx_driving/preprocess/ops/kernels/op_kernel/roipoint_pool3d_forward.cpp
rename to kernels/op_kernel/roipoint_pool3d_forward.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_max_with_argmax_v2.cpp b/kernels/op_kernel/scatter_max_with_argmax_v2.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_max_with_argmax_v2.cpp
rename to kernels/op_kernel/scatter_max_with_argmax_v2.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean.cpp b/kernels/op_kernel/scatter_mean.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean.cpp
rename to kernels/op_kernel/scatter_mean.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_div.cpp b/kernels/op_kernel/scatter_mean_div.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_div.cpp
rename to kernels/op_kernel/scatter_mean_div.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad.cpp b/kernels/op_kernel/scatter_mean_grad.cpp
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad.cpp
rename to kernels/op_kernel/scatter_mean_grad.cpp
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad.h b/kernels/op_kernel/scatter_mean_grad.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad.h
rename to kernels/op_kernel/scatter_mean_grad.h
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_base.h b/kernels/op_kernel/scatter_mean_grad_base.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_base.h
rename to kernels/op_kernel/scatter_mean_grad_base.h
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_large.h b/kernels/op_kernel/scatter_mean_grad_large.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_large.h
rename to kernels/op_kernel/scatter_mean_grad_large.h
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_line.h b/kernels/op_kernel/scatter_mean_grad_line.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_grad_line.h
rename to kernels/op_kernel/scatter_mean_grad_line.h
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_normal.h b/kernels/op_kernel/scatter_mean_normal.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_normal.h
rename to kernels/op_kernel/scatter_mean_normal.h
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_notail.h b/kernels/op_kernel/scatter_mean_notail.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_notail.h
rename to kernels/op_kernel/scatter_mean_notail.h
diff --git a/mx_driving/common/ops/kernels/op_kernel/scatter_mean_notail_bighead.h b/kernels/op_kernel/scatter_mean_notail_bighead.h
similarity index 100%
rename from mx_driving/common/ops/kernels/op_kernel/scatter_mean_notail_bighead.h
rename to kernels/op_kernel/scatter_mean_notail_bighead.h
diff --git a/mx_driving/spconv/ops/kernels/op_kernel/sparse_conv3d.cpp b/kernels/op_kernel/sparse_conv3d.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_kernel/sparse_conv3d.cpp
rename to kernels/op_kernel/sparse_conv3d.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_kernel/sparse_conv3d_grad_v2.cpp b/kernels/op_kernel/sparse_conv3d_grad_v2.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_kernel/sparse_conv3d_grad_v2.cpp
rename to kernels/op_kernel/sparse_conv3d_grad_v2.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_kernel/sparse_inverse_conv3d.cpp b/kernels/op_kernel/sparse_inverse_conv3d.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_kernel/sparse_inverse_conv3d.cpp
rename to kernels/op_kernel/sparse_inverse_conv3d.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_kernel/subm_sparse_conv3d.cpp b/kernels/op_kernel/subm_sparse_conv3d.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_kernel/subm_sparse_conv3d.cpp
rename to kernels/op_kernel/subm_sparse_conv3d.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_kernel/to_sparse.cpp b/kernels/op_kernel/to_sparse.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_kernel/to_sparse.cpp
rename to kernels/op_kernel/to_sparse.cpp
diff --git a/mx_driving/spconv/ops/kernels/op_kernel/to_sparse_v3.cpp b/kernels/op_kernel/to_sparse_v3.cpp
similarity index 100%
rename from mx_driving/spconv/ops/kernels/op_kernel/to_sparse_v3.cpp
rename to kernels/op_kernel/to_sparse_v3.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/unique_voxel.cpp b/kernels/op_kernel/unique_voxel.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/unique_voxel.cpp
rename to kernels/op_kernel/unique_voxel.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/vec_pool_grad.cpp b/kernels/op_kernel/vec_pool_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/vec_pool_grad.cpp
rename to kernels/op_kernel/vec_pool_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/voxel_pooling_train.cpp b/kernels/op_kernel/voxel_pooling_train.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/voxel_pooling_train.cpp
rename to kernels/op_kernel/voxel_pooling_train.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/voxel_pooling_train_grad.cpp b/kernels/op_kernel/voxel_pooling_train_grad.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/voxel_pooling_train_grad.cpp
rename to kernels/op_kernel/voxel_pooling_train_grad.cpp
diff --git a/mx_driving/point/ops/kernels/op_kernel/voxel_to_point.cpp b/kernels/op_kernel/voxel_to_point.cpp
similarity index 100%
rename from mx_driving/point/ops/kernels/op_kernel/voxel_to_point.cpp
rename to kernels/op_kernel/voxel_to_point.cpp
diff --git a/mx_driving/_C/__init__.pyi b/mx_driving/_C/__init__.pyi
index 54fb2639b54657363024bb25f5bbf78cabdd033d..9a811c3a1ea61a2c21d3c43b5f5a8534ab8a61d1 100644
--- a/mx_driving/_C/__init__.pyi
+++ b/mx_driving/_C/__init__.pyi
@@ -76,7 +76,7 @@ def deformable_aggregation(
     sampling_location: torch.Tensor,
     weights: torch.Tensor,
 ) -> torch.Tensor: ...
-def deformable_aggregation_grad(
+def deformable_aggregation_backward(
     mc_ms_feat: torch.Tensor,
     spatial_shape: torch.Tensor,
     scale_start_index: torch.Tensor,
@@ -234,7 +234,7 @@ def npu_box_iou_quadri(boxes_a: torch.Tensor, boxes_b: torch.Tensor, mode_flag:
 def npu_box_iou_rotated(
     boxes_a: torch.Tensor, boxes_b: torch.Tensor, mode_flag: int, aligned: bool
 ) -> torch.Tensor: ...
-def border_align_forward_npu(
+def border_align(
     input: torch.Tensor, rois: torch.Tensor, output: torch.Tensor, pooled_size: int
 ) -> None: ...
 def border_align_backward(
@@ -417,7 +417,7 @@ __all__ = [
     "npu_add_relu_grad",
     "fused_bias_leaky_relu",
     "deformable_aggregation",
-    "deformable_aggregation_grad",
+    "deformable_aggregation_backward",
     "deformable_conv2d",
     "modulated_deformable_conv2d",
     "deformable_conv2d_backward",
diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py
index 3ab0077d7278f0bcd0909676de66fa07328ac347..566532626d85b1a624a831aeb16fe9a7d5782596 100644
--- a/mx_driving/__init__.py
+++ b/mx_driving/__init__.py
@@ -1,9 +1,102 @@
 import os
 
-import torch
-import torch_npu
 import mx_driving._C
 
+from .modules.roi_point_pool_3d import RoIPointPool3d
+from .modules.sparse_conv import SparseConv3d, SparseInverseConv3d, SubMConv3d
+from .modules.sparse_modules import SparseConvTensor, SparseModule, SparseSequential
+from .modules.voxelization import Voxelization
+from .ops.assign_score_withk import assign_score_withk
+from .ops.bev_pool import bev_pool
+from .ops.bev_pool_v2 import bev_pool_v2
+from .ops.bev_pool_v3 import bev_pool_v3
+from .ops.border_align import border_align
+from .ops.box_iou import box_iou_quadri
+from .ops.boxes_overlap_bev import boxes_overlap_bev, npu_boxes_overlap_bev
+from .ops.deform_conv2d import DeformConv2dFunction, deform_conv2d
+from .ops.furthest_point_sampling import npu_furthest_point_sampling
+from .ops.furthest_point_sampling_with_dist import furthest_point_sample_with_dist
+from .ops.fused_bias_leaky_relu import npu_fused_bias_leaky_relu
+from .ops.group_points import group_points, npu_group_points
+from .ops.hypot import hypot
+from .ops.knn import knn
+from .ops.modulated_deform_conv2d import ModulatedDeformConv2dFunction, modulated_deform_conv2d
+from .ops.multi_scale_deformable_attn import (
+    MultiScaleDeformableAttnFunction,
+    multi_scale_deformable_attn,
+    npu_multi_scale_deformable_attn_function,
+)
+from .ops.nms3d_normal import npu_nms3d_normal
+from .ops.npu_add_relu import npu_add_relu
+from .ops.npu_deformable_aggregation import npu_deformable_aggregation
+from .ops.npu_dynamic_scatter import npu_dynamic_scatter
+from .ops.npu_max_pool2d import npu_max_pool2d
+from .ops.npu_nms3d import npu_nms3d
+from .ops.npu_points_in_box import npu_points_in_box
+from .ops.npu_points_in_box_all import npu_points_in_box_all, points_in_boxes_all
+from .ops.pixel_group import pixel_group
+from .ops.roi_align_rotated import roi_align_rotated
+from .ops.roiaware_pool3d import roiaware_pool3d
+from .ops.rotated_iou import npu_rotated_iou
+from .ops.rotated_overlaps import npu_rotated_overlaps
+from .ops.scatter_max import scatter_max
+from .ops.scatter_mean import scatter_mean
+from .ops.three_interpolate import three_interpolate
+from .ops.three_nn import three_nn
+from .ops.voxel_pooling_train import npu_voxel_pooling_train
+from .ops.voxelization import voxelization
+from .ops.npu_geometric_kernel_attention import npu_geometric_kernel_attention
+
+__all__ = [
+    "RoIPointPool3d",
+    "SparseConv3d",
+    "SparseInverseConv3d",
+    "SubMConv3d",
+    "SparseConvTensor",
+    "SparseModule",
+    "SparseSequential",
+    "Voxelization",
+    "assign_score_withk",
+    "bev_pool",
+    "bev_pool_v2",
+    "bev_pool_v3",
+    "border_align",
+    "box_iou_quadri",
+    "boxes_overlap_bev",
+    "npu_boxes_overlap_bev",
+    "deform_conv2d",
+    "furthest_point_sample_with_dist",
+    "furthest_point_sample_with_dist",
+    "npu_fused_bias_leaky_relu",
+    "group_points",
+    "npu_group_points",
+    "hypot",
+    "knn",
+    "modulated_deform_conv2d",
+    "multi_scale_deformable_attn",
+    "npu_multi_scale_deformable_attn_function",
+    "npu_nms3d_normal",
+    "npu_add_relu",
+    "npu_deformable_aggregation",
+    "npu_dynamic_scatter",
+    "npu_max_pool2d",
+    "npu_nms3d",
+    "npu_points_in_box",
+    "npu_points_in_box_all",
+    "points_in_boxes_all",
+    "pixel_group",
+    "roi_align_rotated",
+    "roiaware_pool3d",
+    "npu_rotated_iou",
+    "npu_rotated_overlaps",
+    "scatter_max",
+    "scatter_mean",
+    "three_interpolate",
+    "three_nn",
+    "npu_voxel_pooling_train",
+    "voxelization",
+]
+
 
 def _set_env():
     mx_driving_root = os.path.dirname(os.path.abspath(__file__))
diff --git a/mx_driving/common/__init__.py b/mx_driving/common.py
similarity index 43%
rename from mx_driving/common/__init__.py
rename to mx_driving/common.py
index beecdaa5c544d780249725270ac4b52055e65504..1eb379bfeff78122b146576145954ebbea755461 100644
--- a/mx_driving/common/__init__.py
+++ b/mx_driving/common.py
@@ -1,7 +1,12 @@
+import warnings
+
+warnings.warn(
+    "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning
+)
 from .ops.three_interpolate import three_interpolate
 from .ops.scatter_max import scatter_max
 from .ops.knn import knn
-from .ops.threeNN import three_nn
+from .ops.three_nn import three_nn
 from .ops.scatter_mean import scatter_mean
 from .ops.hypot import hypot
-from .ops.assign_score_withk import assign_score_withk
+from .ops.assign_score_withk import assign_score_withk
\ No newline at end of file
diff --git a/mx_driving/common/CMakeLists.txt b/mx_driving/common/CMakeLists.txt
deleted file mode 100644
index 807aa0c667560bcf0d75c6c6a26369daa624e9de..0000000000000000000000000000000000000000
--- a/mx_driving/common/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
-  add_subdirectory(ops/kernels)
-endif()
-
-if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
-  add_subdirectory(ops/onnx/plugin)
-endif()
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
-  add_subdirectory(ops/csrc)
-endif()
diff --git a/mx_driving/common/components/README.md b/mx_driving/common/components/README.md
deleted file mode 100644
index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000
--- a/mx_driving/common/components/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some pytorch algorithm modules.
\ No newline at end of file
diff --git a/mx_driving/common/ops/csrc/CMakeLists.txt b/mx_driving/common/ops/csrc/CMakeLists.txt
deleted file mode 100644
index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/csrc/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_CSRC_SRC
-    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/common/ops/csrc/Hypot.cpp b/mx_driving/common/ops/csrc/Hypot.cpp
deleted file mode 100644
index b9a008e8ef16b100c276e7f32030ebf7d6c3a73a..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/csrc/Hypot.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) 2024 Huawei Technologies Co., Ltd
-// All rights reserved.
-
-#include "csrc/OpApiCommon.h"
-#include "functions.h"
-
-at::Tensor npu_hypot(const at::Tensor& x, const at::Tensor& y)
-{
-    auto out = at::empty_like(x, x.options());
-    EXEC_NPU_CMD(aclnnHypot, x, y, out);
-    return out;
-}
-
-std::tuple<at::Tensor, at::Tensor> npu_hypot_grad(const at::Tensor& x, const at::Tensor& y, const at::Tensor& out, const at::Tensor& out_grad)
-{
-    auto x_grad = at::empty_like(x, x.options());
-    auto y_grad = at::empty_like(y, y.options());
-    EXEC_NPU_CMD(aclnnHypotGrad, x, y, out, out_grad, x_grad, y_grad);
-    return std::make_tuple(x_grad, y_grad);
-}
diff --git a/mx_driving/common/ops/csrc/README.md b/mx_driving/common/ops/csrc/README.md
deleted file mode 100644
index 8073915fabe1c484db0488c9abc5e09b858c52c8..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/csrc/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-## Description
-The `csrc` lib implements python interface, which use `pybind11` to wrap the C++ code.
-There are 3 files you need to focus:
-1. `pybind.cpp`: Define the python interface.
-2. `functions.cpp`: Define the C++ interface.
-3. The file naming in `Pascal` style: The implementation of the C++ interface.
\ No newline at end of file
diff --git a/mx_driving/common/ops/csrc/functions.h b/mx_driving/common/ops/csrc/functions.h
deleted file mode 100644
index f226d907436037ca0e5211f9e1a771670ae61e2b..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/csrc/functions.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2024, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef COMMON_OPS_CSRC_FUNCTIONS_H_
-#define COMMON_OPS_CSRC_FUNCTIONS_H_
-
-#include <ATen/ATen.h>
-
-std::tuple<at::Tensor, at::Tensor> knn(const at::Tensor& xyz, const at::Tensor& center_xyz, int32_t k, bool is_from_knn);
-
-at::Tensor npu_three_interpolate(
-    int b, int c, int m, int n, const at::Tensor& points, const at::Tensor& idx, const at::Tensor& weight);
-
-at::Tensor npu_three_interpolate_backward(
-    int b, int c, int n, int m, const at::Tensor& grad_out, const at::Tensor& idx, const at::Tensor& weight);
-
-std::tuple<at::Tensor, at::Tensor> scatter_max_with_argmax_v2(
-    const at::Tensor& updates, const at::Tensor& indices, c10::optional<at::Tensor> out);
-
-at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments);
-
-at::Tensor npu_scatter(const at::Tensor& self, const at::Tensor& indices, const at::Tensor& updates, int64_t dim);
-
-at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Tensor& count, int32_t dim);
-
-std::tuple<at::Tensor, at::Tensor> npu_scatter_mean(at::Tensor& src, at::Tensor& index,
-                                                    c10::optional<at::Tensor> out, c10::optional<int> dim,
-                                                    c10::optional<int> dim_size);
-std::tuple<at::Tensor, at::Tensor> npu_sort_pairs(const at::Tensor &keys_in, const at::Tensor &values_in, int64_t dim, bool descending);
-
-at::Tensor npu_hypot(const at::Tensor& x, const at::Tensor& y);
-
-std::tuple<at::Tensor, at::Tensor> npu_hypot_grad(const at::Tensor& x, const at::Tensor& y, const at::Tensor& out, const at::Tensor& out_grad);
-
-void assign_score_withk(const at::Tensor& points, const at::Tensor& centers, const at::Tensor& scores, const at::Tensor& knn_idx,
-                        at::Tensor& output, int32_t B, int32_t N, int32_t npoint, int32_t M, int32_t K, int32_t out_dim, int32_t aggregate);
-
-#endif // COMMON_OPS_CSRC_FUNCTIONS_H_
diff --git a/mx_driving/common/ops/csrc/pybind.cpp b/mx_driving/common/ops/csrc/pybind.cpp
deleted file mode 100644
index 5e483ee6c7c0197d885892c2e0a21179e3fa03a2..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/csrc/pybind.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2024 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "csrc/pybind.h"
-
-#include <torch/extension.h>
-
-#include "functions.h"
-
-void init_common(pybind11::module& m)
-{
-    // knn
-    m.def("knn", &knn);
-
-    // npu_scatter_mean_grad
-    m.def("npu_scatter_mean_grad", &npu_scatter_mean_grad);
-
-    // three_interpolate
-    m.def("npu_three_interpolate", &npu_three_interpolate);
-    m.def("npu_three_interpolate_backward", &npu_three_interpolate_backward);
-
-    // scatter_mean
-    m.def("npu_scatter_mean", &npu_scatter_mean, "npu_scatter_mean NPU version");
-    
-    // scatter_max
-    m.def("scatter_max_with_argmax_v2", &scatter_max_with_argmax_v2);
-    m.def("npu_scatter_max_backward", &npu_scatter_max_backward);
-
-    // npu_sort_pairs
-    m.def("npu_sort_pairs", &npu_sort_pairs, "sort_pairs NPU version");
-
-    // npu_hypot
-    m.def("npu_hypot", &npu_hypot);
-    m.def("npu_hypot_grad", &npu_hypot_grad);
-
-    // assign_score_withk
-    m.def("assign_score_withk", &assign_score_withk);
-}
diff --git a/mx_driving/common/ops/kernels/CMakeLists.txt b/mx_driving/common/ops/kernels/CMakeLists.txt
deleted file mode 100644
index 3b1f8543b1fd15189db362166f9afad80f056ffd..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
-    add_subdirectory(op_host)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
-    add_subdirectory(op_kernel)
-endif()
-if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
-    add_subdirectory(testcases)
-endif()
-
diff --git a/mx_driving/common/ops/kernels/README.md b/mx_driving/common/ops/kernels/README.md
deleted file mode 100644
index 1e6645553e8d86a84a9833a13610741b59930494..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/kernels/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-## 算子原型
-<table>
-<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
-</tr>
-<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
-<tr><td align="center">x</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td></tr>
-<tr><td align="center">y</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td></tr>
-</tr>
-</tr>
-<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td></tr>
-</tr>
-<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
-</table>
\ No newline at end of file
diff --git a/mx_driving/common/ops/kernels/inc/base.h b/mx_driving/common/ops/kernels/inc/base.h
deleted file mode 100644
index a0b8bfae346605551e85d2fb8db80595e644de08..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/kernels/inc/base.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// REG_OP(Add)
-//     .INPUT(x1, TensorType({DT_FLOAT}))
-//     .INPUT(x2, TensorType({DT_FLOAT}))
-//     .OUTPUT(y, TensorType({DT_FLOAT}))
-//     .OP_END_FACTORY_REG(Add)
-
-// REG_OP(FurthestPointSamplingWithDist)
-//     .INPUT(points_dist, TensorType({DT_FLOAT}))
-//     .INPUT(nearest_temp, TensorType({DT_FLOAT}))
-//     .OUTPUT(index, TensorType({DT_INT32}))
-//     .REQUIRED_ATTR(num_points, Int)
-//     .OP_END_FACTORY_REG(FurthestPointSamplingWithDist)
-
-// REG_OP(Nms3dNormal)
-//     .INPUT(boxes, TensorType({DT_FLOAT, DT_FLOAT16}))
-//     .OUTPUT(keep, TensorType({DT_INT16}))
-//     .REQUIRED_ATTR(nms_overlap_thresh, Float)
-//     .OP_END_FACTORY_REG(Nms3dNormal)
-
-// REG_OP(FurthestPointSampling)
-//     .INPUT(point_xyz, TensorType({DT_FLOAT}))
-//     .INPUT(nearest_temp, TensorType({DT_FLOAT}))
-//     .OUTPUT(index, TensorType({DT_INT32}))
-//     .REQUIRED_ATTR(num_points, Int)
-//     .OP_END_FACTORY_REG(FurthestPointSampling)
-
-// REG_OP(DynamicScatterGrad)
-//     .INPUT(grad_voxel_feats, TensorType({DT_FLOAT}))
-//     .INPUT(prefix_sum_point_per_voxel, TensorType({DT_INT32}))
-//     .INPUT(argsort_coor, TensorType({DT_INT32}))
-//     .INPUT(compare_mask, TensorType({DT_UINT16}))
-//     .OUTPUT(grad_point_feats, TensorType({DT_FLOAT}))
-//     .ATTR(reduce_type, String, "max")
-//     .OP_END_FACTORY_REG(DynamicScatterGrad)
diff --git a/mx_driving/common/ops/kernels/op_host/CMakeLists.txt b/mx_driving/common/ops/kernels/op_host/CMakeLists.txt
deleted file mode 100644
index c44b2b0174f28f0144a7c03fc6c40cc5b389c14e..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/kernels/op_host/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_HOST_SRC
-    ${ASCEND_HOST_SRC} ${HOST_SRC}
-    CACHE INTERNAL "")
-# add the exclude files for aclnn
-set(aclop_exclude
-    ${aclop_exclude} ""
-    CACHE INTERNAL "")
-file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp)
-file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h)
-set(ACLNN_SRC_CUSTOM
-    ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC}
-    CACHE INTERNAL "")
-set(ACLNN_INC_CUSTOM
-    ${ACLNN_INC_CUSTOM} ${ACLNN_INC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/common/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/common/ops/kernels/op_kernel/CMakeLists.txt
deleted file mode 100644
index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000
--- a/mx_driving/common/ops/kernels/op_kernel/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_KERNEL_SRC
-        ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
-        CACHE INTERNAL "")
diff --git a/mx_driving/fused/ops/csrc/AddRelu.cpp b/mx_driving/csrc/AddRelu.cpp
similarity index 88%
rename from mx_driving/fused/ops/csrc/AddRelu.cpp
rename to mx_driving/csrc/AddRelu.cpp
index 6fe35c20b9d04772de7d83b170de9f219f16ddff..896a20e80d39ae71da5d5a0d623c98556c436a0b 100644
--- a/mx_driving/fused/ops/csrc/AddRelu.cpp
+++ b/mx_driving/csrc/AddRelu.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor npu_add_relu(at::Tensor& x, const at::Tensor& y)
 {
@@ -27,10 +27,6 @@ at::Tensor npu_add_relu_grad(at::Tensor& self, at::Tensor& grad_output)
 {
     auto result = at::empty_like(self, self.options());
     at_npu::native::OpCommand cmd;
-    cmd.Name("ReluGrad")
-        .Input(grad_output)
-        .Input(self)
-        .Output(result)
-        .Run();
+    cmd.Name("ReluGrad").Input(grad_output).Input(self).Output(result).Run();
     return result;
 }
diff --git a/mx_driving/common/ops/csrc/AssignScoreWithk.cpp b/mx_driving/csrc/AssignScoreWithk.cpp
similarity index 81%
rename from mx_driving/common/ops/csrc/AssignScoreWithk.cpp
rename to mx_driving/csrc/AssignScoreWithk.cpp
index a7cecce21eed3db78f111763a3b2cc11faae054a..4a028d92f5bead67c59624e2bc4634bd740f9a6d 100644
--- a/mx_driving/common/ops/csrc/AssignScoreWithk.cpp
+++ b/mx_driving/csrc/AssignScoreWithk.cpp
@@ -15,22 +15,11 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-void assign_score_withk(
-    const at::Tensor& points,
-    const at::Tensor& centers,
-    const at::Tensor& scores,
-    const at::Tensor& knn_idx,
-    at::Tensor & output,
-    int32_t B,
-    int32_t N,
-    int32_t npoint,
-    int32_t M,
-    int32_t K,
-    int32_t out_dim,
-    int32_t aggregate
-    )
+void assign_score_withk(const at::Tensor& points, const at::Tensor& centers, const at::Tensor& scores,
+    const at::Tensor& knn_idx, at::Tensor& output, int32_t B, int32_t N, int32_t npoint, int32_t M, int32_t K,
+    int32_t out_dim, int32_t aggregate)
 {
     TORCH_CHECK_NPU(points);
     TORCH_CHECK_NPU(centers);
@@ -48,5 +37,6 @@ void assign_score_withk(
     at::Tensor points_trans = points.permute({0, 3, 1, 2});
     at::Tensor centers_trans = centers.permute({0, 3, 1, 2});
 
-    EXEC_NPU_CMD_SYNC(aclnnAssignScoreWithk, points_trans, centers_trans, scores, knn_idx, B, N, npoint, M, K, out_dim, aggregate, output);
-}
\ No newline at end of file
+    EXEC_NPU_CMD_SYNC(aclnnAssignScoreWithk, points_trans, centers_trans, scores, knn_idx, B, N, npoint, M, K, out_dim,
+        aggregate, output);
+}
diff --git a/mx_driving/point/ops/csrc/BEVPool.cpp b/mx_driving/csrc/BEVPool.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/BEVPool.cpp
rename to mx_driving/csrc/BEVPool.cpp
index 47332f0711aed7388ed81faf2c82c5a600f07ee6..56742eabb146498a15e41578271ed90696325b47 100644
--- a/mx_driving/point/ops/csrc/BEVPool.cpp
+++ b/mx_driving/csrc/BEVPool.cpp
@@ -14,9 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr int64_t N_IDX = 0;
diff --git a/mx_driving/point/ops/csrc/BEVPoolBackward.cpp b/mx_driving/csrc/BEVPoolBackward.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/BEVPoolBackward.cpp
rename to mx_driving/csrc/BEVPoolBackward.cpp
index 262c4584a736d6aa63874faeba886d24d790756f..129b059a9f6654318bc988af880df5d817eba2fe 100644
--- a/mx_driving/point/ops/csrc/BEVPoolBackward.cpp
+++ b/mx_driving/csrc/BEVPoolBackward.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr int64_t N_IDX = 0;
diff --git a/mx_driving/point/ops/csrc/BEVPoolV2.cpp b/mx_driving/csrc/BEVPoolV2.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/BEVPoolV2.cpp
rename to mx_driving/csrc/BEVPoolV2.cpp
index b2268d7a371d6437a1417c3be3492ad70d7b01a0..3c4b013f98d93160d507058debb791745bcdecdf 100644
--- a/mx_driving/point/ops/csrc/BEVPoolV2.cpp
+++ b/mx_driving/csrc/BEVPoolV2.cpp
@@ -16,7 +16,7 @@
 
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr int64_t C_IDX = 4;
diff --git a/mx_driving/point/ops/csrc/BEVPoolV2Backward.cpp b/mx_driving/csrc/BEVPoolV2Backward.cpp
similarity index 99%
rename from mx_driving/point/ops/csrc/BEVPoolV2Backward.cpp
rename to mx_driving/csrc/BEVPoolV2Backward.cpp
index ebeca36c0c52e16824b1007dc9f99ca6aeb5aa56..2330da05c2658f446829feeb76fa753781966264 100644
--- a/mx_driving/point/ops/csrc/BEVPoolV2Backward.cpp
+++ b/mx_driving/csrc/BEVPoolV2Backward.cpp
@@ -14,9 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr int64_t C_IDX = 4;
diff --git a/mx_driving/point/ops/csrc/BEVPoolV3.cpp b/mx_driving/csrc/BEVPoolV3.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/BEVPoolV3.cpp
rename to mx_driving/csrc/BEVPoolV3.cpp
index 0680801182bbdbfc448f3db43af559c4854c8272..d6461246e788c67eb42d7ce37abf16ca554b59bd 100644
--- a/mx_driving/point/ops/csrc/BEVPoolV3.cpp
+++ b/mx_driving/csrc/BEVPoolV3.cpp
@@ -14,9 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr int64_t C_IDX = 4;
diff --git a/mx_driving/point/ops/csrc/BEVPoolV3Backward.cpp b/mx_driving/csrc/BEVPoolV3Backward.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/BEVPoolV3Backward.cpp
rename to mx_driving/csrc/BEVPoolV3Backward.cpp
index 8566f7c3d0d302277e7a948d03f2a20683528f04..6916416c9da7b28e53f25a4602127cec6fc4d60d 100644
--- a/mx_driving/point/ops/csrc/BEVPoolV3Backward.cpp
+++ b/mx_driving/csrc/BEVPoolV3Backward.cpp
@@ -14,9 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr int64_t C_IDX = 4;
@@ -44,4 +43,4 @@ std::tuple<at::Tensor, at::Tensor> npu_bev_pool_v3_backward(const at::Tensor& gr
 
     EXEC_NPU_CMD(aclnnBEVPoolV3Grad, grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, grad_depth, grad_feat);
     return std::make_tuple(grad_depth, grad_feat);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/detection/ops/csrc/BorderAlign.cpp b/mx_driving/csrc/BorderAlign.cpp
similarity index 86%
rename from mx_driving/detection/ops/csrc/BorderAlign.cpp
rename to mx_driving/csrc/BorderAlign.cpp
index 68e246ed5c8645de6e29584b2af877580e911c0e..72363550145582837992dd32d5a9614e54b4f489 100644
--- a/mx_driving/detection/ops/csrc/BorderAlign.cpp
+++ b/mx_driving/csrc/BorderAlign.cpp
@@ -13,13 +13,14 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-void border_align_forward_npu(const at::Tensor& input, const at::Tensor& rois, at::Tensor& output, const int32_t pooled_size)
+void border_align(const at::Tensor& input, const at::Tensor& rois, at::Tensor& output, int32_t pooled_size)
 {
     TORCH_CHECK(input.size(1) % 4 == 0, "The number of channels must be divisible by 4.");
     at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous();
     at::Tensor rois_map = rois.contiguous();
     EXEC_NPU_CMD(aclnnBorderAlign, feature_map, rois_map, pooled_size, output);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/detection/ops/csrc/BorderAlignGrad.cpp b/mx_driving/csrc/BorderAlignBackward.cpp
similarity index 89%
rename from mx_driving/detection/ops/csrc/BorderAlignGrad.cpp
rename to mx_driving/csrc/BorderAlignBackward.cpp
index 9d5ce70ad219fef6997afbc45afd2236edb97f25..599fadb884fc184b19eb72ddf5445cc3cec4bc5e 100644
--- a/mx_driving/detection/ops/csrc/BorderAlignGrad.cpp
+++ b/mx_driving/csrc/BorderAlignBackward.cpp
@@ -15,10 +15,10 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor border_align_backward(const at::Tensor& grad_out, const at::Tensor& boxes, const at::Tensor& argmax_idx,
-                                 int32_t pool_size, int32_t height, int32_t width)
+    int32_t pool_size, int32_t height, int32_t width)
 {
     TORCH_CHECK_NPU(grad_out);
     TORCH_CHECK_NPU(boxes);
@@ -33,7 +33,8 @@ at::Tensor border_align_backward(const at::Tensor& grad_out, const at::Tensor& b
     int32_t box_size = boxes.size(1);
 
     at::Tensor grad_input = at::zeros({batch_size, feat_channels, height, width}, grad_out.options());
-    
-    EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_out, boxes, argmax_idx, channels, box_size, height, width, pool_size, batch_size, grad_input);
+
+    EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_out, boxes, argmax_idx, channels, box_size, height, width, pool_size,
+        batch_size, grad_input);
     return grad_input;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/detection/ops/csrc/BoxIou.cpp b/mx_driving/csrc/BoxIou.cpp
similarity index 89%
rename from mx_driving/detection/ops/csrc/BoxIou.cpp
rename to mx_driving/csrc/BoxIou.cpp
index 84b22dfa5f198d724e86aab8b0cb5852945d77a8..e0abc69d4ce5541dd0d137c502f3d13f9d8fdaaa 100644
--- a/mx_driving/detection/ops/csrc/BoxIou.cpp
+++ b/mx_driving/csrc/BoxIou.cpp
@@ -15,12 +15,12 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr int64_t N_IDX = 0;
 
-void check_npu(const at::Tensor &boxes_a, const at::Tensor &boxes_b)
+void check_npu(const at::Tensor& boxes_a, const at::Tensor& boxes_b)
 {
     TORCH_CHECK_NPU(boxes_a);
     TORCH_CHECK_NPU(boxes_b);
@@ -35,8 +35,8 @@ void check_npu(const at::Tensor &boxes_a, const at::Tensor &boxes_b)
  * @param aligned: False-calculate between each box of boxes_a and boxes_b, True-calculate between each aligned pair of boxes_a and boxes_b
  * @return ious: iou of boxes
  */
-at::Tensor npu_box_iou_quadri(const at::Tensor &boxes_a, const at::Tensor &boxes_b,
-    const int64_t mode_flag, const bool aligned)
+at::Tensor npu_box_iou_quadri(
+    const at::Tensor& boxes_a, const at::Tensor& boxes_b, const int64_t mode_flag, const bool aligned)
 {
     TORCH_CHECK(boxes_a.size(1) == 8, "boxes_a must be 2D tensor (N, 8)");
     TORCH_CHECK(boxes_b.size(1) == 8, "boxes_b must be 2D tensor (N, 8)");
@@ -61,8 +61,8 @@ at::Tensor npu_box_iou_quadri(const at::Tensor &boxes_a, const at::Tensor &boxes
  * @param aligned: False-calculate between each box of boxes_a and boxes_b, True-calculate between each aligned pair of boxes_a and boxes_b
  * @return ious: iou of boxes
  */
-at::Tensor npu_box_iou_rotated(const at::Tensor &boxes_a, const at::Tensor &boxes_b,
-    const int64_t mode_flag, const bool aligned)
+at::Tensor npu_box_iou_rotated(
+    const at::Tensor& boxes_a, const at::Tensor& boxes_b, const int64_t mode_flag, const bool aligned)
 {
     TORCH_CHECK(boxes_a.size(1) == 5, "boxes_a must be 2D tensor (N, 5)");
     TORCH_CHECK(boxes_b.size(1) == 5, "boxes_b must be 2D tensor (N, 5)");
diff --git a/mx_driving/detection/ops/csrc/BoxesOverlapBev.cpp b/mx_driving/csrc/BoxesOverlapBev.cpp
similarity index 90%
rename from mx_driving/detection/ops/csrc/BoxesOverlapBev.cpp
rename to mx_driving/csrc/BoxesOverlapBev.cpp
index cc06412b02595a0103065a3249867ce95687d2c7..db85fec26990876be75a18b8b885fe3edd0ab418 100644
--- a/mx_driving/detection/ops/csrc/BoxesOverlapBev.cpp
+++ b/mx_driving/csrc/BoxesOverlapBev.cpp
@@ -15,12 +15,12 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr int64_t N_IDX = 0;
 
-void check_npu(const at::Tensor &boxes_a, const at::Tensor &boxes_b)
+void check_npu(const at::Tensor& boxes_a, const at::Tensor& boxes_b)
 {
     TORCH_CHECK_NPU(boxes_a);
     TORCH_CHECK_NPU(boxes_b);
@@ -33,7 +33,7 @@ void check_npu(const at::Tensor &boxes_a, const at::Tensor &boxes_b)
  * @param boxes_b: input boxes, 2D tensor(N, 5)
  * @return area_overlap: overlap area of boxes
  */
-at::Tensor npu_boxes_overlap_bev(const at::Tensor &boxes_a, const at::Tensor &boxes_b)
+at::Tensor npu_boxes_overlap_bev(const at::Tensor& boxes_a, const at::Tensor& boxes_b)
 {
     TORCH_CHECK(boxes_a.size(1) == 5, "boxes_a must be 2D tensor (N, 5)");
     TORCH_CHECK(boxes_b.size(1) == 5, "boxes_b must be 2D tensor (N, 5)");
diff --git a/mx_driving/csrc/CMakeLists.txt b/mx_driving/csrc/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..10fcddf9d354f07983689f83f6b5e7d7959405db
--- /dev/null
+++ b/mx_driving/csrc/CMakeLists.txt
@@ -0,0 +1,62 @@
+file(GLOB ASCEND_CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+if(BUILD_STAGE EQUAL 2)
+  set(Python3_USE_STATIC_LIBS FALSE)
+  find_package(Python3 COMPONENTS Interpreter Development)
+
+  execute_process(
+    COMMAND ${Python3_EXECUTABLE} -c
+            "import os; import torch; print(os.path.dirname(torch.__file__))"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE TORCH_PATH)
+  execute_process(
+    COMMAND
+      ${Python3_EXECUTABLE} -c
+      "import os; import site; print(site.getsitepackages()[0] + '/torch_npu')"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE TORCH_NPU_PATH)
+  message("TORCH_PATH is ${TORCH_PATH}")
+  message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
+
+  set(EXT_CXX_FLAGS "${EXT_CXX_FLAGS}")
+  separate_arguments(EXT_CXX_FLAGS)
+  add_library(_C SHARED ${ASCEND_CSRC_SRC})
+  set_target_properties(
+    _C
+    PROPERTIES OUTPUT_NAME "${MX_DRIVING_PATH}/_C.${Python3_SOABI}"
+               PREFIX ""
+               SUFFIX ".so")
+
+  if(${COMPILE_WITH_XLA})
+    target_compile_definitions(_C PRIVATE COMPILE_WITH_XLA)
+  endif()
+  target_include_directories(
+    _C
+    PRIVATE ${Python3_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/include
+            ${TORCH_NPU_PATH}/include ${TORCH_PATH}/include
+            ${TORCH_PATH}/include/torch/csrc/api/include)
+  target_compile_options(
+    _C
+    PRIVATE -fprofile-arcs
+            -ftest-coverage
+            -fPIC
+            $<$<CONFIG:Release>:-O3>
+            $<$<CONFIG:Debug>:-O0
+            -g>
+            -fstack-protector-all
+            -DTORCH_API_INCLUDE_EXTENSION_H
+            -DTORCH_EXTENSION_NAME=_C
+            -D_GLIBCXX_USE_CXX11_ABI=0
+            -D__FILENAME__=__FILE__
+            ${EXT_CXX_FLAGS})
+
+  target_link_directories(_C PRIVATE ${TORCH_PATH}/lib ${TORCH_NPU_PATH}/lib)
+  target_link_libraries(_C PRIVATE gcov c10 torch torch_python torch_npu)
+  target_link_options(
+    _C
+    PRIVATE
+    $<$<STREQUAL:$<TARGET_PROPERTY:TYPE>,EXECUTABLE>:-pie>
+    $<$<CONFIG:Release>:-s>
+    -Wl,-z,relro
+    -Wl,-z,now
+    -Wl,-z,noexecstack)
+endif()
diff --git a/mx_driving/fused/ops/csrc/DeformableAggregation.cpp b/mx_driving/csrc/DeformableAggregation.cpp
similarity index 65%
rename from mx_driving/fused/ops/csrc/DeformableAggregation.cpp
rename to mx_driving/csrc/DeformableAggregation.cpp
index da4027c22f0cc246db88af6c1d98eefbd73adc88..c2a769d49ab5d781d274466348a9f9ae75b8df64 100644
--- a/mx_driving/fused/ops/csrc/DeformableAggregation.cpp
+++ b/mx_driving/csrc/DeformableAggregation.cpp
@@ -1,6 +1,21 @@
-#include "csrc/OpApiCommon.h"
-#include "functions.h"
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
+#include "csrc/OpApiCommon.h"
+#include "csrc/functions.h"
 
 at::Tensor deformable_aggregation(const at::Tensor& mc_ms_feat, const at::Tensor& spatial_shape,
     const at::Tensor& scale_start_index, const at::Tensor& sampling_location, const at::Tensor& weights)
@@ -32,21 +47,13 @@ at::Tensor deformable_aggregation(const at::Tensor& mc_ms_feat, const at::Tensor
 
     EXEC_NPU_CMD(aclnnDeformableAggregation, mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
         batch_size, num_feat, num_embeds, num_anchors, num_pts, num_cams, num_scale, num_groups, out);
-
     return out;
 }
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_aggregation_grad(
-    const at::Tensor& mc_ms_feat,
-    const at::Tensor& spatial_shape,
-    const at::Tensor& scale_start_index,
-    const at::Tensor& sampling_location,
-    const at::Tensor& weights,
-    const at::Tensor& grad_output,
-    const at::Tensor& grad_mc_ms_feat,
-    const at::Tensor& grad_sampling_location,
-    const at::Tensor& grad_weights
-    )
+std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_aggregation_backward(const at::Tensor& mc_ms_feat,
+    const at::Tensor& spatial_shape, const at::Tensor& scale_start_index, const at::Tensor& sampling_location,
+    const at::Tensor& weights, const at::Tensor& grad_output, const at::Tensor& grad_mc_ms_feat,
+    const at::Tensor& grad_sampling_location, const at::Tensor& grad_weights)
 {
     TORCH_CHECK_NPU(mc_ms_feat);
     TORCH_CHECK_NPU(spatial_shape);
@@ -61,20 +68,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_aggregation_grad(
     TORCH_CHECK(sampling_location.dim() == 5, "sampling_location.dim() must be 5, but got: ", sampling_location.dim());
     TORCH_CHECK(weights.dim() == 6, "weights.dim() must be 6, but got: ", weights.dim());
 
-    EXEC_NPU_CMD(
-        aclnnDeformableAggregationGrad,
-        mc_ms_feat,
-        spatial_shape,
-        scale_start_index,
-        sampling_location,
-        weights,
-        grad_output,
-        grad_mc_ms_feat,
-        grad_sampling_location,
-        grad_weights);
-    return std::make_tuple(
-        grad_mc_ms_feat,
-        grad_sampling_location,
-        grad_weights);
+    EXEC_NPU_CMD(aclnnDeformableAggregationGrad, mc_ms_feat, spatial_shape, scale_start_index, sampling_location,
+        weights, grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights);
+    return std::make_tuple(grad_mc_ms_feat, grad_sampling_location, grad_weights);
 }
-
diff --git a/mx_driving/fused/ops/csrc/DeformableConv2d.cpp b/mx_driving/csrc/DeformableConv2d.cpp
similarity index 98%
rename from mx_driving/fused/ops/csrc/DeformableConv2d.cpp
rename to mx_driving/csrc/DeformableConv2d.cpp
index f847861aa8f45029f945d9e2e4182cfa28a70a82..3f797463b032b3af42b62be1620ff82b650028c5 100644
--- a/mx_driving/fused/ops/csrc/DeformableConv2d.cpp
+++ b/mx_driving/csrc/DeformableConv2d.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 std::tuple<at::Tensor, at::Tensor> deformable_conv2d(const at::Tensor& input, const at::Tensor& offset,
     const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding,
diff --git a/mx_driving/fused/ops/csrc/DeformableConv2dBackward.cpp b/mx_driving/csrc/DeformableConv2dBackward.cpp
similarity index 73%
rename from mx_driving/fused/ops/csrc/DeformableConv2dBackward.cpp
rename to mx_driving/csrc/DeformableConv2dBackward.cpp
index 038bb3760014f1d05b0c9ddbc0c2f45bd5ba3585..dd6444a7d81f33646d28c77d69139a049442ae27 100644
--- a/mx_driving/fused/ops/csrc/DeformableConv2dBackward.cpp
+++ b/mx_driving/csrc/DeformableConv2dBackward.cpp
@@ -1,5 +1,21 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_conv2d_backward(const at::Tensor& input,
     const at::Tensor& weight, const at::Tensor& offset, const at::Tensor& offset_output, const at::Tensor& grad_y,
diff --git a/mx_driving/point/ops/csrc/DynamicScatter.cpp b/mx_driving/csrc/DynamicScatter.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/DynamicScatter.cpp
rename to mx_driving/csrc/DynamicScatter.cpp
index fb50a1431e3650c63ec2ce7dd22ad45bc0deaffa..48984495885728ac1dc725d63fdf1e696d5115e1 100644
--- a/mx_driving/point/ops/csrc/DynamicScatter.cpp
+++ b/mx_driving/csrc/DynamicScatter.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 namespace {
 constexpr uint32_t BLOCK_NUM = 8;
@@ -63,4 +63,4 @@ void npu_dynamic_scatter_grad(at::Tensor& grad_point_feats, const at::Tensor& gr
         EXEC_NPU_CMD(aclnnDynamicScatterGrad, grad_voxel_feats, prefix_sum_point_per_voxel, argsort_coor, compare_mask,
             reduce_type, grad_point_feats);
     }
-}
\ No newline at end of file
+}
diff --git a/mx_driving/point/ops/csrc/DynamicVoxelization.cpp b/mx_driving/csrc/DynamicVoxelization.cpp
similarity index 96%
rename from mx_driving/point/ops/csrc/DynamicVoxelization.cpp
rename to mx_driving/csrc/DynamicVoxelization.cpp
index 7b3ae41b7b39c4b7420b99266a1b7ca7e807f4d5..7c049c6411f889458c1520302bf8bbcf964c2efd 100644
--- a/mx_driving/point/ops/csrc/DynamicVoxelization.cpp
+++ b/mx_driving/csrc/DynamicVoxelization.cpp
@@ -14,10 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <ATen/ATen.h>
-
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor dynamic_voxelization(const at::Tensor& points, at::Tensor& coors, int grid_x, int grid_y, int grid_z,
     double voxel_x, double voxel_y, double voxel_z, double coors_min_x, double coors_min_y, double coorsMinZ)
diff --git a/mx_driving/point/ops/csrc/FurthestPointSampling.cpp b/mx_driving/csrc/FurthestPointSampling.cpp
similarity index 97%
rename from mx_driving/point/ops/csrc/FurthestPointSampling.cpp
rename to mx_driving/csrc/FurthestPointSampling.cpp
index ab097e94e4871d14f5e7629a489393ea6fa8b378..0db88d3414a51dfc1af35b3196f69f846a51584e 100644
--- a/mx_driving/point/ops/csrc/FurthestPointSampling.cpp
+++ b/mx_driving/csrc/FurthestPointSampling.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor npu_furthest_point_sampling(const at::Tensor& point_xyz, const at::Tensor& nearset_temp, int32_t num_points)
 {
@@ -23,4 +23,4 @@ at::Tensor npu_furthest_point_sampling(const at::Tensor& point_xyz, const at::Te
         nearset_temp.options().dtype(at::kInt));
     EXEC_NPU_CMD(aclnnFurthestPointSampling, point_xyz, nearset_temp, num_points, output);
     return output;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/point/ops/csrc/FurthestPointSamplingWithDist.cpp b/mx_driving/csrc/FurthestPointSamplingWithDist.cpp
similarity index 97%
rename from mx_driving/point/ops/csrc/FurthestPointSamplingWithDist.cpp
rename to mx_driving/csrc/FurthestPointSamplingWithDist.cpp
index cfef3b81c6034f8338eebe7fc1ad271cc955bb0a..bb36399d89bfa994d0c508bfae1867f6fa980dd8 100644
--- a/mx_driving/point/ops/csrc/FurthestPointSamplingWithDist.cpp
+++ b/mx_driving/csrc/FurthestPointSamplingWithDist.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor furthest_point_sampling_with_dist(
     const at::Tensor& points_dist, const at::Tensor& nearest_temp, int32_t num_points)
diff --git a/mx_driving/fused/ops/csrc/FusedBiasLeakyRelu.cpp b/mx_driving/csrc/FusedBiasLeakyRelu.cpp
similarity index 89%
rename from mx_driving/fused/ops/csrc/FusedBiasLeakyRelu.cpp
rename to mx_driving/csrc/FusedBiasLeakyRelu.cpp
index 720d9a49703cdcccf4b0097460509d22c93f7908..006740badcf68e97e8c57b92c86ec603e5834c9a 100644
--- a/mx_driving/fused/ops/csrc/FusedBiasLeakyRelu.cpp
+++ b/mx_driving/csrc/FusedBiasLeakyRelu.cpp
@@ -14,12 +14,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <ATen/ATen.h>
-
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-at::Tensor fused_bias_leaky_relu(const at::Tensor& x, const at::Tensor& bias, const double negative_slope, const double scale)
+at::Tensor fused_bias_leaky_relu(const at::Tensor& x, const at::Tensor& bias, double negative_slope, double scale)
 {
     TORCH_CHECK_NPU(x);
     TORCH_CHECK_NPU(bias);
@@ -28,4 +26,4 @@ at::Tensor fused_bias_leaky_relu(const at::Tensor& x, const at::Tensor& bias, co
 
     EXEC_NPU_CMD(aclnnFusedBiasLeakyReluV2, x, bias, negative_slope, scale, output);
     return output;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/fused/ops/csrc/GeometricKernelAttentionFunc.cpp b/mx_driving/csrc/GeometricKernelAttentionFunc.cpp
similarity index 98%
rename from mx_driving/fused/ops/csrc/GeometricKernelAttentionFunc.cpp
rename to mx_driving/csrc/GeometricKernelAttentionFunc.cpp
index cfff39e99d4d0c45c2238ffc81a4dac2cb4c24a8..ebfc0947c6e1f668dcc4bcd381852f4c653a6e71 100644
--- a/mx_driving/fused/ops/csrc/GeometricKernelAttentionFunc.cpp
+++ b/mx_driving/csrc/GeometricKernelAttentionFunc.cpp
@@ -16,7 +16,7 @@
 
 #include "csrc/OpApiCommon.h"
 #include "csrc/utils.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 constexpr size_t VALUE_BATCH_SIZE_DIM = 0;
 constexpr size_t VALUE_NUM_KEYS_DIM = 1;
@@ -30,7 +30,7 @@ constexpr size_t ATTN_WEIGHTS_NUM_POINTS_DIM = 4;
 constexpr size_t FLOAT32_BYTES = 4;
 constexpr size_t BLOCK_BYTES = 32;
 
-at::Tensor npu_geometric_kernel_attention_func(const at::Tensor& value, const at::Tensor& spatial_shapes,
+at::Tensor npu_geometric_kernel_attention(const at::Tensor& value, const at::Tensor& spatial_shapes,
     const at::Tensor& level_start_index, const at::Tensor& sampling_locations, const at::Tensor& attn_weights)
 {
     TORCH_CHECK(value.scalar_type() == at::kHalf || value.scalar_type() == at::kFloat,
diff --git a/mx_driving/point/ops/csrc/GroupPoints.cpp b/mx_driving/csrc/GroupPoints.cpp
similarity index 91%
rename from mx_driving/point/ops/csrc/GroupPoints.cpp
rename to mx_driving/csrc/GroupPoints.cpp
index 03e8b4797a97fea0fe82737a648a5e91a8f58be7..94f052eae48c48434cc350e7fee1347ebbd3cf26 100644
--- a/mx_driving/point/ops/csrc/GroupPoints.cpp
+++ b/mx_driving/csrc/GroupPoints.cpp
@@ -15,15 +15,15 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
-
+#include "csrc/functions.h"
 
 at::Tensor group_points(
     const at::Tensor& points, const at::Tensor& idx, int64_t b, int64_t c, int64_t n, int64_t npoints, int64_t nsample)
 {
     TORCH_CHECK_NPU(points);
     TORCH_CHECK_NPU(idx);
-    TORCH_CHECK(points.scalar_type() == at::kHalf || points.scalar_type() == at::kFloat, "group_points only support float16 or float32 tensor.")
+    TORCH_CHECK(points.scalar_type() == at::kHalf || points.scalar_type() == at::kFloat,
+        "group_points only support float16 or float32 tensor.")
     TORCH_CHECK(points.dim() == 3, "points.dim() must be 3, but got: ", points.dim());
     TORCH_CHECK(idx.dim() == 3, "idx.dim() must be 3, but got: ", idx.dim());
     TORCH_CHECK(points.size(0) == idx.size(0), "the input first dimension must be the same.")
@@ -43,8 +43,8 @@ at::Tensor group_points(
 }
 
 
-at::Tensor group_points_backward(const at::Tensor& grad_out, const at::Tensor& idx, int64_t b,
-                                 int64_t c, int64_t n, int64_t npoints, int64_t nsample)
+at::Tensor group_points_backward(const at::Tensor& grad_out, const at::Tensor& idx, int64_t b, int64_t c, int64_t n,
+    int64_t npoints, int64_t nsample)
 {
     TORCH_CHECK_NPU(grad_out);
     TORCH_CHECK_NPU(idx);
@@ -61,4 +61,4 @@ at::Tensor group_points_backward(const at::Tensor& grad_out, const at::Tensor& i
 
     at::Tensor grad_points = out.transpose(1, 2);
     return grad_points;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/point/ops/csrc/HardVoxelize.cpp b/mx_driving/csrc/HardVoxelize.cpp
similarity index 99%
rename from mx_driving/point/ops/csrc/HardVoxelize.cpp
rename to mx_driving/csrc/HardVoxelize.cpp
index 25964218ac2f8a64d177c57bedf6c85c342fa721..27c779541f4fea9685a2adf070d8a9e5711feacb 100644
--- a/mx_driving/point/ops/csrc/HardVoxelize.cpp
+++ b/mx_driving/csrc/HardVoxelize.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 constexpr size_t NUM_VOXELS_IDX = 0;
 constexpr size_t UNI_VOXELS_IDX = 1;
diff --git a/include/csrc/pybind.h b/mx_driving/csrc/Hypot.cpp
similarity index 52%
rename from include/csrc/pybind.h
rename to mx_driving/csrc/Hypot.cpp
index 49ac2037b041ffbe1d634508b46d35ad8ebb72f5..24a0d4a77133de34edcd55220f0c3dd60398619c 100644
--- a/include/csrc/pybind.h
+++ b/mx_driving/csrc/Hypot.cpp
@@ -13,14 +13,23 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef CSRC_PYBIND_H_
-#define CSRC_PYBIND_H_
-#include <pybind11/pybind11.h>
 
-void init_common(pybind11::module& m);
-void init_fused(pybind11::module& m);
-void init_point(pybind11::module& m);
-void init_preprocess(pybind11::module& m);
-void init_detection(pybind11::module& m);
-void init_spconv(pybind11::module& m);
-#endif // CSRC_PYBIND_H_
+
+#include "csrc/OpApiCommon.h"
+#include "csrc/functions.h"
+
+at::Tensor npu_hypot(const at::Tensor& x, const at::Tensor& y)
+{
+    auto out = at::empty_like(x, x.options());
+    EXEC_NPU_CMD(aclnnHypot, x, y, out);
+    return out;
+}
+
+std::tuple<at::Tensor, at::Tensor> npu_hypot_grad(
+    const at::Tensor& x, const at::Tensor& y, const at::Tensor& out, const at::Tensor& out_grad)
+{
+    auto x_grad = at::empty_like(x, x.options());
+    auto y_grad = at::empty_like(y, y.options());
+    EXEC_NPU_CMD(aclnnHypotGrad, x, y, out, out_grad, x_grad, y_grad);
+    return std::make_tuple(x_grad, y_grad);
+}
diff --git a/mx_driving/common/ops/csrc/Knn.cpp b/mx_driving/csrc/Knn.cpp
similarity index 97%
rename from mx_driving/common/ops/csrc/Knn.cpp
rename to mx_driving/csrc/Knn.cpp
index e9c47973572ad7021acd3019fbd4567f004e2d14..12f03e1b3ac70504f1b51d70f81c778587c3c9bc 100644
--- a/mx_driving/common/ops/csrc/Knn.cpp
+++ b/mx_driving/csrc/Knn.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 std::tuple<at::Tensor, at::Tensor> knn(const at::Tensor& xyz, const at::Tensor& center_xyz, int32_t k, bool is_from_knn)
 {
@@ -28,4 +28,4 @@ std::tuple<at::Tensor, at::Tensor> knn(const at::Tensor& xyz, const at::Tensor&
     EXEC_NPU_CMD_SYNC(aclnnKnn, xyz, center_xyz, is_from_knn, k, dist, idx);
 
     return std::tie(dist, idx);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/fused/ops/csrc/MaxPool2d.cpp b/mx_driving/csrc/MaxPool2d.cpp
similarity index 99%
rename from mx_driving/fused/ops/csrc/MaxPool2d.cpp
rename to mx_driving/csrc/MaxPool2d.cpp
index 499c8e70462dd8a2daf3e6ea3ece6442b7476485..94e2e366bae227d8ebf2f8eacf4750d9eae9215d 100644
--- a/mx_driving/fused/ops/csrc/MaxPool2d.cpp
+++ b/mx_driving/csrc/MaxPool2d.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor npu_max_pool2d(const at::Tensor& x, int kernel_size, int stride, int padding)
 {
diff --git a/mx_driving/fused/ops/csrc/ModulatedDeformableConv2d.cpp b/mx_driving/csrc/ModulatedDeformableConv2d.cpp
similarity index 98%
rename from mx_driving/fused/ops/csrc/ModulatedDeformableConv2d.cpp
rename to mx_driving/csrc/ModulatedDeformableConv2d.cpp
index 5fe0afa6d235868f5bacb006a46d2970cf68c2bc..c41d6c72a69ba596604b408bffb00265fed558a4 100644
--- a/mx_driving/fused/ops/csrc/ModulatedDeformableConv2d.cpp
+++ b/mx_driving/csrc/ModulatedDeformableConv2d.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 std::tuple<at::Tensor, at::Tensor> modulated_deformable_conv2d(const at::Tensor& input, const at::Tensor& offset,
     const at::Tensor& mask, const at::Tensor& weight, const c10::optional<at::Tensor>& bias_opt,
diff --git a/mx_driving/fused/ops/csrc/ModulatedDeformableConv2dBackward.cpp b/mx_driving/csrc/ModulatedDeformableConv2dBackward.cpp
similarity index 75%
rename from mx_driving/fused/ops/csrc/ModulatedDeformableConv2dBackward.cpp
rename to mx_driving/csrc/ModulatedDeformableConv2dBackward.cpp
index 7b72604d206d9466f15c2b02addbacae32df10d5..94614e5531cadc7e02fd2dcf16fd6e439309f7e3 100644
--- a/mx_driving/fused/ops/csrc/ModulatedDeformableConv2dBackward.cpp
+++ b/mx_driving/csrc/ModulatedDeformableConv2dBackward.cpp
@@ -1,5 +1,21 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> modulated_deformable_conv2d_backward(
     const at::Tensor& input, const at::Tensor& offset, const at::Tensor& mask, const at::Tensor& weight,
diff --git a/mx_driving/fused/ops/csrc/MultiScaleDeformableAttn.cpp b/mx_driving/csrc/MultiScaleDeformableAttn.cpp
similarity index 99%
rename from mx_driving/fused/ops/csrc/MultiScaleDeformableAttn.cpp
rename to mx_driving/csrc/MultiScaleDeformableAttn.cpp
index 2e81bdf243f05a2d1b4d154438f94231e41716c1..f0a0e7c7cc4f0bff337f7b14c5eb2cf99a91ec33 100644
--- a/mx_driving/fused/ops/csrc/MultiScaleDeformableAttn.cpp
+++ b/mx_driving/csrc/MultiScaleDeformableAttn.cpp
@@ -15,8 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
-
+#include "csrc/functions.h"
 namespace {
 constexpr size_t BATCH_IDX = 0;
 constexpr size_t QUERY_IDX = 1;
diff --git a/mx_driving/spconv/ops/csrc/MultiToSparse.cpp b/mx_driving/csrc/MultiToSparse.cpp
similarity index 77%
rename from mx_driving/spconv/ops/csrc/MultiToSparse.cpp
rename to mx_driving/csrc/MultiToSparse.cpp
index f6ba0d09f0a03ae681ada99e9a842ff11d82abaf..0424634e83bac86732701b443f04eb3f81bb04dc 100644
--- a/mx_driving/spconv/ops/csrc/MultiToSparse.cpp
+++ b/mx_driving/csrc/MultiToSparse.cpp
@@ -15,10 +15,11 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-std::tuple<at::Tensor, at::Tensor> multi_to_sparse(const at::Tensor& out_features, const at::Tensor& unique_indices_offset,
-                                                   const at::Tensor& sorted_idx_to_former_indices, const at::Tensor& outidx_pair)
+std::tuple<at::Tensor, at::Tensor> multi_to_sparse(const at::Tensor& out_features,
+    const at::Tensor& unique_indices_offset, const at::Tensor& sorted_idx_to_former_indices,
+    const at::Tensor& outidx_pair)
 {
     TORCH_CHECK_NPU(out_features);
     TORCH_CHECK_NPU(unique_indices_offset);
@@ -28,21 +29,21 @@ std::tuple<at::Tensor, at::Tensor> multi_to_sparse(const at::Tensor& out_feature
     auto indices_size = unique_indices_offset.sizes();
     auto features_size = out_features.sizes();
     TORCH_CHECK(indices_size[0] > 1,
-        "indices_size zeros dim must be greater than 1 expected but got indices_size[0] value: ",
-        indices_size[0]);
+        "indices_size zeros dim must be greater than 1 expected but got indices_size[0] value: ", indices_size[0]);
 
     c10::SmallVector<int64_t, 2> out_size = {indices_size[0] - 1, features_size[1]};
     c10::SmallVector<int64_t, 2> out_idx_size = {indices_size[0] - 1, 8};
     at::Tensor sparse_value = at::empty(out_size, out_features.options());
     at::Tensor sparse_indices = at::empty(out_idx_size, unique_indices_offset.options());
 
-    EXEC_NPU_CMD(aclnnToSparse, unique_indices_offset, out_features,
-                 sorted_idx_to_former_indices, outidx_pair, sparse_value, sparse_indices);
+    EXEC_NPU_CMD(aclnnToSparse, unique_indices_offset, out_features, sorted_idx_to_former_indices, outidx_pair,
+        sparse_value, sparse_indices);
     return std::tie(sparse_value, sparse_indices);
 }
 
-std::tuple<at::Tensor, at::Tensor> multi_to_sparse_v2(const at::Tensor& features, const at::Tensor& weight, const at::Tensor& unique_indices_offset,
-                                                      const at::Tensor& sorted_idx_to_former_indices, const at::Tensor& outidx_pair)
+std::tuple<at::Tensor, at::Tensor> multi_to_sparse_v2(const at::Tensor& features, const at::Tensor& weight,
+    const at::Tensor& unique_indices_offset, const at::Tensor& sorted_idx_to_former_indices,
+    const at::Tensor& outidx_pair)
 {
     TORCH_CHECK_NPU(features);
     TORCH_CHECK_NPU(weight);
@@ -55,8 +56,7 @@ std::tuple<at::Tensor, at::Tensor> multi_to_sparse_v2(const at::Tensor& features
     auto indices_size = unique_indices_offset.sizes();
 
     TORCH_CHECK(indices_size[0] > 1,
-        "indices_size zeros dim must be greater than 1 expected but got indices_size[0] value: ",
-        indices_size[0]);
+        "indices_size zeros dim must be greater than 1 expected but got indices_size[0] value: ", indices_size[0]);
 
     c10::SmallVector<int64_t, 2> out_size = {indices_size[0] - 1, weight_size[4]};
     c10::SmallVector<int64_t, 2> out_idx_size = {indices_size[0] - 1, 8};
@@ -64,7 +64,7 @@ std::tuple<at::Tensor, at::Tensor> multi_to_sparse_v2(const at::Tensor& features
     at::Tensor sparse_value = at::empty(out_size, features.options());
     at::Tensor sparse_indices = at::empty(out_idx_size, unique_indices_offset.options());
 
-    EXEC_NPU_CMD(aclnnToSparseV3, features, weight, unique_indices_offset,
-                 sorted_idx_to_former_indices, outidx_pair, sparse_value, sparse_indices);
+    EXEC_NPU_CMD(aclnnToSparseV3, features, weight, unique_indices_offset, sorted_idx_to_former_indices, outidx_pair,
+        sparse_value, sparse_indices);
     return std::tie(sparse_value, sparse_indices);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/detection/ops/csrc/Nms3d.cpp b/mx_driving/csrc/Nms3d.cpp
similarity index 97%
rename from mx_driving/detection/ops/csrc/Nms3d.cpp
rename to mx_driving/csrc/Nms3d.cpp
index c92cda16fa9ca924c10c193400a6176b774bfd27..587d8ea6b752fe60f6a82dc8a0e9018638485bfb 100644
--- a/mx_driving/detection/ops/csrc/Nms3d.cpp
+++ b/mx_driving/csrc/Nms3d.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 std::tuple<at::Tensor, at::Tensor> nms3d(const at::Tensor& boxes, double threshold)
 {
diff --git a/mx_driving/detection/ops/csrc/Nms3dNormal.cpp b/mx_driving/csrc/Nms3dNormal.cpp
similarity index 97%
rename from mx_driving/detection/ops/csrc/Nms3dNormal.cpp
rename to mx_driving/csrc/Nms3dNormal.cpp
index 933dfd0792116fb5b6ea9bf3643c21de39cca680..387ee2bb9945625001374f6b11b0159758eb1236 100644
--- a/mx_driving/detection/ops/csrc/Nms3dNormal.cpp
+++ b/mx_driving/csrc/Nms3dNormal.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 std::tuple<at::Tensor, at::Tensor> nms3d_normal(const at::Tensor& boxes, double nms_overlap_thresh)
 {
diff --git a/mx_driving/common/ops/kernels/op_host/OWNERS b/mx_driving/csrc/OWNERS
similarity index 100%
rename from mx_driving/common/ops/kernels/op_host/OWNERS
rename to mx_driving/csrc/OWNERS
diff --git a/bind/OpApiCommon.cpp b/mx_driving/csrc/OpApiCommon.cpp
similarity index 100%
rename from bind/OpApiCommon.cpp
rename to mx_driving/csrc/OpApiCommon.cpp
diff --git a/mx_driving/detection/ops/csrc/PixelGroup.cpp b/mx_driving/csrc/PixelGroup.cpp
similarity index 86%
rename from mx_driving/detection/ops/csrc/PixelGroup.cpp
rename to mx_driving/csrc/PixelGroup.cpp
index ce16121c3a33fcfd5f4a222830699e700d1aea86..fc3e5175df43bd509194af786025197d462465de 100644
--- a/mx_driving/detection/ops/csrc/PixelGroup.cpp
+++ b/mx_driving/csrc/PixelGroup.cpp
@@ -15,11 +15,11 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-std::vector<std::vector<float>> pixel_group(const at::Tensor &score, const at::Tensor &mask, const at::Tensor &embedding,
-                                            const at::Tensor &kernel_label, const at::Tensor &kernel_contour,
-                                            int32_t kernel_region_num, double distance_threshold)
+std::vector<std::vector<float>> pixel_group(const at::Tensor& score, const at::Tensor& mask,
+    const at::Tensor& embedding, const at::Tensor& kernel_label, const at::Tensor& kernel_contour,
+    int32_t kernel_region_num, double distance_threshold)
 {
     TORCH_CHECK_NPU(score);
     TORCH_CHECK_NPU(mask);
@@ -42,8 +42,8 @@ std::vector<std::vector<float>> pixel_group(const at::Tensor &score, const at::T
     at::Tensor label_updated = at::empty(label_updated_size, kernel_label.options());
     at::Tensor valid_mask = at::empty(label_updated_size, mask.options());
 
-    EXEC_NPU_CMD(aclnnPixelGroup, score, mask, embedding, kernel_label, kernel_contour,
-                 kernel_region_num, distance_threshold, point_vector, label_updated);
+    EXEC_NPU_CMD(aclnnPixelGroup, score, mask, embedding, kernel_label, kernel_contour, kernel_region_num,
+        distance_threshold, point_vector, label_updated);
 
     std::vector<std::vector<float>> pixel_assignment(kernel_region_num);
     at::Tensor point_vector_cpu = point_vector.to(at::kCPU);
@@ -68,4 +68,4 @@ std::vector<std::vector<float>> pixel_group(const at::Tensor &score, const at::T
     }
 
     return pixel_assignment;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/point/ops/csrc/PointToVoxel.cpp b/mx_driving/csrc/PointToVoxel.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/PointToVoxel.cpp
rename to mx_driving/csrc/PointToVoxel.cpp
index 63214e75ceab1e328cae67f423feed0bc9f038a2..e1c0c270636ee6c1daee8e86197df0479a8ef49a 100644
--- a/mx_driving/point/ops/csrc/PointToVoxel.cpp
+++ b/mx_driving/csrc/PointToVoxel.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 constexpr float DEFAULT_VALUE = -1.0f;
 constexpr size_t VOXEL_SIZES_SIZE = 3;
diff --git a/mx_driving/preprocess/ops/csrc/PointsInBox.cpp b/mx_driving/csrc/PointsInBox.cpp
similarity index 97%
rename from mx_driving/preprocess/ops/csrc/PointsInBox.cpp
rename to mx_driving/csrc/PointsInBox.cpp
index 92ad849ef5f8622eacdcf33310c492e711193a1d..c13a08f0329d6c2a8db8bb202a018fbfc37b3d31 100644
--- a/mx_driving/preprocess/ops/csrc/PointsInBox.cpp
+++ b/mx_driving/csrc/PointsInBox.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor npu_points_in_box(const at::Tensor& boxes, const at::Tensor& pts)
 {
diff --git a/mx_driving/preprocess/ops/csrc/PointsInBoxAll.cpp b/mx_driving/csrc/PointsInBoxAll.cpp
similarity index 98%
rename from mx_driving/preprocess/ops/csrc/PointsInBoxAll.cpp
rename to mx_driving/csrc/PointsInBoxAll.cpp
index b6fb5e78ff323b66a7dbcd37837d9565f0db0f39..d2a8bd342723265aeb5b8f7d475b271386ec754b 100644
--- a/mx_driving/preprocess/ops/csrc/PointsInBoxAll.cpp
+++ b/mx_driving/csrc/PointsInBoxAll.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor npu_points_in_box_all(const at::Tensor& boxes, const at::Tensor& pts)
 {
diff --git a/mx_driving/spconv/ops/csrc/PrepareSubmConv3d.cpp b/mx_driving/csrc/PrepareSubmConv3d.cpp
similarity index 80%
rename from mx_driving/spconv/ops/csrc/PrepareSubmConv3d.cpp
rename to mx_driving/csrc/PrepareSubmConv3d.cpp
index dd73a12dab58ff45cb1dda678741f33f67363fb9..c1017abc64903fd7b5c77eefd5993f49dd5fa16e 100644
--- a/mx_driving/spconv/ops/csrc/PrepareSubmConv3d.cpp
+++ b/mx_driving/csrc/PrepareSubmConv3d.cpp
@@ -15,14 +15,14 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-std::tuple<at::Tensor, at::Tensor> npu_prepare_subm_conv3d(const at::Tensor& flattenIndices,
-                                                           at::IntArrayRef outSpatialShape, int batch_size)
+std::tuple<at::Tensor, at::Tensor> npu_prepare_subm_conv3d(
+    const at::Tensor& flattenIndices, at::IntArrayRef outSpatialShape, int batch_size)
 {
     int64_t outputnum = 1;
     for (int32_t i = 0; i < outSpatialShape.size(); i++) {
-        outputnum  *= outSpatialShape[i];
+        outputnum *= outSpatialShape[i];
     }
     c10::SmallVector<int64_t, 8> output_size = {batch_size * outputnum};
     auto temp = at::empty(output_size, flattenIndices.options().dtype(at::kFloat)).fill_(-1);
diff --git a/mx_driving/detection/ops/csrc/RoiAlignRotatedGradV2.cpp b/mx_driving/csrc/RoiAlignRotatedGradV2.cpp
similarity index 70%
rename from mx_driving/detection/ops/csrc/RoiAlignRotatedGradV2.cpp
rename to mx_driving/csrc/RoiAlignRotatedGradV2.cpp
index d8736b9aa558cbacfff86e274dfa013cf98db9e4..d4b0439d19cd38e13bfbd3b59eaf4eac06f8bc1b 100644
--- a/mx_driving/detection/ops/csrc/RoiAlignRotatedGradV2.cpp
+++ b/mx_driving/csrc/RoiAlignRotatedGradV2.cpp
@@ -13,13 +13,13 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input,
-                                         const at::Tensor& rois, const at::Tensor& grad_output,
-                                         int32_t pooled_height, int32_t pooled_width, double spatial_scale,
-                                         int32_t sampling_ratio, bool aligned, bool clockwise)
+at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input, const at::Tensor& rois, const at::Tensor& grad_output,
+    int32_t pooled_height, int32_t pooled_width, double spatial_scale, int32_t sampling_ratio, bool aligned,
+    bool clockwise)
 {
     auto ori_dtype = input.scalar_type();
 
@@ -27,9 +27,8 @@ at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input,
 
     at::Tensor grad_input = at::zeros(grad_input_size, input.options());
 
-    EXEC_NPU_CMD(aclnnRoiAlignRotatedGradV2, input, rois, grad_output,
-                 pooled_height, pooled_width, spatial_scale, sampling_ratio, aligned, clockwise,
-                 grad_input);
+    EXEC_NPU_CMD(aclnnRoiAlignRotatedGradV2, input, rois, grad_output, pooled_height, pooled_width, spatial_scale,
+        sampling_ratio, aligned, clockwise, grad_input);
 
     return grad_input.to(ori_dtype);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/detection/ops/csrc/RoiAlignRotatedV2.cpp b/mx_driving/csrc/RoiAlignRotatedV2.cpp
similarity index 76%
rename from mx_driving/detection/ops/csrc/RoiAlignRotatedV2.cpp
rename to mx_driving/csrc/RoiAlignRotatedV2.cpp
index dd7304fa7927e735b70673672043a069b49de0cc..bbf5017793e02b2bbb3bac16dafe0bcf7e30f1be 100644
--- a/mx_driving/detection/ops/csrc/RoiAlignRotatedV2.cpp
+++ b/mx_driving/csrc/RoiAlignRotatedV2.cpp
@@ -13,14 +13,16 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 void roi_align_rotated_v2_forward_npu(const at::Tensor& input, const at::Tensor& rois_map, at::Tensor& output,
-                                      double spatial_scale, int32_t sampling_ratio, int32_t pooled_height, int32_t pooled_width,
-                                      bool aligned, bool clockwise)
+    double spatial_scale, int32_t sampling_ratio, int32_t pooled_height, int32_t pooled_width, bool aligned,
+    bool clockwise)
 {
     at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous();
     at::Tensor rois = rois_map.permute({1, 0}).contiguous();
-    EXEC_NPU_CMD(aclnnRoiAlignRotatedV2, feature_map, rois, spatial_scale, sampling_ratio, pooled_height, pooled_width, aligned, clockwise, output);
-}
\ No newline at end of file
+    EXEC_NPU_CMD(aclnnRoiAlignRotatedV2, feature_map, rois, spatial_scale, sampling_ratio, pooled_height, pooled_width,
+        aligned, clockwise, output);
+}
diff --git a/mx_driving/detection/ops/csrc/RoiawarePool3d.cpp b/mx_driving/csrc/RoiawarePool3d.cpp
similarity index 93%
rename from mx_driving/detection/ops/csrc/RoiawarePool3d.cpp
rename to mx_driving/csrc/RoiawarePool3d.cpp
index e69adbd655b5b8c20830b9c97692d1c14255237a..faab5fb99eb6be1df75c980ba9aac856294ad71b 100644
--- a/mx_driving/detection/ops/csrc/RoiawarePool3d.cpp
+++ b/mx_driving/csrc/RoiawarePool3d.cpp
@@ -21,8 +21,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 void npu_roiaware_pool3d_forward(const at::Tensor& rois, const at::Tensor& pts, const at::Tensor& pts_feature,
     at::Tensor& argmax, at::Tensor& pts_idx_of_voxels, at::Tensor& pooled_features, int32_t mode)
@@ -45,11 +46,12 @@ void npu_roiaware_pool3d_forward(const at::Tensor& rois, const at::Tensor& pts,
     uint32_t outy = pts_idx_of_voxels.size(2);
     uint32_t outz = pts_idx_of_voxels.size(3);
 
-    EXEC_NPU_CMD(aclnnRoiawarePool3d, rois_cast, pts_cast, pts_feature_cast, mode, max_pts_each_voxel, outx, outy, outz, argmax, pts_idx_of_voxels, pooled_features_cast);
+    EXEC_NPU_CMD(aclnnRoiawarePool3d, rois_cast, pts_cast, pts_feature_cast, mode, max_pts_each_voxel, outx, outy, outz,
+        argmax, pts_idx_of_voxels, pooled_features_cast);
 
     if (dtype == at::kHalf) {
         pooled_features_cast = pooled_features_cast.to(at::kHalf);
     }
 
     pooled_features.copy_(pooled_features_cast);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/detection/ops/csrc/RoiawarePool3dGrad.cpp b/mx_driving/csrc/RoiawarePool3dGrad.cpp
similarity index 52%
rename from mx_driving/detection/ops/csrc/RoiawarePool3dGrad.cpp
rename to mx_driving/csrc/RoiawarePool3dGrad.cpp
index dd4e316327170beec7a53c55414cca3e9d9d4770..a289519c0e9f3afcb572d6fe610feeef7da54f52 100644
--- a/mx_driving/detection/ops/csrc/RoiawarePool3dGrad.cpp
+++ b/mx_driving/csrc/RoiawarePool3dGrad.cpp
@@ -1,5 +1,29 @@
+// Copyright (c) 2023-2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Copyright (c) 2023-2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 
 at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::Tensor& argmax,
@@ -9,10 +33,11 @@ at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::T
     TORCH_CHECK_NPU(argmax);
     TORCH_CHECK_NPU(grad_out);
 
-    TORCH_CHECK(pts_idx_of_voxels.dim() == 5, "pts_idx_of_voxels must to be a 5D Tensor, but got: ", pts_idx_of_voxels.dim());
+    TORCH_CHECK(
+        pts_idx_of_voxels.dim() == 5, "pts_idx_of_voxels must to be a 5D Tensor, but got: ", pts_idx_of_voxels.dim());
     TORCH_CHECK(argmax.dim() == 5, "argmax as to be a 5D Tensor, but got: ", argmax.dim());
     TORCH_CHECK(grad_out.dim() == 5, "grad_out has to be a 5D Tensor, but got: ", grad_out.dim());
-    
+
     int32_t boxes_num = grad_out.size(0);
     int32_t out_x = grad_out.size(1);
     int32_t out_y = grad_out.size(2);
@@ -20,12 +45,13 @@ at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::T
     int32_t channels = grad_out.size(4);
     int32_t max_pts_per_voxel = pts_idx_of_voxels.size(4);
 
-    TORCH_CHECK((boxes_num != 0 && out_x != 0 && out_y != 0 && out_z != 0 && channels != 0 && npoints != 0), "Error, some dim equals zero!\n");
+    TORCH_CHECK((boxes_num != 0 && out_x != 0 && out_y != 0 && out_z != 0 && channels != 0 && npoints != 0),
+        "Error, some dim equals zero!\n");
     TORCH_CHECK((channels <= 2048), "channels must less equal than 2048, but got: ", channels);
 
     auto dtype = grad_out.dtype();
     at::Tensor grad_out_cast = grad_out;
-    
+
     at::Tensor grad_in = at::zeros({npoints, channels}, grad_out.options());
     if (dtype == at::kHalf) {
         grad_out_cast = grad_out.to(at::kFloat);
@@ -34,20 +60,21 @@ at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::T
 
     if (pool_method == 0) {
         // maxpool3d
-        EXEC_NPU_CMD(aclnnRoiawareMaxpool3dGrad, argmax, grad_out_cast, boxes_num,
-            out_x, out_y, out_z, channels, npoints, grad_in);
+        EXEC_NPU_CMD(aclnnRoiawareMaxpool3dGrad, argmax, grad_out_cast, boxes_num, out_x, out_y, out_z, channels,
+            npoints, grad_in);
     } else if (pool_method == 1) {
         // avgpool3d
         TORCH_CHECK(npoints >= max_pts_per_voxel, "npoints must greator than max_pts_per_voxel!");
         TORCH_CHECK(max_pts_per_voxel != 0, "Error, some dim equals zero!");
-        TORCH_CHECK((max_pts_per_voxel <= 2048), "max_pts_per_voxel must less equal than 2048, but got: ", max_pts_per_voxel);
-        
-        EXEC_NPU_CMD(aclnnRoiawareAvgpool3dGrad, pts_idx_of_voxels, grad_out_cast, boxes_num,
-            out_x, out_y, out_z, channels, npoints, max_pts_per_voxel, grad_in);
+        TORCH_CHECK(
+            (max_pts_per_voxel <= 2048), "max_pts_per_voxel must less equal than 2048, but got: ", max_pts_per_voxel);
+
+        EXEC_NPU_CMD(aclnnRoiawareAvgpool3dGrad, pts_idx_of_voxels, grad_out_cast, boxes_num, out_x, out_y, out_z,
+            channels, npoints, max_pts_per_voxel, grad_in);
     }
 
     if (dtype == at::kHalf) {
         grad_in = grad_in.to(at::kHalf);
     }
     return grad_in;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/preprocess/ops/csrc/RoipointPool3dForward.cpp b/mx_driving/csrc/RoipointPool3dForward.cpp
similarity index 69%
rename from mx_driving/preprocess/ops/csrc/RoipointPool3dForward.cpp
rename to mx_driving/csrc/RoipointPool3dForward.cpp
index aed16dd4290d40e48490494571a564dd06e75afe..b9ca5a59a904b5b5ece8181bbfb0f58e70da0418 100644
--- a/mx_driving/preprocess/ops/csrc/RoipointPool3dForward.cpp
+++ b/mx_driving/csrc/RoipointPool3dForward.cpp
@@ -13,9 +13,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <ATen/ATen.h>
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 /*
  * points: (B, N, 3)
@@ -25,19 +25,17 @@
  * pooled_empty_flag: (B, M)
 */
 std::tuple<at::Tensor, at::Tensor> npu_roipoint_pool3d_forward(const int32_t num_sampled_points,
-                                                               const at::Tensor &points,
-                                                               const at::Tensor &point_features,
-                                                               const at::Tensor &boxes3d)
+    const at::Tensor& points, const at::Tensor& point_features, const at::Tensor& boxes3d)
 {
     auto points_trans = points.transpose(1, 2).contiguous();
     auto point_features_trans = point_features.transpose(1, 2).contiguous();
-    c10::SmallVector<int64_t, SIZE> features_trans_size = {points.size(0), boxes3d.size(1),
-        points.size(2) + point_features.size(2), num_sampled_points};
+    c10::SmallVector<int64_t, SIZE> features_trans_size = {
+        points.size(0), boxes3d.size(1), points.size(2) + point_features.size(2), num_sampled_points};
     at::Tensor pooled_features_trans = at::empty(features_trans_size, points.options());
     c10::SmallVector<int64_t, SIZE> empty_flag_size = {boxes3d.size(0), boxes3d.size(1)};
     at::Tensor pooled_empty_flag = at::empty(empty_flag_size, boxes3d.options().dtype(at::kInt));
-    EXEC_NPU_CMD(aclnnRoipointPool3dForward,
-        points_trans, point_features_trans, boxes3d, num_sampled_points, pooled_features_trans, pooled_empty_flag);
+    EXEC_NPU_CMD(aclnnRoipointPool3dForward, points_trans, point_features_trans, boxes3d, num_sampled_points,
+        pooled_features_trans, pooled_empty_flag);
     auto pooled_features = pooled_features_trans.transpose(2, 3).contiguous();
     return std::tie(pooled_features, pooled_empty_flag);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/detection/ops/csrc/RotatedIou.cpp b/mx_driving/csrc/RotatedIou.cpp
similarity index 97%
rename from mx_driving/detection/ops/csrc/RotatedIou.cpp
rename to mx_driving/csrc/RotatedIou.cpp
index fd39e98507a4494a4215669ae6a625e442a1c499..f25da9ed0bcb5a5aa8954c759b19b6466061792d 100644
--- a/mx_driving/detection/ops/csrc/RotatedIou.cpp
+++ b/mx_driving/csrc/RotatedIou.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "functions.h"
-#include "torch_npu/csrc/framework/OpCommand.h"
+#include "csrc/OpApiCommon.h"
+#include "csrc/functions.h"
 
 namespace {
 at::Tensor& rotated_iou_npu_nocheck(at::Tensor& iou, const at::Tensor& boxes, const at::Tensor& query_boxes, bool trans,
diff --git a/mx_driving/detection/ops/csrc/RotatedOverlaps.cpp b/mx_driving/csrc/RotatedOverlaps.cpp
similarity index 70%
rename from mx_driving/detection/ops/csrc/RotatedOverlaps.cpp
rename to mx_driving/csrc/RotatedOverlaps.cpp
index 2f49abd989a06f0372028c39d2e75fba9c717a9c..ee1f952e138176f63637da3c77a9e71e8b23b437 100644
--- a/mx_driving/detection/ops/csrc/RotatedOverlaps.cpp
+++ b/mx_driving/csrc/RotatedOverlaps.cpp
@@ -14,35 +14,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "torch_npu/csrc/framework/OpCommand.h"
-#include "functions.h"
+#include "csrc/OpApiCommon.h"
+#include "csrc/functions.h"
 
 namespace {
-at::Tensor &rotated_overlaps_npu_nocheck(
-    at::Tensor &overlaps,
-    const at::Tensor &self,
-    const at::Tensor &query_boxes,
-    bool trans)
+at::Tensor& rotated_overlaps_npu_nocheck(
+    at::Tensor& overlaps, const at::Tensor& self, const at::Tensor& query_boxes, bool trans)
 {
     at_npu::native::OpCommand cmd;
-    cmd.Name("RotatedOverlaps")
-        .Input(self)
-        .Input(query_boxes)
-        .Output(overlaps)
-        .Attr("trans", trans)
-        .Run();
+    cmd.Name("RotatedOverlaps").Input(self).Input(query_boxes).Output(overlaps).Attr("trans", trans).Run();
     return overlaps;
 }
 } // namespace
 
-at::Tensor npu_rotated_overlaps(
-    const at::Tensor &self,
-    const at::Tensor &query_boxes,
-    bool trans)
+at::Tensor npu_rotated_overlaps(const at::Tensor& self, const at::Tensor& query_boxes, bool trans)
 {
     TORCH_CHECK(self.ndimension() == 3 && query_boxes.ndimension() == 3,
-                "boxes' dim should be equal to query_boxes' ndimension() ",
-                "and equal to 3!");
+        "boxes' dim should be equal to query_boxes' ndimension() ", "and equal to 3!");
     auto origin_dtype = self.scalar_type();
     // the Op only support fp32 currently!
     at::Tensor self_cp = self.to(at::kFloat).permute({0, 2, 1});
diff --git a/mx_driving/common/ops/csrc/ScatterMax.cpp b/mx_driving/csrc/ScatterMax.cpp
similarity index 80%
rename from mx_driving/common/ops/csrc/ScatterMax.cpp
rename to mx_driving/csrc/ScatterMax.cpp
index 37c06e1523fa098d214e40edaa6407b2c4c785cf..a1c4ee4102716878fa9f590b1f5d75e33ea600bd 100644
--- a/mx_driving/common/ops/csrc/ScatterMax.cpp
+++ b/mx_driving/csrc/ScatterMax.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 using namespace std;
 
@@ -23,7 +23,7 @@ namespace {
 constexpr uint32_t MAX_INDICES_VALUE = 120000;
 constexpr uint32_t SUPPORT_UPDATES = 32;
 constexpr uint32_t MAX_SUPPORT_UPDATES = 512;
-}
+} // namespace
 
 void npu_scatter_max_check(const at::Tensor& updates, const at::Tensor& indices, const at::Tensor& result)
 {
@@ -42,12 +42,14 @@ void npu_scatter_max_check(const at::Tensor& updates, const at::Tensor& indices,
     for (size_t i = 1; i < result.dim(); i++) {
         TORCH_CHECK(updatesSizes[i] == resultSizes[i], "updates and out should have the same size except for dim 0.");
     }
-    TORCH_CHECK(indicesLength == 1, "all the dims's range except the first dim of input tensor [indices] should be equal to 1.");
-    TORCH_CHECK(indices.sizes()[0] == updates.sizes()[0], "input's updates size of dim 0 should be equal to indices's size.");
+    TORCH_CHECK(indicesLength == 1,
+        "all the dims's range except the first dim of input tensor [indices] should be equal to 1.");
+    TORCH_CHECK(
+        indices.sizes()[0] == updates.sizes()[0], "input's updates size of dim 0 should be equal to indices's size.");
 }
 
-std::tuple<at::Tensor, at::Tensor> scatter_max_with_argmax_v2(const at::Tensor& updates, const at::Tensor& indices,
-                                                              c10::optional<at::Tensor> out)
+std::tuple<at::Tensor, at::Tensor> scatter_max_with_argmax_v2(
+    const at::Tensor& updates, const at::Tensor& indices, c10::optional<at::Tensor> out)
 {
     auto sizes = updates.sizes().vec();
     auto indicesMax = indices.max().item().toLong();
@@ -63,8 +65,7 @@ std::tuple<at::Tensor, at::Tensor> scatter_max_with_argmax_v2(const at::Tensor&
     return std::tie(result, argmax);
 }
 
-at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids,
-                                    const at::Tensor& num_segments)
+at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments)
 {
     c10::SmallVector<int64_t, SIZE> output_size;
 
@@ -78,6 +79,12 @@ at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segme
 
     at::Tensor out = at::empty(output_size, x.options());
     at_npu::native::OpCommand cmd;
-    cmd.Name("UnsortedSegmentSum").Input(x).Input(segment_ids).Input(num_segments).Output(out).Attr("check_ids", true).Run();
+    cmd.Name("UnsortedSegmentSum")
+        .Input(x)
+        .Input(segment_ids)
+        .Input(num_segments)
+        .Output(out)
+        .Attr("check_ids", true)
+        .Run();
     return out;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/common/ops/csrc/ScatterMeanGrad.cpp b/mx_driving/csrc/ScatterMeanGrad.cpp
similarity index 91%
rename from mx_driving/common/ops/csrc/ScatterMeanGrad.cpp
rename to mx_driving/csrc/ScatterMeanGrad.cpp
index d7ada84800518c07fa373a4a13a58207e3f81f85..1f714f89fb353957630cde9bbf235b712e978444 100644
--- a/mx_driving/common/ops/csrc/ScatterMeanGrad.cpp
+++ b/mx_driving/csrc/ScatterMeanGrad.cpp
@@ -13,9 +13,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <ATen/ATen.h>
+
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Tensor& count, int32_t dim)
 {
@@ -30,10 +30,8 @@ at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Te
     TORCH_CHECK(grad_out.scalar_type() == at::kFloat,
         "grad_out: float32 tensor expected but got a tensor with dtype: ", grad_out.scalar_type());
     TORCH_CHECK(index.scalar_type() == at::kInt,
-        "index: int32 tensor expected but got a tensor with dtype: ",
-        index.scalar_type());
-    TORCH_CHECK(grad_out_dims != 0 && index_dims != 0,
-        "grad_out and index should not be empty");
+        "index: int32 tensor expected but got a tensor with dtype: ", index.scalar_type());
+    TORCH_CHECK(grad_out_dims != 0 && index_dims != 0, "grad_out and index should not be empty");
 
     c10::SmallVector<int64_t, 8> grad_in_size;
     for (uint32_t i = 0; i < grad_out_dims; i++) {
@@ -42,8 +40,7 @@ at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Te
     dim = (dim + index_dims) % index_dims;
     grad_in_size[dim] = index_size[dim];
     for (uint32_t i = 0; i < grad_out_dims; i++) {
-        TORCH_CHECK(i >= index_dims || grad_in_size[i] == index_size[i],
-            "the shape except dim should be the same");
+        TORCH_CHECK(i >= index_dims || grad_in_size[i] == index_size[i], "the shape except dim should be the same");
     }
     uint64_t tail = 1;
     for (uint32_t i = index_dims; i < grad_out_dims; i++) {
@@ -67,4 +64,4 @@ at::Tensor npu_scatter_mean_grad(at::Tensor& grad_out, at::Tensor& index, at::Te
         EXEC_NPU_CMD(aclnnScatterMeanGrad, grad_out, index, count, dim, result);
     }
     return result;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/common/ops/csrc/SortPairs.cpp b/mx_driving/csrc/SortPairs.cpp
similarity index 89%
rename from mx_driving/common/ops/csrc/SortPairs.cpp
rename to mx_driving/csrc/SortPairs.cpp
index 9f60bcfce2efb8f5926879cbf43dafb16a711075..9ea09a4dfcd6127fed4890b8c6ad5dee5c857350 100644
--- a/mx_driving/common/ops/csrc/SortPairs.cpp
+++ b/mx_driving/csrc/SortPairs.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 /**
  * @brief sort key-value pairs
@@ -25,8 +25,8 @@
  * @param descending: true-descending, false-ascending
  * @return (keys_out, values_out): (sorted keys, corresponding values of sorted keys)
  */
-std::tuple<at::Tensor, at::Tensor> npu_sort_pairs(const at::Tensor &keys_in, const at::Tensor &values_in,
-    int64_t dim, bool descending)
+std::tuple<at::Tensor, at::Tensor> npu_sort_pairs(
+    const at::Tensor& keys_in, const at::Tensor& values_in, int64_t dim, bool descending)
 {
     TORCH_CHECK_NPU(keys_in);
     TORCH_CHECK_NPU(values_in);
diff --git a/mx_driving/spconv/ops/csrc/SparseConv3d.cpp b/mx_driving/csrc/SparseConv3d.cpp
similarity index 68%
rename from mx_driving/spconv/ops/csrc/SparseConv3d.cpp
rename to mx_driving/csrc/SparseConv3d.cpp
index beee90cb9b5efcf0f95faac009b4efcc36642206..e92532cf98875657398b4914d78242ffa85c78b8 100644
--- a/mx_driving/spconv/ops/csrc/SparseConv3d.cpp
+++ b/mx_driving/csrc/SparseConv3d.cpp
@@ -15,33 +15,31 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
-#include <torch/extension.h>
+#include "csrc/functions.h"
 
-std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding,
-                                                     int out_channel, at::IntArrayRef outSpatialShape, int batch_size)
+std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size,
+    at::IntArrayRef stride, at::IntArrayRef padding, int out_channel, at::IntArrayRef outSpatialShape, int batch_size)
 {
     TORCH_CHECK_NPU(indices);
-    TORCH_CHECK(out_channel <= 128, "out_channel must less or equal than 128 expected but got out_channel: ",
-        out_channel);
-    TORCH_CHECK(out_channel % 8 == 0, "out_channel must be divisible by 8 but got out_channel: ",
-        out_channel);
+    TORCH_CHECK(
+        out_channel <= 128, "out_channel must less or equal than 128 expected but got out_channel: ", out_channel);
+    TORCH_CHECK(out_channel % 8 == 0, "out_channel must be divisible by 8 but got out_channel: ", out_channel);
     auto indices_size = indices.sizes();
     int64_t kernelsum = 1;
     for (int32_t i = 0; i < kernel_size.size(); i++) {
-        kernelsum  *= kernel_size[i];
+        kernelsum *= kernel_size[i];
     }
     int64_t outputsum = indices_size[0] * kernelsum;
 
     c10::SmallVector<int64_t, 8> indices_out_size = {outputsum};
     c10::SmallVector<int64_t, 8> indices_pairs_size = {outputsum, indices_size[1]};
 
-    c10::SmallVector<int64_t, 8> spatial_size = {batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel};
-    at::IntArrayRef outputShape =  at::IntArrayRef(spatial_size);
+    c10::SmallVector<int64_t, 8> spatial_size = {
+        batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel};
+    at::IntArrayRef outputShape = at::IntArrayRef(spatial_size);
 
     at::Tensor indices_out = at::empty(indices_out_size, indices.options()).fill_(-1);
     at::Tensor indices_pairs = at::empty(indices_pairs_size, indices.options()).fill_(-1);
-    EXEC_NPU_CMD(aclnnSparseConv3d, indices, kernel_size, outputShape,
-                 stride, padding, indices_out, indices_pairs);
+    EXEC_NPU_CMD(aclnnSparseConv3d, indices, kernel_size, outputShape, stride, padding, indices_out, indices_pairs);
     return std::tie(indices_pairs, indices_out);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/spconv/ops/csrc/SparseConv3dGrad.cpp b/mx_driving/csrc/SparseConv3dGrad.cpp
similarity index 81%
rename from mx_driving/spconv/ops/csrc/SparseConv3dGrad.cpp
rename to mx_driving/csrc/SparseConv3dGrad.cpp
index 5192def8082e40f3ef5c80fe3035c51cde0d99e1..c5549b328bedd843eac30fbc9da36e65866cdfc6 100644
--- a/mx_driving/spconv/ops/csrc/SparseConv3dGrad.cpp
+++ b/mx_driving/csrc/SparseConv3dGrad.cpp
@@ -15,11 +15,11 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
-#include <torch/extension.h>
+#include "csrc/functions.h"
 
-std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d_grad(const at::Tensor& indices_offset, const at::Tensor& former_sorted_indices,
-                                                          const at::Tensor& feature, const at::Tensor& weight, const at::Tensor& grad)
+std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d_grad(const at::Tensor& indices_offset,
+    const at::Tensor& former_sorted_indices, const at::Tensor& feature, const at::Tensor& weight,
+    const at::Tensor& grad)
 {
     TORCH_CHECK_NPU(indices_offset);
     TORCH_CHECK_NPU(former_sorted_indices);
@@ -33,7 +33,7 @@ std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d_grad(const at::Tensor& indi
 
     int64_t kernelsum = 1;
     for (int32_t i = 0; i < weight_size.size() - 2; i++) {
-        kernelsum  *= weight_size[i];
+        kernelsum *= weight_size[i];
     }
     int64_t kernelIC = weight_size[3];
     int64_t kernelOC = weight_size[4];
@@ -44,6 +44,7 @@ std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d_grad(const at::Tensor& indi
     at::Tensor feature_grad = at::zeros(feature_grad_size, feature.options());
     at::Tensor weight_grad = at::zeros(weight_size, feature.options());
 
-    EXEC_NPU_CMD(aclnnSparseConv3dGradV2, indices_offset, former_sorted_indices, feature, weight_trans, grad, feature_grad, weight_grad);
+    EXEC_NPU_CMD(aclnnSparseConv3dGradV2, indices_offset, former_sorted_indices, feature, weight_trans, grad,
+        feature_grad, weight_grad);
     return std::tie(feature_grad, weight_grad);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/spconv/ops/csrc/SparseInverseConv3d.cpp b/mx_driving/csrc/SparseInverseConv3d.cpp
similarity index 67%
rename from mx_driving/spconv/ops/csrc/SparseInverseConv3d.cpp
rename to mx_driving/csrc/SparseInverseConv3d.cpp
index b9f9fd6a1bc242731884308db443f1f86cf631f8..e4ad5023907a764a8f6f4748da807016303c2f7a 100644
--- a/mx_driving/spconv/ops/csrc/SparseInverseConv3d.cpp
+++ b/mx_driving/csrc/SparseInverseConv3d.cpp
@@ -15,13 +15,12 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
-#include <torch/extension.h>
+#include "csrc/functions.h"
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_sparse_inverse_conv3d(const at::Tensor& feature, const at::Tensor& indices, const at::Tensor& weight,
-                                                                         at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding,
-                                                                         at::IntArrayRef dilation, at::IntArrayRef output_padding,
-                                                                         int out_channel, at::IntArrayRef outSpatialShape, int batch_size)
+std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_sparse_inverse_conv3d(const at::Tensor& feature,
+    const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, at::IntArrayRef output_padding, int out_channel,
+    at::IntArrayRef outSpatialShape, int batch_size)
 {
     // check Tensor Device is NPU
     TORCH_CHECK_NPU(feature);
@@ -31,7 +30,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_sparse_inverse_conv3d(const a
     // Calculate kernelSize
     int64_t kernelsum = 1;
     for (int32_t i = 0; i < kernel_size.size(); i++) {
-        kernelsum  *= kernel_size[i];
+        kernelsum *= kernel_size[i];
     }
     // to create memory of teh output
     auto indices_size = indices.sizes();
@@ -44,11 +43,12 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_sparse_inverse_conv3d(const a
     at::Tensor indices_out = at::empty(indices_out_size, indices.options()).fill_(-1);
     at::Tensor indices_pairs = at::empty(indices_pairs_size, indices.options()).fill_(-1);
 
-    c10::SmallVector<int64_t, 8> spatial_size = {batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel};
-    at::IntArrayRef outputShape =  at::IntArrayRef(spatial_size);
+    c10::SmallVector<int64_t, 8> spatial_size = {
+        batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel};
+    at::IntArrayRef outputShape = at::IntArrayRef(spatial_size);
     // weight [,,,in_channels, out_channels] -> [,,,out_channels, in_channels]
     at::Tensor weight_trans = weight.transpose(-1, -2).contiguous();
-    EXEC_NPU_CMD(aclnnSparseInverseConv3d, feature, indices, weight_trans, outputShape,
-                 stride, padding, dilation, output_padding, out, indices_out, indices_pairs);
+    EXEC_NPU_CMD(aclnnSparseInverseConv3d, feature, indices, weight_trans, outputShape, stride, padding, dilation,
+        output_padding, out, indices_out, indices_pairs);
     return std::tie(out, indices_pairs, indices_out);
-}
\ No newline at end of file
+}
diff --git a/mx_driving/spconv/ops/csrc/SubmSparseCov3d.cpp b/mx_driving/csrc/SubmSparseCov3d.cpp
similarity index 72%
rename from mx_driving/spconv/ops/csrc/SubmSparseCov3d.cpp
rename to mx_driving/csrc/SubmSparseCov3d.cpp
index e0502bde8c2257e7969d73d1c162f916da31b039..140179189a3df108618240f761c7174b70f8b788 100644
--- a/mx_driving/spconv/ops/csrc/SubmSparseCov3d.cpp
+++ b/mx_driving/csrc/SubmSparseCov3d.cpp
@@ -15,20 +15,18 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_subm_sparse_conv3d(const at::Tensor& feature, const at::Tensor& indices,
-                                                                      const at::Tensor& weight,
-                                                                      at::IntArrayRef kernel_size, int out_channel,
-                                                                      at::IntArrayRef outSpatialShape, int batch_size,
-                                                                      const at::Tensor& temp)
+std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_subm_sparse_conv3d(const at::Tensor& feature,
+    const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, int out_channel,
+    at::IntArrayRef outSpatialShape, int batch_size, const at::Tensor& temp)
 {
     auto indices_size = indices.sizes();
     auto feature_size = feature.sizes();
     auto weight_dim = weight.dim();
     int64_t kernelsum = 1;
     for (int32_t i = 0; i < kernel_size.size(); i++) {
-        kernelsum  *= kernel_size[0];
+        kernelsum *= kernel_size[0];
     }
     int64_t outputsum = indices_size[0] * kernelsum;
     c10::SmallVector<int64_t, 8> output_size = {indices_size[0], kernelsum, feature_size[1]};
@@ -38,7 +36,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_subm_sparse_conv3d(const at::
     at::Tensor out = at::empty(output_size, feature.options()).fill_(0);
     at::Tensor indices_out = at::empty(indices_out_size, feature.options().dtype(at::kInt)).fill_(-1);
     at::Tensor indices_pairs = at::empty(indices_pairs_size, feature.options().dtype(at::kInt));
-    EXEC_NPU_CMD(aclnnSubmSparseConv3d, feature, indices_trans, weight, temp, kernel_size,
-                 out_channel, outSpatialShape, batch_size, out, indices_out, indices_pairs);
+    EXEC_NPU_CMD(aclnnSubmSparseConv3d, feature, indices_trans, weight, temp, kernel_size, out_channel, outSpatialShape,
+        batch_size, out, indices_out, indices_pairs);
     return std::tie(out, indices_pairs, indices_out);
 }
diff --git a/mx_driving/common/ops/csrc/ThreeInterpolate.cpp b/mx_driving/csrc/ThreeInterpolate.cpp
similarity index 72%
rename from mx_driving/common/ops/csrc/ThreeInterpolate.cpp
rename to mx_driving/csrc/ThreeInterpolate.cpp
index f781887dec014c554528a88f54bb0b9eb28c15fa..a56646351f75ca86ee651e18d521522f42482863 100644
--- a/mx_driving/common/ops/csrc/ThreeInterpolate.cpp
+++ b/mx_driving/csrc/ThreeInterpolate.cpp
@@ -15,9 +15,10 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& points, const at::Tensor& idx, const at::Tensor& weight)
+at::Tensor npu_three_interpolate(
+    int b, int c, int m, int n, const at::Tensor& points, const at::Tensor& idx, const at::Tensor& weight)
 {
     TORCH_CHECK_NPU(points);
     TORCH_CHECK_NPU(idx);
@@ -27,8 +28,10 @@ at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& p
     auto idx_dtype = idx.scalar_type();
     auto weight_dtype = weight.scalar_type();
 
-    TORCH_CHECK((point_dtype == at::kFloat || point_dtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16.");
-    TORCH_CHECK((weight_dtype == at::kFloat || weight_dtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16.");
+    TORCH_CHECK((point_dtype == at::kFloat || point_dtype == at::kHalf),
+        "three_interpolate_forward ascend only support fp32 and fp16.");
+    TORCH_CHECK((weight_dtype == at::kFloat || weight_dtype == at::kHalf),
+        "three_interpolate_forward ascend only support fp32 and fp16.");
     TORCH_CHECK((point_dtype == weight_dtype), "input dtype is inconsistent.");
     TORCH_CHECK((idx_dtype == at::kInt), "indices: int32 tensor expected but got a tensor with dtype: ", idx_dtype);
 
@@ -36,13 +39,15 @@ at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& p
     auto idx_size = idx.sizes();
     auto weight_size = weight.sizes();
 
-    TORCH_CHECK((point_size.size() == 3 && idx_size.size() == 3 && weight_size.size() == 3), "input dimension should be 3.");
-    TORCH_CHECK((point_size[0] == idx_size[0] && point_size[0] == weight_size[0] && idx_size[0] == weight_size[0]), "the first dimension of input should be the same.");
+    TORCH_CHECK(
+        (point_size.size() == 3 && idx_size.size() == 3 && weight_size.size() == 3), "input dimension should be 3.");
+    TORCH_CHECK((point_size[0] == idx_size[0] && point_size[0] == weight_size[0] && idx_size[0] == weight_size[0]),
+        "the first dimension of input should be the same.");
     TORCH_CHECK((idx_size[1] == weight_size[1]), "the second dimension of indices and weight should be the same.");
     TORCH_CHECK((idx_size[2] == 3 && weight_size[2] == 3), "the third dimension of indices and weight should be 3.");
 
     TORCH_CHECK((b < 10001 && c < 10001 && m < 10001 && n < 10001), "input dimension is too heavy.");
-    
+
     auto point_c_trans = points.transpose(1, 2).to(at::kFloat);
     auto weight_cast = weight.to(at::kFloat);
 
@@ -50,13 +55,8 @@ at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& p
     at::Tensor out_cast = at::zeros(output_size, points.options()).to(at::kFloat);
 
     at_npu::native::OpCommand cmd;
-    cmd.Name("ThreeInterpolate")
-        .Input(point_c_trans)
-        .Input(idx)
-        .Input(weight_cast)
-        .Output(out_cast)
-        .Run();
-    
+    cmd.Name("ThreeInterpolate").Input(point_c_trans).Input(idx).Input(weight_cast).Output(out_cast).Run();
+
     auto out = out_cast;
     if (point_dtype == at::kHalf) {
         out = out_cast.to(at::kHalf);
@@ -64,11 +64,12 @@ at::Tensor npu_three_interpolate(int b, int c, int m, int n, const at::Tensor& p
     auto output = out_cast.view({b, n, c}).transpose(1, 2);
     auto res = output.contiguous();
     out.copy_(res);
-    
+
     return out;
 }
 
-at::Tensor npu_three_interpolate_backward(int b, int c, int n, int m, const at::Tensor& grad_out, const at::Tensor& idx, const at::Tensor& weight)
+at::Tensor npu_three_interpolate_backward(
+    int b, int c, int n, int m, const at::Tensor& grad_out, const at::Tensor& idx, const at::Tensor& weight)
 {
     TORCH_CHECK_NPU(grad_out);
     TORCH_CHECK_NPU(idx);
@@ -78,8 +79,10 @@ at::Tensor npu_three_interpolate_backward(int b, int c, int n, int m, const at::
     auto idx_dtype = idx.scalar_type();
     auto weight_dtype = weight.scalar_type();
 
-    TORCH_CHECK((grad_dtype == at::kFloat || grad_dtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16.");
-    TORCH_CHECK((weight_dtype == at::kFloat || weight_dtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16.");
+    TORCH_CHECK((grad_dtype == at::kFloat || grad_dtype == at::kHalf),
+        "three_interpolate_forward ascend only support fp32 and fp16.");
+    TORCH_CHECK((weight_dtype == at::kFloat || weight_dtype == at::kHalf),
+        "three_interpolate_forward ascend only support fp32 and fp16.");
     TORCH_CHECK((grad_dtype == weight_dtype), "input dtype is inconsistent.");
     TORCH_CHECK((idx_dtype == at::kInt), "indices: int32 tensor expected but got a tensor with dtype: ", idx_dtype);
 
@@ -87,11 +90,14 @@ at::Tensor npu_three_interpolate_backward(int b, int c, int n, int m, const at::
     auto idx_size = idx.sizes();
     auto weight_size = weight.sizes();
 
-    TORCH_CHECK((grad_size.size() == 3 && idx_size.size() == 3 && weight_size.size() == 3), "the input dimension should be 3.");
-    TORCH_CHECK((grad_size[0] == idx_size[0] && grad_size[0] == weight_size[0] && idx_size[0] == weight_size[0]), "the first dimension of input should be the same.");
-    TORCH_CHECK((grad_size[2] == idx_size[1] && grad_size[2] == weight_size[1] && idx_size[1] == weight_size[1]), "the second dimension of indices and weight should be the same.");
+    TORCH_CHECK(
+        (grad_size.size() == 3 && idx_size.size() == 3 && weight_size.size() == 3), "the input dimension should be 3.");
+    TORCH_CHECK((grad_size[0] == idx_size[0] && grad_size[0] == weight_size[0] && idx_size[0] == weight_size[0]),
+        "the first dimension of input should be the same.");
+    TORCH_CHECK((grad_size[2] == idx_size[1] && grad_size[2] == weight_size[1] && idx_size[1] == weight_size[1]),
+        "the second dimension of indices and weight should be the same.");
     TORCH_CHECK((idx_size[2] == 3 && weight_size[2] == 3), "the third dimension of indices and weight should be 3.");
-    
+
     TORCH_CHECK((b < 10001 && c < 10001 && m < 10001 && n < 10001), "input dimension is too heavy.");
 
     at::Tensor grad_points = at::zeros({b, c, m}, grad_out.options());
diff --git a/mx_driving/point/ops/csrc/UniqueVoxel.cpp b/mx_driving/csrc/UniqueVoxel.cpp
similarity index 94%
rename from mx_driving/point/ops/csrc/UniqueVoxel.cpp
rename to mx_driving/csrc/UniqueVoxel.cpp
index 31ac61f10e9e51c14d11b31c1eef4014997b0449..eed06c2eccd65c81b5a1a9e790ac82b9af6fdd85 100644
--- a/mx_driving/point/ops/csrc/UniqueVoxel.cpp
+++ b/mx_driving/csrc/UniqueVoxel.cpp
@@ -14,16 +14,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <c10/core/ScalarType.h>
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
-
+#include "csrc/functions.h"
 
 std::tuple<int32_t, at::Tensor, at::Tensor, at::Tensor, at::Tensor> unique_voxel(const at::Tensor& voxels)
 {
     TORCH_CHECK_NPU(voxels);
     TORCH_CHECK(voxels.dim() == 1, "voxels.dim() must be 1, but got: ", voxels.dim());
-    TORCH_CHECK(voxels.dtype() == at::kFloat || voxels.dtype() == at::kInt, "voxels.dtype() must be float or int32, but got: ", voxels.dtype());
+    TORCH_CHECK(voxels.dtype() == at::kFloat || voxels.dtype() == at::kInt,
+        "voxels.dtype() must be float or int32, but got: ", voxels.dtype());
 
     size_t num_points = voxels.size(0);
 
diff --git a/mx_driving/point/ops/csrc/VecPoolBackward.cpp b/mx_driving/csrc/VecPoolBackward.cpp
similarity index 54%
rename from mx_driving/point/ops/csrc/VecPoolBackward.cpp
rename to mx_driving/csrc/VecPoolBackward.cpp
index aaa3e1af547cc203b31babefcac6f0cef98b341f..1ad326b26a6caa4b9eae10dff88ab2c8c0ddab89 100644
--- a/mx_driving/point/ops/csrc/VecPoolBackward.cpp
+++ b/mx_driving/csrc/VecPoolBackward.cpp
@@ -15,24 +15,16 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
-at::Tensor vec_pool_backward(const at::Tensor& grad_new_features,
-                             const at::Tensor& point_cnt_of_grid,
-                             const at::Tensor& grouped_idxs,
-                             const int64_t n,
-                             const int64_t num_c_in)
+at::Tensor vec_pool_backward(const at::Tensor& grad_new_features, const at::Tensor& point_cnt_of_grid,
+    const at::Tensor& grouped_idxs, const int64_t n, const int64_t num_c_in)
 {
-    TORCH_CHECK(grad_new_features.dim() == 2,
-        "grad_new_features.dim() must be 2, but got: ", grad_new_features.dim());
-    TORCH_CHECK(point_cnt_of_grid.dim() == 2,
-        "point_cnt_of_grid.dim() must be 2, but got: ", point_cnt_of_grid.dim());
-    TORCH_CHECK(grouped_idxs.dim() == 2,
-        "grouped_idxs.dim() must be 2, but got: ", grouped_idxs.dim());
-    TORCH_CHECK(point_cnt_of_grid.size(1) != 0,
-        "numTotalGrids can not be 0.");
-    TORCH_CHECK(grouped_idxs.size(1) == 3,
-        "grouped_idxs.shape[1] must be 3, but got: ", grouped_idxs.size(1));
+    TORCH_CHECK(grad_new_features.dim() == 2, "grad_new_features.dim() must be 2, but got: ", grad_new_features.dim());
+    TORCH_CHECK(point_cnt_of_grid.dim() == 2, "point_cnt_of_grid.dim() must be 2, but got: ", point_cnt_of_grid.dim());
+    TORCH_CHECK(grouped_idxs.dim() == 2, "grouped_idxs.dim() must be 2, but got: ", grouped_idxs.dim());
+    TORCH_CHECK(point_cnt_of_grid.size(1) != 0, "numTotalGrids can not be 0.");
+    TORCH_CHECK(grouped_idxs.size(1) == 3, "grouped_idxs.shape[1] must be 3, but got: ", grouped_idxs.size(1));
     auto output_size = {n, num_c_in};
     at::Tensor out = at::zeros(output_size, grad_new_features.options());
     EXEC_NPU_CMD(aclnnVecPoolGrad, grad_new_features, point_cnt_of_grid, grouped_idxs, n, num_c_in, out);
diff --git a/mx_driving/point/ops/csrc/VoxelPoolingTrain.cpp b/mx_driving/csrc/VoxelPoolingTrain.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/VoxelPoolingTrain.cpp
rename to mx_driving/csrc/VoxelPoolingTrain.cpp
index 5341a87cd7f9519b419cd9dcd7a740a8440f040d..7657918e5c4183088d1177b5f55843f140c3c76b 100644
--- a/mx_driving/point/ops/csrc/VoxelPoolingTrain.cpp
+++ b/mx_driving/csrc/VoxelPoolingTrain.cpp
@@ -14,10 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <ATen/ATen.h>
-
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 std::tuple<at::Tensor&, at::Tensor&> voxel_pooling_train(const at::Tensor& inputFeatures, const at::Tensor& geom,
     at::Tensor& outputFeatures, at::Tensor& posMemo, int batchSize, int numPoints, int numChannels, int numVoxelX,
@@ -70,4 +68,4 @@ at::Tensor voxel_pool_train_backward(const at::Tensor& gradOut, const at::Tensor
         out = out.to(at::kHalf);
     }
     return out;
-}
\ No newline at end of file
+}
diff --git a/mx_driving/point/ops/csrc/VoxelToPoint.cpp b/mx_driving/csrc/VoxelToPoint.cpp
similarity index 98%
rename from mx_driving/point/ops/csrc/VoxelToPoint.cpp
rename to mx_driving/csrc/VoxelToPoint.cpp
index 3d1bc37fd4a3d34d251351fb05e7bf95d8f2b98f..fd6463333b6e6a040b5e41dd0b135243ba97f0d3 100644
--- a/mx_driving/point/ops/csrc/VoxelToPoint.cpp
+++ b/mx_driving/csrc/VoxelToPoint.cpp
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
+#include "csrc/functions.h"
 
 constexpr float DEFAULT_VALUE = -1.0f;
 
diff --git a/mx_driving/csrc/pybind.cpp b/mx_driving/csrc/pybind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71cd0c44dcaf924c1e4ea5b16959c55dbfd47143
--- /dev/null
+++ b/mx_driving/csrc/pybind.cpp
@@ -0,0 +1,196 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "csrc/functions.h"
+#include <torch/extension.h>
+
+#include <mutex>
+#include <string>
+
+std::string g_opApiSoPath;
+std::once_flag init_flag; // Flag for one-time initialization
+
+void init_op_api_so_path(const std::string& path)
+{
+    std::call_once(init_flag, [&]() { g_opApiSoPath = path; });
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("_init_op_api_so_path", &init_op_api_so_path);
+    // knn
+    m.def("knn", &knn);
+
+    // npu_scatter_mean_grad
+    m.def("npu_scatter_mean_grad", &npu_scatter_mean_grad);
+
+    // three_interpolate
+    m.def("npu_three_interpolate", &npu_three_interpolate);
+    m.def("npu_three_interpolate_backward", &npu_three_interpolate_backward);
+
+    // scatter_mean
+    m.def("npu_scatter_mean", &npu_scatter_mean, "npu_scatter_mean NPU version");
+
+    // scatter_max
+    m.def("scatter_max_with_argmax_v2", &scatter_max_with_argmax_v2);
+    m.def("npu_scatter_max_backward", &npu_scatter_max_backward);
+
+    // npu_sort_pairs
+    m.def("npu_sort_pairs", &npu_sort_pairs, "sort_pairs NPU version");
+
+    // npu_hypot
+    m.def("npu_hypot", &npu_hypot);
+    m.def("npu_hypot_grad", &npu_hypot_grad);
+
+    // assign_score_withk
+    m.def("assign_score_withk", &assign_score_withk);
+    // nms3d_normal
+    m.def("nms3d_normal", &nms3d_normal);
+
+    // nms3d
+    m.def("nms3d", &nms3d);
+
+    // roated overlap
+    m.def("npu_rotated_overlaps", &npu_rotated_overlaps, "npu_rotated_overlap NPU version");
+
+    // rotated iou
+    m.def("npu_rotated_iou", &npu_rotated_iou);
+
+    // npu_boxes_overlap_bev
+    m.def("npu_boxes_overlap_bev", &npu_boxes_overlap_bev, "boxes_overlap_bev NPU version");
+
+    // roi_align_rotated_v2_forward_npu
+    m.def("roi_align_rotated_v2_forward_npu", &roi_align_rotated_v2_forward_npu);
+
+    // npu_roi_align_rotated_grad_v2
+    m.def("npu_roi_align_rotated_grad_v2", &npu_roi_align_rotated_grad_v2);
+
+    // npu_box_iou_quadri
+    m.def("npu_box_iou_quadri", &npu_box_iou_quadri, "box_iou_quadri NPU version");
+
+    // npu_box_iou_rotated
+    m.def("npu_box_iou_rotated", &npu_box_iou_rotated, "box_iou_rotated NPU version");
+
+    // border_align_forward_npu
+    m.def("border_align", &border_align);
+
+    // border_align_backward_npu
+    m.def("border_align_backward", &border_align_backward);
+
+    // npu_roiaware_pool3d_forward
+    m.def("npu_roiaware_pool3d_forward", &npu_roiaware_pool3d_forward);
+
+    // roiaware_pool3d_grad
+    m.def("roiaware_pool3d_grad", &roiaware_pool3d_grad, "roiaware_pool3d_grad NPU version");
+
+    // pixel_group
+    m.def("pixel_group", &pixel_group);
+
+    // nnpu_max_pool2d
+    m.def("npu_max_pool2d", &npu_max_pool2d);
+    // mullti_scale_deformable_attn
+    m.def("multi_scale_deformable_attn", &multi_scale_deformable_attn);
+    m.def("multi_scale_deformable_attn_backward", &multi_scale_deformable_attn_backward);
+
+    // npu_add_relu
+    m.def("npu_add_relu", &npu_add_relu);
+    m.def("npu_add_relu_grad", &npu_add_relu_grad);
+
+    // fused_bias_leaky_relu
+    m.def("fused_bias_leaky_relu", &fused_bias_leaky_relu);
+
+    // npu_deformable_aggregation
+    m.def("npu_deformable_aggregation", &deformable_aggregation);
+    m.def("npu_deformable_aggregation_backward", &deformable_aggregation_backward);
+
+    // deformable_conv2d
+    m.def("deformable_conv2d", &deformable_conv2d);
+    m.def("modulated_deformable_conv2d", &modulated_deformable_conv2d);
+    m.def("deformable_conv2d_backward", &deformable_conv2d_backward);
+    m.def("modulated_deformable_conv2d_backward", &modulated_deformable_conv2d_backward);
+
+    // npu_geometric_kernel_attention_func
+    m.def("npu_geometric_kernel_attention", &npu_geometric_kernel_attention);
+    m.def("npu_geometric_kernel_attention_backward", &npu_geometric_kernel_attention_backward);
+
+    // group_points
+    m.def("group_points", &group_points);
+    m.def("group_points_backward", &group_points_backward);
+
+    // vec_pool
+    m.def("vec_pool_backward", &vec_pool_backward);
+
+    m.def("point_to_voxel", &point_to_voxel);
+
+    m.def("voxel_to_point", &voxel_to_point);
+
+    m.def("unique_voxel", &unique_voxel);
+
+    m.def("hard_voxelize", &hard_voxelize);
+
+    // bev_pool
+    m.def("npu_bev_pool", &npu_bev_pool, "npu_bev_pool NPU version");
+    m.def("npu_bev_pool_backward", &npu_bev_pool_backward, "npu_bev_pool_backward NPU version");
+    m.def("npu_bev_pool_v2", &npu_bev_pool_v2, "npu_bev_pool_v2 NPU version");
+    m.def("npu_bev_pool_v2_backward", &npu_bev_pool_v2_backward, "npu_bev_pool_v2_backward NPU version");
+    m.def("npu_bev_pool_v3", &npu_bev_pool_v3, "npu_bev_pool_v3 NPU version");
+    m.def("npu_bev_pool_v3_backward", &npu_bev_pool_v3_backward, "npu_bev_pool_v3_backward NPU version");
+
+    // furthest_points_sampling_with_dist
+    m.def("furthest_point_sampling_with_dist", &furthest_point_sampling_with_dist);
+
+    // npu_dynamic_scatter
+    m.def("npu_dynamic_scatter", &npu_dynamic_scatter);
+    m.def("npu_dynamic_scatter_grad", &npu_dynamic_scatter_grad);
+
+    // dyn_voxelization
+    m.def("dynamic_voxelization", &dynamic_voxelization);
+
+    // npu_furthest_point_sampling
+    m.def("npu_furthest_point_sampling", &npu_furthest_point_sampling);
+
+    // voxel_pooling
+    m.def("voxel_pooling_train", &voxel_pooling_train);
+    m.def("voxel_pool_train_backward", &voxel_pool_train_backward);
+
+    // npu_points_in_box
+    m.def("npu_points_in_box", &npu_points_in_box);
+
+    // npu_points_in_box_all
+    m.def("npu_points_in_box_all", &npu_points_in_box_all);
+
+    // npu_roipoint_pool3d_forward
+    m.def("npu_roipoint_pool3d_forward", &npu_roipoint_pool3d_forward);
+
+    // npu_subm_sparse_conv3d
+    m.def("npu_subm_sparse_conv3d", &npu_subm_sparse_conv3d);
+
+    // npu_sparse_conv3d
+    m.def("npu_sparse_conv3d", &npu_sparse_conv3d);
+
+    // npu_sparse_inverse_conv3d
+    m.def("npu_sparse_inverse_conv3d", &npu_sparse_inverse_conv3d);
+
+    // multi_to_sparse
+    m.def("multi_to_sparse", &multi_to_sparse);
+
+    // multi_to_sparse_v2
+    m.def("multi_to_sparse_v2", &multi_to_sparse_v2);
+
+    // npu_sparse_conv3d_grad
+    m.def("npu_sparse_conv3d_grad", &npu_sparse_conv3d_grad);
+
+    m.def("npu_prepare_subm_conv3d", &npu_prepare_subm_conv3d);
+}
diff --git a/mx_driving/common/ops/csrc/scatterMean.cpp b/mx_driving/csrc/scatterMean.cpp
similarity index 88%
rename from mx_driving/common/ops/csrc/scatterMean.cpp
rename to mx_driving/csrc/scatterMean.cpp
index ff188e4a7b14b20012020dee5d5a089abbaf6199..9df865e18e44dc6363b77c398862d2a7b55b0224 100644
--- a/mx_driving/common/ops/csrc/scatterMean.cpp
+++ b/mx_driving/csrc/scatterMean.cpp
@@ -14,20 +14,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <ATen/ATen.h>
 #include "csrc/OpApiCommon.h"
-#include "functions.h"
-
+#include "csrc/functions.h"
 
 using namespace std;
 
-static void npu_scatter_mean_shape_check(const at::Tensor& src, const at::Tensor& indices, const at::Tensor& out, int dim, int max_index)
+static void npu_scatter_mean_shape_check(
+    const at::Tensor& src, const at::Tensor& indices, const at::Tensor& out, int dim, int max_index)
 {
     auto src_size = src.sizes();
     auto out_size = out.sizes();
     auto indices_size = indices.sizes();
     auto indices_dim = indices.dim();
-    TORCH_CHECK(dim < indices_dim, "Dimension out of range, dim expected to be in range of [", -indices_dim, ", ", indices_dim-1, "], but got ", dim);
+    TORCH_CHECK(dim < indices_dim, "Dimension out of range, dim expected to be in range of [", -indices_dim, ", ",
+        indices_dim - 1, "], but got ", dim);
     TORCH_CHECK(src.dim() == out.dim(), "out's dimension should be equal to src's dimension.");
     TORCH_CHECK(src.dim() >= indices.dim(), "indices's dimension should not larger than src's dimension.");
     // shape of out and src
@@ -66,9 +66,8 @@ static int32_t get_available_dimnum(const at::Tensor& indices)
     return indices_dim - last_indices_dim;
 }
 
-std::tuple<at::Tensor, at::Tensor> npu_scatter_mean(at::Tensor& src, at::Tensor& indices,
-                                                    c10::optional<at::Tensor> out, c10::optional<int> dim,
-                                                    c10::optional<int> dim_size)
+std::tuple<at::Tensor, at::Tensor> npu_scatter_mean(at::Tensor& src, at::Tensor& indices, c10::optional<at::Tensor> out,
+    c10::optional<int> dim, c10::optional<int> dim_size)
 {
     TORCH_CHECK_NPU(src);
     TORCH_CHECK_NPU(indices);
@@ -112,10 +111,7 @@ std::tuple<at::Tensor, at::Tensor> npu_scatter_mean(at::Tensor& src, at::Tensor&
         at::Tensor count = at::zeros(out_trans.sizes(), src.options().dtype(at::kFloat));
 
         EXEC_NPU_CMD(aclnnScatterMean, src, indices, out_trans, dim_input, out_trans, count);
-        count = at::where(
-            count == 0,
-            at::ones({}, count.options()),
-            count);
+        count = at::where(count == 0, at::ones({}, count.options()), count);
         out_trans = out_trans / count;
 
         out_trans = out_trans.transpose(true_dim, dim_input).contiguous();
@@ -130,4 +126,4 @@ std::tuple<at::Tensor, at::Tensor> npu_scatter_mean(at::Tensor& src, at::Tensor&
         EXEC_NPU_CMD(aclnnScatterMeanDiv, true_out, count, true_out);
         return std::tie(true_out, count);
     }
-}
\ No newline at end of file
+}
diff --git a/mx_driving/detection/__init__.py b/mx_driving/detection.py
similarity index 69%
rename from mx_driving/detection/__init__.py
rename to mx_driving/detection.py
index f5605f4a8127548e01e39d14ce81bf89a349289b..12cac4f57087abe4cb2852ac321b9b76f1912517 100644
--- a/mx_driving/detection/__init__.py
+++ b/mx_driving/detection.py
@@ -1,3 +1,8 @@
+import warnings
+
+warnings.warn(
+    "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning
+)
 from .ops.boxes_overlap_bev import boxes_overlap_bev, npu_boxes_overlap_bev
 from .ops.nms3d_normal import npu_nms3d_normal
 from .ops.npu_nms3d import npu_nms3d
@@ -7,4 +12,4 @@ from .ops.roi_align_rotated import roi_align_rotated
 from .ops.box_iou import box_iou_quadri
 from .ops.border_align import border_align
 from .ops.roiaware_pool3d import roiaware_pool3d
-from .ops.pixel_group import pixel_group
\ No newline at end of file
+from .ops.pixel_group import pixel_group
diff --git a/mx_driving/detection/CMakeLists.txt b/mx_driving/detection/CMakeLists.txt
deleted file mode 100644
index 63ebf65165f490b26ec6fbb6cb034f1e8d947c59..0000000000000000000000000000000000000000
--- a/mx_driving/detection/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
-  add_subdirectory(ops/kernels)
-endif()
-
-if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
-  add_subdirectory(ops/onnx/plugin)
-endif()
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
-  add_subdirectory(ops/csrc)
-endif()
diff --git a/mx_driving/detection/components/README.md b/mx_driving/detection/components/README.md
deleted file mode 100644
index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000
--- a/mx_driving/detection/components/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some pytorch algorithm modules.
\ No newline at end of file
diff --git a/mx_driving/detection/ops/csrc/CMakeLists.txt b/mx_driving/detection/ops/csrc/CMakeLists.txt
deleted file mode 100644
index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/csrc/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_CSRC_SRC
-    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/detection/ops/csrc/OWNERS b/mx_driving/detection/ops/csrc/OWNERS
deleted file mode 100644
index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/csrc/OWNERS
+++ /dev/null
@@ -1,7 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/detection/ops/csrc/README.md b/mx_driving/detection/ops/csrc/README.md
deleted file mode 100644
index 0bbe4f394307b9d81004b5bd923e630eabd9a509..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/csrc/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some cpp source files, which provide code for adaptation of ascend kernels. It provide links for kernels and cpp interfaces.
\ No newline at end of file
diff --git a/mx_driving/detection/ops/csrc/functions.h b/mx_driving/detection/ops/csrc/functions.h
deleted file mode 100644
index ba1d8a236b1e5fb718fcb69d1fb84de2ea7768c4..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/csrc/functions.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2024, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
-#define PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
-
-#include <ATen/ATen.h>
-#include <torch/library.h>
-
-std::tuple<at::Tensor, at::Tensor> nms3d_normal(const at::Tensor& boxes, double nms_overlap_thresh);
-
-std::tuple<at::Tensor, at::Tensor> nms3d(const at::Tensor& boxes, double threshold);
-
-at::Tensor npu_rotated_overlaps(const at::Tensor& self, const at::Tensor& query_boxes, bool trans);
-
-at::Tensor npu_rotated_iou(const at::Tensor& boxes, const at::Tensor& query_boxes, bool trans, int64_t mode,
-    bool is_cross, double v_threshold, double e_threshold);
-
-at::Tensor npu_boxes_overlap_bev(const at::Tensor &boxes_a, const at::Tensor &boxes_b);
-
-void roi_align_rotated_v2_forward_npu(const at::Tensor& input, const at::Tensor& rois_map, at::Tensor& output,
-                                      double spatial_scale, int32_t sampling_ratio, int32_t pooled_height, int32_t pooled_width,
-                                      bool aligned, bool clockwise);
-at::Tensor npu_roi_align_rotated_grad_v2(const at::Tensor& input,
-                                         const at::Tensor& rois, const at::Tensor& grad_output,
-                                         int32_t pooled_height, int32_t pooled_width, double spatial_scale,
-                                         int32_t sampling_ratio, bool aligned, bool clockwise);
-
-at::Tensor npu_box_iou_quadri(const at::Tensor &boxes_a, const at::Tensor &boxes_b,
-    const int64_t mode_flag, const bool aligned);
-
-at::Tensor npu_box_iou_rotated(const at::Tensor &boxes_a, const at::Tensor &boxes_b,
-    const int64_t mode_flag, const bool aligned);
-
-void border_align_forward_npu(const at::Tensor& input, const at::Tensor& rois, at::Tensor& output, const int32_t pooled_size);
-
-at::Tensor border_align_backward(const at::Tensor& grad_out, const at::Tensor& boxes, const at::Tensor& argmax_idx,
-                                 int32_t pool_size, int32_t height, int32_t width);
-
-void npu_roiaware_pool3d_forward(const at::Tensor& rois, const at::Tensor& pts, const at::Tensor& pts_feature,
-    at::Tensor& argmax, at::Tensor& pts_idx_of_voxels, at::Tensor& pooled_features, int32_t mode);
-at::Tensor roiaware_pool3d_grad(const at::Tensor& pts_idx_of_voxels, const at::Tensor& argmax,
-    const at::Tensor& grad_out, int32_t npoints, int64_t pool_method);
-
-std::vector<std::vector<float>> pixel_group(const at::Tensor& score, const at::Tensor& mask, const at::Tensor& embedding,
-                                            const at::Tensor& kernel_label, const at::Tensor& kernel_contour,
-                                            int kernel_region_num, double distance_threshold);
-#endif // PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
diff --git a/mx_driving/detection/ops/csrc/pybind.cpp b/mx_driving/detection/ops/csrc/pybind.cpp
deleted file mode 100644
index 18f1e90b1984c19fc9694612f4d9bb01e90b4d9c..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/csrc/pybind.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <torch/extension.h>
-#include "csrc/pybind.h"
-#include "functions.h"
-
-void init_detection(pybind11::module& m)
-{
-    // nms3d_normal
-    m.def("nms3d_normal", &nms3d_normal);
-
-    // nms3d
-    m.def("nms3d", &nms3d);
-
-    // roated overlap
-    m.def("npu_rotated_overlaps", &npu_rotated_overlaps, "npu_rotated_overlap NPU version");
-
-    // rotated iou
-    m.def("npu_rotated_iou", &npu_rotated_iou);
-    
-    // npu_boxes_overlap_bev
-    m.def("npu_boxes_overlap_bev", &npu_boxes_overlap_bev, "boxes_overlap_bev NPU version");
-
-    // roi_align_rotated_v2_forward_npu
-    m.def("roi_align_rotated_v2_forward_npu", &roi_align_rotated_v2_forward_npu);
-
-    // npu_roi_align_rotated_grad_v2
-    m.def("npu_roi_align_rotated_grad_v2", &npu_roi_align_rotated_grad_v2);
-
-    // npu_box_iou_quadri
-    m.def("npu_box_iou_quadri", &npu_box_iou_quadri, "box_iou_quadri NPU version");
-
-    // npu_box_iou_rotated
-    m.def("npu_box_iou_rotated", &npu_box_iou_rotated, "box_iou_rotated NPU version");
-
-    // border_align_forward_npu
-    m.def("border_align_forward_npu", &border_align_forward_npu);
-
-    // border_align_backward_npu
-    m.def("border_align_backward", &border_align_backward);
-
-    // npu_roiaware_pool3d_forward
-    m.def("npu_roiaware_pool3d_forward", &npu_roiaware_pool3d_forward);
-
-    // roiaware_pool3d_grad
-    m.def("roiaware_pool3d_grad", &roiaware_pool3d_grad, "roiaware_pool3d_grad NPU version");
-
-    // pixel_group
-    m.def("pixel_group", &pixel_group);
-}
diff --git a/mx_driving/detection/ops/kernels/CMakeLists.txt b/mx_driving/detection/ops/kernels/CMakeLists.txt
deleted file mode 100644
index 179d9da23345abf75fb87954f266055922527742..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
-  add_subdirectory(framework)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
-  add_subdirectory(op_host)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
-  add_subdirectory(op_kernel)
-endif()
-if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
-  add_subdirectory(testcases)
-endif()
diff --git a/mx_driving/detection/ops/kernels/README.md b/mx_driving/detection/ops/kernels/README.md
deleted file mode 100644
index 214fb0a6d662e806bd7f6bdd1b8962bc1639026e..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/kernels/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some ascend-kernel source files, which are like cuda-kernels and supply some ops that can be run on ascend device.
\ No newline at end of file
diff --git a/mx_driving/detection/ops/kernels/op_host/CMakeLists.txt b/mx_driving/detection/ops/kernels/op_host/CMakeLists.txt
deleted file mode 100644
index 7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/kernels/op_host/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_HOST_SRC
-    ${ASCEND_HOST_SRC} ${HOST_SRC}
-    CACHE INTERNAL "")
-
-# add the exclude files for aclnn
-set(aclop_exclude
-    ${aclop_exclude} ""
-    CACHE INTERNAL "")
-file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp)
-file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h)
-set(ACLNN_SRC_CUSTOM
-    ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC}
-    CACHE INTERNAL "")
-set(ACLNN_INC_CUSTOM
-    ${ACLNN_INC_CUSTOM} ${ACLNN_INC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/detection/ops/kernels/op_host/OWNERS b/mx_driving/detection/ops/kernels/op_host/OWNERS
deleted file mode 100644
index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/kernels/op_host/OWNERS
+++ /dev/null
@@ -1,7 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.h b/mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.h
deleted file mode 100644
index 9976c2486c9e45dc8d867da1f7ad8740f8dbc90b..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/kernels/op_host/nms3d_normal_tiling.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
- */
-#ifndef NMS3D_NORMAL_TILING_H
-#define NMS3D_NORMAL_TILING_H
-
-#include "register/tilingdata_base.h"
-
-namespace optiling {
-BEGIN_TILING_DATA_DEF(Nms3dNormalTilingData)
-    TILING_DATA_FIELD_DEF(uint32_t, usedCoreNum)  // used cores
-    TILING_DATA_FIELD_DEF(uint32_t, boxNum)  // count of boxes
-    TILING_DATA_FIELD_DEF(uint32_t, loopTime)  // loop times
-    TILING_DATA_FIELD_DEF(uint32_t, eachSum) // count of each core, = loop_time * 8
-    TILING_DATA_FIELD_DEF(uint32_t, tailSum) // count of tail core
-    TILING_DATA_FIELD_DEF(uint32_t, tailNum) // last time count of tail core
-    TILING_DATA_FIELD_DEF(uint32_t, maskNum) // mask align 32bit
-    TILING_DATA_FIELD_DEF(float, overlapThresh)
-END_TILING_DATA_DEF;
-
-REGISTER_TILING_DATA_CLASS(Nms3dNormal, Nms3dNormalTilingData)
-} // namespace optiling
-
-#endif // NMS3D_NORMAL_TILING_H
diff --git a/mx_driving/detection/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/detection/ops/kernels/op_kernel/CMakeLists.txt
deleted file mode 100644
index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/kernels/op_kernel/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_KERNEL_SRC
-        ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
-        CACHE INTERNAL "")
diff --git a/mx_driving/detection/ops/onnx/__init__.py b/mx_driving/detection/ops/onnx/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mx_driving/detection/ops/onnx/plugin/CMakeLists.txt b/mx_driving/detection/ops/onnx/plugin/CMakeLists.txt
deleted file mode 100644
index cc6034bd1fe09a766aef52f69cf0bb348ceaf2b5..0000000000000000000000000000000000000000
--- a/mx_driving/detection/ops/onnx/plugin/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB ONNX_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_ONNX_SRC
-    ${ASCEND_ONNX_SRC} ${ONNX_SRC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/fused/__init__.py b/mx_driving/fused.py
similarity index 71%
rename from mx_driving/fused/__init__.py
rename to mx_driving/fused.py
index 80a2b2cf2531379a50c340c24fb8f9c2aa79f1cf..71d05810e1c58c23d07c83c7a52818c788cfd327 100644
--- a/mx_driving/fused/__init__.py
+++ b/mx_driving/fused.py
@@ -1,3 +1,8 @@
+import warnings
+
+warnings.warn(
+    "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning
+)
 from .ops.deform_conv2d import DeformConv2dFunction, deform_conv2d
 from .ops.fused_bias_leaky_relu import npu_fused_bias_leaky_relu
 from .ops.modulated_deform_conv2d import (ModulatedDeformConv2dFunction,
@@ -8,4 +13,4 @@ from .ops.multi_scale_deformable_attn import (
 from .ops.npu_add_relu import npu_add_relu
 from .ops.npu_deformable_aggregation import npu_deformable_aggregation
 from .ops.npu_max_pool2d import npu_max_pool2d
-from .ops.npu_geometric_kernel_attention_func import npu_geometric_kernel_attention_func
+from .ops.npu_geometric_kernel_attention import npu_geometric_kernel_attention
diff --git a/mx_driving/fused/CMakeLists.txt b/mx_driving/fused/CMakeLists.txt
deleted file mode 100644
index 807aa0c667560bcf0d75c6c6a26369daa624e9de..0000000000000000000000000000000000000000
--- a/mx_driving/fused/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
-  add_subdirectory(ops/kernels)
-endif()
-
-if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
-  add_subdirectory(ops/onnx/plugin)
-endif()
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
-  add_subdirectory(ops/csrc)
-endif()
diff --git a/mx_driving/fused/components/README.md b/mx_driving/fused/components/README.md
deleted file mode 100644
index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000
--- a/mx_driving/fused/components/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some pytorch algorithm modules.
\ No newline at end of file
diff --git a/mx_driving/fused/ops/__init__.py b/mx_driving/fused/ops/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mx_driving/fused/ops/csrc/CMakeLists.txt b/mx_driving/fused/ops/csrc/CMakeLists.txt
deleted file mode 100644
index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/csrc/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_CSRC_SRC
-    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/fused/ops/csrc/OWNERS b/mx_driving/fused/ops/csrc/OWNERS
deleted file mode 100644
index 6d60158d26b6a9b3c818a73e78f09a6aa3700cf7..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/csrc/OWNERS
+++ /dev/null
@@ -1,8 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-- liu_zhi_xu
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/fused/ops/csrc/README.md b/mx_driving/fused/ops/csrc/README.md
deleted file mode 100644
index 0bbe4f394307b9d81004b5bd923e630eabd9a509..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/csrc/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some cpp source files, which provide code for adaptation of ascend kernels. It provide links for kernels and cpp interfaces.
\ No newline at end of file
diff --git a/mx_driving/fused/ops/csrc/functions.h b/mx_driving/fused/ops/csrc/functions.h
deleted file mode 100644
index 54a7c11468408f6aa73e1da92b3fc2fc8e1500e6..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/csrc/functions.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2024, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_
-#define PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_
-#include <ATen/Tensor.h>
-#include <torch/library.h>
-
-at::Tensor npu_max_pool2d(const at::Tensor& x, int kernel_size, int stride, int padding);
-
-at::Tensor multi_scale_deformable_attn(const at::Tensor& value, const at::Tensor& value_spatial_shapes,
-    const at::Tensor& value_level_start_index, const at::Tensor& sampling_locations,
-    const at::Tensor& attention_weights);
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> multi_scale_deformable_attn_backward(const at::Tensor& value,
-    const at::Tensor& value_spatial_shapes, const at::Tensor& value_level_start_index,
-    const at::Tensor& sampling_locations, const at::Tensor& attention_weights, const at::Tensor& grad_output);
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> multi_scale_deformable_attn_grad_v2(const at::Tensor& value,
-    const at::Tensor& shape, const at::Tensor& level_start_index, const at::Tensor& location_trans,
-    const at::Tensor& attn_weight_trans, const at::Tensor& grad_output);
-
-at::Tensor npu_add_relu(at::Tensor& x, const at::Tensor& y);
-
-at::Tensor npu_add_relu_grad(at::Tensor& self, at::Tensor& grad_output);
-std::tuple<at::Tensor, at::Tensor> npu_scatter_mean(at::Tensor& src, at::Tensor& index, c10::optional<at::Tensor> out,
-    c10::optional<int> dim, c10::optional<int> dim_size);
-
-at::Tensor fused_bias_leaky_relu(
-    const at::Tensor& x, const at::Tensor& bias, const double negative_slop, const double scale);
-
-at::Tensor deformable_aggregation(const at::Tensor& mc_ms_feat, const at::Tensor& spatial_shape,
-    const at::Tensor& scale_start_index, const at::Tensor& sampling_location, const at::Tensor& weights);
-std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_aggregation_grad(const at::Tensor& mc_ms_feat,
-    const at::Tensor& spatial_shape, const at::Tensor& scale_start_index, const at::Tensor& sampling_location,
-    const at::Tensor& weights, const at::Tensor& grad_output, const at::Tensor& grad_mc_ms_feat,
-    const at::Tensor& grad_sampling_location, const at::Tensor& grad_weights);
-
-std::tuple<at::Tensor, at::Tensor> deformable_conv2d(const at::Tensor& input, const at::Tensor& offset,
-    const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding,
-    at::IntArrayRef dilation, int64_t groups, int64_t deformable_groups);
-
-std::tuple<at::Tensor, at::Tensor> modulated_deformable_conv2d(const at::Tensor& input, const at::Tensor& offset,
-    const at::Tensor& mask, const at::Tensor& weight, const c10::optional<at::Tensor>& bias_opt,
-    at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
-    int64_t groups, int64_t deformable_groups, int64_t with_bias);
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_conv2d_backward(const at::Tensor& input,
-    const at::Tensor& weight, const at::Tensor& offset, const at::Tensor& offset_output, const at::Tensor& grad_y,
-    at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
-    int64_t groups, int64_t deformable_groups);
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> modulated_deformable_conv2d_backward(
-    const at::Tensor& input, const at::Tensor& offset, const at::Tensor& mask, const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias_opt, const at::Tensor& offset_output, const at::Tensor& grad_y,
-    at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
-    int64_t groups, int64_t deformable_groups, int64_t with_bias);
-
-at::Tensor npu_geometric_kernel_attention_func(const at::Tensor& value, const at::Tensor& spatial_shapes,
-    const at::Tensor& level_start_index, const at::Tensor& sampling_locations, const at::Tensor& attn_weights);
-
-std::tuple<at::Tensor, at::Tensor> npu_geometric_kernel_attention_backward(const at::Tensor& value,
-    const at::Tensor& spatial_shapes, const at::Tensor& level_start_index, const at::Tensor& sampling_locations,
-    const at::Tensor& attn_weights, const at::Tensor& grad_output);
-#endif // PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_
diff --git a/mx_driving/fused/ops/csrc/pybind.cpp b/mx_driving/fused/ops/csrc/pybind.cpp
deleted file mode 100644
index acbdc51f3a881923a1aff4edecdd9fdd732f6a70..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/csrc/pybind.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "csrc/pybind.h"
-
-#include <torch/extension.h>
-
-#include "functions.h"
-void init_fused(pybind11::module& m)
-{
-    // nnpu_max_pool2d
-    m.def("npu_max_pool2d", &npu_max_pool2d);
-    // mullti_scale_deformable_attn
-    m.def("multi_scale_deformable_attn", &multi_scale_deformable_attn);
-    m.def("multi_scale_deformable_attn_backward", &multi_scale_deformable_attn_backward);
-
-    // npu_add_relu
-    m.def("npu_add_relu", &npu_add_relu);
-    m.def("npu_add_relu_grad", &npu_add_relu_grad);
-
-    // fused_bias_leaky_relu
-    m.def("fused_bias_leaky_relu", &fused_bias_leaky_relu);
-
-    // npu_deformable_aggregation
-    m.def("npu_deformable_aggregation", &deformable_aggregation);
-    m.def("npu_deformable_aggregation_grad", &deformable_aggregation_grad);
-
-    // deformable_conv2d
-    m.def("deformable_conv2d", &deformable_conv2d);
-    m.def("modulated_deformable_conv2d", &modulated_deformable_conv2d);
-    m.def("deformable_conv2d_backward", &deformable_conv2d_backward);
-    m.def("modulated_deformable_conv2d_backward", &modulated_deformable_conv2d_backward);
-
-    // npu_geometric_kernel_attention_func
-    m.def("npu_geometric_kernel_attention_func", &npu_geometric_kernel_attention_func);
-    m.def("npu_geometric_kernel_attention_backward", &npu_geometric_kernel_attention_backward);
-}
diff --git a/mx_driving/fused/ops/kernels/CMakeLists.txt b/mx_driving/fused/ops/kernels/CMakeLists.txt
deleted file mode 100644
index b77ac594c4df44bf8700a3b2fa1867984111f27a..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
-  add_subdirectory(op_host)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
-  add_subdirectory(op_kernel)
-endif()
-if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
-  add_subdirectory(testcases)
-endif()
diff --git a/mx_driving/fused/ops/kernels/README.md b/mx_driving/fused/ops/kernels/README.md
deleted file mode 100644
index 214fb0a6d662e806bd7f6bdd1b8962bc1639026e..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/kernels/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some ascend-kernel source files, which are like cuda-kernels and supply some ops that can be run on ascend device.
\ No newline at end of file
diff --git a/mx_driving/fused/ops/kernels/op_host/CMakeLists.txt b/mx_driving/fused/ops/kernels/op_host/CMakeLists.txt
deleted file mode 100644
index 7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/kernels/op_host/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_HOST_SRC
-    ${ASCEND_HOST_SRC} ${HOST_SRC}
-    CACHE INTERNAL "")
-
-# add the exclude files for aclnn
-set(aclop_exclude
-    ${aclop_exclude} ""
-    CACHE INTERNAL "")
-file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp)
-file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h)
-set(ACLNN_SRC_CUSTOM
-    ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC}
-    CACHE INTERNAL "")
-set(ACLNN_INC_CUSTOM
-    ${ACLNN_INC_CUSTOM} ${ACLNN_INC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/fused/ops/kernels/op_host/OWNERS b/mx_driving/fused/ops/kernels/op_host/OWNERS
deleted file mode 100644
index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/kernels/op_host/OWNERS
+++ /dev/null
@@ -1,7 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/fused/ops/kernels/op_host/common.h b/mx_driving/fused/ops/kernels/op_host/common.h
deleted file mode 100644
index 4580dff5fd0b206d1b94383f160932c22d1cb8a9..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/kernels/op_host/common.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
- */
-#ifndef COMMON_H
-#define COMMON_H
-
-#include "register/op_def_registry.h"
-#include "tiling/platform/platform_ascendc.h"
-#include "tiling/tiling_api.h"
-#include "register/tilingdata_base.h"
-
-inline uint32_t ceil_multiple(uint32_t num, uint32_t block)
-{
-    if (block == 0) {
-        return 0;
-    }
-    return (num + block - 1) / block;
-}
-
-inline uint32_t ceil_value(uint32_t num, uint32_t block)
-{
-    if (block == 0) {
-        return 0;
-    }
-    return ((num + block - 1) / block) * block;
-}
-
-#endif // COMMON_H
diff --git a/mx_driving/fused/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/fused/ops/kernels/op_kernel/CMakeLists.txt
deleted file mode 100644
index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/kernels/op_kernel/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_KERNEL_SRC
-        ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
-        CACHE INTERNAL "")
diff --git a/mx_driving/fused/ops/onnx/__init__.py b/mx_driving/fused/ops/onnx/__init__.py
deleted file mode 100644
index 3989a46992bc48cb7e9e30ca3cfe092a90d60ff2..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/onnx/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .wrapper_onnx_ops import NPUMultiScaleDeformableAttnOP
-
-onnx_msda = NPUMultiScaleDeformableAttnOP.apply
diff --git a/mx_driving/fused/ops/onnx/plugin/CMakeLists.txt b/mx_driving/fused/ops/onnx/plugin/CMakeLists.txt
deleted file mode 100644
index cc6034bd1fe09a766aef52f69cf0bb348ceaf2b5..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/onnx/plugin/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB ONNX_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_ONNX_SRC
-    ${ASCEND_ONNX_SRC} ${ONNX_SRC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/fused/ops/onnx/wrapper_onnx_ops.py b/mx_driving/fused/ops/onnx/wrapper_onnx_ops.py
deleted file mode 100644
index 12b6baa60408ca9b3b3c330e6de7b7d625d1de5e..0000000000000000000000000000000000000000
--- a/mx_driving/fused/ops/onnx/wrapper_onnx_ops.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from typing import Optional, List
-import torch
-from torch import Tensor
-import torch.onnx.symbolic_helper as sym_help
-import mx_driving.fused
-
-
-class NPUMultiScaleDeformableAttnOP(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, *args, **kwargs):
-        return mx_driving.fused.multi_scale_deformable_attn(*args, **kwargs)
-
-    @staticmethod
-    # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def symbolic(g, value: Tensor, value_spatial_shapes: Tensor, value_level_start_index: Tensor,
-                                                sampling_locations: Tensor, attention_weights: Tensor):
-        return g.op("npu::MultiScaleDeformableAttn",
-                    value,
-                    value_spatial_shapes,
-                    value_level_start_index,
-                    sampling_locations,
-                    attention_weights)
diff --git a/mx_driving/common/ops/__init__.py b/mx_driving/modules/__init__.py
similarity index 100%
rename from mx_driving/common/ops/__init__.py
rename to mx_driving/modules/__init__.py
diff --git a/mx_driving/modules/roi_point_pool_3d.py b/mx_driving/modules/roi_point_pool_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3efed8ed43fdb92cae7542f14c0b48bdf5092189
--- /dev/null
+++ b/mx_driving/modules/roi_point_pool_3d.py
@@ -0,0 +1,12 @@
+from torch.nn import Module
+
+from ..ops.npu_roipoint_pool3d import roipoint_pool3d
+
+
+class RoIPointPool3d(Module):
+    def __init__(self, num_sampled_points: int = 512):
+        super().__init__()
+        self.num_sampled_points = num_sampled_points
+
+    def forward(self, points, point_features, boxes3d):
+        return roipoint_pool3d(self.num_sampled_points, points, point_features, boxes3d)
diff --git a/mx_driving/spconv/ops/sparse_conv.py b/mx_driving/modules/sparse_conv.py
similarity index 53%
rename from mx_driving/spconv/ops/sparse_conv.py
rename to mx_driving/modules/sparse_conv.py
index 8f1304d56856d027b20360bf4c50a49140729a0b..06576e113a3d6405ce4488ddaebd8416f5593d98 100644
--- a/mx_driving/spconv/ops/sparse_conv.py
+++ b/mx_driving/modules/sparse_conv.py
@@ -21,17 +21,58 @@ from torch.nn import init
 from torch.nn.init import calculate_gain
 from torch.nn.parameter import Parameter
 
-from . import sparse_functional as Fsp
-from . import sparse_ops as ops
+from ..ops import sparse_functional as Fsp
 from .sparse_modules import SparseModule
 from .sparse_structure import SparseConvTensor
 
 
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def get_inverse_conv_output_size(input_size, kernel_size, stride, padding, dilation, output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (
+            (input_size[i] - 1) * stride[i]
+            - 2 * padding[i]
+            + dilation[i] * (kernel_size[i] - 1)
+            + output_padding[i]
+            + 1
+        )
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation, output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
 def _calculate_fan_in_and_fan_out_hwio(tensor):
     dimensions = tensor.ndimension()
     if dimensions < 2:
-        raise ValueError('fan in and fan out can not be computed for tensor'
-                         'with fewer than 2 dimensions')
+        raise ValueError("fan in and fan out can not be computed for tensor" "with fewer than 2 dimensions")
 
     if dimensions == 2:  # Linear
         fan_in = tensor.size(-2)
@@ -49,25 +90,26 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
 
 
 class SparseConvolution(SparseModule):
-
-    # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def __init__(self,
-                 ndim,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 subm=False,
-                 output_padding=0,
-                 transposed=False,
-                 inverse=False,
-                 indice_key=None,
-                 fused_bn=False,
-                 mode='mmcv'):
+    # pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def __init__(
+        self,
+        ndim,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        subm=False,
+        output_padding=0,
+        transposed=False,
+        inverse=False,
+        indice_key=None,
+        fused_bn=False,
+        mode="mmcv",
+    ):
         super().__init__()
         if groups != 1:
             raise RuntimeError("do not support group == 1")
@@ -103,17 +145,16 @@ class SparseConvolution(SparseModule):
         self.fused_bn = fused_bn
         self.mode = mode
 
-        self.weight = Parameter(
-            torch.Tensor(*kernel_size, in_channels, out_channels))
+        self.weight = Parameter(torch.Tensor(*kernel_size, in_channels, out_channels))
         if bias:
             self.bias = Parameter(torch.Tensor(out_channels))
         else:
-            self.register_parameter('bias', None)
+            self.register_parameter("bias", None)
         self.reset_parameters()
 
     def reset_parameters(self):
         fan_in, fan_out = _calculate_fan_in_and_fan_out_hwio(self.weight)
-        if self.mode == 'mmcv':
+        if self.mode == "mmcv":
             init.kaiming_uniform_(self.weight, a=math.sqrt(5))
         else:
             self._custom_kaiming_uniform_(self.weight, a=math.sqrt(5), fan_in=fan_in, fan_out=fan_out)
@@ -124,80 +165,111 @@ class SparseConvolution(SparseModule):
                 bound = 1 / math.sqrt(fan_in)
             init.uniform_(self.bias, -bound, bound)
 
-    def _custom_kaiming_uniform_(self,
-                                 tensor,
-                                 a=0,
-                                 fan_in=0,
-                                 fan_out=0,
-                                 mode='fan_in',
-                                 nonlinearity='leaky_relu'):
+    def _custom_kaiming_uniform_(self, tensor, a=0, fan_in=0, fan_out=0, mode="fan_in", nonlinearity="leaky_relu"):
         fan = 0.0
-        if mode == 'fan_in':
+        if mode == "fan_in":
             fan = float(fan_in)
-        elif mode == 'fan_out':
+        elif mode == "fan_out":
             fan = float(fan_out)
         gain = calculate_gain(nonlinearity, a)
         std = gain / math.sqrt(fan)
         bound = math.sqrt(3.0) * std
         with torch.no_grad():
             tensor.uniform_(-bound, bound)
-            tensor.data = tensor.data.reshape(self.out_channels, np.prod(self.kernel_size) * self.in_channels).transpose(-1, -2).contiguous()
+            tensor.data = (
+                tensor.data.reshape(self.out_channels, np.prod(self.kernel_size) * self.in_channels)
+                .transpose(-1, -2)
+                .contiguous()
+            )
             tensor.data = tensor.data.reshape(*self.kernel_size, self.in_channels, self.out_channels)
 
     def forward(self, input):
         if not isinstance(input, SparseConvTensor):
             raise RuntimeError("input is not SparseConvTensor")
         if self.inverse:
-            out_spatial_shape = ops.get_inverse_conv_output_size(
-                input.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding)
+            out_spatial_shape = get_inverse_conv_output_size(
+                input.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding
+            )
             out_spatial_shape = [int(i) for i in out_spatial_shape]
             if not isinstance(out_spatial_shape, list):
                 out_spatial_shape = out_spatial_shape.tolist()
-            out_features, outidx = Fsp.indice_inverse_conv(input.features, input.indices, self.weight, out_spatial_shape,
-                                                           self.out_channels, input.batch_size,
-                                                           self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding,
-                                                           self.groups, self.bias)
+            out_features, outidx = Fsp.indice_inverse_conv(
+                input.features,
+                input.indices,
+                self.weight,
+                out_spatial_shape,
+                self.out_channels,
+                input.batch_size,
+                self.kernel_size,
+                self.stride,
+                self.padding,
+                self.dilation,
+                self.output_padding,
+                self.groups,
+                self.bias,
+            )
         elif not self.subm:
-            out_spatial_shape = ops.get_conv_output_size(
-                input.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation)
+            out_spatial_shape = get_conv_output_size(
+                input.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation
+            )
             out_spatial_shape = [int(i) for i in out_spatial_shape]
             if not isinstance(out_spatial_shape, list):
                 out_spatial_shape = out_spatial_shape.tolist()
-            out_features, outidx = Fsp.indice_conv(input.features, input.indices, self.weight, out_spatial_shape,
-                                                   self.out_channels, input.batch_size,
-                                                   self.kernel_size, self.stride, self.padding, self.dilation,
-                                                   self.groups, self.bias)
+            out_features, outidx = Fsp.indice_conv(
+                input.features,
+                input.indices,
+                self.weight,
+                out_spatial_shape,
+                self.out_channels,
+                input.batch_size,
+                self.kernel_size,
+                self.stride,
+                self.padding,
+                self.dilation,
+                self.groups,
+                self.bias,
+            )
         else:
             out_spatial_shape = input.spatial_shape
             out_spatial_shape = [int(i) for i in out_spatial_shape]
             if not isinstance(out_spatial_shape, list):
                 out_spatial_shape = out_spatial_shape.tolist()
-            out_features, outidx = Fsp.indice_subm_conv(input.features, input.indices, self.weight, out_spatial_shape,
-                                                        self.out_channels, input.batch_size,
-                                                        self.kernel_size, self.stride, self.padding, self.dilation,
-                                                        self.groups, self.bias)
+            out_features, outidx = Fsp.indice_subm_conv(
+                input.features,
+                input.indices,
+                self.weight,
+                out_spatial_shape,
+                self.out_channels,
+                input.batch_size,
+                self.kernel_size,
+                self.stride,
+                self.padding,
+                self.dilation,
+                self.groups,
+                self.bias,
+            )
 
         if self.bias is not None:
             out_features += self.bias
 
-        out_tensor = SparseConvTensor(out_features, outidx, out_spatial_shape,
-                                        input.batch_size)
+        out_tensor = SparseConvTensor(out_features, outidx, out_spatial_shape, input.batch_size)
         return out_tensor
 
 
 class SparseConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None,
-                 mode='mmcv'):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        indice_key=None,
+        mode="mmcv",
+    ):
         super().__init__(
             3,
             in_channels,
@@ -209,22 +281,24 @@ class SparseConv3d(SparseConvolution):
             groups,
             bias,
             indice_key=indice_key,
-            mode=mode)
+            mode=mode,
+        )
 
 
 class SubMConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None,
-                 mode='mmcv'):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        indice_key=None,
+        mode="mmcv",
+    ):
         super().__init__(
             3,
             in_channels,
@@ -237,23 +311,25 @@ class SubMConv3d(SparseConvolution):
             bias,
             True,
             indice_key=indice_key,
-            mode=mode)
+            mode=mode,
+        )
 
 
 class SparseInverseConv3d(SparseConvolution):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 inverse=True,
-                 indice_key=None,
-                 mode='mmcv'):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        inverse=True,
+        indice_key=None,
+        mode="mmcv",
+    ):
         super().__init__(
             3,
             in_channels,
@@ -267,4 +343,5 @@ class SparseInverseConv3d(SparseConvolution):
             subm=False,
             inverse=True,
             indice_key=indice_key,
-            mode=mode)
\ No newline at end of file
+            mode=mode,
+        )
diff --git a/mx_driving/spconv/ops/sparse_modules.py b/mx_driving/modules/sparse_modules.py
similarity index 100%
rename from mx_driving/spconv/ops/sparse_modules.py
rename to mx_driving/modules/sparse_modules.py
diff --git a/mx_driving/spconv/ops/sparse_structure.py b/mx_driving/modules/sparse_structure.py
similarity index 74%
rename from mx_driving/spconv/ops/sparse_structure.py
rename to mx_driving/modules/sparse_structure.py
index 83907ab5563ff292e8c48715f5b1149a7d31f460..44ebf4627998638b2fbd404994277f129e9d1300 100644
--- a/mx_driving/spconv/ops/sparse_structure.py
+++ b/mx_driving/modules/sparse_structure.py
@@ -4,8 +4,7 @@ import numpy as np
 import torch
 
 
-def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
-               shape: torch.Tensor) -> torch.Tensor:
+def scatter_nd(indices: torch.Tensor, updates: torch.Tensor, shape: torch.Tensor) -> torch.Tensor:
     """pytorch edition of tensorflow scatter_nd.
 
     this function don't contain except handle code. so use this carefully when
@@ -13,7 +12,7 @@ def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
     """
     ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
     ndim = indices.shape[-1]
-    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1] :]
     flatted_indices = indices.view(-1, ndim)
     slices = [flatted_indices[:, i] for i in range(ndim)]
     slices += [Ellipsis]
@@ -22,13 +21,14 @@ def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
 
 
 class SparseConvTensor:
-
-    def __init__(self,
-                 features: torch.Tensor,
-                 indices: torch.Tensor,
-                 spatial_shape: Union[List, Tuple],
-                 batch_size: int,
-                 grid: Optional[torch.Tensor] = None):
+    def __init__(
+        self,
+        features: torch.Tensor,
+        indices: torch.Tensor,
+        spatial_shape: Union[List, Tuple],
+        batch_size: int,
+        grid: Optional[torch.Tensor] = None,
+    ):
         self.features = features
         self.indices = indices
         if self.indices.dtype != torch.int32:
@@ -50,8 +50,7 @@ class SparseConvTensor:
         return None
 
     def dense(self, channels_first: bool = True) -> torch.Tensor:
-        output_shape = [self.batch_size] + list(
-            self.spatial_shape) + [self.features.shape[1]]
+        output_shape = [self.batch_size] + list(self.spatial_shape) + [self.features.shape[1]]
         res = scatter_nd(self.indices.long(), self.features, output_shape)
         if not channels_first:
             return res
@@ -62,5 +61,4 @@ class SparseConvTensor:
 
     @property
     def sparity(self):
-        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
-                self.batch_size)
+        return self.indices.shape[0] / np.prod(self.spatial_shape) / self.batch_size
diff --git a/mx_driving/modules/voxelization.py b/mx_driving/modules/voxelization.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaa128233792e3b43c617012bb8ed06053616adb
--- /dev/null
+++ b/mx_driving/modules/voxelization.py
@@ -0,0 +1,23 @@
+import torch
+from torch.nn import Module
+from torch.nn.modules.utils import _pair
+
+from ..ops.voxelization import voxelization
+
+
+class Voxelization(Module):
+    def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000, deterministic=True):
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        self.max_voxels = max_voxels
+        self.max_voxels = max_voxels if isinstance(max_voxels, tuple) else _pair(max_voxels)
+        self.deterministic = deterministic
+
+    def forward(self, points: torch.Tensor):
+        max_voxels = self.max_voxels[0] if self.training else self.max_voxels[1]
+        return voxelization(
+            points, self.voxel_size, self.point_cloud_range, self.max_num_points, max_voxels, self.deterministic
+        )
diff --git a/mx_driving/detection/ops/__init__.py b/mx_driving/ops/__init__.py
similarity index 100%
rename from mx_driving/detection/ops/__init__.py
rename to mx_driving/ops/__init__.py
diff --git a/mx_driving/common/ops/assign_score_withk.py b/mx_driving/ops/assign_score_withk.py
similarity index 97%
rename from mx_driving/common/ops/assign_score_withk.py
rename to mx_driving/ops/assign_score_withk.py
index f17773adb7b4d16da5e9ee8cafd0cc32618c9a82..7f05125fc3bc58cd187a36f0eec9086c8f0d20d4 100644
--- a/mx_driving/common/ops/assign_score_withk.py
+++ b/mx_driving/ops/assign_score_withk.py
@@ -10,7 +10,6 @@ Modification 1. Add support for Ascend NPU
 import torch
 import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
 import mx_driving._C
 
diff --git a/mx_driving/point/ops/bev_pool.py b/mx_driving/ops/bev_pool.py
similarity index 100%
rename from mx_driving/point/ops/bev_pool.py
rename to mx_driving/ops/bev_pool.py
diff --git a/mx_driving/point/ops/bev_pool_v2.py b/mx_driving/ops/bev_pool_v2.py
similarity index 100%
rename from mx_driving/point/ops/bev_pool_v2.py
rename to mx_driving/ops/bev_pool_v2.py
diff --git a/mx_driving/point/ops/bev_pool_v3.py b/mx_driving/ops/bev_pool_v3.py
similarity index 100%
rename from mx_driving/point/ops/bev_pool_v3.py
rename to mx_driving/ops/bev_pool_v3.py
diff --git a/mx_driving/detection/ops/border_align.py b/mx_driving/ops/border_align.py
similarity index 94%
rename from mx_driving/detection/ops/border_align.py
rename to mx_driving/ops/border_align.py
index f91c9ded76e7e8b4a5138c2b84b0d7a15e5139e2..6ecbfc48094ef93ddca62be37b61caa27ebcea94 100644
--- a/mx_driving/detection/ops/border_align.py
+++ b/mx_driving/ops/border_align.py
@@ -22,7 +22,7 @@ class BorderAlignFunction(Function):
             feature_map.device
         )
 
-        mx_driving._C.border_align_forward_npu(feature_map, rois, output, ctx.pooled_size)
+        mx_driving._C.border_align(feature_map, rois, output, ctx.pooled_size)
 
         npu_outputs, index = output.max(dim=-2)
         npu_outputs = (
diff --git a/mx_driving/detection/ops/box_iou.py b/mx_driving/ops/box_iou.py
similarity index 100%
rename from mx_driving/detection/ops/box_iou.py
rename to mx_driving/ops/box_iou.py
diff --git a/mx_driving/detection/ops/boxes_overlap_bev.py b/mx_driving/ops/boxes_overlap_bev.py
similarity index 100%
rename from mx_driving/detection/ops/boxes_overlap_bev.py
rename to mx_driving/ops/boxes_overlap_bev.py
diff --git a/mx_driving/fused/ops/deform_conv2d.py b/mx_driving/ops/deform_conv2d.py
similarity index 100%
rename from mx_driving/fused/ops/deform_conv2d.py
rename to mx_driving/ops/deform_conv2d.py
diff --git a/mx_driving/point/ops/furthest_point_sampling.py b/mx_driving/ops/furthest_point_sampling.py
similarity index 85%
rename from mx_driving/point/ops/furthest_point_sampling.py
rename to mx_driving/ops/furthest_point_sampling.py
index 708251dbf4e9e02ece44d1d8f7d73cfd16ce5ab9..59eb005f5c4a9c297dda837c88f00e5b3af8f571 100644
--- a/mx_driving/point/ops/furthest_point_sampling.py
+++ b/mx_driving/ops/furthest_point_sampling.py
@@ -6,12 +6,12 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import numpy as np
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
@@ -21,10 +21,10 @@ class AdsFurthestPointSampling(Function):
         B, N = point_xyz.size()[:2]
         point_xyz = point_xyz.permute(0, 2, 1).contiguous()
 
-        nearest_dist = torch.tensor(np.ones((B, N)) * 1e10, dtype=torch.float32, device='npu').contiguous()
+        nearest_dist = torch.tensor(np.ones((B, N)) * 1e10, dtype=torch.float32, device="npu").contiguous()
         output = mx_driving._C.npu_furthest_point_sampling(point_xyz, nearest_dist, num_points)
 
         return output
 
 
-npu_furthest_point_sampling = AdsFurthestPointSampling.apply
\ No newline at end of file
+npu_furthest_point_sampling = AdsFurthestPointSampling.apply
diff --git a/mx_driving/point/ops/furthest_point_sampling_with_dist.py b/mx_driving/ops/furthest_point_sampling_with_dist.py
similarity index 90%
rename from mx_driving/point/ops/furthest_point_sampling_with_dist.py
rename to mx_driving/ops/furthest_point_sampling_with_dist.py
index f56f104cf15c1836d11e3512d3f254a3015c7839..7c24bc970fee2d16bfa1e40ba3722f86afbd01e6 100644
--- a/mx_driving/point/ops/furthest_point_sampling_with_dist.py
+++ b/mx_driving/ops/furthest_point_sampling_with_dist.py
@@ -6,20 +6,21 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
 class AdsFurthestPointSamplingWithDistFunction(Function):
     @staticmethod
-    def forward(ctx, points_dist, num_points): 
+    def forward(ctx, points_dist, num_points):
         B, N = points_dist.size()[:2]
         nearest_temp = points_dist.new_zeros([B, N]).fill_(1e10)
         result = mx_driving._C.furthest_point_sampling_with_dist(points_dist, nearest_temp, num_points)
         return result
 
+
 furthest_point_sample_with_dist = AdsFurthestPointSamplingWithDistFunction.apply
diff --git a/mx_driving/fused/ops/fused_bias_leaky_relu.py b/mx_driving/ops/fused_bias_leaky_relu.py
similarity index 65%
rename from mx_driving/fused/ops/fused_bias_leaky_relu.py
rename to mx_driving/ops/fused_bias_leaky_relu.py
index 51a86fdf519d02bb6e42185370df0a701ee7cc82..03b88d3904be40cbd8d4bf228004a85e8abd6265 100644
--- a/mx_driving/fused/ops/fused_bias_leaky_relu.py
+++ b/mx_driving/ops/fused_bias_leaky_relu.py
@@ -5,22 +5,22 @@ Modification by: Huawei Developers
 Modification date: 2024-06-04 
 Modification Description: 
 Modification 1. Add support for Ascend NPU
-"""  
+"""
+
 import torch
 from torch.autograd import Function
 
-import torch_npu
 import mx_driving._C
 
 
-class FusedBiasLeakyRelu(Function):
-
+class FusedBiasLeakyReLU(Function):
     @staticmethod
     def forward(ctx, x, bias, negative_slope=0.2, scale=2**0.5):
-        bias = torch.broadcast_to(bias.to(x.dtype).reshape([-1 if i == 1 else 1 for i in range(x.ndim)]), 
-                                  x.shape).contiguous()
+        bias = torch.broadcast_to(
+            bias.to(x.dtype).reshape([-1 if i == 1 else 1 for i in range(x.ndim)]), x.shape
+        ).contiguous()
         out = mx_driving._C.fused_bias_leaky_relu(x, bias, negative_slope, scale)
         return out
 
 
-npu_fused_bias_leaky_relu = FusedBiasLeakyRelu.apply
\ No newline at end of file
+npu_fused_bias_leaky_relu = FusedBiasLeakyReLU.apply
diff --git a/mx_driving/point/ops/group_points.py b/mx_driving/ops/group_points.py
similarity index 75%
rename from mx_driving/point/ops/group_points.py
rename to mx_driving/ops/group_points.py
index 523ef73a6b035557d304b0e8b62dec7cc25facee..d7122fec4336277f43c4128da5342cd48952c4e5 100644
--- a/mx_driving/point/ops/group_points.py
+++ b/mx_driving/ops/group_points.py
@@ -14,12 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import warnings
-import torch
+
 import numpy as np
+import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
@@ -27,10 +27,7 @@ class AdsGroupPoints(Function):
     """Group feature with given index."""
 
     @staticmethod
-    def forward(
-            ctx,
-            features: torch.Tensor,
-            indices: torch.Tensor):
+    def forward(ctx, features: torch.Tensor, indices: torch.Tensor):
         """
         Args:
             features (Tensor): Tensor of features to group, input shape is (B, C, N).
@@ -45,14 +42,7 @@ class AdsGroupPoints(Function):
         B, C, N = features.size()
         _, npoints, nsample = indices.size()
 
-        output = mx_driving._C.group_points(
-            features,
-            indices,
-            B,
-            C,
-            N,
-            npoints,
-            nsample)
+        output = mx_driving._C.group_points(features, indices, B, C, N, npoints, nsample)
 
         ctx.for_backwards = (indices, N)
         return output
@@ -70,14 +60,7 @@ class AdsGroupPoints(Function):
         idx, N = ctx.for_backwards
 
         B, C, npoints, nsample = grad_out.size()
-        grad_features = mx_driving._C.group_points_backward(
-            grad_out,
-            idx,
-            B,
-            C,
-            N,
-            npoints,
-            nsample)
+        grad_features = mx_driving._C.group_points_backward(grad_out, idx, B, C, N, npoints, nsample)
         return grad_features, None
 
 
@@ -86,5 +69,7 @@ def group_points(features: torch.Tensor, indices: torch.Tensor):
 
 
 def npu_group_points(features: torch.Tensor, indices: torch.Tensor):
-    warnings.warn("`npu_group_points` will be deprecated in future. Please use `group_points` instead.", DeprecationWarning)
-    return AdsGroupPoints.apply(features, indices)
\ No newline at end of file
+    warnings.warn(
+        "`npu_group_points` will be deprecated in future. Please use `group_points` instead.", DeprecationWarning
+    )
+    return AdsGroupPoints.apply(features, indices)
diff --git a/mx_driving/common/ops/hypot.py b/mx_driving/ops/hypot.py
similarity index 100%
rename from mx_driving/common/ops/hypot.py
rename to mx_driving/ops/hypot.py
diff --git a/mx_driving/common/ops/knn.py b/mx_driving/ops/knn.py
similarity index 64%
rename from mx_driving/common/ops/knn.py
rename to mx_driving/ops/knn.py
index bbd9cd5ac84950d2427067747e8fd9bdf0676e6a..067c7b8f43193e40ebaf4535adde4bb03b969c8f 100644
--- a/mx_driving/common/ops/knn.py
+++ b/mx_driving/ops/knn.py
@@ -6,24 +6,23 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 from typing import Optional
+
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
-class AdsKnn(Function):
+class Knn(Function):
     @staticmethod
-    def forward(ctx,
-                k: int,
-                xyz: torch.Tensor,
-                center_xyz: Optional[torch.Tensor] = None,
-                transposed: bool = False) -> torch.Tensor:
+    def forward(
+        ctx, k: int, xyz: torch.Tensor, center_xyz: Optional[torch.Tensor] = None, transposed: bool = False
+    ) -> torch.Tensor:
         if k <= 0 and k >= 100:
-            print('k should be in range (0, 100).')
+            print("k should be in range (0, 100).")
             return None
 
         if center_xyz is None:
@@ -34,21 +33,21 @@ class AdsKnn(Function):
         else:
             xyz = xyz.transpose(2, 1).contiguous()
 
-        if not xyz.is_contiguous(): # [B, 3, N]
+        if not xyz.is_contiguous():  # [B, 3, N]
             return None
-        if not xyz.is_contiguous(): # [B, npoint, 3]
+        if not xyz.is_contiguous():  # [B, npoint, 3]
             return None
 
         if center_xyz.get_device() != xyz.get_device():
-            print('center_xyz and xyz should be on the same device.')
+            print("center_xyz and xyz should be on the same device.")
             return None
 
         dist2, idx = mx_driving._C.knn(xyz, center_xyz, k, True)
         zeros_idx = torch.zeros(xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu()
         idx.where(dist2 >= 1e10, zeros_idx)
-        idx = idx.transpose(2, 1).contiguous() # [B, k, npoint]
+        idx = idx.transpose(2, 1).contiguous()  # [B, k, npoint]
 
         return idx.int()
 
 
-knn = AdsKnn.apply
+knn = Knn.apply
diff --git a/mx_driving/fused/ops/modulated_deform_conv2d.py b/mx_driving/ops/modulated_deform_conv2d.py
similarity index 100%
rename from mx_driving/fused/ops/modulated_deform_conv2d.py
rename to mx_driving/ops/modulated_deform_conv2d.py
diff --git a/mx_driving/fused/ops/multi_scale_deformable_attn.py b/mx_driving/ops/multi_scale_deformable_attn.py
similarity index 81%
rename from mx_driving/fused/ops/multi_scale_deformable_attn.py
rename to mx_driving/ops/multi_scale_deformable_attn.py
index d2cdda814f272f7a5c04edbad2a79091ea9d7440..4c7ffef62799b07ac6ab7fb56f1904a5c6162a5e 100644
--- a/mx_driving/fused/ops/multi_scale_deformable_attn.py
+++ b/mx_driving/ops/multi_scale_deformable_attn.py
@@ -49,6 +49,25 @@ class MultiScaleDeformableAttnFunction(Function):
         )
         return grad_value, None, None, grad_sampling_loc, grad_attn_weight
 
+    @staticmethod
+    # pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def symbolic(
+        g,
+        value: torch.Tensor,
+        value_spatial_shapes: torch.Tensor,
+        value_level_start_index: torch.Tensor,
+        sampling_locations: torch.Tensor,
+        attention_weights: torch.Tensor,
+    ):
+        return g.op(
+            "npu::MultiScaleDeformableAttn",
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        )
+
 
 multi_scale_deformable_attn = MultiScaleDeformableAttnFunction.apply
 
diff --git a/mx_driving/detection/ops/nms3d_normal.py b/mx_driving/ops/nms3d_normal.py
similarity index 90%
rename from mx_driving/detection/ops/nms3d_normal.py
rename to mx_driving/ops/nms3d_normal.py
index c6b297cc9be74e518d0e057b7807b79e91dc6da5..d1751c07848f7faee693654eb28a29ccfda5cc56 100644
--- a/mx_driving/detection/ops/nms3d_normal.py
+++ b/mx_driving/ops/nms3d_normal.py
@@ -6,10 +6,11 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import torch
 import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
+
 import mx_driving._C
 
 
@@ -17,11 +18,12 @@ class AdsNms3dNormalFunction(Function):
     @staticmethod
     def forward(ctx, boxes, scores, iou_threshold: float):
         if boxes.shape[1] != 7:
-            raise 'Input boxes shape should be (N, 7)'
+            raise "Input boxes shape should be (N, 7)"
         order = scores.sort(0, descending=True)[1]
         boxes = boxes[order].contiguous()
 
         keep, num_out = mx_driving._C.nms3d_normal(boxes, iou_threshold)
         return order[keep[:num_out].long()].contiguous()
 
+
 npu_nms3d_normal = AdsNms3dNormalFunction.apply
diff --git a/mx_driving/fused/ops/npu_add_relu.py b/mx_driving/ops/npu_add_relu.py
similarity index 88%
rename from mx_driving/fused/ops/npu_add_relu.py
rename to mx_driving/ops/npu_add_relu.py
index 62366a79b1b05917054bfeb2578798fb81729a7f..dbb66145ac93761dc7d7c51c7c5d5eabf68f9e29 100644
--- a/mx_driving/fused/ops/npu_add_relu.py
+++ b/mx_driving/ops/npu_add_relu.py
@@ -5,13 +5,13 @@ Modification by: Huawei Developers
 Modification date: 2024-06-04 
 Modification Description: 
 Modification 1. Add support for Ascend NPU
-"""  
+"""
+
 import torch
+import torch.nn.functional as F
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
-import torch.nn.functional as F
 import mx_driving._C
 
 
@@ -27,8 +27,9 @@ class AddReluFunction(Function):
 
     @staticmethod
     def backward(ctx, grad_output):
-        x, = ctx.saved_tensors
+        (x,) = ctx.saved_tensors
         result = mx_driving._C.npu_add_relu_grad(x, grad_output)
         return result, result
 
-npu_add_relu = AddReluFunction.apply
\ No newline at end of file
+
+npu_add_relu = AddReluFunction.apply
diff --git a/mx_driving/fused/ops/npu_deformable_aggregation.py b/mx_driving/ops/npu_deformable_aggregation.py
similarity index 87%
rename from mx_driving/fused/ops/npu_deformable_aggregation.py
rename to mx_driving/ops/npu_deformable_aggregation.py
index d6076fb4876ff68403f36ec7bb9384b67d0b9bcc..46de6fb9832f782022c299f95cccba97487d510b 100644
--- a/mx_driving/fused/ops/npu_deformable_aggregation.py
+++ b/mx_driving/ops/npu_deformable_aggregation.py
@@ -1,9 +1,8 @@
-import torch
 import numpy as np
+import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
@@ -12,13 +11,13 @@ class AdsDeformableAggregation(Function):
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
     def forward(
-            ctx,
-            mc_ms_feat: torch.Tensor,
-            spatial_shape: torch.Tensor,
-            scale_start_index: torch.Tensor,
-            sampling_location: torch.Tensor,
-            weights: torch.Tensor):
-
+        ctx,
+        mc_ms_feat: torch.Tensor,
+        spatial_shape: torch.Tensor,
+        scale_start_index: torch.Tensor,
+        sampling_location: torch.Tensor,
+        weights: torch.Tensor,
+    ):
 
         mc_ms_feat = mc_ms_feat.contiguous().float()
         spatial_shape = spatial_shape.contiguous().int()
@@ -41,8 +40,8 @@ class AdsDeformableAggregation(Function):
             weights,
         )
         return output
-    
-    @staticmethod  
+
+    @staticmethod
     def backward(ctx, grad_output):
         (
             mc_ms_feat,
@@ -60,7 +59,7 @@ class AdsDeformableAggregation(Function):
         grad_mc_ms_feat = torch.zeros_like(mc_ms_feat)
         grad_sampling_location = torch.zeros_like(sampling_location)
         grad_weights = torch.zeros_like(weights)
-        grad_mc_ms_feat, grad_sampling_location, grad_weights = mx_driving._C.npu_deformable_aggregation_grad(
+        grad_mc_ms_feat, grad_sampling_location, grad_weights = mx_driving._C.npu_deformable_aggregation_backward(
             mc_ms_feat,
             spatial_shape,
             scale_start_index,
@@ -80,4 +79,5 @@ class AdsDeformableAggregation(Function):
             grad_weights,
         )
 
+
 npu_deformable_aggregation = AdsDeformableAggregation.apply
diff --git a/mx_driving/point/ops/npu_dynamic_scatter.py b/mx_driving/ops/npu_dynamic_scatter.py
similarity index 67%
rename from mx_driving/point/ops/npu_dynamic_scatter.py
rename to mx_driving/ops/npu_dynamic_scatter.py
index 81ae7ff6f6722becaa4e766752137eb322a54c70..76fddcfcaca1f6d2ca95a4369ea52d3970f1b517 100644
--- a/mx_driving/point/ops/npu_dynamic_scatter.py
+++ b/mx_driving/ops/npu_dynamic_scatter.py
@@ -6,29 +6,33 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 from typing import Any, Optional, Tuple
 
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
 class DynamicScatterFunction(Function):
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def forward(ctx: Any, feats: torch.Tensor, coors: torch.Tensor,
-                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:
-        if reduce_type not in ('max', 'sum', 'mean'):
+    def forward(
+        ctx: Any, feats: torch.Tensor, coors: torch.Tensor, reduce_type: str = "max"
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if reduce_type not in ("max", "sum", "mean"):
             raise ValueError("reduce_type should be 'max', 'sum' or 'mean', but now is %s." % reduce_type)
 
         voxel_idx = mx_driving._C.point_to_voxel(coors, [], [], "XYZ")
-        num_voxels, uniqued_voxel_idx, prefix_sum_point_per_voxel, argsort_coor, _ = mx_driving._C.unique_voxel(voxel_idx)
+        num_voxels, uniqued_voxel_idx, prefix_sum_point_per_voxel, argsort_coor, _ = mx_driving._C.unique_voxel(
+            voxel_idx
+        )
         voxel_coors = mx_driving._C.voxel_to_point(uniqued_voxel_idx, [], [], "XYZ")
-        voxel_feats, compare_mask = mx_driving._C.npu_dynamic_scatter(feats, coors, prefix_sum_point_per_voxel,
-                                                              argsort_coor, num_voxels, reduce_type)
+        voxel_feats, compare_mask = mx_driving._C.npu_dynamic_scatter(
+            feats, coors, prefix_sum_point_per_voxel, argsort_coor, num_voxels, reduce_type
+        )
 
         ctx.reduce_type = reduce_type
         ctx.feats_shape = feats.shape
@@ -39,13 +43,17 @@ class DynamicScatterFunction(Function):
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
     # 'pylint: disable=too-many-return-arguments,huawei-too-many-return-arguments
-    def backward(ctx: Any,
-                 grad_voxel_feats: torch.Tensor,
-                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
+    def backward(ctx: Any, grad_voxel_feats: torch.Tensor, grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
         (prefix_sum_point_per_voxel, argsort_coor, compare_mask) = ctx.saved_tensors
         grad_point_feats = torch.zeros(ctx.feats_shape, dtype=grad_voxel_feats.dtype, device=grad_voxel_feats.device)
-        mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel,
-                                       argsort_coor, compare_mask, ctx.reduce_type)
+        mx_driving._C.npu_dynamic_scatter_grad(
+            grad_point_feats,
+            grad_voxel_feats.contiguous(),
+            prefix_sum_point_per_voxel,
+            argsort_coor,
+            compare_mask,
+            ctx.reduce_type,
+        )
         return grad_point_feats, None, None
 
 
diff --git a/mx_driving/fused/ops/npu_geometric_kernel_attention_func.py b/mx_driving/ops/npu_geometric_kernel_attention.py
similarity index 87%
rename from mx_driving/fused/ops/npu_geometric_kernel_attention_func.py
rename to mx_driving/ops/npu_geometric_kernel_attention.py
index 1819053207777bd65ce56c4bc4f3480de3aacb45..82663f11aaba9ac4e94d5e51a4d3170848c59b0f 100644
--- a/mx_driving/fused/ops/npu_geometric_kernel_attention_func.py
+++ b/mx_driving/ops/npu_geometric_kernel_attention.py
@@ -9,7 +9,7 @@ class GeometricKernelAttentionFunc(Function):
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
     def forward(ctx, value, spatial_shapes, level_start_index, sampling_locations, attn_weights):
-        result = mx_driving._C.npu_geometric_kernel_attention_func(
+        result = mx_driving._C.npu_geometric_kernel_attention(
             value, spatial_shapes, level_start_index, sampling_locations, attn_weights
         )
         ctx.save_for_backward(value, spatial_shapes, level_start_index, sampling_locations, attn_weights)
@@ -24,4 +24,4 @@ class GeometricKernelAttentionFunc(Function):
         return grad_value, None, None, None, grad_attn_weights
 
 
-npu_geometric_kernel_attention_func = GeometricKernelAttentionFunc.apply
+npu_geometric_kernel_attention = GeometricKernelAttentionFunc.apply
diff --git a/mx_driving/fused/ops/npu_max_pool2d.py b/mx_driving/ops/npu_max_pool2d.py
similarity index 100%
rename from mx_driving/fused/ops/npu_max_pool2d.py
rename to mx_driving/ops/npu_max_pool2d.py
diff --git a/mx_driving/detection/ops/npu_nms3d.py b/mx_driving/ops/npu_nms3d.py
similarity index 100%
rename from mx_driving/detection/ops/npu_nms3d.py
rename to mx_driving/ops/npu_nms3d.py
diff --git a/mx_driving/preprocess/ops/npu_points_in_box.py b/mx_driving/ops/npu_points_in_box.py
similarity index 89%
rename from mx_driving/preprocess/ops/npu_points_in_box.py
rename to mx_driving/ops/npu_points_in_box.py
index 056df051629a1a69daa0e3bbe31eefad04831f7c..540a5b77224ab79cd66cf911eac001e3335028bd 100644
--- a/mx_driving/preprocess/ops/npu_points_in_box.py
+++ b/mx_driving/ops/npu_points_in_box.py
@@ -6,11 +6,11 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
@@ -25,4 +25,5 @@ class PointsInBoxFunction(Function):
     def backward(ctx, grad_output):
         return None
 
-npu_points_in_box = PointsInBoxFunction.apply
\ No newline at end of file
+
+npu_points_in_box = PointsInBoxFunction.apply
diff --git a/mx_driving/preprocess/ops/npu_points_in_box_all.py b/mx_driving/ops/npu_points_in_box_all.py
similarity index 78%
rename from mx_driving/preprocess/ops/npu_points_in_box_all.py
rename to mx_driving/ops/npu_points_in_box_all.py
index 8f31e175765c512dfd912f7f78efb8216cf152f9..d27ff9095883fa94a6457baeff2a3ba34c8c00a2 100644
--- a/mx_driving/preprocess/ops/npu_points_in_box_all.py
+++ b/mx_driving/ops/npu_points_in_box_all.py
@@ -6,12 +6,13 @@ Modification date: 2024-07-24
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import warnings
+
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
@@ -32,5 +33,8 @@ def points_in_boxes_all(boxes, pts):
 
 
 def npu_points_in_box_all(boxes, pts):
-    warnings.warn("`npu_points_in_box_all` will be deprecated in future. Please use `points_in_boxes_all` instead.", DeprecationWarning)
-    return PointsInBoxAllFunction.apply(boxes, pts)
\ No newline at end of file
+    warnings.warn(
+        "`npu_points_in_box_all` will be deprecated in future. Please use `points_in_boxes_all` instead.",
+        DeprecationWarning,
+    )
+    return PointsInBoxAllFunction.apply(boxes, pts)
diff --git a/mx_driving/preprocess/ops/npu_roipoint_pool3d.py b/mx_driving/ops/npu_roipoint_pool3d.py
similarity index 59%
rename from mx_driving/preprocess/ops/npu_roipoint_pool3d.py
rename to mx_driving/ops/npu_roipoint_pool3d.py
index eae3c1104c5348d06cb92d37c3af39b87aec7530..116a81979d29b1ae8d631f0324d3bc5059e84492 100644
--- a/mx_driving/preprocess/ops/npu_roipoint_pool3d.py
+++ b/mx_driving/ops/npu_roipoint_pool3d.py
@@ -6,10 +6,11 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import torch
-from torch.autograd import Function
-from torch.nn import Module
 import torch_npu
+from torch.autograd import Function
+
 import mx_driving._C
 
 
@@ -17,19 +18,19 @@ class RoipointPool3dFunction(Function):
     @staticmethod
     def forward(ctx, num_sampled_points, points, point_features, boxes3d):
         if num_sampled_points <= 0:
-            raise Exception('Input num_sampled_points be more than 0')
+            raise Exception("Input num_sampled_points be more than 0")
         if (points.size(0) != point_features.size(0)) or (points.size(0) != boxes3d.size(0)):
-            raise Exception('Input points/point_features/boxes3d shape should be (B, x, x)')
+            raise Exception("Input points/point_features/boxes3d shape should be (B, x, x)")
         if (len(points.shape) != 3) or (points.size(2) != 3):
-            raise Exception('Input points shape should be (B, N, 3)')
+            raise Exception("Input points shape should be (B, N, 3)")
         if (len(point_features.shape) != 3) or (points.size(1) != point_features.size(1)):
-            raise Exception('Input point_features shape should be (B, N, C)')
+            raise Exception("Input point_features shape should be (B, N, C)")
         if (len(boxes3d.shape) != 3) or (boxes3d.size(2) != 7):
-            raise Exception('Input boxes3d shape should be (B, M, 7)')
+            raise Exception("Input boxes3d shape should be (B, M, 7)")
         if (points.dtype != point_features.dtype) or (points.dtype != boxes3d.dtype):
-            raise Exception('Input points/point_features/boxes3d dtype should be the same.')
-        if (points.device.type != 'npu') or (point_features.device.type != 'npu') or (boxes3d.device.type != 'npu'):
-            raise ValueError('The device is not npu!')
+            raise Exception("Input points/point_features/boxes3d dtype should be the same.")
+        if (points.device.type != "npu") or (point_features.device.type != "npu") or (boxes3d.device.type != "npu"):
+            raise ValueError("The device is not npu!")
         # points: (B, N, 3) 输入点
         # point_features: (B, N, C) 输入点特征
         # boxes3d: (B, M, 7) 边界框
@@ -42,18 +43,10 @@ class RoipointPool3dFunction(Function):
         feature_len = point_features.size(2)
         # pooled_features = points.new_zeros((batch_size, boxes_num, num_sampled_points, 3 + feature_len))
         # pooled_empty_flag = points.new_zeros((batch_size, boxes_num), dtype=torch.int)
-        pooled_features, pooled_empty_flag = \
-            mx_driving._C.npu_roipoint_pool3d_forward(num_sampled_points, points, point_features, boxes3d)
+        pooled_features, pooled_empty_flag = mx_driving._C.npu_roipoint_pool3d_forward(
+            num_sampled_points, points, point_features, boxes3d
+        )
         return pooled_features, pooled_empty_flag
 
 
 roipoint_pool3d = RoipointPool3dFunction.apply
-
-
-class RoipointPool3d(Module):
-    def __init__(self, num_sampled_points: int = 512):
-        super().__init__()
-        self.num_sampled_points = num_sampled_points
-
-    def forward(self, points, point_features, boxes3d):
-        return RoipointPool3dFunction.apply(self.num_sampled_points, points, point_features, boxes3d)
diff --git a/mx_driving/detection/ops/pixel_group.py b/mx_driving/ops/pixel_group.py
similarity index 100%
rename from mx_driving/detection/ops/pixel_group.py
rename to mx_driving/ops/pixel_group.py
diff --git a/mx_driving/detection/ops/roi_align_rotated.py b/mx_driving/ops/roi_align_rotated.py
similarity index 65%
rename from mx_driving/detection/ops/roi_align_rotated.py
rename to mx_driving/ops/roi_align_rotated.py
index 3ed1c3eaf50419b5c57a0e6050b522e61624f8e4..e37aacc323d3646591d8a1e2d74883218d6b510f 100644
--- a/mx_driving/detection/ops/roi_align_rotated.py
+++ b/mx_driving/ops/roi_align_rotated.py
@@ -1,11 +1,12 @@
 """
 Copyright (c) OpenMMLab. All rights reserved.
 """
+
 from typing import Any, Optional, Tuple, Union
 
 import torch
-import torch_npu
 import torch.nn as nn
+import torch_npu
 from torch.autograd import Function
 
 import mx_driving._C
@@ -13,8 +14,18 @@ import mx_driving._C
 
 class RoIAlignRotatedFunction(Function):
     @staticmethod
-    def forward(ctx: Any, feature_map: torch.Tensor, rois: torch.Tensor, spatial_scale: float,
-                sampling_ratio: int, pooled_height: int, pooled_width: int, aligned: bool = True, clockwise: bool = False) -> torch.Tensor:
+    # pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def forward(
+        ctx: Any,
+        feature_map: torch.Tensor,
+        rois: torch.Tensor,
+        spatial_scale: float,
+        sampling_ratio: int,
+        pooled_height: int,
+        pooled_width: int,
+        aligned: bool = True,
+        clockwise: bool = False,
+    ) -> torch.Tensor:
         ctx.pooled_height = pooled_height
         ctx.pooled_width = pooled_width
         ctx.spatial_scale = spatial_scale
@@ -26,7 +37,9 @@ class RoIAlignRotatedFunction(Function):
         batch_size, num_channels, data_height, data_width = feature_map.size()
         num_rois = rois.size(0)
 
-        output = feature_map.new_zeros(num_rois, ctx.pooled_height, ctx.pooled_width, num_channels).to(feature_map.device)
+        output = feature_map.new_zeros(num_rois, ctx.pooled_height, ctx.pooled_width, num_channels).to(
+            feature_map.device
+        )
 
         mx_driving._C.roi_align_rotated_v2_forward_npu(
             feature_map,
@@ -37,21 +50,31 @@ class RoIAlignRotatedFunction(Function):
             ctx.pooled_height,
             ctx.pooled_width,
             ctx.aligned,
-            ctx.clockwise)
+            ctx.clockwise,
+        )
         output = output.transpose(2, 3).transpose(1, 2).contiguous()
         return output
-    
+
     @staticmethod
+    # pylint: disable=too-many-return-values
     def backward(ctx: Any, grad_output: torch.Tensor):
         feature_map, rois = ctx.saved_tensors
         rois_trans = torch.permute(rois, (1, 0)).contiguous()
         grad_output_trans = torch.permute(grad_output, (0, 2, 3, 1)).contiguous()
         grad_feature_map = mx_driving._C.npu_roi_align_rotated_grad_v2(
-            feature_map, rois_trans, grad_output_trans,
-            ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale,
-            ctx.sampling_ratio, ctx.aligned, ctx.clockwise)
+            feature_map,
+            rois_trans,
+            grad_output_trans,
+            ctx.pooled_height,
+            ctx.pooled_width,
+            ctx.spatial_scale,
+            ctx.sampling_ratio,
+            ctx.aligned,
+            ctx.clockwise,
+        )
         grad_feature_map = grad_feature_map.permute(0, 3, 1, 2).contiguous()
-        
+
         return grad_feature_map, None, None, None, None, None, None, None
 
-roi_align_rotated = RoIAlignRotatedFunction.apply
\ No newline at end of file
+
+roi_align_rotated = RoIAlignRotatedFunction.apply
diff --git a/mx_driving/detection/ops/roiaware_pool3d.py b/mx_driving/ops/roiaware_pool3d.py
similarity index 56%
rename from mx_driving/detection/ops/roiaware_pool3d.py
rename to mx_driving/ops/roiaware_pool3d.py
index 7aa0279bc8f21547531166101de416795934b059..d355f4f13a1cc53c759d085a4fe97106325e858c 100644
--- a/mx_driving/detection/ops/roiaware_pool3d.py
+++ b/mx_driving/ops/roiaware_pool3d.py
@@ -6,11 +6,13 @@ Modification date: 2024-10-16
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 from typing import Any, Tuple, Union
+
 import torch
 import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
+
 import mx_driving._C
 
 
@@ -23,48 +25,50 @@ def is_tuple_of(input_tuple, expected_type=int):
 
 class RoIAwarePool3dFunction(Function):
     @staticmethod
-    def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor, pts_feature: torch.Tensor,
-                    out_size: Union[int, tuple], max_pts_per_voxel: int, mode: int):
+    # pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def forward(
+        ctx: Any,
+        rois: torch.Tensor,
+        pts: torch.Tensor,
+        pts_feature: torch.Tensor,
+        out_size: Union[int, tuple],
+        max_pts_per_voxel: int,
+        mode: int,
+    ):
         if isinstance(out_size, int):
             out_x = out_y = out_z = out_size
-        elif (len(out_size) == 3 or is_tuple_of(out_size, int)):
+        elif len(out_size) == 3 or is_tuple_of(out_size, int):
             out_x, out_y, out_z = out_size
         else:
             raise Exception("outsize attr Error!\n")
-        
+
         num_rois = rois.shape[0]
         num_channels = pts_feature.shape[-1]
         num_pts = pts.shape[0]
 
-        pooled_features = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, num_channels))
-        argmax = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int32)
-        pts_idx_of_voxels = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, max_pts_per_voxel), dtype=torch.int32)
-        
+        pooled_features = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, num_channels))
+        argmax = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int32)
+        pts_idx_of_voxels = pts_feature.new_zeros((num_rois, out_x, out_y, out_z, max_pts_per_voxel), dtype=torch.int32)
+
         mx_driving._C.npu_roiaware_pool3d_forward(
-            rois,
-            pts,
-            pts_feature,
-            argmax,
-            pts_idx_of_voxels,
-            pooled_features,
-            mode)
-        
+            rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features, mode
+        )
+
         ctx.save_for_backward(pts_idx_of_voxels, argmax, mode, num_pts, num_channels)
-        
+
         return pooled_features
-    
+
     @staticmethod
     def backward(ctx: Any, grad_out: torch.Tensor):
         ret = ctx.roiaware_pool3d_for_backward
         pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
 
         # backward
-        grad_in = mx_driving._C.roiaware_pool3d_grad(pts_idx_of_voxels, argmax, 
-            grad_out.contiguous(), num_pts, pool_method=mode)
+        grad_in = mx_driving._C.roiaware_pool3d_grad(
+            pts_idx_of_voxels, argmax, grad_out.contiguous(), num_pts, pool_method=mode
+        )
 
         return None, None, grad_in, None, None, None
-    
-roiaware_pool3d = RoIAwarePool3dFunction.apply
\ No newline at end of file
+
+
+roiaware_pool3d = RoIAwarePool3dFunction.apply
diff --git a/mx_driving/detection/ops/rotated_iou.py b/mx_driving/ops/rotated_iou.py
similarity index 100%
rename from mx_driving/detection/ops/rotated_iou.py
rename to mx_driving/ops/rotated_iou.py
diff --git a/mx_driving/detection/ops/rotated_overlaps.py b/mx_driving/ops/rotated_overlaps.py
similarity index 100%
rename from mx_driving/detection/ops/rotated_overlaps.py
rename to mx_driving/ops/rotated_overlaps.py
diff --git a/mx_driving/common/ops/scatter_max.py b/mx_driving/ops/scatter_max.py
similarity index 77%
rename from mx_driving/common/ops/scatter_max.py
rename to mx_driving/ops/scatter_max.py
index b30c6139f61ef9edb952fde48fbacfd7c2088050..5a0e49eb0f7982011389a85d86af348c260c463c 100644
--- a/mx_driving/common/ops/scatter_max.py
+++ b/mx_driving/ops/scatter_max.py
@@ -6,11 +6,11 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
@@ -28,13 +28,21 @@ class ScatterMaxFunction(Function):
 
         device = argmax.device
         grad_updates_index0 = argmax.unsqueeze(-1)
-        grad_updates_index1 = torch.tile(torch.arange(0, argmax.shape[1]), argmax.shape[0:1:1]).reshape(argmax.shape).unsqueeze(-1).to(device)
+        grad_updates_index1 = (
+            torch.tile(torch.arange(0, argmax.shape[1]), argmax.shape[0:1:1])
+            .reshape(argmax.shape)
+            .unsqueeze(-1)
+            .to(device)
+        )
         grad_updates_indices = torch.concat((grad_updates_index0, grad_updates_index1), -1).to(device)
-        grad_updates_indices_uss = grad_updates_indices[..., 0] * grad_updates_indices.shape[1] + grad_updates_indices[..., 1]
+        grad_updates_indices_uss = (
+            grad_updates_indices[..., 0] * grad_updates_indices.shape[1] + grad_updates_indices[..., 1]
+        )
         num_segments = torch.tensor(updates.shape[0] * updates.shape[1]).to(device)
 
         grad = mx_driving._C.npu_scatter_max_backward(grad_output, grad_updates_indices_uss, num_segments)
 
         return grad.reshape(updates.shape), None, None
 
+
 scatter_max = ScatterMaxFunction.apply
diff --git a/mx_driving/common/ops/scatter_mean.py b/mx_driving/ops/scatter_mean.py
similarity index 90%
rename from mx_driving/common/ops/scatter_mean.py
rename to mx_driving/ops/scatter_mean.py
index 7e7e619b3283171c6169aa4ca01349fb55d59980..06e0c8a4fbda2394b0e53e1693447e8c89c98db1 100644
--- a/mx_driving/common/ops/scatter_mean.py
+++ b/mx_driving/ops/scatter_mean.py
@@ -1,8 +1,7 @@
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
@@ -22,4 +21,5 @@ class ScatterMeanFunction(Function):
         result = mx_driving._C.npu_scatter_mean_grad(grad_out, index, count, dim)
         return result, None, None, None, None
 
-scatter_mean = ScatterMeanFunction.apply
\ No newline at end of file
+
+scatter_mean = ScatterMeanFunction.apply
diff --git a/mx_driving/common/ops/sort_pairs.py b/mx_driving/ops/sort_pairs.py
similarity index 100%
rename from mx_driving/common/ops/sort_pairs.py
rename to mx_driving/ops/sort_pairs.py
diff --git a/mx_driving/spconv/ops/sparse_functional.py b/mx_driving/ops/sparse_functional.py
similarity index 68%
rename from mx_driving/spconv/ops/sparse_functional.py
rename to mx_driving/ops/sparse_functional.py
index ee4db416da3f1ddb4943ace4a023d21b3384c04b..a994ab4151ddd2173df784bff051953e7128c625 100644
--- a/mx_driving/spconv/ops/sparse_functional.py
+++ b/mx_driving/ops/sparse_functional.py
@@ -15,28 +15,39 @@
 
 from typing import Any
 
-import torch
 import numpy as np
+import torch
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
+
 import mx_driving._C
-from . import sparse_ops as ops
 
 
 class SparseConvFunction(Function):
-
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def forward(ctx: Any, features, indices, weight, out_spatial_shape,
-                out_channels, batch_size,
-                kernel_size, stride, padding, dilation,
-                groups, bias) -> torch.Tensor:
+    def forward(
+        ctx: Any,
+        features,
+        indices,
+        weight,
+        out_spatial_shape,
+        out_channels,
+        batch_size,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        bias,
+    ) -> torch.Tensor:
 
         device = features.device
         weight = weight.data
         # calculate the index pair
-        outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_conv3d(indices, kernel_size, stride, padding,
-                                                            out_channels, out_spatial_shape, batch_size)
+        outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_conv3d(
+            indices, kernel_size, stride, padding, out_channels, out_spatial_shape, batch_size
+        )
         # sort and nonezero
         to_insert = torch.tensor(-1).to(device)
         sorted_idx, sorted_idx_to_former_indices = torch.sort(ouidx_offset.view(torch.float32))
@@ -45,8 +56,9 @@ class SparseConvFunction(Function):
         sub_result = new_sorted_idx - new_sorted_idx_2
         unique_indices_offset = torch.nonzero(sub_result != 0)
         # index_put and matmul
-        out_features, outidx = mx_driving._C.multi_to_sparse_v2(features, weight, unique_indices_offset.int(),
-                                                  sorted_idx_to_former_indices.int(), outidx_pair.int())
+        out_features, outidx = mx_driving._C.multi_to_sparse_v2(
+            features, weight, unique_indices_offset.int(), sorted_idx_to_former_indices.int(), outidx_pair.int()
+        )
         outidx, outidx_ = torch.chunk(outidx, 2, dim=1)
 
         ctx.save_for_backward(features, weight, sorted_idx_to_former_indices.int(), unique_indices_offset.int())
@@ -55,12 +67,11 @@ class SparseConvFunction(Function):
     @staticmethod
     @once_differentiable
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple:
+    def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx=None) -> tuple:
         features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors
-        feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset,
-                                                                 sorted_idx_to_former_indices,
-                                                                 features, weight, grad_out_features)
-
+        feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad(
+            unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features
+        )
 
         return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None
 
@@ -69,16 +80,38 @@ class SparseInverseConvFunction(Function):
 
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def forward(ctx: Any, features, indices, weight, out_spatial_shape,
-                out_channels, batch_size,
-                kernel_size, stride, padding, dilation, output_padding,
-                groups, bias) -> torch.Tensor:
+    def forward(
+        ctx: Any,
+        features,
+        indices,
+        weight,
+        out_spatial_shape,
+        out_channels,
+        batch_size,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        output_padding,
+        groups,
+        bias,
+    ) -> torch.Tensor:
         device = features.device
         weight = weight.data
         # calculate the index pair
-        out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_inverse_conv3d(features, indices, weight,
-                                        kernel_size, stride, padding, dilation, output_padding,
-                                        out_channels, out_spatial_shape, batch_size)
+        out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_inverse_conv3d(
+            features,
+            indices,
+            weight,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            output_padding,
+            out_channels,
+            out_spatial_shape,
+            batch_size,
+        )
         # sort and nonezero
         to_insert = torch.tensor(-1).to(device)
         sorted_idx, sorted_idx_to_former_indices = torch.sort(ouidx_offset.view(torch.float32))
@@ -87,8 +120,9 @@ class SparseInverseConvFunction(Function):
         sub_result = new_sorted_idx - new_sorted_idx_2
         unique_indices_offset = torch.nonzero(sub_result != 0)
         # matmul
-        out_features, outidx = mx_driving._C.multi_to_sparse(out_features, unique_indices_offset.int(),
-                                                     sorted_idx_to_former_indices.int(), outidx_pair.int())
+        out_features, outidx = mx_driving._C.multi_to_sparse(
+            out_features, unique_indices_offset.int(), sorted_idx_to_former_indices.int(), outidx_pair.int()
+        )
         outidx, outidx_ = torch.chunk(outidx, 2, dim=1)
 
         ctx.save_for_backward(features, weight, sorted_idx_to_former_indices.int(), unique_indices_offset.int())
@@ -97,11 +131,11 @@ class SparseInverseConvFunction(Function):
     @staticmethod
     @once_differentiable
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple:
+    def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx=None) -> tuple:
         features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors
-        feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset,
-                                                                sorted_idx_to_former_indices,
-                                                                features, weight, grad_out_features)
+        feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad(
+            unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features
+        )
         return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None, None
 
 
@@ -109,21 +143,36 @@ class SubMConvFunction(Function):
 
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def forward(ctx: Any, features, indices, weight, out_spatial_shape,
-                out_channels, batch_size,
-                kernel_size, stride, padding, dilation,
-                groups, bias) -> torch.Tensor:
+    def forward(
+        ctx: Any,
+        features,
+        indices,
+        weight,
+        out_spatial_shape,
+        out_channels,
+        batch_size,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        bias,
+    ) -> torch.Tensor:
         device = features.device
         weight = weight.data
         # calculate the index pair
         indices_long = indices.long()
-        flatten_indices = indices_long[:, 0] * out_spatial_shape[0] * out_spatial_shape[1] * out_spatial_shape[2] + \
-                indices_long[:, 1] * out_spatial_shape[1] * out_spatial_shape[2] + indices_long[:, 2] * out_spatial_shape[2] + indices_long[:, 3]
+        flatten_indices = (
+            indices_long[:, 0] * out_spatial_shape[0] * out_spatial_shape[1] * out_spatial_shape[2]
+            + indices_long[:, 1] * out_spatial_shape[1] * out_spatial_shape[2]
+            + indices_long[:, 2] * out_spatial_shape[2]
+            + indices_long[:, 3]
+        )
         temp, ordered_indices = mx_driving._C.npu_prepare_subm_conv3d(flatten_indices, out_spatial_shape, batch_size)
         temp[flatten_indices] = ordered_indices
-        output_iml2col, outidx_pair, ouidx_offset = mx_driving._C.npu_subm_sparse_conv3d(features, indices, weight,
-                                                                          kernel_size, out_channels,
-                                                                          out_spatial_shape, batch_size, temp)
+        output_iml2col, outidx_pair, ouidx_offset = mx_driving._C.npu_subm_sparse_conv3d(
+            features, indices, weight, kernel_size, out_channels, out_spatial_shape, batch_size, temp
+        )
         weight_flatten = weight.view(kernel_size[0] * kernel_size[1] * kernel_size[2] * features.shape[1], out_channels)
         output_iml2col = output_iml2col.view(features.shape[0], -1)
         out_features = output_iml2col @ weight_flatten
@@ -133,12 +182,14 @@ class SubMConvFunction(Function):
     @staticmethod
     @once_differentiable
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple:
+    def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx=None) -> tuple:
         features, weight, output_iml2col, ouidx_offset = ctx.saved_tensors
         weight_grad = output_iml2col.T @ grad_out_features
         weight_shape = weight.shape
         kernel_num = weight_shape[0] * weight_shape[1] * weight_shape[2]
-        weight_grad = weight_grad.view(weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3], weight_shape[4])
+        weight_grad = weight_grad.view(
+            weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3], weight_shape[4]
+        )
         weight = weight.view(kernel_num * weight_shape[3], weight_shape[4])
         feature_grad_iml2col = grad_out_features @ (weight.T)
         feature_grad_iml2col = feature_grad_iml2col.view(features.shape[0], kernel_num, features.shape[1])
@@ -151,6 +202,7 @@ class SubMConvFunction(Function):
         feature_grad.index_put_((ouidx_offset,), feature_grad_iml2col, True)
         return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None
 
+
 indice_conv = SparseConvFunction.apply
 indice_inverse_conv = SparseInverseConvFunction.apply
-indice_subm_conv = SubMConvFunction.apply
\ No newline at end of file
+indice_subm_conv = SubMConvFunction.apply
diff --git a/mx_driving/common/ops/three_interpolate.py b/mx_driving/ops/three_interpolate.py
similarity index 92%
rename from mx_driving/common/ops/three_interpolate.py
rename to mx_driving/ops/three_interpolate.py
index ed237c10166394d4ecd89d9695a07dd92ba62dd0..a0fdcdcb7536749bed1dfa1274108e7a12477094 100644
--- a/mx_driving/common/ops/three_interpolate.py
+++ b/mx_driving/ops/three_interpolate.py
@@ -10,19 +10,17 @@ Modification 1. Add support for Ascend NPU
 from typing import Any, Tuple
 
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
 class ThreeInterpolateFunction(Function):
 
     @staticmethod
-    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor,
-                weight: torch.Tensor) -> torch.Tensor:
-        
+    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+
         b, c, m = features.size()
         n = indices.size(1)
         ctx.three_interpolate_for_backward = (indices, weight, m)
@@ -34,7 +32,7 @@ class ThreeInterpolateFunction(Function):
 
     @staticmethod
     def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        
+
         b, c, n = grad_out.size()
         idx, weight, m = ctx.three_interpolate_for_backward
 
@@ -46,7 +44,8 @@ class ThreeInterpolateFunction(Function):
 
         if grad_out_dtype == torch.half:
             grad_features = grad_features.to(torch.half)
-        
+
         return grad_features, None, None
 
+
 three_interpolate = ThreeInterpolateFunction.apply
diff --git a/mx_driving/common/ops/threeNN.py b/mx_driving/ops/three_nn.py
similarity index 92%
rename from mx_driving/common/ops/threeNN.py
rename to mx_driving/ops/three_nn.py
index d259e5d07e76a310ffbea40db5ee90b08126a350..26cfafe52a31680ca086a7ed46f0e7ce721200de 100644
--- a/mx_driving/common/ops/threeNN.py
+++ b/mx_driving/ops/three_nn.py
@@ -6,16 +6,17 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 from typing import Any, Tuple
+
 import torch
+import torch_npu
 from torch.autograd import Function
-from torch.nn import Module
 
-import torch_npu
 import mx_driving._C
 
 
-class AdsThreeNN(Function):
+class ThreeNN(Function):
     @staticmethod
     def forward(ctx: Any, target: torch.Tensor, source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         # target is center_xyz
@@ -34,4 +35,4 @@ class AdsThreeNN(Function):
         return dist2, idx.int()
 
 
-three_nn = AdsThreeNN.apply
+three_nn = ThreeNN.apply
diff --git a/mx_driving/point/ops/voxel_pooling_train.py b/mx_driving/ops/voxel_pooling_train.py
similarity index 85%
rename from mx_driving/point/ops/voxel_pooling_train.py
rename to mx_driving/ops/voxel_pooling_train.py
index 12c61c9d6ae876c4af3336a3de0f27b54574d914..41efc71a8b361ce2b14d9c95bd5669e55d4226bc 100644
--- a/mx_driving/point/ops/voxel_pooling_train.py
+++ b/mx_driving/ops/voxel_pooling_train.py
@@ -6,9 +6,10 @@ Modification date: 2024-06-04
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import torch
 from torch.autograd import Function
-from torch.nn import Module
+
 import mx_driving._C
 
 
@@ -18,12 +19,11 @@ class AdsVoxelPoolingFunction(Function):
         grad_input_features = torch.zeros_like(input_features)
         geom_xyz = geom_xyz.reshape(geom_xyz.shape[0], -1, geom_xyz.shape[-1])
         input_features = input_features.reshape(geom_xyz.shape[0], -1, input_features.shape[-1])
-        
+
         batch_size = input_features.shape[0]
         num_points = input_features.shape[1]
         num_channels = input_features.shape[2]
-        output_features = input_features.new_zeros(batch_size, voxel_num[1], 
-                                                   voxel_num[0], num_channels)
+        output_features = input_features.new_zeros(batch_size, voxel_num[1], voxel_num[0], num_channels)
         pos_memo = geom_xyz.new_ones(batch_size, num_points, 3) * -1
         pos, result = mx_driving._C.voxel_pooling_train(
             input_features,
@@ -52,15 +52,10 @@ class AdsVoxelPoolingFunction(Function):
         W = grad_output_features.shape[3]
 
         result = mx_driving._C.voxel_pool_train_backward(
-            grad_output_features,
-            pos_memo,
-            batch_size,
-            num_points,
-            num_channels,
-            H,
-            W
+            grad_output_features, pos_memo, batch_size, num_points, num_channels, H, W
         )
         grad_input_features = result.reshape(grad_input_features_shape)
         return None, grad_input_features, None
 
-npu_voxel_pooling_train = AdsVoxelPoolingFunction.apply
\ No newline at end of file
+
+npu_voxel_pooling_train = AdsVoxelPoolingFunction.apply
diff --git a/mx_driving/point/ops/voxelization.py b/mx_driving/ops/voxelization.py
similarity index 64%
rename from mx_driving/point/ops/voxelization.py
rename to mx_driving/ops/voxelization.py
index 777133cb3ee8ad81b9a2a4604f879f5c42589c00..cd62fc782f472b614ce841420c8d5b0c67605783 100644
--- a/mx_driving/point/ops/voxelization.py
+++ b/mx_driving/ops/voxelization.py
@@ -4,8 +4,6 @@ Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 
 import torch
 from torch.autograd import Function
-from torch.nn import Module
-from torch.nn.modules.utils import _pair
 
 import mx_driving._C
 
@@ -48,21 +46,3 @@ class _Voxelization(Function):
 
 
 voxelization = _Voxelization.apply
-
-
-class Voxelization(torch.nn.Module):
-    def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000, deterministic=True):
-        super().__init__()
-
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.max_num_points = max_num_points
-        self.max_voxels = max_voxels
-        self.max_voxels = max_voxels if isinstance(max_voxels, tuple) else _pair(max_voxels)
-        self.deterministic = deterministic
-
-    def forward(self, points: torch.Tensor):
-        max_voxels = self.max_voxels[0] if self.training else self.max_voxels[1]
-        return voxelization(
-            points, self.voxel_size, self.point_cloud_range, self.max_num_points, max_voxels, self.deterministic
-        )
diff --git a/mx_driving/point/__init__.py b/mx_driving/point.py
similarity index 49%
rename from mx_driving/point/__init__.py
rename to mx_driving/point.py
index 6aa7d099bac32bc1b6ca77cc39f73155ff5bcc31..3f79fd6526c834302d50565eb74157652cbbe88e 100644
--- a/mx_driving/point/__init__.py
+++ b/mx_driving/point.py
@@ -1,10 +1,16 @@
-from .ops.group_points import npu_group_points
-from .ops.group_points import group_points
+import warnings
+
+
+warnings.warn(
+    "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning
+)
+from .modules.voxelization import Voxelization
 from .ops.bev_pool import bev_pool
 from .ops.bev_pool_v2 import bev_pool_v2
-from .ops.furthest_point_sampling_with_dist import furthest_point_sample_with_dist
+from .ops.bev_pool_v3 import bev_pool_v3
 from .ops.furthest_point_sampling import npu_furthest_point_sampling
+from .ops.furthest_point_sampling_with_dist import furthest_point_sample_with_dist
+from .ops.group_points import group_points, npu_group_points
 from .ops.npu_dynamic_scatter import npu_dynamic_scatter
-from .ops.voxelization import voxelization, Voxelization
 from .ops.voxel_pooling_train import npu_voxel_pooling_train
-from .ops.bev_pool_v3 import bev_pool_v3
\ No newline at end of file
+from .ops.voxelization import voxelization
diff --git a/mx_driving/point/CMakeLists.txt b/mx_driving/point/CMakeLists.txt
deleted file mode 100644
index 63ebf65165f490b26ec6fbb6cb034f1e8d947c59..0000000000000000000000000000000000000000
--- a/mx_driving/point/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
-  add_subdirectory(ops/kernels)
-endif()
-
-if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
-  add_subdirectory(ops/onnx/plugin)
-endif()
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
-  add_subdirectory(ops/csrc)
-endif()
diff --git a/mx_driving/point/components/README.md b/mx_driving/point/components/README.md
deleted file mode 100644
index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000
--- a/mx_driving/point/components/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some pytorch algorithm modules.
\ No newline at end of file
diff --git a/mx_driving/point/ops/__init__.py b/mx_driving/point/ops/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mx_driving/point/ops/csrc/CMakeLists.txt b/mx_driving/point/ops/csrc/CMakeLists.txt
deleted file mode 100644
index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/csrc/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_CSRC_SRC
-    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/point/ops/csrc/OWNERS b/mx_driving/point/ops/csrc/OWNERS
deleted file mode 100644
index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/csrc/OWNERS
+++ /dev/null
@@ -1,7 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/point/ops/csrc/README.md b/mx_driving/point/ops/csrc/README.md
deleted file mode 100644
index 0bbe4f394307b9d81004b5bd923e630eabd9a509..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/csrc/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some cpp source files, which provide code for adaptation of ascend kernels. It provide links for kernels and cpp interfaces.
\ No newline at end of file
diff --git a/mx_driving/point/ops/csrc/pybind.cpp b/mx_driving/point/ops/csrc/pybind.cpp
deleted file mode 100644
index e5b0c25d94faa44919839030d7e9b748eb6874a3..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/csrc/pybind.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "csrc/pybind.h"
-
-#include <torch/extension.h>
-
-#include "functions.h"
-
-void init_point(pybind11::module& m)
-{
-    // group_points
-    m.def("group_points", &group_points);
-    m.def("group_points_backward", &group_points_backward);
-
-    // vec_pool
-    m.def("vec_pool_backward", &vec_pool_backward);
-
-    m.def("point_to_voxel", &point_to_voxel);
-
-    m.def("voxel_to_point", &voxel_to_point);
-
-    m.def("unique_voxel", &unique_voxel);
-    
-    m.def("hard_voxelize", &hard_voxelize);
-
-    // bev_pool
-    m.def("npu_bev_pool", &npu_bev_pool, "npu_bev_pool NPU version");
-    m.def("npu_bev_pool_backward", &npu_bev_pool_backward, "npu_bev_pool_backward NPU version");
-    m.def("npu_bev_pool_v2", &npu_bev_pool_v2, "npu_bev_pool_v2 NPU version");
-    m.def("npu_bev_pool_v2_backward", &npu_bev_pool_v2_backward, "npu_bev_pool_v2_backward NPU version");
-    m.def("npu_bev_pool_v3", &npu_bev_pool_v3, "npu_bev_pool_v3 NPU version");
-    m.def("npu_bev_pool_v3_backward", &npu_bev_pool_v3_backward, "npu_bev_pool_v3_backward NPU version");
-
-    // furthest_points_sampling_with_dist
-    m.def("furthest_point_sampling_with_dist", &furthest_point_sampling_with_dist);
-
-    // npu_dynamic_scatter
-    m.def("npu_dynamic_scatter", &npu_dynamic_scatter);
-    m.def("npu_dynamic_scatter_grad", &npu_dynamic_scatter_grad);
-
-    // dyn_voxelization
-    m.def("dynamic_voxelization", &dynamic_voxelization);
-
-    // npu_furthest_point_sampling
-    m.def("npu_furthest_point_sampling", &npu_furthest_point_sampling);
-
-    // voxel_pooling
-    m.def("voxel_pooling_train", &voxel_pooling_train);
-    m.def("voxel_pool_train_backward", &voxel_pool_train_backward);
-}
diff --git a/mx_driving/point/ops/kernels/CMakeLists.txt b/mx_driving/point/ops/kernels/CMakeLists.txt
deleted file mode 100644
index 179d9da23345abf75fb87954f266055922527742..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
-  add_subdirectory(framework)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
-  add_subdirectory(op_host)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
-  add_subdirectory(op_kernel)
-endif()
-if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
-  add_subdirectory(testcases)
-endif()
diff --git a/mx_driving/point/ops/kernels/README.md b/mx_driving/point/ops/kernels/README.md
deleted file mode 100644
index 214fb0a6d662e806bd7f6bdd1b8962bc1639026e..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/kernels/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some ascend-kernel source files, which are like cuda-kernels and supply some ops that can be run on ascend device.
\ No newline at end of file
diff --git a/mx_driving/point/ops/kernels/op_host/CMakeLists.txt b/mx_driving/point/ops/kernels/op_host/CMakeLists.txt
deleted file mode 100644
index 7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/kernels/op_host/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_HOST_SRC
-    ${ASCEND_HOST_SRC} ${HOST_SRC}
-    CACHE INTERNAL "")
-
-# add the exclude files for aclnn
-set(aclop_exclude
-    ${aclop_exclude} ""
-    CACHE INTERNAL "")
-file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp)
-file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h)
-set(ACLNN_SRC_CUSTOM
-    ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC}
-    CACHE INTERNAL "")
-set(ACLNN_INC_CUSTOM
-    ${ACLNN_INC_CUSTOM} ${ACLNN_INC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/point/ops/kernels/op_host/OWNERS b/mx_driving/point/ops/kernels/op_host/OWNERS
deleted file mode 100644
index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/kernels/op_host/OWNERS
+++ /dev/null
@@ -1,7 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/point/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/point/ops/kernels/op_kernel/CMakeLists.txt
deleted file mode 100644
index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/kernels/op_kernel/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_KERNEL_SRC
-        ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
-        CACHE INTERNAL "")
diff --git a/mx_driving/point/ops/kernels/op_kernel/common.h b/mx_driving/point/ops/kernels/op_kernel/common.h
deleted file mode 100644
index 2041af4985be2803dae6afeae4b2c56b59f1df1c..0000000000000000000000000000000000000000
--- a/mx_driving/point/ops/kernels/op_kernel/common.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef COMMON_H_
-#define COMMON_H_
-
-#include "kernel_operator.h"
-
-constexpr int32_t TILING_ALIGN32B_FLAG = 1;
-constexpr int32_t TILING_FP32_BIT = 1;
-constexpr int32_t TILING_FP16_BIT = 2;
-constexpr int32_t TILING_BF16_BIT = 3;
-
-class TaskIterator {
-public:
-    __aicore__ inline TaskIterator(
-        int32_t blkIdx, int32_t blkDim, int32_t avgTaskNum, int32_t tailTaskNum, int32_t totalTaskNum)
-        : blkIdx_(blkIdx), blkDim_(blkDim), totalTaskNum_(totalTaskNum)
-    {
-        nextIdx_ = blkIdx * avgTaskNum + (blkIdx < tailTaskNum ? blkIdx : tailTaskNum);
-        endIdx_ = nextIdx_ + avgTaskNum + (blkIdx < tailTaskNum ? 1 : 0);
-    }
-
-    __aicore__ inline bool HasNext() const
-    {
-        return nextIdx_ < endIdx_;
-    }
-
-    __aicore__ inline int32_t Next()
-    {
-        return nextIdx_++;
-    }
-
-    __aicore__ inline int32_t GetNext() const
-    {
-        return nextIdx_;
-    }
-
-    __aicore__ inline int32_t GetTaskNum() const
-    {
-        return totalTaskNum_;
-    }
-
-private:
-    int32_t blkIdx_, blkDim_;
-    int32_t nextIdx_, endIdx_;
-    int32_t totalTaskNum_;
-};
-#endif // COMMON_H_
\ No newline at end of file
diff --git a/mx_driving/preprocess/__init__.py b/mx_driving/preprocess.py
similarity index 45%
rename from mx_driving/preprocess/__init__.py
rename to mx_driving/preprocess.py
index db975036570fc6b3f7fed6419cc4bf55701ce45f..520c307946ea5dbdb4417acb526c15b41ceb9e76 100644
--- a/mx_driving/preprocess/__init__.py
+++ b/mx_driving/preprocess.py
@@ -1,4 +1,9 @@
+import warnings
+
+warnings.warn(
+    "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning
+)
 from .ops.npu_points_in_box import npu_points_in_box
 from .ops.npu_points_in_box_all import npu_points_in_box_all
 from .ops.npu_points_in_box_all import points_in_boxes_all
-from .ops.npu_roipoint_pool3d import RoipointPool3d as RoIPointPool3d
\ No newline at end of file
+from .modules.roi_point_pool_3d import RoIPointPool3d
diff --git a/mx_driving/preprocess/CMakeLists.txt b/mx_driving/preprocess/CMakeLists.txt
deleted file mode 100644
index 63ebf65165f490b26ec6fbb6cb034f1e8d947c59..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
-  add_subdirectory(ops/kernels)
-endif()
-
-if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
-  add_subdirectory(ops/onnx/plugin)
-endif()
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
-  add_subdirectory(ops/csrc)
-endif()
diff --git a/mx_driving/preprocess/components/README.md b/mx_driving/preprocess/components/README.md
deleted file mode 100644
index f1cf0540a17c9ebd79472f7ebcac5909a1bc078f..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/components/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some pytorch algorithm modules.
\ No newline at end of file
diff --git a/mx_driving/preprocess/ops/__init__.py b/mx_driving/preprocess/ops/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mx_driving/preprocess/ops/csrc/CMakeLists.txt b/mx_driving/preprocess/ops/csrc/CMakeLists.txt
deleted file mode 100644
index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/csrc/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_CSRC_SRC
-    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/preprocess/ops/csrc/OWNERS b/mx_driving/preprocess/ops/csrc/OWNERS
deleted file mode 100644
index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/csrc/OWNERS
+++ /dev/null
@@ -1,7 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/preprocess/ops/csrc/README.md b/mx_driving/preprocess/ops/csrc/README.md
deleted file mode 100644
index 8073915fabe1c484db0488c9abc5e09b858c52c8..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/csrc/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-## Description
-The `csrc` lib implements python interface, which use `pybind11` to wrap the C++ code.
-There are 3 files you need to focus:
-1. `pybind.cpp`: Define the python interface.
-2. `functions.cpp`: Define the C++ interface.
-3. The file naming in `Pascal` style: The implementation of the C++ interface.
\ No newline at end of file
diff --git a/mx_driving/preprocess/ops/csrc/functions.h b/mx_driving/preprocess/ops/csrc/functions.h
deleted file mode 100644
index e509d755e0b806a5f6c9cbee2f15ee186f0e4d45..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/csrc/functions.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2024, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
-#define PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
-
-#include <ATen/ATen.h>
-#include <torch/library.h>
-
-at::Tensor npu_points_in_box(const at::Tensor& boxes, const at::Tensor& pts);
-
-at::Tensor npu_points_in_box_all(const at::Tensor& boxes, const at::Tensor& pts);
-
-std::tuple<at::Tensor, at::Tensor> npu_roipoint_pool3d_forward(const int32_t num_sampled_points,
-    const at::Tensor& points, const at::Tensor& point_features, const at::Tensor& boxes3d);
-#endif // PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
diff --git a/mx_driving/preprocess/ops/csrc/pybind.cpp b/mx_driving/preprocess/ops/csrc/pybind.cpp
deleted file mode 100644
index f9bc7205886a1ca8a859c435359a37bde3f9b3bb..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/csrc/pybind.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <torch/extension.h>
-#include "csrc/pybind.h"
-#include "functions.h"
-
-void init_preprocess(pybind11::module& m)
-{
-    // npu_points_in_box
-    m.def("npu_points_in_box", &npu_points_in_box);
-    
-    // npu_points_in_box_all
-    m.def("npu_points_in_box_all", &npu_points_in_box_all);
-
-    // npu_roipoint_pool3d_forward
-    m.def("npu_roipoint_pool3d_forward", &npu_roipoint_pool3d_forward);
-}
diff --git a/mx_driving/preprocess/ops/kernels/CMakeLists.txt b/mx_driving/preprocess/ops/kernels/CMakeLists.txt
deleted file mode 100644
index 179d9da23345abf75fb87954f266055922527742..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
-  add_subdirectory(framework)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
-  add_subdirectory(op_host)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
-  add_subdirectory(op_kernel)
-endif()
-if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
-  add_subdirectory(testcases)
-endif()
diff --git a/mx_driving/preprocess/ops/kernels/README.md b/mx_driving/preprocess/ops/kernels/README.md
deleted file mode 100644
index 1e6645553e8d86a84a9833a13610741b59930494..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/kernels/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-## 算子原型
-<table>
-<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
-</tr>
-<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
-<tr><td align="center">x</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td></tr>
-<tr><td align="center">y</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td></tr>
-</tr>
-</tr>
-<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td></tr>
-</tr>
-<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
-</table>
\ No newline at end of file
diff --git a/mx_driving/preprocess/ops/kernels/framework/CMakeLists.txt b/mx_driving/preprocess/ops/kernels/framework/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mx_driving/preprocess/ops/kernels/op_host/CMakeLists.txt b/mx_driving/preprocess/ops/kernels/op_host/CMakeLists.txt
deleted file mode 100644
index c44b2b0174f28f0144a7c03fc6c40cc5b389c14e..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/kernels/op_host/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_HOST_SRC
-    ${ASCEND_HOST_SRC} ${HOST_SRC}
-    CACHE INTERNAL "")
-# add the exclude files for aclnn
-set(aclop_exclude
-    ${aclop_exclude} ""
-    CACHE INTERNAL "")
-file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp)
-file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h)
-set(ACLNN_SRC_CUSTOM
-    ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC}
-    CACHE INTERNAL "")
-set(ACLNN_INC_CUSTOM
-    ${ACLNN_INC_CUSTOM} ${ACLNN_INC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/preprocess/ops/kernels/op_host/OWNERS b/mx_driving/preprocess/ops/kernels/op_host/OWNERS
deleted file mode 100644
index 606fe9ac200dfccf4066f08fc109921e46d1db70..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/kernels/op_host/OWNERS
+++ /dev/null
@@ -1,7 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/preprocess/ops/kernels/op_host/common.h b/mx_driving/preprocess/ops/kernels/op_host/common.h
deleted file mode 100644
index 4580dff5fd0b206d1b94383f160932c22d1cb8a9..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/kernels/op_host/common.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
- */
-#ifndef COMMON_H
-#define COMMON_H
-
-#include "register/op_def_registry.h"
-#include "tiling/platform/platform_ascendc.h"
-#include "tiling/tiling_api.h"
-#include "register/tilingdata_base.h"
-
-inline uint32_t ceil_multiple(uint32_t num, uint32_t block)
-{
-    if (block == 0) {
-        return 0;
-    }
-    return (num + block - 1) / block;
-}
-
-inline uint32_t ceil_value(uint32_t num, uint32_t block)
-{
-    if (block == 0) {
-        return 0;
-    }
-    return ((num + block - 1) / block) * block;
-}
-
-#endif // COMMON_H
diff --git a/mx_driving/preprocess/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/preprocess/ops/kernels/op_kernel/CMakeLists.txt
deleted file mode 100644
index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000
--- a/mx_driving/preprocess/ops/kernels/op_kernel/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_KERNEL_SRC
-        ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
-        CACHE INTERNAL "")
diff --git a/mx_driving/spconv.py b/mx_driving/spconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..248713a204292255a5a0eac58deb596a45fe8f73
--- /dev/null
+++ b/mx_driving/spconv.py
@@ -0,0 +1,8 @@
+import warnings
+
+
+warnings.warn(
+    "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning
+)
+from .modules.sparse_conv import SparseConv3d, SparseInverseConv3d, SubMConv3d
+from .modules.sparse_modules import SparseConvTensor, SparseModule, SparseSequential
diff --git a/mx_driving/spconv/CMakeLists.txt b/mx_driving/spconv/CMakeLists.txt
deleted file mode 100644
index 63ebf65165f490b26ec6fbb6cb034f1e8d947c59..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
-  add_subdirectory(ops/kernels)
-endif()
-
-if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
-  add_subdirectory(ops/onnx/plugin)
-endif()
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
-  add_subdirectory(ops/csrc)
-endif()
diff --git a/mx_driving/spconv/__init__.py b/mx_driving/spconv/__init__.py
deleted file mode 100644
index 7435203cfc1b1c542a7ebc1a7d5c3c7b9e3714c7..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .ops.sparse_conv import SubMConv3d
-from .ops.sparse_conv import SparseConv3d
-from .ops.sparse_conv import SparseInverseConv3d
-from .ops.sparse_modules import SparseSequential
-from .ops.sparse_modules import SparseConvTensor
-from .ops.sparse_modules import SparseModule
diff --git a/mx_driving/spconv/ops/__init__.py b/mx_driving/spconv/ops/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mx_driving/spconv/ops/csrc/CMakeLists.txt b/mx_driving/spconv/ops/csrc/CMakeLists.txt
deleted file mode 100644
index 4a75d495802c6e765b589b2913da0d5debbb750a..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/csrc/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_CSRC_SRC
-    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/spconv/ops/csrc/OWNERS b/mx_driving/spconv/ops/csrc/OWNERS
deleted file mode 100644
index 6d60158d26b6a9b3c818a73e78f09a6aa3700cf7..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/csrc/OWNERS
+++ /dev/null
@@ -1,8 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-- liu_zhi_xu
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/spconv/ops/csrc/README.md b/mx_driving/spconv/ops/csrc/README.md
deleted file mode 100644
index 0bbe4f394307b9d81004b5bd923e630eabd9a509..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/csrc/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some cpp source files, which provide code for adaptation of ascend kernels. It provide links for kernels and cpp interfaces.
\ No newline at end of file
diff --git a/mx_driving/spconv/ops/csrc/functions.h b/mx_driving/spconv/ops/csrc/functions.h
deleted file mode 100644
index c1547c91050c634dc3c9bd3f397c6b0ccba7b492..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/csrc/functions.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2024, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
-#define PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
-
-#include <ATen/ATen.h>
-#include <torch/library.h>
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_subm_sparse_conv3d(const at::Tensor& feature, const at::Tensor& indices,
-                                                                      const at::Tensor& weight,
-                                                                      at::IntArrayRef kernel_size, int out_channel,
-                                                                      at::IntArrayRef outSpatialShape, int batch_size,
-                                                                      const at::Tensor& temp);
-
-std::tuple<at::Tensor, at::Tensor> multi_to_sparse(const at::Tensor& out_features, const at::Tensor& unique_indices_offset,
-                                                   const at::Tensor& sorted_idx_to_former_indices, const at::Tensor& outidx_pair);
-
-std::tuple<at::Tensor, at::Tensor> multi_to_sparse_v2(const at::Tensor& features, const at::Tensor& weight, const at::Tensor& unique_indices_offset,
-                                                      const at::Tensor& sorted_idx_to_former_indices, const at::Tensor& outidx_pair);
-
-std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding,
-                                                     int out_channel, at::IntArrayRef outSpatialShape, int batch_size);
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_sparse_inverse_conv3d(const at::Tensor& feature, const at::Tensor& indices, const at::Tensor& weight,
-                                                                         at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding,
-                                                                         at::IntArrayRef dilation, at::IntArrayRef output_padding,
-                                                                         int out_channel, at::IntArrayRef outSpatialShape, int batch_size);
-
-std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d_grad(const at::Tensor& indices_offset, const at::Tensor& former_sorted_indices,
-                                                          const at::Tensor& feature, const at::Tensor& weight, const at::Tensor& grad);
-
-std::tuple<at::Tensor, at::Tensor> npu_prepare_subm_conv3d(const at::Tensor& flattenIndices,
-                                                           at::IntArrayRef outSpatialShape, int batch_size);
-
-#endif // PERCEPTION_VISION_OPS_CSRC_FUNCTIONS_H_
diff --git a/mx_driving/spconv/ops/csrc/pybind.cpp b/mx_driving/spconv/ops/csrc/pybind.cpp
deleted file mode 100644
index 26bfaf7a09a8ab588bb7ad66a551490c0d42bb88..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/csrc/pybind.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <torch/extension.h>
-#include "csrc/pybind.h"
-#include "functions.h"
-
-void init_spconv(pybind11::module &m)
-{
-    // npu_subm_sparse_conv3d
-    m.def("npu_subm_sparse_conv3d", &npu_subm_sparse_conv3d);
-
-    // npu_sparse_conv3d
-    m.def("npu_sparse_conv3d", &npu_sparse_conv3d);
-
-    // npu_sparse_inverse_conv3d
-    m.def("npu_sparse_inverse_conv3d", &npu_sparse_inverse_conv3d);
-
-    // multi_to_sparse
-    m.def("multi_to_sparse", &multi_to_sparse);
-
-    // multi_to_sparse_v2
-    m.def("multi_to_sparse_v2", &multi_to_sparse_v2);
-
-    // npu_sparse_conv3d_grad
-    m.def("npu_sparse_conv3d_grad", &npu_sparse_conv3d_grad);
-
-    m.def("npu_prepare_subm_conv3d", &npu_prepare_subm_conv3d);
-}
diff --git a/mx_driving/spconv/ops/kernels/CMakeLists.txt b/mx_driving/spconv/ops/kernels/CMakeLists.txt
deleted file mode 100644
index 179d9da23345abf75fb87954f266055922527742..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
-  add_subdirectory(framework)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
-  add_subdirectory(op_host)
-endif()
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
-  add_subdirectory(op_kernel)
-endif()
-if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
-  add_subdirectory(testcases)
-endif()
diff --git a/mx_driving/spconv/ops/kernels/README.md b/mx_driving/spconv/ops/kernels/README.md
deleted file mode 100644
index 214fb0a6d662e806bd7f6bdd1b8962bc1639026e..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/kernels/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Description
-+ The folder contains some ascend-kernel source files, which are like cuda-kernels and supply some ops that can be run on ascend device.
\ No newline at end of file
diff --git a/mx_driving/spconv/ops/kernels/op_host/CMakeLists.txt b/mx_driving/spconv/ops/kernels/op_host/CMakeLists.txt
deleted file mode 100644
index 7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/kernels/op_host/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_HOST_SRC
-    ${ASCEND_HOST_SRC} ${HOST_SRC}
-    CACHE INTERNAL "")
-
-# add the exclude files for aclnn
-set(aclop_exclude
-    ${aclop_exclude} ""
-    CACHE INTERNAL "")
-file(GLOB ACLNN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.cpp)
-file(GLOB ACLNN_INC ${CMAKE_CURRENT_SOURCE_DIR}/aclnn*.h)
-set(ACLNN_SRC_CUSTOM
-    ${ACLNN_SRC_CUSTOM} ${ACLNN_SRC}
-    CACHE INTERNAL "")
-set(ACLNN_INC_CUSTOM
-    ${ACLNN_INC_CUSTOM} ${ACLNN_INC}
-    CACHE INTERNAL "")
diff --git a/mx_driving/spconv/ops/kernels/op_host/OWNERS b/mx_driving/spconv/ops/kernels/op_host/OWNERS
deleted file mode 100644
index 6d60158d26b6a9b3c818a73e78f09a6aa3700cf7..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/kernels/op_host/OWNERS
+++ /dev/null
@@ -1,8 +0,0 @@
-approvers:
-- wangxiaoxin-sherie
-- liu_zhi_xu
-reviewers:
-- zhuguodong1
-- captainjing
-options:
-  no_parent_owners: true
diff --git a/mx_driving/spconv/ops/kernels/op_kernel/CMakeLists.txt b/mx_driving/spconv/ops/kernels/op_kernel/CMakeLists.txt
deleted file mode 100644
index c51870f18c2b530409e6df2c4529ab5a63b32953..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/kernels/op_kernel/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(ASCEND_KERNEL_SRC
-        ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
-        CACHE INTERNAL "")
diff --git a/mx_driving/spconv/ops/sparse_ops.py b/mx_driving/spconv/ops/sparse_ops.py
deleted file mode 100644
index 065774559d9b77c2dbf3e899fc6c16755a200aa7..0000000000000000000000000000000000000000
--- a/mx_driving/spconv/ops/sparse_ops.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2024, Huawei Technologies.All rights reserved.
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        size = (input_size[i] + 2 * padding[i] - dilation[i] *
-                (kernel_size[i] - 1) - 1) // stride[i] + 1
-        if kernel_size[i] == -1:
-            output_size.append(1)
-        else:
-            output_size.append(size)
-    return output_size
-
-
-def get_inverse_conv_output_size(input_size, kernel_size, stride, padding, dilation, output_padding):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + dilation[i] * (kernel_size[i] - 1) + output_padding[i] + 1
-        if kernel_size[i] == -1:
-            output_size.append(1)
-        else:
-            output_size.append(size)
-    return output_size
-
-
-def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
-                           output_padding):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        if kernel_size[i] == -1:
-            raise ValueError("deconv don't support kernel_size < 0")
-        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
-            i] + output_padding[i]
-        output_size.append(size)
-    return output_size
diff --git a/onnx_plugin/CMakeLists.txt b/onnx_plugin/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb625814f17edf9774c4201c36318cf8801341d3
--- /dev/null
+++ b/onnx_plugin/CMakeLists.txt
@@ -0,0 +1,33 @@
+file(GLOB ONNX_PLUGIN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+if(BUILD_STAGE EQUAL 1)
+  if(${ENABLE_ONNX})
+    if(CANN_PATHS)
+      if(${ARCH} STREQUAL "aarch64")
+        protobuf_generate(
+          PROTO_FILE ${CANN_PATHS}/aarch64-linux/include/proto/ge_onnx.proto
+          OUT_DIR ${ASCEND_AUTOGEN_PATH})
+      else()
+        protobuf_generate(
+          PROTO_FILE ${CANN_PATHS}/x86_64-linux/include/proto/ge_onnx.proto
+          OUT_DIR ${ASCEND_AUTOGEN_PATH})
+      endif()
+    else()
+      protobuf_generate(
+        PROTO_FILE ${ASCEND_CANN_PACKAGE_PATH}/include/proto/ge_onnx.proto
+        OUT_DIR ${ASCEND_AUTOGEN_PATH})
+    endif()
+
+    add_library(cust_onnx_parsers SHARED ${ONNX_PLUGIN_SRC})
+    target_compile_options(
+      cust_onnx_parsers
+      PRIVATE -O2 -Werror -Wno-deprecated-declarations -Dgoogle=ascend_private
+              "-fno-common" "-fno-strict-aliasing")
+    target_link_libraries(cust_onnx_parsers PRIVATE intf_pub)
+    target_include_directories(
+      cust_onnx_parsers PRIVATE ${PROJECT_SOURCE_DIR}/include
+                                ${ASCEND_AUTOGEN_PATH})
+
+    install_target(TRG cust_onnx_parsers DST
+                   packages/vendors/${vendor_name}/framework/onnx/)
+  endif()
+endif()
diff --git a/mx_driving/fused/ops/onnx/plugin/onnx_multi_scale_deformable_attn.cpp b/onnx_plugin/onnx_multi_scale_deformable_attn.cpp
similarity index 100%
rename from mx_driving/fused/ops/onnx/plugin/onnx_multi_scale_deformable_attn.cpp
rename to onnx_plugin/onnx_multi_scale_deformable_attn.cpp
diff --git a/mx_driving/detection/ops/onnx/plugin/onnx_roi_align_rotated.cpp b/onnx_plugin/onnx_roi_align_rotated.cpp
similarity index 100%
rename from mx_driving/detection/ops/onnx/plugin/onnx_roi_align_rotated.cpp
rename to onnx_plugin/onnx_roi_align_rotated.cpp
diff --git a/tests/onnx/roi_align_rotated_plugin.py b/tests/onnx/roi_align_rotated_plugin.py
index 75b3193a36a658ad8ef55b933a4568cc7d7b991b..da53bdf900d1a65b6db0a31c5ed37eea79a1c9cd 100644
--- a/tests/onnx/roi_align_rotated_plugin.py
+++ b/tests/onnx/roi_align_rotated_plugin.py
@@ -1,6 +1,6 @@
 import os
-import onnx
-from onnx import helper, TensorProto
+import onnx_plugin
+from onnx_plugin import helper, TensorProto
 
 
 def roi_align_rotated():
@@ -9,7 +9,7 @@ def roi_align_rotated():
 
     output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [3, 48, 2, 2])
 
-    node_def = onnx.helper.make_node('RoiAlignRotatedV2',
+    node_def = onnx_plugin.helper.make_node('RoiAlignRotatedV2',
                                       inputs=['input', 'rois'],
                                       outputs=['output'],
                                       spatial_scale=1.0,
@@ -32,7 +32,7 @@ def roi_align_rotated():
     current_path = os.path.abspath(__file__)
     idx = current_path.rfind('/')
     current_path = current_path[:idx]
-    onnx.save(model_def, os.path.join(current_path, "roi_align_rotated.onnx"))
+    onnx_plugin.save(model_def, os.path.join(current_path, "roi_align_rotated.onnx"))
 
 if __name__ == "__main__":
     roi_align_rotated()
\ No newline at end of file
diff --git a/tests/torch/test_npu_geometric_kernel_attention_func.py b/tests/torch/test_npu_geometric_kernel_attention_func.py
index ba5254f6685b25e4c77a7f099373a0ff5bd0b020..b3f027fddcd16ad8d3debec186add5c641213205 100644
--- a/tests/torch/test_npu_geometric_kernel_attention_func.py
+++ b/tests/torch/test_npu_geometric_kernel_attention_func.py
@@ -79,10 +79,6 @@ class TestGeometricKernelAttentionFunc(TestCase):
         self.test_results = self.gen_results()
 
     def gen_results(self):
-        if DEVICE_NAME != "Ascend910B":
-            self.skipTest(
-                "OP `MultiScaleDeformableAttnFunction` is only supported on 910B, skipping test data generation!"
-            )
         test_results = []
         for shape, dtype in self.items:
             cpu_inputs, npu_inputs = self.gen_inputs(shape, dtype)
@@ -157,7 +153,7 @@ class TestGeometricKernelAttentionFunc(TestCase):
         sampling_locations = npu_inputs.sampling_locations
         attn_weights = npu_inputs.attn_weights
         grad_output = npu_inputs.grad_output
-        npu_output = mx_driving.fused.npu_geometric_kernel_attention_func(
+        npu_output = mx_driving.fused.npu_geometric_kernel_attention(
             value, spatial_shapes, level_start_index, sampling_locations, attn_weights
         )
         npu_output.backward(grad_output)
diff --git a/tests/torch/test_roipoint_pool3d.py b/tests/torch/test_roipoint_pool3d.py
index 0cb748ae7c28b40a3ef1b590cf75fba641f28e49..cf17193e6858e2122b17ba0257f7acaa0123bba6 100644
--- a/tests/torch/test_roipoint_pool3d.py
+++ b/tests/torch/test_roipoint_pool3d.py
@@ -11,21 +11,46 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 import random
 import unittest
+
+import numpy as np
 import torch
 import torch_npu
-import numpy as np
 from torch_npu.testing.testcase import TestCase, run_tests
+
 from mx_driving.preprocess import RoIPointPool3d
-sys.path.append("../utils")
-from random_matrix import random_value
 
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
 
+# float16[-14,16], float32[-126,128], float64[-1022,1024], int16[0,15], int32[0,31], int64[0,63]
+# random_value(-7, 8, (1, 2, 3), np.float32, True, True, False, False)
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def random_value(
+    min_log, max_log, size, dtype=np.float32, nega_flag=True, zero_flag=True, inf_flag=False, nan_flag=False
+):
+    matrix_log = np.random.uniform(low=min_log, high=max_log, size=size).astype(np.float32)
+    matrix = np.exp2(matrix_log).astype(dtype)
+    flag_value = int(zero_flag) + int(inf_flag) + int(nan_flag)
+    size_value = np.prod(size)
+    p0 = 0.1
+    if (flag_value > 0) and (size_value > 0):
+        p0 = 0.1 / flag_value / size_value  # 10%
+    if nega_flag:
+        matrix *= np.random.choice(a=[1, -1], size=size, p=[0.5, 0.5])
+    if zero_flag:
+        matrix *= np.random.choice(a=[1, 0], size=size, p=[1 - p0, p0])
+    if inf_flag:
+        np_inf = np.array([np.inf]).astype(dtype)[0]
+        matrix += np.random.choice(a=[0, np_inf], size=size, p=[1 - p0, p0])
+    if nan_flag:
+        np_nan = np.array([np.nan]).astype(dtype)[0]
+        matrix += np.random.choice(a=[0, np_nan], size=size, p=[1 - p0, p0])
+    return matrix
+
+
 # points: (B, N, 3) 输入点
 # point_features: (B, N, C) 输入点特征
 # boxes3d: (B, M, 7) 边界框
@@ -53,106 +78,113 @@ def check_point_in_box3d(point, box3d):
 
 
 def roipoint_pool3d_forward(num_sampled_points, points, point_features, boxes3d, pooled_features):
-    point_num = points.shape[0] # N
-    feature_len = point_features.shape[1] # C
-    point_flag = np.zeros((point_num), dtype=np.int32) # (N)
-    point_idx = np.zeros((num_sampled_points), dtype=np.int32) # (num)
+    point_num = points.shape[0]  # N
+    feature_len = point_features.shape[1]  # C
+    point_flag = np.zeros((point_num), dtype=np.int32)  # (N)
+    point_idx = np.zeros((num_sampled_points), dtype=np.int32)  # (num)
 
     for pt_idx in range(point_num):
         point_flag[pt_idx] = check_point_in_box3d(points[pt_idx], boxes3d)
 
     cnt = 0
     for pt_idx in range(point_num):
-        if (point_flag[pt_idx] == 0):
+        if point_flag[pt_idx] == 0:
             continue
         point_idx[cnt] = pt_idx
         cnt += 1
-        if (cnt == num_sampled_points):
+        if cnt == num_sampled_points:
             break
 
-    if (cnt == 0):
+    if cnt == 0:
         return 1
-    if (cnt < num_sampled_points):
+    if cnt < num_sampled_points:
         for spn_idx in range(cnt, num_sampled_points):
             point_idx[spn_idx] = point_idx[spn_idx % cnt]
 
     for sample_point_idx in range(num_sampled_points):
         src_point_idx = point_idx[sample_point_idx]
         pooled_features[sample_point_idx, 0:3] = points[src_point_idx, 0:3]
-        pooled_features[sample_point_idx, 3:3 + feature_len] = \
-            point_features[src_point_idx, 0:feature_len]
+        pooled_features[sample_point_idx, 3 : 3 + feature_len] = point_features[src_point_idx, 0:feature_len]
     return 0
 
 
 def cpu_roipoint_pool3d(num_sampled_points, points, point_features, boxes3d):
     # B=batch_size; N=point_num; M=boxes_num; C=feature_len; num = num_sampled_points
-    batch_size = points.shape[0] # B
-    feature_len = point_features.shape[2] # C
-    boxes_num = boxes3d.shape[1] # M
+    batch_size = points.shape[0]  # B
+    feature_len = point_features.shape[2]  # C
+    boxes_num = boxes3d.shape[1]  # M
     pooled_features = np.zeros_like(points, shape=(batch_size, boxes_num, num_sampled_points, 3 + feature_len))
     pooled_empty_flag = np.zeros((batch_size, boxes_num), dtype=np.int32)
     for bs_idx in range(batch_size):
         for boxes_idx in range(boxes_num):
-            pooled_empty_flag[bs_idx][boxes_idx] = roipoint_pool3d_forward(num_sampled_points, points[bs_idx],
-                point_features[bs_idx], boxes3d[bs_idx][boxes_idx], pooled_features[bs_idx][boxes_idx])
+            pooled_empty_flag[bs_idx][boxes_idx] = roipoint_pool3d_forward(
+                num_sampled_points,
+                points[bs_idx],
+                point_features[bs_idx],
+                boxes3d[bs_idx][boxes_idx],
+                pooled_features[bs_idx][boxes_idx],
+            )
     return pooled_features, pooled_empty_flag
 
 
 class TestRoipointPool3d(TestCase):
-    @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `RoipointPool3d` is only supported on 910B, skip this ut!")
+    @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `RoipointPool3d` is only supported on 910B, skip this ut!")
     def test_roipoint_pool3d_float(self):
         random.seed()
-        batch_size = random.randint(1, 4) # B
-        num_sampled_points = random.randint(1, 48) # num
-        boxes_num = random.randint(1, 48) # M
-        point_num = random.randint(max(num_sampled_points, boxes_num), 105) # N
-        points = random_value(-15.5, 16, (batch_size, point_num, 3), np.float32) # (B, N, 3)
-        point_features = points.copy() # (B, N, C)
-        boxes3d = np.zeros((batch_size, boxes_num, 7), dtype=np.float32) # (B, M, 7)
+        batch_size = random.randint(1, 4)  # B
+        num_sampled_points = random.randint(1, 48)  # num
+        boxes_num = random.randint(1, 48)  # M
+        point_num = random.randint(max(num_sampled_points, boxes_num), 105)  # N
+        points = random_value(-15.5, 16, (batch_size, point_num, 3), np.float32)  # (B, N, 3)
+        point_features = points.copy()  # (B, N, C)
+        boxes3d = np.zeros((batch_size, boxes_num, 7), dtype=np.float32)  # (B, M, 7)
         boxes3d[0:, 0:, 0:3] = random_value(-15.5, 16, (batch_size, boxes_num, 3))
         boxes3d[0:, 0:, 3:6] = random_value(-63, 64, (batch_size, boxes_num, 3), nega_flag=False)
-        boxes3d[0:, 0:, 6:] = \
-            np.random.uniform(low=0, high=3.141592654, size=(batch_size, boxes_num, 1)).astype(np.float32)
+        boxes3d[0:, 0:, 6:] = np.random.uniform(low=0, high=3.141592654, size=(batch_size, boxes_num, 1)).astype(
+            np.float32
+        )
 
         cpu_pooled_features, cpu_pooled_empty_flag = cpu_roipoint_pool3d(
-            num_sampled_points, points, point_features, boxes3d)
+            num_sampled_points, points, point_features, boxes3d
+        )
 
         roipoint_pool3d = RoIPointPool3d(num_sampled_points)
-        pooled_features, pooled_empty_flag = roipoint_pool3d(torch.from_numpy(points).npu(),
-            torch.from_numpy(point_features).npu(), torch.from_numpy(boxes3d).npu())
+        pooled_features, pooled_empty_flag = roipoint_pool3d(
+            torch.from_numpy(points).npu(), torch.from_numpy(point_features).npu(), torch.from_numpy(boxes3d).npu()
+        )
 
         float_pooled_features = pooled_features.cpu().numpy()
         float_pooled_empty_flag = pooled_empty_flag.cpu().numpy()
-        self.assertRtolEqual(float_pooled_features, cpu_pooled_features, prec=0.00005) # (B, M, num, 3+C)
-        self.assertRtolEqual(float_pooled_empty_flag, cpu_pooled_empty_flag, prec=0.00005) # (B, M)
+        self.assertRtolEqual(float_pooled_features, cpu_pooled_features, prec=0.00005)  # (B, M, num, 3+C)
+        self.assertRtolEqual(float_pooled_empty_flag, cpu_pooled_empty_flag, prec=0.00005)  # (B, M)
 
-
-    @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `RoipointPool3d` is only supported on 910B, skip this ut!")
+    @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `RoipointPool3d` is only supported on 910B, skip this ut!")
     def test_roipoint_pool3d_half(self):
         random.seed()
-        batch_size = random.randint(1, 4) # B
-        num_sampled_points = random.randint(1, 60) # num
-        boxes_num = random.randint(1, 60) # M
-        point_num = random.randint(max(num_sampled_points, boxes_num), 105) # N
-        points = random_value(-3.5, 4, (batch_size, point_num, 3), np.float16) # (B, N, 3)
-        point_features = points.copy() # (B, N, C)
-        boxes3d = np.zeros((batch_size, boxes_num, 7), dtype=np.float16) # (B, M, 7)
+        batch_size = random.randint(1, 4)  # B
+        num_sampled_points = random.randint(1, 60)  # num
+        boxes_num = random.randint(1, 60)  # M
+        point_num = random.randint(max(num_sampled_points, boxes_num), 105)  # N
+        points = random_value(-3.5, 4, (batch_size, point_num, 3), np.float16)  # (B, N, 3)
+        point_features = points.copy()  # (B, N, C)
+        boxes3d = np.zeros((batch_size, boxes_num, 7), dtype=np.float16)  # (B, M, 7)
         boxes3d[0:, 0:, 0:3] = random_value(-3.5, 4, (batch_size, boxes_num, 3), np.float16)
         boxes3d[0:, 0:, 3:6] = random_value(-14, 16, (batch_size, boxes_num, 3), np.float16, nega_flag=False)
-        boxes3d[0:, 0:, 6:] = \
-            np.random.uniform(low=0, high=3.142, size=(batch_size, boxes_num, 1)).astype(np.float16)
+        boxes3d[0:, 0:, 6:] = np.random.uniform(low=0, high=3.142, size=(batch_size, boxes_num, 1)).astype(np.float16)
 
-        cpu_pooled_features, cpu_pooled_empty_flag = cpu_roipoint_pool3d(num_sampled_points,
-            points.astype(np.float32), point_features.astype(np.float32), boxes3d.astype(np.float32))
+        cpu_pooled_features, cpu_pooled_empty_flag = cpu_roipoint_pool3d(
+            num_sampled_points, points.astype(np.float32), point_features.astype(np.float32), boxes3d.astype(np.float32)
+        )
 
         roipoint_pool3d = RoIPointPool3d(num_sampled_points)
-        pooled_features, pooled_empty_flag = roipoint_pool3d(torch.from_numpy(points).npu(),
-            torch.from_numpy(point_features).npu(), torch.from_numpy(boxes3d).npu())
+        pooled_features, pooled_empty_flag = roipoint_pool3d(
+            torch.from_numpy(points).npu(), torch.from_numpy(point_features).npu(), torch.from_numpy(boxes3d).npu()
+        )
 
         half_pooled_features = pooled_features.cpu().numpy().astype(np.float32)
         half_pooled_empty_flag = pooled_empty_flag.cpu().numpy()
-        self.assertRtolEqual(half_pooled_features, cpu_pooled_features, prec=0.0005) # (B, M, num, 3+C)
-        self.assertRtolEqual(half_pooled_empty_flag, cpu_pooled_empty_flag, prec=0.0005) # (B, M)
+        self.assertRtolEqual(half_pooled_features, cpu_pooled_features, prec=0.0005)  # (B, M, num, 3+C)
+        self.assertRtolEqual(half_pooled_empty_flag, cpu_pooled_empty_flag, prec=0.0005)  # (B, M)
 
 
 if __name__ == "__main__":
diff --git a/tests/utils/random_matrix.py b/tests/utils/random_matrix.py
deleted file mode 100644
index a4f8879519de2e4a77c3dba04a648efd9c3ca23e..0000000000000000000000000000000000000000
--- a/tests/utils/random_matrix.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-import random
-import numpy as np
-
-
-# float16[-14,16], float32[-126,128], float64[-1022,1024], int16[0,15], int32[0,31], int64[0,63]
-# random_value(-7, 8, (1, 2, 3), np.float32, True, True, False, False)
-def random_value(min_log, max_log, size, dtype=np.float32,
-    nega_flag=True, zero_flag=True, inf_flag=False, nan_flag=False):
-    matrix_log = np.random.uniform(low=min_log, high=max_log, size=size).astype(np.float32)
-    matrix = np.exp2(matrix_log).astype(dtype)
-    flag_value = int(zero_flag) + int(inf_flag) + int(nan_flag)
-    size_value = np.prod(size)
-    p0 = 0.1
-    if (flag_value > 0) and (size_value > 0):
-        p0 = 0.1 / flag_value / size_value # 10%
-    if nega_flag:
-        matrix *= np.random.choice(a=[1, -1], size=size, p=[0.5, 0.5])
-    if zero_flag:
-        matrix *= np.random.choice(a=[1, 0], size=size, p=[1 - p0, p0])
-    if inf_flag:
-        np_inf = np.array([np.inf]).astype(dtype)[0]
-        matrix += np.random.choice(a=[0, np_inf], size=size, p=[1 - p0, p0])
-    if nan_flag:
-        np_nan = np.array([np.nan]).astype(dtype)[0]
-        matrix += np.random.choice(a=[0, np_nan], size=size, p=[1 - p0, p0])
-    return matrix
-
-
-# random_size(-7, 8, ([1, 4], [1, 4], [3]), np.float32, True, True, False, False)
-def random_size(min_log, max_log, size_range, dtype=np.float32,
-    nega_flag=True, zero_flag=True, inf_flag=False, nan_flag=False):
-    size = []
-    dim = len(size_range)
-    for i in range(dim):
-        if len(size_range[i]) == 1:
-            size.append(size_range[i][0])
-        else:
-            random.seed()
-            size.append(random.randint(size_range[i][0], size_range[i][1]))
-    return random_value(min_log, max_log, size, dtype, nega_flag, zero_flag, inf_flag, nan_flag)

算子类型(OpType)	Add
算子输入	name	shape	data type	format
	x	-	float	ND
	y	-	float	ND
算子输出	z	-	float	ND
核函数名	add_custom