From af35f966e3827bc378f3ef5518e41dbe0a1eb4ae Mon Sep 17 00:00:00 2001 From: caishangqiu Date: Thu, 28 Aug 2025 12:42:34 +0800 Subject: [PATCH 1/2] refactoring project files --- debug/accuracy_tools/msprobe/.clang-format | 36 ++++ debug/accuracy_tools/msprobe/CMakeLists.txt | 29 ++- debug/accuracy_tools/msprobe/LICENSE | 201 ++++++++++++++++++ debug/accuracy_tools/msprobe/README.md | 111 +--------- debug/accuracy_tools/msprobe/__init__.py | 14 +- debug/accuracy_tools/msprobe/build.sh | 83 ++++++++ .../msprobe/cmake/Findcpython.cmake | 14 ++ .../msprobe/cmake/Findgtest.cmake | 45 ++++ .../msprobe/cmake/Findmockcpp.cmake | 42 ++++ debug/accuracy_tools/msprobe/cmake/config.ini | 7 + .../msprobe/cmake/download_opensource.sh | 110 ++++++++++ .../accuracy_tools/msprobe/cmake/utils.cmake | 46 ++++ .../msprobe/docs/0001.capability_matrix.md | 0 .../msprobe/docs/0002.installation.md | 55 +++++ .../msprobe/docs/0003.config_introduction.md | 0 .../msprobe/docs/0004.config_examples.md | 0 .../msprobe/docs/0101.dump_offline_model.md | 0 .../docs/0102.dump_mindie_llm_for_atb.md | 0 debug/accuracy_tools/msprobe/docs/README.md | 29 +++ debug/accuracy_tools/msprobe/pyproject.toml | 11 + .../msprobe/requirements/requirements.txt | 7 + .../msprobe/requirements/requirements_tf.txt | 10 + debug/accuracy_tools/msprobe/setup.py | 90 ++++++++ .../accuracy_tools/msprobe/third_party/.keep | 0 24 files changed, 832 insertions(+), 108 deletions(-) create mode 100644 debug/accuracy_tools/msprobe/.clang-format create mode 100644 debug/accuracy_tools/msprobe/LICENSE create mode 100644 debug/accuracy_tools/msprobe/build.sh create mode 100644 debug/accuracy_tools/msprobe/cmake/Findcpython.cmake create mode 100644 debug/accuracy_tools/msprobe/cmake/Findgtest.cmake create mode 100644 debug/accuracy_tools/msprobe/cmake/Findmockcpp.cmake create mode 100644 debug/accuracy_tools/msprobe/cmake/config.ini create mode 100644 debug/accuracy_tools/msprobe/cmake/download_opensource.sh create mode 100644 debug/accuracy_tools/msprobe/cmake/utils.cmake create mode 100644 debug/accuracy_tools/msprobe/docs/0001.capability_matrix.md create mode 100644 debug/accuracy_tools/msprobe/docs/0002.installation.md create mode 100644 debug/accuracy_tools/msprobe/docs/0003.config_introduction.md create mode 100644 debug/accuracy_tools/msprobe/docs/0004.config_examples.md create mode 100644 debug/accuracy_tools/msprobe/docs/0101.dump_offline_model.md create mode 100644 debug/accuracy_tools/msprobe/docs/0102.dump_mindie_llm_for_atb.md create mode 100644 debug/accuracy_tools/msprobe/docs/README.md create mode 100644 debug/accuracy_tools/msprobe/pyproject.toml create mode 100644 debug/accuracy_tools/msprobe/requirements/requirements.txt create mode 100644 debug/accuracy_tools/msprobe/requirements/requirements_tf.txt create mode 100644 debug/accuracy_tools/msprobe/setup.py create mode 100644 debug/accuracy_tools/msprobe/third_party/.keep diff --git a/debug/accuracy_tools/msprobe/.clang-format b/debug/accuracy_tools/msprobe/.clang-format new file mode 100644 index 0000000000..72ffb18052 --- /dev/null +++ b/debug/accuracy_tools/msprobe/.clang-format @@ -0,0 +1,36 @@ +BasedOnStyle: LLVM + +IndentWidth: 4 +TabWidth: 4 +UseTab: Never + +ColumnLimit: 120 +BreakBeforeBraces: Custom + +BraceWrapping: + AfterNamespace: false + AfterFunction: true + AfterClass: false + AfterControlStatement: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyRecord: false + SplitEmptyFunction: false + AfterEnum: true + +AccessModifierOffset: -4 +IndentCaseLabels: true +SpaceBeforeParens: ControlStatements + +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortBlocksOnASingleLine: false + +BinPackParameters: false +BinPackArguments: false + +NamespaceIndentation: All +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true diff --git a/debug/accuracy_tools/msprobe/CMakeLists.txt b/debug/accuracy_tools/msprobe/CMakeLists.txt index 66085a4b0b..bc9134a369 100644 --- a/debug/accuracy_tools/msprobe/CMakeLists.txt +++ b/debug/accuracy_tools/msprobe/CMakeLists.txt @@ -1,5 +1,28 @@ -add_subdirectory(ccsrc) +cmake_minimum_required(VERSION 3.14) +project(msprobe) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED OFF) +set(CMAKE_CXX_EXTENSIONS OFF) + +execute_process( + COMMAND uname -m + OUTPUT_VARIABLE machine_arch + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +if (DEFINED ARCH_TYPE AND NOT "${ARCH_TYPE}" STREQUAL "${machine_arch}") + message(FATAL_ERROR + "Cross-compilation is not supported currently. (compile ${ARCH_TYPE} on ${machine_arch})" + ) +endif() + +set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") +set(ENV{PROJECT_ROOT_PATH} "${CMAKE_SOURCE_DIR}") + +include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake") +add_subdirectory(msprobe) if (DEFINED BUILD_TEST_CASE AND "${BUILD_TEST_CASE}" STREQUAL "True") -add_subdirectory(test) -endif() \ No newline at end of file + add_subdirectory(test/UT) +endif() diff --git a/debug/accuracy_tools/msprobe/LICENSE b/debug/accuracy_tools/msprobe/LICENSE new file mode 100644 index 0000000000..261eeb9e9f --- /dev/null +++ b/debug/accuracy_tools/msprobe/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/debug/accuracy_tools/msprobe/README.md b/debug/accuracy_tools/msprobe/README.md index a11292f4b2..3b1683324a 100644 --- a/debug/accuracy_tools/msprobe/README.md +++ b/debug/accuracy_tools/msprobe/README.md @@ -1,116 +1,19 @@ # 📖 msprobe 使用手册 -![version](https://img.shields.io/badge/version-1.0.4-blueviolet) -![python](https://img.shields.io/badge/python-3.8|3.9|3.10-blue) ![platform](https://img.shields.io/badge/platform-Linux-yellow) +![License: Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-green) -**msprobe** 是 MindStudio Training Tools 工具链下精度调试部分的工具包。主要包括精度预检、溢出检测和精度比对等功能,目前适配 [PyTorch](https://pytorch.org/) 和 [MindSpore](https://www.mindspore.cn/) 框架。这些子工具侧重不同的训练场景,可以定位模型训练中的精度问题。 +## 用前必看 -为方便使用,本工具提供了一个统一、简易的程序接口,**PrecisionDebugger**,以 PyTorch 框架为例,通过以下示例模板和 **config.json** 可轻松使用各种功能。 +使用工具前,请先浏览[工具模块简介、适用场景和当前版本局限](./docs/0001.capability_matrix.md)。 -```python -from msprobe.pytorch import PrecisionDebugger - -debugger = PrecisionDebugger(config_path='./config.json') -... -debugger.start() # 一般在训练循环开头启动工具 -... # 循环体 -debugger.stop() # 一般在训练循环末尾结束工具 -debugger.step() # 在训练循环的最后需要重置工具,非循环场景不需要 -``` - -除了在训练脚本中调用接口函数,还可以通过命令行使用 **msprobe** 的其他功能,具体的使用规则和 **config.json** 的配置要求详见以下章节。 - -此外,可以在 shell 脚本添加 `export MSPROBE_LOG_LEVEL=1` 设置日志分级,细则如下: -- MSPROBE_LOG_LEVEL=4,不打印任何日志; -- MSPROBE_LOG_LEVEL=3,仅打印 ERROR; -- MSPROBE_LOG_LEVEL=2,仅打印 WARNING、ERROR; -- MSPROBE_LOG_LEVEL=1,仅打印 INFO、WARNING、ERROR(默认配置); -- MSPROBE_LOG_LEVEL=0,打印 DEBUG、INFO、WARNING、ERROR。 - -## ⚙️ [安装](./docs/01.installation.md) - -## 🛠️ config.json [介绍](./docs/02.config_introduction.md) 和 [示例](./docs/03.config_examples.md) +## ⚙️ [安装](./docs/0002.installation.md) +## 🛠️ config.json [介绍](./docs/0003.config_introduction.md) 和 [示例](./docs/0004.config_examples.md) ## 🧰 主要功能 ### 1 数据采集 -msprobe 通过在训练脚本中添加 PrecisionDebugger 接口的方式对 API 执行精度数据 dump 操作,对应 config.json 中的 task 为 statistics 或 tensor。 - -[PyTorch 场景的数据采集](./docs/05.data_dump_PyTorch.md) - -[MindSpore 场景的数据采集](./docs/06.data_dump_MindSpore.md) - -### 2 精度预检 - -精度预检旨在昇腾 NPU 上扫描训练模型中的所有 API 进行 API 复现,给出精度情况的诊断和分析。对应 config.json 中的 task 为 run_ut。 - -PyTorch 场景的[离线预检](./docs/07.accuracy_checker_PyTorch.md)和[在线预检](./docs/08.accuracy_checker_online_PyTorch.md) - -MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore.md) - -### 3 精度比对 - -该功能进行 PyTorch 整网 API 粒度的数据 dump、精度比对,进而定位训练场景下的精度问题。 - -[PyTorch 场景的精度比对](./docs/10.accuracy_compare_PyTorch.md) - -[MindSpore 场景的精度比对](./docs/11.accuracy_compare_MindSpore.md) - -### 4 溢出检测与解析 - -溢出检测与解析是在执行精度数据 dump 时,判断是否存在输入正常但输出存在溢出的 API,从而判断是否为正常溢出。对应 config.json 中的 overflow_check。 - -[PyTorch 场景的溢出检测与解析](./docs/12.overflow_check_PyTorch.md) - -[MindSpore 场景的溢出检测与解析](./docs/13.overflow_check_MindSpore.md) - -### 5 数据解析 - -该功能用于比对前后两次 NPU ACL 层级 dump 数据的一致性。 - -[PyTorch 场景的数据解析](./docs/14.data_parse_PyTorch.md) - -### 6 无标杆比对 - -[PyTorch 场景的无标杆比对](./docs/15.free_benchmarking_PyTorch.md) - -[MindSpore 场景的无标杆比对](./docs/16.overflow_check_MindSpore.md)(待补充) - -### 7 梯度状态监测 - -本功能用于采集梯度数据并进行梯度相似度比对,可以精准定位出现问题的 step。 - -[兼容 PyTorch 和 MindSpore 框架的梯度监测](./docs/17.grad_probe.md) - -## 🌟 新版本特性 - -【数据采集】 -- 支持 config.json 中的 step 传入范围; -- 优化了 MindSpore 场景的 step 机制,step 结束后训练继续运行。 - -【精度预检】 -- 在 PyTorch 场景,支持部分 NPU 融合算子预检。 - -【精度比对】 -- 解决了使用 MindSpore 需要安装 PyTorch 的问题。 - -【无标杆比对】 -- 补充在 PyTorch 场景的性能基线报告; -- 支持 MindSpore 场景的 change_value 扰动模式。 - -## 📑 补充材料 - -[msprobe 性能基线报告](./docs/S01.report_msprobe_dump_standard_performance_baseline.md) - -[无标杆工具场景验证和性能基线报告](./docs/S02.report_free_benchmarking_validation_performance_baseline.md) - -## ❗ 免责声明 -本工具建议执行用户与安装用户保持一致,如果您要使用root执行,请自行关注root高权限触及的安全风险。 - -## ❓ FAQ - -[FAQ for PyTorch](./docs/FAQ_PyTorch.md) +[离线模型 ONNX、TensorFlow (.pb, saved model)、Ascend OM 场景](./docs/0101.dump_offline_model.md) -FAQ for MindSpore +[以 Ascend Transformer Boost (ATB) 为后端的 MindIE-LLM 场景](./docs/0102.dump_mindie_llm_for_atb.md) diff --git a/debug/accuracy_tools/msprobe/__init__.py b/debug/accuracy_tools/msprobe/__init__.py index ade5d3d36c..53529bc8d3 100644 --- a/debug/accuracy_tools/msprobe/__init__.py +++ b/debug/accuracy_tools/msprobe/__init__.py @@ -1 +1,13 @@ -from msprobe.core.grad_probe.grad_compare import GradComparator \ No newline at end of file +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/debug/accuracy_tools/msprobe/build.sh b/debug/accuracy_tools/msprobe/build.sh new file mode 100644 index 0000000000..90b120efaa --- /dev/null +++ b/debug/accuracy_tools/msprobe/build.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +set -e + +BUILD_PATH=$(pwd) + +BUILD_ARGS=$(getopt \ + -o ha:v:j:ft \ + --long help,release,debug,arch:,python-version:,jobs:,force-rebuild,local,test-cases \ + -- "$@") +eval set -- "${BUILD_ARGS}" + +ARCH_TYPE=$(uname -m) +BUILD_TYPE=release +CONCURRENT_JOBS=16 +BUILD_TEST_CASE=False +USE_LOCAL_FIRST=False +PYTHON_VERSION="" + +HELP_DOC=$(cat << EOF +Usage: build.sh [OPTION]...\n +Build the C++ part of msprobe.\n +\n +Arguments:\n + -a, --arch Specify the schema, which generally does not need to be set up.\n + -j, --jobs Specify the number of compilation jobs(default 16).\n + -f, --force-rebuild Clean up the cache before building.\n + -t, --test-cases Build test cases.\n + --local Prioritize the use of on-premises, third-party resources as dependencies.\n + --release Build the release version(default).\n + --debug Build the debug version. + -v, --python-version Specify version of python. +EOF +) + +while true; do + case "$1" in + -h | --help) + echo -e ${HELP_DOC} + exit 0 ;; + -a | --arch) + ARCH_TYPE="$2" ; shift 2 ;; + -v | --python-version) + PYTHON_VERSION="$2" ; shift 2 ;; + --release) + BUILD_TYPE=release ; shift ;; + --debug) + BUILD_TYPE=debug ; shift ;; + -j | --jobs) + CONCURRENT_JOBS="$2" ; shift 2 ;; + --local) + USE_LOCAL_FIRST=True ; shift ;; + -f | --force-rebuild) + rm -rf "${BUILD_PATH}/lib" "${BUILD_PATH}/output" "${BUILD_PATH}/msprobe/lib/msprobe_c.so" + shift ;; + -t | --test-cases) + BUILD_TEST_CASE=True ; shift ;; + --) + shift ; break ;; + *) + echo "Unknow argument $1" + exit 1 ;; + esac +done + +BUILD_OUTPUT_PATH=${BUILD_PATH}/output/${BUILD_TYPE} + +cmake -B ${BUILD_OUTPUT_PATH} -S . -DARCH_TYPE=${ARCH_TYPE} -DBUILD_TYPE=${BUILD_TYPE} \ + -DUSE_LOCAL_FIRST=${USE_LOCAL_FIRST} -DBUILD_TEST_CASE=${BUILD_TEST_CASE} \ + -DPYTHON_VERSION=${PYTHON_VERSION} +cd ${BUILD_OUTPUT_PATH} +make -j${CONCURRENT_JOBS} + +if [[ ! -e ${BUILD_OUTPUT_PATH}/msprobe/csrc/libmsprobe_c.so ]]; then + echo "Failed to build libmsprobe_c.so." + exit 1 +fi + +if [[ ! -e ${BUILD_PATH}/msprobe/lib ]]; then + mkdir ${BUILD_PATH}/msprobe/lib +fi + +cp ${BUILD_OUTPUT_PATH}/msprobe/csrc/libmsprobe_c.so ${BUILD_PATH}/msprobe/lib/msprobe_c.so diff --git a/debug/accuracy_tools/msprobe/cmake/Findcpython.cmake b/debug/accuracy_tools/msprobe/cmake/Findcpython.cmake new file mode 100644 index 0000000000..577d42dcda --- /dev/null +++ b/debug/accuracy_tools/msprobe/cmake/Findcpython.cmake @@ -0,0 +1,14 @@ +set(PKG_NAME cpython) + +if (NOT ${PKG_NAME}_FOUND) + find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development) + + if (NOT Python3_FOUND) + message(FATAL_ERROR "${Python3} is not found.") + endif() + + set(PACKAGE_VERSION ${Python3_VERSION}) + include_directories(${Python3_INCLUDE_DIRS}) + set(${PKG_NAME}_LIBRARIES ${Python3_LIBRARIES}) + set(${PKG_NAME}_FOUND TRUE) +endif() diff --git a/debug/accuracy_tools/msprobe/cmake/Findgtest.cmake b/debug/accuracy_tools/msprobe/cmake/Findgtest.cmake new file mode 100644 index 0000000000..aa5606d311 --- /dev/null +++ b/debug/accuracy_tools/msprobe/cmake/Findgtest.cmake @@ -0,0 +1,45 @@ +set(PACKAGE_VERSION 1.12.1) + +set(PKG_NAME gtest) +set(SHA256_VALUE "81964fe578e9bd7c94dfdb09c8e4d6e6759e19967e397dbea48d1c10e45d0df2") +set(DOWNLOAD_PATH "$ENV{PROJECT_ROOT_PATH}/third_party") +set(DIR_NAME "${DOWNLOAD_PATH}/googletest-release-1.12.1") + +if (NOT ${PKG_NAME}_FOUND) + download_opensource_pkg(${PKG_NAME} + SHA256 ${SHA256_VALUE} + DOWNLOAD_PATH ${DOWNLOAD_PATH} + ) + include_directories(${DIR_NAME}/googletest/include) + include_directories(${DIR_NAME}/googlemock/include) + set(BUILD_DEPENDENCY_PATH "$ENV{PROJECT_ROOT_PATH}/build_dependency") + + execute_process( + WORKING_DIRECTORY ${DIR_NAME} + COMMAND cmake . -DBUILD_SHARED_LIBS=ON + RESULT_VARIABLE RESULT + ) + if (NOT RESULT EQUAL 0) + message(FATAL_ERROR "Failed to build gtest. ${RESULT}") + endif() + + execute_process( + WORKING_DIRECTORY ${DIR_NAME} + COMMAND make -j16 + RESULT_VARIABLE RESULT + ) + if (NOT RESULT EQUAL 0) + message(FATAL_ERROR "Failed to build gtest. ${RESULT}") + endif() + + file(GLOB GTEST_SO "${DIR_NAME}/lib/libgtest.so") + file(GLOB GMOCK_SO "${DIR_NAME}/lib/libgmock.so") + file(GLOB GTEST_MAIN_SO "${DIR_NAME}/lib/libgtest_main.so") + file(GLOB GMOCK_MAIN_SO "${DIR_NAME}/lib/libgmock_main.so") + + if (NOT GTEST_SO OR NOT GMOCK_SO OR NOT GTEST_MAIN_SO OR NOT GMOCK_MAIN_SO) + message(FATAL_ERROR "Failed to build gtest.") + endif() + set(${PKG_NAME}_LIBRARIES "${GTEST_SO};${GMOCK_SO};${GTEST_MAIN_SO};${GMOCK_MAIN_SO}") + set(${PKG_NAME}_FOUND TRUE) +endif() diff --git a/debug/accuracy_tools/msprobe/cmake/Findmockcpp.cmake b/debug/accuracy_tools/msprobe/cmake/Findmockcpp.cmake new file mode 100644 index 0000000000..da938b9c6e --- /dev/null +++ b/debug/accuracy_tools/msprobe/cmake/Findmockcpp.cmake @@ -0,0 +1,42 @@ +set(PACKAGE_VERSION 2.7) + +set(PKG_NAME mockcpp) +set(SHA256_VALUE "0dc7111c5be9785d0550ed3b68db7e12fd5d7802b7bc6548c52ac7b9e727fcc1") +set(DOWNLOAD_PATH "$ENV{PROJECT_ROOT_PATH}/third_party") +set(DIR_NAME "${DOWNLOAD_PATH}/mockcpp-v2.7") + +if (NOT ${PKG_NAME}_FOUND) + download_opensource_pkg(${PKG_NAME} + SHA256 ${SHA256_VALUE} + DOWNLOAD_PATH ${DOWNLOAD_PATH} + ) + include_directories(${DIR_NAME}/include) + include_directories(${DIR_NAME}/3rdparty) + + execute_process( + WORKING_DIRECTORY ${DIR_NAME} + COMMAND cmake . + RESULT_VARIABLE RESULT + ) + if (NOT RESULT EQUAL 0) + message(FATAL_ERROR "Failed to build mockcpp. ${RESULT}") + endif() + + execute_process( + WORKING_DIRECTORY ${DIR_NAME} + COMMAND make -j16 + RESULT_VARIABLE RESULT + ) + if (NOT RESULT EQUAL 0) + message(FATAL_ERROR "Failed to build mockcpp. ${RESULT}") + endif() + + file(GLOB MOCKCPP_LIB "${DIR_NAME}/src/libmockcpp.a") + + if (NOT MOCKCPP_LIB) + message(FATAL_ERROR "Failed to build mockcpp.") + endif() + set(${PKG_NAME}_LIBRARIES "${MOCKCPP_LIB}") + set(${PKG_NAME}_FOUND TRUE) + +endif() diff --git a/debug/accuracy_tools/msprobe/cmake/config.ini b/debug/accuracy_tools/msprobe/cmake/config.ini new file mode 100644 index 0000000000..6940c5403f --- /dev/null +++ b/debug/accuracy_tools/msprobe/cmake/config.ini @@ -0,0 +1,7 @@ +[gtest] +url = https://tools.mindspore.cn/Ascend/mstt/libs/googletest/release-1.12.1.tar.gz +tag = release-1.12.1 + +[mockcpp] +url = https://tools.mindspore.cn/Ascend/mstt/libs/mockcpp/v2.7.zip +tag = v2.7 diff --git a/debug/accuracy_tools/msprobe/cmake/download_opensource.sh b/debug/accuracy_tools/msprobe/cmake/download_opensource.sh new file mode 100644 index 0000000000..ce9f13fdd5 --- /dev/null +++ b/debug/accuracy_tools/msprobe/cmake/download_opensource.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +if [ "$#" -lt 2 ]; then + echo "Usage: $0 [ ] [ ]" + exit 1 +fi + +pkg_name=$1 +path=$2 + +if [ "$#" -ge 3 ]; then + sha256_value=$3 +fi +if [ "$#" -ge 4 ]; then + tag=$4 +fi + +url=$(awk -F " = " '/\['${pkg_name}'\]/{a=1}a==1&&$1~/url/{print $2;exit}' config.ini) +tag=$(awk -F " = " '/\['${pkg_name}'\]/{a=1} a==1 && $1 ~ /tag/ {print $2; exit}' config.ini) + +if [[ ! $url = https* ]]; then + echo "[ERROR] The URL of $pkg_name is illegal." + exit 1 +fi + +echo "[INFO] Start to download ${url}..." + +if [ ! -d "$path" ]; then + echo "[ERROR] The specified path does not exist: $path" + exit 1 +fi +cd ${path} + +extension=$(echo "${url}" | awk -F'[./]' '{print $NF}') +fullname="${path}/$(basename "${url}")" +if [[ "${extension}" == "gz" || "${extension}" == "zip" ]]; then + if [[ -e "${fullname}" ]]; then + echo "[INFO] Source ${fullname} already exists, skipping download." + else + echo "[INFO] Start downloading: ${url}" + curl -L -k --fail --retry 3 --connect-timeout 10 -o "${fullname}" "${url}" + if [[ $? -ne 0 ]]; then + echo "[ERROR] Download failed: ${url}" + rm -f "${fullname}" + exit 1 + fi + + filesize=$(stat -c%s "${fullname}") + if [[ "${filesize}" -lt 10240 ]]; then + echo "[ERROR] Downloaded file too small (<10KB), possible error page: ${url}" + rm -f "${fullname}" + exit 1 + fi + + if file "${fullname}" | grep -q "HTML"; then + echo "[ERROR] Downloaded file is HTML, not a zip archive." + rm -f "${fullname}" + exit 1 + fi + + echo "[INFO] Download success: ${url} (${filesize} bytes)" + fi + + if [[ ! -z "${sha256_value}" ]]; then + sha256data=$(sha256sum "${fullname}" | cut -d' ' -f1) + if [[ "${sha256data}" != "${sha256_value}" ]]; then + echo "[ERROR] SHA256 verification failed: ${url}" + echo "[ERROR] Expected: ${sha256_value}" + echo "[ERROR] Actual : ${sha256data}" + exit 1 + fi + fi + + if [[ "${extension}" == "gz" ]]; then + tar -zxvf "${fullname}" -C ./ -n > /dev/null + elif [[ "${extension}" == "zip" ]]; then + unzip -n "${fullname}" -d ./ > /dev/null + fi +elif [[ "${extension}" == "git" ]]; then + repo_dir=$(basename "${url}" .git) + + if [[ -d "${repo_dir}" ]]; then + echo "[INFO] Repository already exists: ${repo_dir}, skipping clone." + if [[ -n "${tag}" ]]; then + cd "${repo_dir}" + echo "[INFO] Checking out ${tag}..." + git fetch origin + git checkout "${tag}" || { + echo "[ERROR] Failed to checkout ${tag}" + exit 1 + } + cd - + fi + else + if [[ -n "${tag}" ]]; then + git clone --progress -b "${tag}" "${url}" + else + git clone --progress "${url}" + fi + if [[ $? -eq 0 ]]; then + echo "[INFO] Clone success: ${url}" + else + echo "[ERROR] Clone failed: ${url}" + exit 1 + fi + fi +else + echo "[ERROR] Unknown url type: ${url}" + exit 1 +fi diff --git a/debug/accuracy_tools/msprobe/cmake/utils.cmake b/debug/accuracy_tools/msprobe/cmake/utils.cmake new file mode 100644 index 0000000000..5d90991d92 --- /dev/null +++ b/debug/accuracy_tools/msprobe/cmake/utils.cmake @@ -0,0 +1,46 @@ +function(download_opensource_pkg pkg_name) + message("start to download ${pkg_name}...") + set(options) + set(oneValueArgs SHA256 GIT_TAG DOWNLOAD_PATH DIR_NAME BUILD_CMD) + set(multiValueArgs PATCHES) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if (NOT PKG_DOWNLOAD_PATH) + set(PKG_DOWNLOAD_PATH "${CMAKE_SOURCE_DIR}/../third_party") + endif() + file(MAKE_DIRECTORY ${PKG_DOWNLOAD_PATH}) + + execute_process( + WORKING_DIRECTORY $ENV{PROJECT_ROOT_PATH}/cmake + COMMAND bash download_opensource.sh ${pkg_name} ${PKG_DOWNLOAD_PATH} ${PKG_SHA256} ${PKG_GIT_TAG} + RESULT_VARIABLE RESULT + ) + if (NOT RESULT EQUAL 0) + message(FATAL_ERROR "Failed to download ${pkg_name}(${RESULT}).") + endif() + if (PKG_BUILD_CMD) + execute_process( + COMMAND bash -c "cd ${PKG_DOWNLOAD_PATH}/${DIR_NAME};${PKG_BUILD_CMD}" + ) + endif() +endfunction() + +function(compile_protobuf_file output_path) + if (NOT PROTOC_EXECUTABLE) + message(FATAL_ERROR "You shall install protobuf first.") + endif() + + file(MAKE_DIRECTORY ${output_path}) + + foreach(file ${ARGN}) + get_filename_component(abs_file_path ${file} ABSOLUTE) + get_filename_component(file_name ${file} NAME_WE) + get_filename_component(file_dir ${abs_file_path} PATH) + file(RELATIVE_PATH rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${file_dir}) + + execute_process( + COMMAND ${PROTOC_EXECUTABLE} -I${file_dir} --cpp_out=${output_path} ${abs_file_path} + ) + + message("Compile protobuf file ${file}") + endforeach() +endfunction() diff --git a/debug/accuracy_tools/msprobe/docs/0001.capability_matrix.md b/debug/accuracy_tools/msprobe/docs/0001.capability_matrix.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/msprobe/docs/0002.installation.md b/debug/accuracy_tools/msprobe/docs/0002.installation.md new file mode 100644 index 0000000000..7a262b539d --- /dev/null +++ b/debug/accuracy_tools/msprobe/docs/0002.installation.md @@ -0,0 +1,55 @@ +# 安装 + +## 1 依赖 + +### 1.1 硬件环境 + +[昇腾产品形态说明](https://www.hiascend.com/document/detail/zh/canncommercial/80RC22/quickstart/quickstart/quickstart_18_0002.html) + +### 1.2 软件环境 + +[固件和驱动](https://www.hiascend.com/hardware/firmware-drivers/community?product=1&model=30&cann=8.2.RC1.alpha001&driver=Ascend+HDK+25.0.RC1) + +| 框架 | 是否必选 | 版本 | +| -------------------------------------------------------------------------------------------- | -------- | ----------------------------------------------------------- | +| [Python](https://www.python.org/) | 是 | 3.7 ~ 3.12 | +| [GCC](https://gcc.gnu.org/) | 是 | 需支持 C++14 标准 | +| [git](https://git-scm.com/) | 否 | 推荐稳定版本 2.34.x - 2.42.x | +| [CANN](https://www.hiascend.cn/developer/download/community/result?module=cann)*1 | 否 | 完全兼容,根据 CPU 架构和 NPU 型号选择 toolkit 和 kernel 包 | +| [PyTorch (CPU, GPU)](https://pytorch.org/) | 否 | 1.11、2.1 ~ 2.7,对应的 Python 版本最低为 3.7 | +| [PyTorch (NPU)](https://gitee.com/ascend/pytorch) | 否 | 1.11、2.1 ~ 2.7,对应的 Python 版本最低为 3.7 | +| [MindIE-LLM](https://gitee.com/ascend/MindIE-LLM)*2 | 否 | 1.0,2.0 | +| [TensorFlow](https://github.com/tensorflow/tensorflow/releases/tag/v2.6.5)*3 | 否 | 仅支持 2.6.5 版本,对应的 Python 版本为 3.7 ~ 3.9 | + +*1: **CANN** 安装参见[社区资料](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0002.html)。 + +*2: **MindIE-LLM** 非开源,如需查看请联系该组件的华为工程师。 + +*3: **TensorFlow** 模型在 **Ascend NPU** 的迁移,还需要安装 [TF 插件](https://gitee.com/ascend/tensorflow/releases)。 + +用户可以根据使用场景自行安装适配的 Python 和其他软件包,并在使用 msprobe 前确保所依赖的框架可以正常运行。 + +## 2 安装 msprobe + +### 2.1 从源码安装 + +```sh +git clone https://gitee.com/ascend/mstt.git -b poc +cd mstt/debug/accuracy_tools_infer + +pip install setuptools wheel + +python setup.py bdist_wheel [--compat tf] +cd ./dist +pip install mindstudio_probe*.whl +``` +**注意**:`--compat` 参数非必选,默认为无,当前支持 tf。 + + + + +# 3 查看 msprobe 工具信息 + +```sh +pip show mindstudio_probe +``` diff --git a/debug/accuracy_tools/msprobe/docs/0003.config_introduction.md b/debug/accuracy_tools/msprobe/docs/0003.config_introduction.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/msprobe/docs/0004.config_examples.md b/debug/accuracy_tools/msprobe/docs/0004.config_examples.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/msprobe/docs/0101.dump_offline_model.md b/debug/accuracy_tools/msprobe/docs/0101.dump_offline_model.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/msprobe/docs/0102.dump_mindie_llm_for_atb.md b/debug/accuracy_tools/msprobe/docs/0102.dump_mindie_llm_for_atb.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/msprobe/docs/README.md b/debug/accuracy_tools/msprobe/docs/README.md new file mode 100644 index 0000000000..12c2ce3ba7 --- /dev/null +++ b/debug/accuracy_tools/msprobe/docs/README.md @@ -0,0 +1,29 @@ +# msprobe 文档编写查阅指南 + +## 1 文档编号 + +0. 公共文档:0001 - 0099 +1. 数据采集:0101 - 0199 +2. 溢出检测:0201 - 0299 +3. 精度预检:0301 - 0399 +4. 精度比对:0401 - 0499 +5. 模型改图:0501 - 0599 +6. 状态监控:0601 - 0699 +7. 数据解析:0701 - 0799 +8. 参数检查:0801 - 0899 + +## 2 文档模板 + +```md +# 简介 + +# 接口介绍 + +# 使用示例 + +# 输出件介绍 + +# 约束 + +# 常见问题 +``` diff --git a/debug/accuracy_tools/msprobe/pyproject.toml b/debug/accuracy_tools/msprobe/pyproject.toml new file mode 100644 index 0000000000..1688f68d01 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pyproject.toml @@ -0,0 +1,11 @@ +[tool.black] +line-length = 120 # 设置最大行长 +target-version = ['py37', 'py38', 'py39', 'py310', 'py311', 'py312'] # 兼容的 Python 版本 + +[tool.isort] +profile = "black" # 使 isort 与 Black 兼容 +line_length = 120 # 统一最大行长 +multi_line_output = 3 # 按分组方式输出多行 import 语句 +force_grid_wrap = 0 # 控制换行时的显示方式 +use_parentheses = true # 使用括号包裹长 import 语句 +combine_as_imports = true # 合并多行的 as 导入 diff --git a/debug/accuracy_tools/msprobe/requirements/requirements.txt b/debug/accuracy_tools/msprobe/requirements/requirements.txt new file mode 100644 index 0000000000..1939d52001 --- /dev/null +++ b/debug/accuracy_tools/msprobe/requirements/requirements.txt @@ -0,0 +1,7 @@ +numpy < 2.0 +protobuf >= 3.18, < 5.0 +onnx >= 1.12.0, < 2.0 +onnxruntime >= 1.10, < 2.0 +pandas >= 1.3, < 3.0 +PyYAML +tqdm diff --git a/debug/accuracy_tools/msprobe/requirements/requirements_tf.txt b/debug/accuracy_tools/msprobe/requirements/requirements_tf.txt new file mode 100644 index 0000000000..745fc5dd1a --- /dev/null +++ b/debug/accuracy_tools/msprobe/requirements/requirements_tf.txt @@ -0,0 +1,10 @@ +numpy >= 1.19.2, <= 1.21.6 +protobuf >= 3.9.2, <= 3.20.3 +scipy >= 1.5.2, <= 1.7.3 +pandas >= 1.2.0, <= 1.3.5 +decorator +sympy +attrs +psutil +PyYAML +tqdm diff --git a/debug/accuracy_tools/msprobe/setup.py b/debug/accuracy_tools/msprobe/setup.py new file mode 100644 index 0000000000..50a8db9b08 --- /dev/null +++ b/debug/accuracy_tools/msprobe/setup.py @@ -0,0 +1,90 @@ +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "8.1.1" + +import os +import sys +from platform import machine +from subprocess import run + +from setuptools import find_packages, setup + +_COMPAT_REQUIREMENTS_MAP = {"tf": "requirements_tf.txt", "default": "requirements.txt"} + + +def parse_args(): + compat_flag = None + if "--compat" not in sys.argv: + return compat_flag + index = sys.argv.index("--compat") + if index + 1 >= len(sys.argv): + raise ValueError("Missing argument for --compat.") + compat_flag = sys.argv[index + 1] + sys.argv.remove("--compat") + sys.argv.remove(compat_flag) + return compat_flag + + +def get_requirements(compat_name=None): + requirements_parent_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements") + requirements_file = _COMPAT_REQUIREMENTS_MAP.get(compat_name, _COMPAT_REQUIREMENTS_MAP["default"]) + with open(os.path.join(requirements_parent_path, requirements_file)) as f: + required_lines = f.read().splitlines() + return required_lines + + +compat = parse_args() +required = get_requirements(compat) + +build_cmd = f"bash ./build.sh -j16 -a {machine()} -v {sys.version_info.major}.{sys.version_info.minor}" +p = run(build_cmd.split(), shell=False) +if p.returncode != 0: + raise RuntimeError(f"Failed to build source({p.returncode})") + + +setup( + name="mindstudio-probe-infer", + version=__version__, + description="Ascend Probe Utils", + long_description=""" + MindStudio-Probe is a set of tools for diagnosing and improving model accuracy on Ascend NPU, + including API accuracy, args checker, grad tool etc. + """, + long_description_content_type="text/markdown", + url="https://gitee.com/ascend/mstt/tree/master/accuracy_tools/msprobe", + author="Ascend Team", + author_email="pmail_mindstudio@huawei.com", + packages=find_packages(include=["msprobe", "msprobe*"]), + package_data={"": ["LICENSE", "lib/*.so"]}, + license="Apache-2.0", + keywords=["msprobe", "pytorch", "mindspore"], + python_requires=">=3.7", + install_requires=required, + zip_safe=False, + classifiers=[ + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "Programming Language :: Python :: 3", + "Programming Language :: C++", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + ], + entry_points={"console_scripts": ["msprobe_infer=msprobe.__main__:main"]}, +) diff --git a/debug/accuracy_tools/msprobe/third_party/.keep b/debug/accuracy_tools/msprobe/third_party/.keep new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From e681bd716bb3db9f08fe4d941aa7c12e8e1531f7 Mon Sep 17 00:00:00 2001 From: caishangqiu Date: Thu, 28 Aug 2025 12:45:50 +0800 Subject: [PATCH 2/2] refactoring base class --- .../msprobe/msprobe/base/__init__.py | 18 ++ .../msprobe/msprobe/base/cmd.py | 81 +++++ .../msprobe/base/component/__init__.py | 13 + .../msprobe/msprobe/base/component/manager.py | 278 ++++++++++++++++++ .../msprobe/msprobe/base/config.py | 120 ++++++++ .../msprobe/msprobe/base/service/__init__.py | 13 + .../msprobe/msprobe/base/service/manager.py | 114 +++++++ 7 files changed, 637 insertions(+) create mode 100644 debug/accuracy_tools/msprobe/msprobe/base/__init__.py create mode 100644 debug/accuracy_tools/msprobe/msprobe/base/cmd.py create mode 100644 debug/accuracy_tools/msprobe/msprobe/base/component/__init__.py create mode 100644 debug/accuracy_tools/msprobe/msprobe/base/component/manager.py create mode 100644 debug/accuracy_tools/msprobe/msprobe/base/config.py create mode 100644 debug/accuracy_tools/msprobe/msprobe/base/service/__init__.py create mode 100644 debug/accuracy_tools/msprobe/msprobe/base/service/manager.py diff --git a/debug/accuracy_tools/msprobe/msprobe/base/__init__.py b/debug/accuracy_tools/msprobe/msprobe/base/__init__.py new file mode 100644 index 0000000000..f57ae818ae --- /dev/null +++ b/debug/accuracy_tools/msprobe/msprobe/base/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from msprobe.base.cmd import BaseCommand, Command +from msprobe.base.component.manager import BaseComponent, Component, ConsumerComp, ProducerComp, Scheduler +from msprobe.base.config import SIZE_1M, BaseConfig, Dict2Class +from msprobe.base.service.manager import BaseService, Service diff --git a/debug/accuracy_tools/msprobe/msprobe/base/cmd.py b/debug/accuracy_tools/msprobe/msprobe/base/cmd.py new file mode 100644 index 0000000000..e132a8702f --- /dev/null +++ b/debug/accuracy_tools/msprobe/msprobe/base/cmd.py @@ -0,0 +1,81 @@ +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from argparse import RawTextHelpFormatter +from sys import argv + +from msprobe.utils.constants import CmdConst, MsgConst +from msprobe.utils.exceptions import MsprobeException + + +class Command: + """ + A hierarchical command registration system that supports multi-level command structures. + """ + + _cmd_map = {} # Internal storage: {parent_cmd: {name: command_class}} + _instance = None + + def __new__(cls, *args, **kwargs): + if not cls._instance: + cls._instance = super(Command, cls).__new__(cls) + return cls._instance + + @classmethod + def register(cls, parent_cmd, name): + def decorator(command_cls): + if parent_cmd not in cls._cmd_map: + cls._cmd_map[parent_cmd] = {} + cls._cmd_map[parent_cmd][name] = command_cls + return command_cls + + return decorator + + @classmethod + def get(cls, parent_cmd): + return cls._cmd_map.get(parent_cmd, {}) + + +class BaseCommand(ABC): + def __init__(self): + self.formatter_class = RawTextHelpFormatter + + @property + def service_key(self): + if isinstance(self.subcommand_level, int) and self.subcommand_level > 0: + return argv[self.subcommand_level] if len(argv) > self.subcommand_level else None + else: + raise MsprobeException(MsgConst.INVALID_ARGU, "Subcommand level must be a positive integer.") + + @abstractmethod + def add_arguments(self, parse): + pass + + def build_parser(self, parent_parser, parent_cmd_class): + if self.subcommand_level > MsgConst.MAX_RECURSION_DEPTH: + raise MsprobeException( + MsgConst.RISK_ALERT, f"Maximum recursion depth of {MsgConst.MAX_RECURSION_DEPTH} exceeded." + ) + subcommands = Command.get(parent_cmd_class) + if not subcommands: + return + self.subcommand_level += 1 + subparsers = parent_parser.add_subparsers(dest=f"L{self.subcommand_level}command") + for name, cmd_class in subcommands.items(): + cmd_parser = subparsers.add_parser( + name=name, help=CmdConst.HELP_TASK_MAP.get(name), formatter_class=self.formatter_class + ) + cmd_class.add_arguments(cmd_parser) + self.build_parser(cmd_parser, cmd_class) diff --git a/debug/accuracy_tools/msprobe/msprobe/base/component/__init__.py b/debug/accuracy_tools/msprobe/msprobe/base/component/__init__.py new file mode 100644 index 0000000000..53529bc8d3 --- /dev/null +++ b/debug/accuracy_tools/msprobe/msprobe/base/component/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/debug/accuracy_tools/msprobe/msprobe/base/component/manager.py b/debug/accuracy_tools/msprobe/msprobe/base/component/manager.py new file mode 100644 index 0000000000..a3a4d337d2 --- /dev/null +++ b/debug/accuracy_tools/msprobe/msprobe/base/component/manager.py @@ -0,0 +1,278 @@ +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from collections import deque +from threading import RLock + +from msprobe.utils.constants import MsgConst +from msprobe.utils.exceptions import MsprobeException +from msprobe.utils.toolkits import register + + +class BaseComponent(object): + """ + Methods that need to be implemented: + activate: Called when service.start() is invoked. + deactivate: Called when service.stop() is invoked. + """ + + def __init__(self, priority=100): + self.activated = False + self.priority = priority + + @property + def is_activated(self): + return self.activated + + def activate(self, *args, **kwargs): + pass + + def deactivate(self, *args, **kwargs): + pass + + def do_activate(self): + if self.activated: + return + self.activate() + self.activated = True + + def do_deactivate(self): + if not self.activated: + return + self.deactivate() + self.activated = False + + +class ProducerComp(BaseComponent, ABC): + """ + A ProducerComp can generate data. + If the data is passively generated (e.g., when a consumer applies the data), implement "load_data". + If the data is actively generated (e.g., when an interest event occurs), + call "publish" to send it to subscribers. + """ + + def __init__(self, priority): + super(ProducerComp, self).__init__(priority) + self.output_buffer = deque() + self.subscribers = set() + + @property + def is_ready(self): + return len(self.output_buffer) > 0 + + @abstractmethod + def load_data(self): + pass + + def publish(self, data, msg_id=0): + """ + Wrap the data and pack it into the output buffer. + """ + self.output_buffer.append([self, data, msg_id]) + Scheduler().enqueue([self]) + + def on_subscribe(self, comp): + if not isinstance(comp, ConsumerComp): + raise MsprobeException(MsgConst.INVALID_DATA_TYPE, "Only ConsumerComp can subscribe to ProducerComp.") + self.subscribers.add(comp) + + def retrieve(self): + if self.output_buffer: + return self.output_buffer.popleft() + else: + return None + + def do_load_data(self): + if self.output_buffer: + return + data = self.load_data() + if data: + self.publish(data) + + def get_subscribers(self): + return self.subscribers + + +class ConsumerComp(BaseComponent, ABC): + """ + A ConsumerComp can consume data. + Call "subscribe" to subscribe data from a ProducerComp. + Implement "consume" to process data. + """ + + def __init__(self, priority): + super(ConsumerComp, self).__init__(priority) + self.dependencies = {} + + def subscribe(self, comp): + if not isinstance(comp, ProducerComp): + raise MsprobeException(MsgConst.INVALID_DATA_TYPE, "Only ProducerComp can subscribe to ConsumerComp.") + if self.is_activated: + raise MsprobeException( + MsgConst.INVALID_DATA_TYPE, f"Component {comp} must be subscribed before activation." + ) + if self.is_cycle(comp): + raise MsprobeException(MsgConst.RISK_ALERT, "Cycle dependency detected! Subscription denied.") + comp.on_subscribe(self) + if comp not in self.dependencies: + self.dependencies[comp] = None + + @abstractmethod + def consume(self, packages): + pass + + def is_cycle(self, comp, visited=None, stack=None): + if visited is None: + visited = set() + if stack is None: + stack = set() + if comp in stack: + return True + if comp in visited: + return False + visited.add(comp) + stack.add(comp) + if isinstance(comp, ConsumerComp): + for producer in comp.dependencies: + if self.is_cycle(producer, visited, stack): + return True + stack.remove(comp) + return False + + def on_receive(self, package): + try: + self.dependencies[package[0]] = package + except Exception as e: + raise MsprobeException( + MsgConst.PARSING_FAILED, + "The first element in the data (self.output_buffer) published by the producer must be itself.", + ) from e + + def get_empty_dependencies(self): + dependencies_list = [] + for k, v in self.dependencies.items(): + if v is None: + dependencies_list.append(k) + return dependencies_list + + def do_consume(self): + """ + Encapsulate the data in "dependencies" and invoke it using "consume". + """ + if self.get_empty_dependencies(): + return + packages = [] + for key in self.dependencies: + packages.append(self.dependencies[key]) + self.dependencies[key] = None + self.consume(packages) + + +class Component: + _component_type_map = {} + + @classmethod + def register(cls, name): + return register(name, cls._component_type_map) + + @classmethod + def get(cls, name): + return cls._component_type_map.get(name) + + +class Scheduler: + _instance = None + _lock = RLock() + + def __new__(cls): + with cls._lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialized = False + return cls._instance + + def __init__(self): + if self._initialized: + return + self.comp_ref = {} + self.queue = deque() + self.enqueued = set() + self.is_in_loop = False + self._initialized = True + + def add(self, components): + for comp in components: + if comp in self.comp_ref: + self.comp_ref[comp] += 1 + else: + self.comp_ref[comp] = 1 + comp.do_activate() + self.enqueue([comp]) + self.run_loop() + + def remove(self, components): + for comp in components: + if comp not in self.comp_ref: + continue + if self.comp_ref[comp] > 1: + self.comp_ref[comp] -= 1 + else: + comp.do_deactivate() + del self.comp_ref[comp] + + def enqueue(self, comps): + for comp in comps: + if comp not in self.enqueued: + self.queue.append(comp) + self.enqueued.add(comp) + + def run_loop(self): + if self.is_in_loop: + return + self.is_in_loop = True + try: + while self.queue: + comp = self.queue.popleft() + self.enqueued.remove(comp) + if isinstance(comp, ConsumerComp): + self._schedule_consumer(comp) + if isinstance(comp, ProducerComp): + self._schedule_producer(comp) + finally: + self.is_in_loop = False + + def _schedule_producer(self, comp: ProducerComp): + if not comp.is_ready: + return + package = comp.retrieve() + if not package: + return + subscribers = comp.get_subscribers() + if not subscribers: + return + for subscriber in subscribers: + subscriber.on_receive(package) + self.enqueue([subscriber]) + + def _schedule_consumer(self, comp: ConsumerComp): + dependencies = comp.get_empty_dependencies() + if not dependencies: + comp.do_consume() + self.enqueue([comp]) + return + for dependency in dependencies: + dependency.do_load_data() + if dependency.is_ready: + self.enqueue([dependency]) diff --git a/debug/accuracy_tools/msprobe/msprobe/base/config.py b/debug/accuracy_tools/msprobe/msprobe/base/config.py new file mode 100644 index 0000000000..9368efdc6b --- /dev/null +++ b/debug/accuracy_tools/msprobe/msprobe/base/config.py @@ -0,0 +1,120 @@ +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod + +from msprobe.common.validation import ( + valid_buffer_size, + valid_framework, + valid_level, + valid_log_level, + valid_seed, + valid_step_or_rank, + valid_task, +) +from msprobe.utils.constants import CfgConst, MsgConst +from msprobe.utils.exceptions import MsprobeException +from msprobe.utils.io import load_json +from msprobe.utils.log import logger + +SIZE_1M = 1_048_576 # 1024 * 1024 + + +class BaseConfig(ABC): + def __init__(self, config_path, task="", framework="", step: list = None, level: list = None): + self.config_path = config_path + self.config = load_json(self.config_path) + self.task_config = {} + self.task = task + self.framework = framework + self.step = step + self.level = level + + def __getattribute__(self, name): + attr = object.__getattribute__(self, name) + if name == "check_config" and callable(attr): + + def wrapper(*args, **kwargs): + self._common_check() + self._get_task_dict() + result = attr(*args, **kwargs) + return result + + return wrapper + return attr + + @abstractmethod + def check_config(self): + pass + + def _get_task_dict(self): + self.task_config = self.config.get(self.config.get(CfgConst.TASK)) + if not self.task_config: + raise MsprobeException( + MsgConst.REQUIRED_ARGU_MISSING, f'Missing dictionary for key "{self.config.get(CfgConst.TASK)}".' + ) + + def _common_check(self): + logger.info("Validating configuration file parameters.") + self._update_config(self.config, CfgConst.TASK, valid_task, self.task or self.config.get(CfgConst.TASK, None)) + self._update_config( + self.config, + CfgConst.FRAMEWORK, + valid_framework, + self.framework or self.config.get(CfgConst.FRAMEWORK, None), + ) + self._update_config( + self.config, CfgConst.STEP, valid_step_or_rank, self.step or self.config.get(CfgConst.STEP, []) + ) + self._update_config(self.config, CfgConst.RANK, valid_step_or_rank, self.config.get(CfgConst.RANK, [])) + self._update_config( + self.config, + CfgConst.LEVEL, + valid_level, + self.level or self.config.get(CfgConst.LEVEL, [CfgConst.LEVEL_API]), + ) + self._update_config( + self.config, CfgConst.LOG_LEVEL, valid_log_level, self.config.get(CfgConst.LOG_LEVEL, "info") + ) + self._update_config(self.config, CfgConst.SEED, valid_seed, self.config.get(CfgConst.SEED, None)) + self._update_config( + self.config, CfgConst.BUFFER_SIZE, valid_buffer_size, self.config.get(CfgConst.BUFFER_SIZE, SIZE_1M) + ) + + def _update_config(self, dic: dict, key: str, check_fun, value: str): + dic[key] = check_fun(value) + + +class Dict2Class: + def __init__(self, data: dict, depth: int = 0): + if depth > MsgConst.MAX_RECURSION_DEPTH: + raise MsprobeException( + MsgConst.RISK_ALERT, f"Maximum recursion depth of {MsgConst.MAX_RECURSION_DEPTH} exceeded." + ) + if data.get(CfgConst.TASK) in data: + data_pop = data.pop(data.get(CfgConst.TASK)) + for key, value in data_pop.items(): + if key == "input" and len(value) == 2: + setattr(self, "input_shape", value[0]) + setattr(self, "input_path", value[1]) + setattr(self, key, value) + for key, value in data.items(): + if isinstance(value, dict): + setattr(self, key, Dict2Class(value, depth + 1)) + else: + setattr(self, key, value) + + @classmethod + def __getattr__(cls, item): + raise MsprobeException(MsgConst.ATTRIBUTE_ERROR, f"{cls.__name__} object has no attribute {item}.") diff --git a/debug/accuracy_tools/msprobe/msprobe/base/service/__init__.py b/debug/accuracy_tools/msprobe/msprobe/base/service/__init__.py new file mode 100644 index 0000000000..53529bc8d3 --- /dev/null +++ b/debug/accuracy_tools/msprobe/msprobe/base/service/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/debug/accuracy_tools/msprobe/msprobe/base/service/manager.py b/debug/accuracy_tools/msprobe/msprobe/base/service/manager.py new file mode 100644 index 0000000000..756f360981 --- /dev/null +++ b/debug/accuracy_tools/msprobe/msprobe/base/service/manager.py @@ -0,0 +1,114 @@ +# Copyright (c) 2025-2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod + +from msprobe.base import BaseComponent, Scheduler +from msprobe.common.validation import valid_task +from msprobe.utils.constants import CfgConst, CmdConst +from msprobe.utils.io import load_json +from msprobe.utils.toolkits import get_current_rank, register + +_TASK_SERVICE_MAP = {CfgConst.TASK_STAT: CmdConst.DUMP, CfgConst.TASK_TENSOR: CmdConst.DUMP} + + +class Service: + _services_map = {} + + def __init__(self, *args, **kwargs): + cmd_namespace = kwargs.get("cmd_namespace") + serv_name = kwargs.get("serv_name") + if hasattr(cmd_namespace, CfgConst.CONFIG_PATH): + if not kwargs.get(CfgConst.TASK): + config = load_json(cmd_namespace.config_path) + task = valid_task(config.get(CfgConst.TASK)) + else: + task = valid_task(kwargs.get(CfgConst.TASK)) + serv_name = _TASK_SERVICE_MAP.get(task) + self.service_class = self.get(serv_name) + self.service_instance = self.service_class(*args, **kwargs) + + def __getattr__(self, name): + return getattr(self.service_instance, name) + + @classmethod + def register(cls, name): + return register(name, cls._services_map) + + @classmethod + def get(cls, name): + return cls._services_map.get(name) + + +class BaseService(ABC): + def __init__(self): + self.comps = [] + self.current_step = 0 + self.scheduler = Scheduler() + + @property + def is_skip(self): + return False + + @property + def current_rank(self): + try: + return int(get_current_rank()) + except Exception: + return None + + @abstractmethod + def construct(self): + pass + + def start(self, *args, **kwargs): + """ + Service startup workflow: + 1. Configure services (init_start). + 2. Build components (construct). + 3. Filter/prioritize components (ignore_actuator), then schedule execution. + 4. Schedule execution and cleanup. + 5. Post-processing (finalize_start). + """ + if self.is_skip: + return + self.init_start() + self.construct() + for attr in self.__dict__.values(): + if isinstance(attr, BaseComponent) and (attr not in self.comps): + self.comps.append(attr) + self.ignore_actuator(attr) + self.comps.sort(key=lambda x: x.priority) + self.scheduler.add(self.comps) + self.finalize_start() + + def init_start(self): + pass + + def ignore_actuator(self, attr): + pass + + def finalize_start(self): + pass + + def step(self, *args, **kwargs): + if self.is_skip: + return + self.current_step += 1 + + def stop(self, *args, **kwargs): + if self.is_skip: + return + self.scheduler.remove(self.comps) + self.comps.clear() -- Gitee