diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..b08433f072bf89f62edf88b3aff40d24c1040ea8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "dynolog_npu/third_party/dynolog"] + path = dynolog_npu/third_party/dynolog + url = https://github.com/facebookincubator/dynolog.git diff --git a/dynolog_npu/README.md b/dynolog_npu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..06ca0c4d929331b656ace5cc0a0f5802ce331c86 --- /dev/null +++ b/dynolog_npu/README.md @@ -0,0 +1,65 @@ +# Ascend Extension for dynolog + +## 安装 + +### 1. clone 代码 + +```bash +git clone https://gitee.com/ascend/mstt.git -b poc +``` + +### 2. 安装依赖 +dynolog的编译依赖,确保安装了以下依赖: + + + + + + + + + + + + + +
Language + Toolchain +
C++ + gcc 8.5.0+ +
Rust + Rust 1.58.1 (1.56+ required for clap dependency) +
+ +- 安装rust + +```bash +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh + +source $HOME/.cargo/env +``` + +- 安装ninja + +```bash +# debian +sudo apt-get install -y cmake ninja-build + +# centos +sudo yum install -y cmake ninja +``` + +### 3. 编译 + +默认编译生成dyno和dynolog二进制文件, -t参数可以支持将二进制文件打包成deb包或rpm包. + +```bash +# 编译dyno和dynolog二进制文件 +bash scripts/build.sh + +# 编译deb包, 当前支持amd64和aarch64平台, 默认为amd64, 编译aarch64平台需要修改third_party/dynolog/scripts/debian/control文件中的Architecture改为aarch64 +bash scripts/build.sh -t deb + +# 编译rpm包, 当前只支持amd64平台 +bash scripts/build.sh -t rpm +``` diff --git a/dynolog_npu/dynolog_npu/dynolog/src/Main.cpp b/dynolog_npu/dynolog_npu/dynolog/src/Main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e5177768327e37173d4e7661e334a9400bd6172 --- /dev/null +++ b/dynolog_npu/dynolog_npu/dynolog/src/Main.cpp @@ -0,0 +1,206 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +// Dynolog : A portable telemetry monitoring daemon. + +#include +#include +#include +#include +#include +#include "dynolog/src/CompositeLogger.h" +#include "dynolog/src/FBRelayLogger.h" +#include "dynolog/src/KernelCollector.h" +#include "dynolog/src/Logger.h" +#include "dynolog/src/ODSJsonLogger.h" +#include "dynolog/src/PerfMonitor.h" +#include "dynolog/src/ScubaLogger.h" +#include "dynolog/src/ServiceHandler.h" +#include "dynolog/src/gpumon/DcgmGroupInfo.h" +#include "dynolog/src/rpc/SimpleJsonServer.h" +#include "dynolog/src/rpc/SimpleJsonServerInl.h" +#include "dynolog/src/tracing/IPCMonitor.h" +#include "hbt/src/perf_event/BuiltinMetrics.h" + +#ifdef USE_PROMETHEUS +#include "dynolog/src/PrometheusLogger.h" +#endif + +using namespace dynolog; +using json = nlohmann::json; +namespace hbt = facebook::hbt; + +DEFINE_int32(port, 1778, "Port for listening RPC requests."); +DEFINE_bool(use_JSON, false, "Emit metrics to JSON file through JSON logger"); +#ifdef USE_PROMETHEUS +DEFINE_bool(use_prometheus, false, "Emit metrics to Prometheus"); +#endif +DEFINE_bool(use_fbrelay, false, "Emit metrics to FB Relay on Lab machines"); +DEFINE_bool(use_ODS, false, "Emit metrics to ODS through ODS logger"); +DEFINE_bool(use_scuba, false, "Emit metrics to Scuba through Scuba logger"); +DEFINE_int32( + kernel_monitor_reporting_interval_s, + 60, + "Duration in seconds to read and report metrics for kernel monitor"); +DEFINE_int32( + perf_monitor_reporting_interval_s, + 60, + "Duration in seconds to read and report metrics for performance monitor"); +DEFINE_int32( + dcgm_reporting_interval_s, + 10, + "Duration in seconds to read and report metrics for DCGM"); +DEFINE_bool( + enable_ipc_monitor, + false, + "Enabled IPC monitor for on system tracing requests."); +DEFINE_bool( + enable_gpu_monitor, + false, + "Enabled GPU monitorng, currently supports NVIDIA GPUs."); +DEFINE_bool(enable_perf_monitor, false, "Enable heartbeat perf monitoring."); + +std::unique_ptr getLogger(const std::string& scribe_category = "") { + std::vector> loggers; +#ifdef USE_PROMETHEUS + if (FLAGS_use_prometheus) { + loggers.push_back(std::make_unique()); + } +#endif + if (FLAGS_use_fbrelay) { + loggers.push_back(std::make_unique()); + } + if (FLAGS_use_ODS) { + loggers.push_back(std::make_unique()); + } + if (FLAGS_use_JSON) { + loggers.push_back(std::make_unique()); + } + if (FLAGS_use_scuba && !scribe_category.empty()) { + loggers.push_back(std::make_unique(scribe_category)); + } + return std::make_unique(std::move(loggers)); +} + +auto next_wakeup(int sec) { + return std::chrono::steady_clock::now() + std::chrono::seconds(sec); +} + +void kernel_monitor_loop() { + KernelCollector kc; + + LOG(INFO) << "Running kernel monitor loop : interval = " + << FLAGS_kernel_monitor_reporting_interval_s << " s."; + + while (1) { + auto logger = getLogger(); + auto wakeup_timepoint = + next_wakeup(FLAGS_kernel_monitor_reporting_interval_s); + + kc.step(); + kc.log(*logger); + logger->finalize(); + + /* sleep override */ + std::this_thread::sleep_until(wakeup_timepoint); + } +} + +void perf_monitor_loop() { + PerfMonitor pm( + hbt::CpuSet::makeAllOnline(), + std::vector{"instructions", "cycles"}, + getDefaultPmuDeviceManager(), + getDefaultMetrics()); + + LOG(INFO) << "Running perf monitor loop : interval = " + << FLAGS_perf_monitor_reporting_interval_s << " s."; + + while (1) { + auto logger = getLogger(); + auto wakeup_timepoint = + next_wakeup(FLAGS_perf_monitor_reporting_interval_s); + + pm.step(); + pm.log(*logger); + + logger->finalize(); + /* sleep override */ + std::this_thread::sleep_until(wakeup_timepoint); + } +} + +auto setup_server(std::shared_ptr handler) { + return std::make_unique>( + handler, FLAGS_port); +} + +void gpu_monitor_loop(std::shared_ptr dcgm) { + auto logger = getLogger(FLAGS_scribe_category); + + LOG(INFO) << "Running DCGM loop : interval = " + << FLAGS_dcgm_reporting_interval_s << " s."; + LOG(INFO) << "DCGM fields: " << gpumon::FLAGS_dcgm_fields; + + while (1) { + auto wakeup_timepoint = next_wakeup(FLAGS_dcgm_reporting_interval_s); + + dcgm->update(); + dcgm->log(*logger); + + /* sleep override */ + std::this_thread::sleep_until(wakeup_timepoint); + } +} + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + FLAGS_logtostderr = 1; + google::InitGoogleLogging(argv[0]); + + LOG(INFO) << "Starting Ascend Extension for dynolog, version = " DYNOLOG_VERSION + << ", build git-hash = " DYNOLOG_GIT_REV; + + std::shared_ptr dcgm; + + std::unique_ptr ipcmon; + std::unique_ptr ipcmon_thread, gpumon_thread, pm_thread; + + if (FLAGS_enable_ipc_monitor) { + LOG(INFO) << "Starting IPC Monitor"; + ipcmon = std::make_unique(); + ipcmon_thread = + std::make_unique([&ipcmon]() { ipcmon->loop(); }); + } + + if (FLAGS_enable_gpu_monitor) { + dcgm = gpumon::DcgmGroupInfo::factory( + gpumon::FLAGS_dcgm_fields, FLAGS_dcgm_reporting_interval_s * 1000); + gpumon_thread = std::make_unique(gpu_monitor_loop, dcgm); + } + std::thread km_thread{kernel_monitor_loop}; + if (FLAGS_enable_perf_monitor) { + pm_thread = std::make_unique(perf_monitor_loop); + } + + // setup service + auto handler = std::make_shared(dcgm); + + // use simple json RPC server for now + auto server = setup_server(handler); + server->run(); + + km_thread.join(); + if (pm_thread) { + pm_thread->join(); + } + if (gpumon_thread) { + gpumon_thread->join(); + } + + server->stop(); + + return 0; +} \ No newline at end of file diff --git a/dynolog_npu/scripts/apply_dyno_patches.sh b/dynolog_npu/scripts/apply_dyno_patches.sh new file mode 100644 index 0000000000000000000000000000000000000000..c492db74a2a56948433a47e9cffcccd4ac71e098 --- /dev/null +++ b/dynolog_npu/scripts/apply_dyno_patches.sh @@ -0,0 +1,36 @@ +#! /bin/bash +set -e + +apply_ascend_patches() { + cd ./third_party/dynolog || return 1 + + if [ ! -d "../../patches" ]; then + echo "ERROR: patches directory not found" + cd ../.. + return 1 + fi + + for patch_file in ../../patches/*.patch; do + if [ -f "$patch_file" ]; then + echo "Applying patch: $patch_file" + git apply --check -p1 "$patch_file" + if [ $? -ne 0 ]; then + echo "ERROR: Failed to apply patch: $(basename $patch_file)" + cd ../.. + return 1 + fi + git apply -p1 "$patch_file" + if [ $? -ne 0 ]; then + echo "ERROR: Failed to apply patch: $(basename $patch_file)" + cd ../.. + return 1 + fi + fi + done + + cd ../.. + echo "Successfully applied all Ascend patches" + return 0 +} + +apply_ascend_patches \ No newline at end of file diff --git a/dynolog_npu/scripts/build.sh b/dynolog_npu/scripts/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..aa3508e14faa6bfea06afe0cd3083ad1a5317037 --- /dev/null +++ b/dynolog_npu/scripts/build.sh @@ -0,0 +1,108 @@ +#!/bin/bash +set -e + +check_gcc_version() { + if ! command -v gcc >/dev/null 2>&1; then + echo "ERROR: gcc command not found" + return 1 + fi + + local GCC_VERSION=$(gcc -dumpversion) + local GCC_MAJOR=$(echo $GCC_VERSION | cut -d. -f1) + local GCC_MINOR=$(echo $GCC_VERSION | cut -d. -f2) + + if [ "$GCC_MAJOR" -lt 8 ] || ([ "$GCC_MAJOR" -eq 8 ] && [ "$GCC_MINOR" -lt 5 ]); then + echo "ERROR: gcc version must be greater than or equal to 8.5.0" + echo "Current gcc version: $GCC_VERSION" + return 1 + fi + echo "Check pass: current gcc version is $GCC_VERSION" + return 0 +} + +check_rust_version() { + if ! command -v rustc >/dev/null 2>&1; then + echo "ERROR: rustc command not found" + return 1 + fi + + local RUST_VERSION=$(rustc --version | cut -d' ' -f2) + local RUST_MAJOR=$(echo $RUST_VERSION | cut -d. -f1) + local RUST_MINOR=$(echo $RUST_VERSION | cut -d. -f2) + + if [ "$RUST_MAJOR" -lt 1 ] || ([ "$RUST_MAJOR" -eq 1 ] && [ "$RUST_MINOR" -lt 56 ]); then + echo "ERROR: Rust version must be greater than or equal to 1.56.0" + echo "Current Rust version: $RUST_VERSION" + return 1 + fi + echo "Check pass: current Rust version is $RUST_VERSION" + return 0 +} + +update_and_checkout_submodule() { + DYNLOG_COMMIT_ID="a9b6aeddcd6363252f5388cb0dd942981a09a24b" + + git submodule update --init --recursive + if [ $? -ne 0 ]; then + echo "ERROR: update git submodule failed" + return 1 + fi + + cd ./third_party/dynolog + git checkout ${DYNLOG_COMMIT_ID} + if [ $? -ne 0 ]; then + echo "ERROR: switch to dynolog specified commit failed" + cd .. + return 1 + fi + echo "Check pass: switch to dynolog specified commit ${DYNLOG_COMMIT_ID}" + cd ../../ + return 0 +} + +PACKAGE_TYPE="" +while getopts "t:" opt; do + case $opt in + t) + PACKAGE_TYPE="$OPTARG" + if [[ "$PACKAGE_TYPE" != "deb" && "$PACKAGE_TYPE" != "rpm" ]]; then + echo "ERROR: Invalid package type. Supported types: deb, rpm" + exit 1 + fi + ;; + \?) + echo "Usage: $0 [-t package_type]" + echo "package_type: deb or rpm (optional, if not specified will only build)" + exit 1 + ;; + esac +done + +echo "------------------ Check GCC and Rust version ----------------------" +check_gcc_version +check_rust_version + +echo "------------------ Update and checkout submodule -------------------" +update_and_checkout_submodule + +echo "------------------ Generate patch for Ascend -----------------------" +bash scripts/gen_dyno_patches.sh + +echo "------------------ Apply patch for Ascend --------------------------" +bash scripts/apply_dyno_patches.sh + +echo "------------------ Build dynolog patch for Ascend-------------------" +cd third_party/dynolog +rm -rf build +if [ -z "$PACKAGE_TYPE" ]; then + bash scripts/build.sh + echo "Build dynolog success without packaging" +elif [ "$PACKAGE_TYPE" = "deb" ]; then + bash scripts/debian/make_deb.sh + mv dynolog_*.deb ../../ + echo "Build dynolog deb package success" +elif [ "$PACKAGE_TYPE" = "rpm" ]; then + bash scripts/rpm/make_rpm.sh + mv dynolog_*.rpm ../../ + echo "Build dynolog rpm package success" +fi diff --git a/dynolog_npu/scripts/gen_dyno_patches.sh b/dynolog_npu/scripts/gen_dyno_patches.sh new file mode 100644 index 0000000000000000000000000000000000000000..d8caef7eb015afde92efd7dbc146f88969dd7984 --- /dev/null +++ b/dynolog_npu/scripts/gen_dyno_patches.sh @@ -0,0 +1,59 @@ +#!/bin/bash +set -e + +WORK_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PATCHES_DIR="${WORK_DIR}/patches" +DYNOLOG_DIR="${WORK_DIR}/third_party/dynolog" +MODIFIED_FILES_DIR="${WORK_DIR}/dynolog_npu" + +mkdir -p "${PATCHES_DIR}" + +generate_patches() { + echo "Generating patches from modified files..." + + # 检查修改后的文件目录是否存在 + if [ ! -d "${MODIFIED_FILES_DIR}" ]; then + echo "ERROR: dynolog_npu directory not found" + return 1 + fi + + # 清理旧的patch文件 + rm -f "${PATCHES_DIR}"/*.patch + + # 遍历修改后的文件目录 + find "${MODIFIED_FILES_DIR}" -type f | while read modified_file; do + # 获取相对路径 + rel_path=$(realpath --relative-to="${MODIFIED_FILES_DIR}" "${modified_file}") + original_file="${DYNOLOG_DIR}/${rel_path}" + + # 检查原始文件是否存在 + if [ ! -f "${original_file}" ]; then + echo "WARNING: Original file not found: ${original_file}" + continue + fi + + # 生成patch文件名(将路径中的斜杠替换为下划线) + patch_name=$(echo "${rel_path}" | sed 's/\//_/g') + patch_file="${PATCHES_DIR}/${patch_name}.patch" + + echo "Generating patch for: ${rel_path}" + + ( + cd "${WORK_DIR}" + diff -u "third_party/dynolog/${rel_path}" "dynolog_npu/${rel_path}" > "${patch_file}" || true + ) + + # 检查patch文件大小 + if [ ! -s "${patch_file}" ]; then + rm "${patch_file}" + echo "No differences found for: ${rel_path}" + else + echo "Successfully generated patch: ${patch_file}" + fi + done + + echo "Patch generation completed" + return 0 +} + +generate_patches \ No newline at end of file