From c7de815ffc8c08107a9f7dfd32a73a7584060d46 Mon Sep 17 00:00:00 2001 From: Gallium Date: Tue, 27 May 2025 17:47:52 +0800 Subject: [PATCH 1/3] =?UTF-8?q?msmonitor=20=E6=8E=A5=E5=85=A5tensorboard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- msmonitor/README.md | 13 ++ msmonitor/dynolog_npu/CMakeLists.txt | 88 +++++++++++ .../dynolog/src/DynologTensorBoardLogger.cpp | 139 ++++++++++++++++++ .../dynolog/src/DynologTensorBoardLogger.h | 97 ++++++++++++ msmonitor/dynolog_npu/dynolog/src/Main.cpp | 4 + .../dynolog/src/{Metric.cpp => Metrics.cpp} | 0 .../dynolog/src/MsMonitorMetrics.h | 33 +++++ .../dynolog/src/tracing/IPCMonitor.cpp | 10 +- msmonitor/scripts/build.sh | 8 +- 9 files changed, 388 insertions(+), 4 deletions(-) create mode 100644 msmonitor/dynolog_npu/CMakeLists.txt create mode 100644 msmonitor/dynolog_npu/dynolog/src/DynologTensorBoardLogger.cpp create mode 100644 msmonitor/dynolog_npu/dynolog/src/DynologTensorBoardLogger.h rename msmonitor/dynolog_npu/dynolog/src/{Metric.cpp => Metrics.cpp} (100%) create mode 100644 msmonitor/dynolog_npu/dynolog/src/MsMonitorMetrics.h diff --git a/msmonitor/README.md b/msmonitor/README.md index b28f387f4e2..00434165db6 100644 --- a/msmonitor/README.md +++ b/msmonitor/README.md @@ -49,6 +49,19 @@ sudo apt-get install -y cmake ninja-build sudo yum install -y cmake ninja ``` +- 安装protobuf (tensorboard_logger三方依赖,用于对接tensorboard展示) +安装 +```bash +# debian +sudo apt install -y protobuf-compiler libprotobuf-dev + +# centos +sudo yum install -y protobuf protobuf-devel protobuf-compiler + +# Python +pip install protobuf +``` + - 安装openssl(RPC TLS认证)& 生成证书密钥 安装 ```bash diff --git a/msmonitor/dynolog_npu/CMakeLists.txt b/msmonitor/dynolog_npu/CMakeLists.txt new file mode 100644 index 00000000000..d0da4c68f37 --- /dev/null +++ b/msmonitor/dynolog_npu/CMakeLists.txt @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +cmake_minimum_required(VERSION 3.16) + +project(Dynolog VERSION 1.0) +option(BUILD_TESTS "Build the unit tests" ON) +option(USE_ODS_GRAPH_API "Enable logger to Meta ODS using public Graph API." +OFF) +option(USE_JSON_GENERATED_PERF_EVENTS "Add performance events generated using +Intel json spec, see hbt/src/perf_event/json_events/intel" +OFF) +option(USE_PROMETHEUS "Enable logging to prometheus, this requires +prometheus-cpp to be installed on the system" +OFF) + +if(USE_PROMETHEUS) + find_package(prometheus-cpp CONFIG REQUIRED) +endif() + +file(READ "version.txt" DYNOLOG_VERSION) +string(STRIP ${DYNOLOG_VERSION} DYNOLOG_VERSION) + +execute_process ( + COMMAND git rev-parse --short HEAD + OUTPUT_VARIABLE DYNOLOG_GIT_REV + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +set(DYNOLOG_VERSION "\"${DYNOLOG_VERSION}\"") +set(DYNOLOG_GIT_REV "\"${DYNOLOG_GIT_REV}\"") +message("Dynolog version = ${DYNOLOG_VERSION}") +message("Dynolog git rev = ${DYNOLOG_GIT_REV}") + +set(CMAKE_VERBOSE_MAKEFILE ON) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) +set(CMAKE_POSITION_INDEPENDENT_CODE True) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") + +if(BUILD_TESTS) + enable_testing() + add_subdirectory("third_party/googletest" "third_party/googletest") +endif() + +include_directories(".") +add_subdirectory(dynolog) +add_subdirectory(cli) +# The following dummy depdendency ensures the cli is built +add_dependencies(dynolog_lib dyno) +add_subdirectory(hbt) + +# Third party deps +set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "") +set(BUILD_SAMPLES OFF CACHE BOOL "") +set(BUILD_TEST OFF CACHE BOOL "") +set(BUILD_SHARED_LIBS OFF CACHE BOOL "") + +set(BUILD_TESTING OFF CACHE BOOL "") +set(WITH_GFLAGS OFF CACHE BOOL "") +add_subdirectory(third_party/glog) +target_link_libraries(dynolog_lib PUBLIC glog::glog) + +set(GFLAGS_BUILD_TESTING OFF CACHE BOOL "") +add_subdirectory(third_party/gflags) +target_link_libraries(dynolog_lib PUBLIC gflags::gflags) + +# https://github.com/nlohmann/json#cmake +set(JSON_BuildTests OFF CACHE INTERNAL "") +add_subdirectory(third_party/json) +target_link_libraries(dynolog_lib PUBLIC nlohmann_json::nlohmann_json) + +add_subdirectory(third_party/pfs) +target_include_directories(dynolog_lib PUBLIC third_party/pfs/include) +target_link_libraries(dynolog_lib PUBLIC pfs) + +add_subdirectory(third_party/fmt) +target_link_libraries(dynolog_lib PUBLIC fmt::fmt) + +add_subdirectory(third_party/tensorboard_logger) +target_include_directories(dynolog_lib PUBLIC third_party/tensorboard_logger/include) +target_link_libraries(dynolog_lib PUBLIC tensorboard_logger) + +if(USE_ODS_GRAPH_API) + add_subdirectory(third_party/cpr) + target_link_libraries(dynolog_lib PUBLIC cpr::cpr) +endif() \ No newline at end of file diff --git a/msmonitor/dynolog_npu/dynolog/src/DynologTensorBoardLogger.cpp b/msmonitor/dynolog_npu/dynolog/src/DynologTensorBoardLogger.cpp new file mode 100644 index 00000000000..9b15a2a8a12 --- /dev/null +++ b/msmonitor/dynolog_npu/dynolog/src/DynologTensorBoardLogger.cpp @@ -0,0 +1,139 @@ +#include "DynologTensorBoardLogger.h" + +#include + +#include "hbt/src/common/System.h" + +#include +#include +#include +#include +#include +#include + +DEFINE_string(metric_log_dir, "", "The Path to store tensorboard logs"); + +namespace dynolog { + +const std::string TensorBoardLoggerImpl::log_file_name_ = "tfevents.pb"; +std::filesystem::path TensorBoardLoggerManager::log_path_ = ""; + +DynologTensorBoardLogger::DynologTensorBoardLogger(const std::string& metric_log_dir) + : logPath_(metric_log_dir) { + + if (!validateLogDir(logPath_)) { + std::runtime_error("Unable to record logs in the target folder"); + } + + // logger = std::make_unique(FLAGS_metric_log_dir); + LOG(INFO) << "Initialized tensorboard logger on = " + << FLAGS_metric_log_dir; +} + +void DynologTensorBoardLogger::finalize() { + TensorBoardLoggerManager::logPath(logPath_); + auto logging_guard = TensorBoardLoggerManager::singleton(); + auto prom = logging_guard.manager; + auto deviceId = dynamic_metrics_["deviceId"] == "-1" ? "host": dynamic_metrics_["deviceId"]; + auto kind = dynamic_metrics_["kind"]; + std::string real_tag = kind == "Marker" + ? kind + "/" + dynamic_metrics_["domain"] + : kind; + std::string metric_name = "duration"; + MsptiMetricDesc desc{deviceId, kind, real_tag, metric_name, kvs_["duration"]}; + prom->log(desc); +} + +bool DynologTensorBoardLogger::validateLogDir(const std::string& path) { + std::filesystem::path log_path(path); + + if (!std::filesystem::exists(log_path)) { + LOG(ERROR) << "Error: Path does not exist: " << path; + return false; + } + + if (!std::filesystem::is_directory(log_path)) { + LOG(ERROR) << "Error: Path is not a directory: " << path; + return false; + } + + if (std::filesystem::is_symlink(log_path)) { + LOG(ERROR) << "Error: Path is a symbolic link: " << path; + return false; + } + + struct stat info; + if (stat(path.c_str(), &info) != 0) { + LOG(ERROR) << "Error: Cannot stat path: " << path; + return false; + } + + uid_t current_uid = getuid(); + if (info.st_uid != current_uid && current_uid != 0) { + LOG(ERROR) << "Error: Path is not owned by current user"; + return false; + } + return true; +} + +// static +std::shared_ptr TensorBoardLoggerManager::singleton_() { + static std::shared_ptr manager_ = + std::make_shared(); + return manager_; +} + +// static +TensorBoardLoggerManager::LoggingGuard TensorBoardLoggerManager::singleton() { + auto s = singleton_(); + return LoggingGuard{.manager = s, .lock_guard = s->lock()}; +} + +bool TensorBoardLoggerManager::isValidMetric(const MsptiMetricDesc &desc) +{ + auto it = validMetrics_.find(desc.kind_); + if (it == validMetrics_.end() || !it->second.count(desc.metric_name_)) { + return false; + } + return true; +} + +uint64_t TensorBoardLoggerManager::getCurStep(const std::string& device, const std::string& kind) +{ + auto key = std::make_pair(device, kind); + return device_kind2_step_[key]++; +} + +void TensorBoardLoggerManager::log(const MsptiMetricDesc& desc) { + if (!isValidMetric(desc)) { + return; + } + + auto device = desc.device_id_; + // 读取tensorboardImpl,调用Log方法写入 + auto it = device_loggers_.find(device); + std::shared_ptr logger; + if (it == device_loggers_.end()) { + std::string device_log_path = log_path_ / ("device_" + device); + device_loggers_[device] = std::make_shared(device_log_path, ""); + } + logger = device_loggers_[device]; + logger->log(desc.tag_, desc.val_, getCurStep(device, desc.kind_)); +} + +void TensorBoardLoggerImpl::log(const std::string &key, double val, uint64_t step) { + if (!std::filesystem::exists(log_path_)) { + std::error_code ec; + std::filesystem::create_directories(log_path_, ec); + if (ec) { + LOG(ERROR) << "failed to create log dir: " << log_path_ << "errorcode: " << ec.message(); + return; + } + } + + if (logger_ == nullptr) { + logger_ = std::make_shared(log_path_ / log_file_name_); + } + logger_->add_scalar(key, step, val); +} +} \ No newline at end of file diff --git a/msmonitor/dynolog_npu/dynolog/src/DynologTensorBoardLogger.h b/msmonitor/dynolog_npu/dynolog/src/DynologTensorBoardLogger.h new file mode 100644 index 00000000000..4b174dbaf98 --- /dev/null +++ b/msmonitor/dynolog_npu/dynolog/src/DynologTensorBoardLogger.h @@ -0,0 +1,97 @@ +#pragma once + +#include +#include +#include +#include + +#include "dynolog/src/Logger.h" + +#include "MsMonitorMetrics.h" + +#include "tensorboard_logger.h" + +DECLARE_string(metric_log_dir); + +namespace dynolog { + +class TensorBoardLoggerImpl { +public: + explicit TensorBoardLoggerImpl(std::string log_path, std::string tag = "") : log_path_(log_path), tag_(tag) {}; + void log(const std::string& key, double val, uint64_t step); +private: + std::filesystem::path log_path_; + std::string tag_; + static const std::string log_file_name_; + std::shared_ptr logger_; +}; + +class TensorBoardLoggerManager { +public: + struct LoggingGuard { + std::shared_ptr manager; + std::lock_guard lock_guard; + }; + + void log(const MsptiMetricDesc& desc); + + static void logPath(const std::string& cfg_log_path) { + log_path_ = cfg_log_path; + } + + static LoggingGuard singleton(); + + bool isValidMetric(const MsptiMetricDesc& desc); + + uint64_t getCurStep(const std::string& device, const std::string& kind); + +private: + std::lock_guard lock() { + return std::lock_guard{mutex_}; + } + static std::shared_ptr singleton_(); + + std::mutex mutex_; + static std::filesystem::path log_path_; + + std::unordered_map> device_loggers_; + std::map, std::uint64_t> device_kind2_step_; +}; + +class DynologTensorBoardLogger final : public Logger { +public: + explicit DynologTensorBoardLogger(const std::string& metric_log_dir); + void setTimestamp(Timestamp ts) override {} + + void logInt(const std::string& key, int64_t val) override { + kvs_[key] = static_cast(val); + } + + void logFloat(const std::string& key, float val) override { + kvs_[key] = static_cast(val); + } + + void logUint(const std::string& key, uint64_t val) override { + kvs_[key] = static_cast(val); + } + + // logStr for dynamic metris + void logStr(const std::string& key, const std::string& val) override { + if (validDynamicMetrics_.count(key)) { + dynamic_metrics_[key] = val; + } + } + + void finalize() override; + +private: + bool validateLogDir(const std::string& path); + +private: + std::unordered_map kvs_; + std::unordered_map dynamic_metrics_; + std::string logPath_; + std::string hostName_; +}; + +} // namespace dynolog \ No newline at end of file diff --git a/msmonitor/dynolog_npu/dynolog/src/Main.cpp b/msmonitor/dynolog_npu/dynolog/src/Main.cpp index 758d9db3ed9..693b729bbd2 100644 --- a/msmonitor/dynolog_npu/dynolog/src/Main.cpp +++ b/msmonitor/dynolog_npu/dynolog/src/Main.cpp @@ -15,6 +15,7 @@ #include "dynolog/src/KernelCollector.h" #include "dynolog/src/Logger.h" #include "dynolog/src/ODSJsonLogger.h" +#include "dynolog/src/DynologTensorBoardLogger.h" #include "dynolog/src/PerfMonitor.h" #include "dynolog/src/ScubaLogger.h" #include "dynolog/src/ServiceHandler.h" @@ -81,6 +82,9 @@ std::unique_ptr getLogger(const std::string& scribe_category = "") { if (FLAGS_use_scuba && !scribe_category.empty()) { loggers.push_back(std::make_unique(scribe_category)); } + if (!FLAGS_metric_log_dir.empty()) { + loggers.push_back(std::make_unique(FLAGS_metric_log_dir)); + } return std::make_unique(std::move(loggers)); } diff --git a/msmonitor/dynolog_npu/dynolog/src/Metric.cpp b/msmonitor/dynolog_npu/dynolog/src/Metrics.cpp similarity index 100% rename from msmonitor/dynolog_npu/dynolog/src/Metric.cpp rename to msmonitor/dynolog_npu/dynolog/src/Metrics.cpp diff --git a/msmonitor/dynolog_npu/dynolog/src/MsMonitorMetrics.h b/msmonitor/dynolog_npu/dynolog/src/MsMonitorMetrics.h new file mode 100644 index 00000000000..48334ba6e91 --- /dev/null +++ b/msmonitor/dynolog_npu/dynolog/src/MsMonitorMetrics.h @@ -0,0 +1,33 @@ +#ifndef DYNOLOG_NPU_MSMONITOR_METRICS_H +#define DYNOLOG_NPU_MSMONITOR_METRICS_H + +#include +#include +#include + +namespace dynolog { + +const std::unordered_set validDynamicMetrics_ { + {"deviceId", "kind", "domain"} +}; + +const std::unordered_map> validMetrics_ { + {"Marker", {"duration"}}, + {"Kernel", {"duration"}}, + {"API", {"duration"}}, + {"Hccl", {"duration"}}, + {"Memory", {"duration"}}, + {"MemSet", {"duration"}}, + {"MemCpy", {"duration"}} +}; + +struct MsptiMetricDesc { + std::string device_id_; + std::string kind_; + std::string tag_; + std::string metric_name_; + double val_; +}; +} // namespace dynolog + +#endif \ No newline at end of file diff --git a/msmonitor/dynolog_npu/dynolog/src/tracing/IPCMonitor.cpp b/msmonitor/dynolog_npu/dynolog/src/tracing/IPCMonitor.cpp index 811bae4e0de..ce2b6722e8e 100644 --- a/msmonitor/dynolog_npu/dynolog/src/tracing/IPCMonitor.cpp +++ b/msmonitor/dynolog_npu/dynolog/src/tracing/IPCMonitor.cpp @@ -90,8 +90,14 @@ void IPCMonitor::LogData(const nlohmann::json& result) logger_->logUint("timestamp", timestamp); auto duration = result["duration"].get(); logger_->logUint("duration", duration); - auto deviceId = result["deviceId"].get(); - logger_->logUint("deviceId", deviceId); + auto deviceId = result["deviceId"].get(); + logger_->logStr("deviceId", std::to_string(deviceId)); + auto kind = result["kind"].get(); + logger_->logStr("kind", kind); + if (result.contains("domain") && result["domain"].is_string()) { + auto domain = result["domain"].get(); + logger_->logStr("domain", domain); + } logger_->finalize(); } diff --git a/msmonitor/scripts/build.sh b/msmonitor/scripts/build.sh index 52cd5ad4f13..a7aaa726dee 100644 --- a/msmonitor/scripts/build.sh +++ b/msmonitor/scripts/build.sh @@ -1,6 +1,5 @@ #!/bin/bash set -e -export BUILD_PROMETHEUS=1 check_gcc_version() { if ! command -v gcc >/dev/null 2>&1; then @@ -43,7 +42,7 @@ check_rust_version() { update_and_checkout_submodule() { DYNLOG_COMMIT_ID="a9b6aeddcd6363252f5388cb0dd942981a09a24b" - git submodule update --init --recursive + git submodule update --init if [ $? -ne 0 ]; then echo "ERROR: update git submodule failed" return 1 @@ -51,6 +50,11 @@ update_and_checkout_submodule() { cd ./third_party/dynolog git checkout ${DYNLOG_COMMIT_ID} + + git submodule add https://github.com/RustingSword/tensorboard_logger.git ./third_party/tensorboard_logger + git submodule update --init --recursive + git commit -am "Add tensorboard_logger as submodule" + if [ $? -ne 0 ]; then echo "ERROR: switch to dynolog specified commit failed" cd .. -- Gitee From a243b345154049b5cc752ebce41ff786a35a9e17 Mon Sep 17 00:00:00 2001 From: Gallium Date: Thu, 5 Jun 2025 11:48:51 +0800 Subject: [PATCH 2/3] delete pro --- msmonitor/scripts/build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/msmonitor/scripts/build.sh b/msmonitor/scripts/build.sh index a7aaa726dee..f2203d17345 100644 --- a/msmonitor/scripts/build.sh +++ b/msmonitor/scripts/build.sh @@ -1,5 +1,6 @@ #!/bin/bash set -e +export BUILD_PROMETHEUS=1 check_gcc_version() { if ! command -v gcc >/dev/null 2>&1; then -- Gitee From 8ee6f72fd90fa64a9c896818c04240ee0e4ce7b1 Mon Sep 17 00:00:00 2001 From: Gallium Date: Thu, 5 Jun 2025 11:49:22 +0800 Subject: [PATCH 3/3] pro --- msmonitor/scripts/build.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/msmonitor/scripts/build.sh b/msmonitor/scripts/build.sh index f2203d17345..a7aaa726dee 100644 --- a/msmonitor/scripts/build.sh +++ b/msmonitor/scripts/build.sh @@ -1,6 +1,5 @@ #!/bin/bash set -e -export BUILD_PROMETHEUS=1 check_gcc_version() { if ! command -v gcc >/dev/null 2>&1; then -- Gitee