diff --git a/systrace/CMakeLists.txt b/systrace/CMakeLists.txt index 4856e4521330306949be0053c0c38c80745478c1..9d90576b805d8fd45c2479771b747da2c8b38dbc 100644 --- a/systrace/CMakeLists.txt +++ b/systrace/CMakeLists.txt @@ -46,6 +46,7 @@ target_include_directories(common PUBLIC ${PROJECT_SOURCE_DIR}/include ${Python3 add_subdirectory(protos) add_library(sysTrace_hook SHARED + ${PROJECT_SOURCE_DIR}/include/common/shared_constants.c ${PROJECT_SOURCE_DIR}/src/trace/systrace_manager.cc ${PROJECT_SOURCE_DIR}/src/trace/library_loader.cc ${PROJECT_SOURCE_DIR}/src/trace/python/pytorch_tracing_loader.cc @@ -54,8 +55,12 @@ add_library(sysTrace_hook SHARED ${PROJECT_SOURCE_DIR}/src/ascend/hook.cc ${PROJECT_SOURCE_DIR}/src/mspti/mspti_tracker.cpp ${PROJECT_SOURCE_DIR}/src/cann/cann_hook.c + ${PROJECT_SOURCE_DIR}/server/monitor_server.cpp ) +add_executable(sysTrace_cli + ${PROJECT_SOURCE_DIR}/client/sysTracecli.cpp +) set_target_properties(sysTrace_hook PROPERTIES OUTPUT_NAME "sysTrace") target_link_libraries(sysTrace_hook diff --git a/systrace/client/sysTracecli.cpp b/systrace/client/sysTracecli.cpp new file mode 100644 index 0000000000000000000000000000000000000000..331b1875cbda7086f5c55d14526a564ad82ed8f2 --- /dev/null +++ b/systrace/client/sysTracecli.cpp @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include + +#define SOCKET_PATH "/tmp/sysTrace_socket" + +void print_help() { + std::cout << "Usage: sysTrace_client [args]\n" + << "Commands:\n" + << " set = - Enable/disable dump level\n" + << " (levels: L0, L1, L2)\n" + << " interval = - Set dump interval in minutes\n" + << " (levels: L1, L2)\n" + << " print [level|all] - Print current settings\n" + << " (levels: L0, L1, L2, all)\n\n" + << "Examples:\n" + << " sysTrace_cli set L1=true\n" + << " sysTrace_cli interval L1=10\n" + << " sysTrace_cli print all\n"; +} + +int main(int argc, char *argv[]) +{ + if (argc < 2 || std::string(argv[1]) == "help") + { + print_help(); + return 0; + } + + int sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd == -1) + { + perror("socket"); + return 1; + } + + struct sockaddr_un addr; + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, SOCKET_PATH, sizeof(addr.sun_path) - 1); + + if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) == -1) + { + perror("connect"); + close(sockfd); + return 1; + } + + std::string command; + if (std::string(argv[1]) == "print") + { + command = "print"; + if (argc >= 3) + command += std::string(" ") + argv[2]; + } + else if (argc >= 3) + { + command = std::string(argv[1]) + " " + argv[2]; + } + else + { + std::cerr << "Invalid command format\n"; + close(sockfd); + return 1; + } + + if (write(sockfd, command.c_str(), command.size()) == -1) + { + perror("write"); + close(sockfd); + return 1; + } + + char buffer[1024]; + ssize_t n = read(sockfd, buffer, sizeof(buffer) - 1); + if (n > 0) + { + buffer[n] = '\0'; + std::cout << buffer; + } + + close(sockfd); + return 0; +} \ No newline at end of file diff --git a/systrace/include/common/shared_constants.c b/systrace/include/common/shared_constants.c new file mode 100644 index 0000000000000000000000000000000000000000..74020895c97574eba26c4797e67fa6f1f78c8c76 --- /dev/null +++ b/systrace/include/common/shared_constants.c @@ -0,0 +1,191 @@ +#include "shared_constants.h" +#include +#include +#include +#include +#include + +static SharedData *shared_data = NULL; +static int shm_fd = -1; + +int init_shared_memory() +{ + shm_fd = shm_open(SHM_NAME, O_CREAT | O_RDWR, 0666); + if (shm_fd == -1) + { + perror("shm_open failed"); + return -1; + } + + if (ftruncate(shm_fd, sizeof(SharedData)) == -1) + { + perror("ftruncate failed"); + close(shm_fd); + return -1; + } + + shared_data = mmap(NULL, sizeof(SharedData), PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, 0); + if (shared_data == MAP_FAILED) + { + perror("mmap failed"); + close(shm_fd); + return -1; + } + + static pthread_mutexattr_t mutex_attr; + if (pthread_mutexattr_init(&mutex_attr) != 0) + { + perror("pthread_mutexattr_init failed"); + return -1; + } + + if (pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED) != 0) + { + perror("pthread_mutexattr_setpshared failed"); + return -1; + } + + if (pthread_mutex_init(&shared_data->g_trace_mutex, &mutex_attr) != 0) + { + perror("pthread_mutex_init failed"); + return -1; + } + + shared_data->g_dump_L0 = true; + shared_data->g_dump_L1 = false; + shared_data->g_dump_L2 = false; + shared_data->g_dump_L1_interval = 5; + shared_data->g_dump_L2_interval = 5; + shared_data->g_L1_timer_active = false; + shared_data->g_L2_timer_active = false; + shared_data->g_L1_start_time = 0; + shared_data->g_L2_start_time = 0; + shared_data->dumped_L1 = false; + shared_data->dumped_L2 = false; + shared_data->need_dump_L1_once = false; + shared_data->need_dump_L2_once = false; + return 0; +} + +SharedData *get_shared_data() +{ + if (shared_data == NULL) + { + if (init_shared_memory() != 0) + { + return NULL; + } + } + return shared_data; +} + +void cleanup_shared_memory() +{ + if (shared_data != NULL) + { + munmap(shared_data, sizeof(SharedData)); + shared_data = NULL; + } + + if (shm_fd != -1) + { + close(shm_fd); + shm_fd = -1; + } + + shm_unlink(SHM_NAME); +} + +bool checkAndUpdateTimer(int level) { + SharedData* shared_data = get_shared_data(); + if (!shared_data) { + return false; + } + + pthread_mutex_lock(&shared_data->g_trace_mutex); + + bool* dump_flag = NULL; + unsigned int* interval = NULL; + bool* timer_active = NULL; + time_t* start_time = NULL; + const char* level_name = ""; + bool *dumped = false; + bool *need_dump_once = NULL; + + switch(level) { + case 1: // L1 + dump_flag = &shared_data->g_dump_L1; + interval = &shared_data->g_dump_L1_interval; + timer_active = &shared_data->g_L1_timer_active; + start_time = &shared_data->g_L1_start_time; + level_name = "L1"; + dumped = &shared_data->dumped_L1; + need_dump_once = &shared_data->need_dump_L1_once; + break; + case 2: // L2 + dump_flag = &shared_data->g_dump_L2; + interval = &shared_data->g_dump_L2_interval; + timer_active = &shared_data->g_L2_timer_active; + start_time = &shared_data->g_L2_start_time; + level_name = "L2"; + dumped = &shared_data->dumped_L2; + need_dump_once = &shared_data->need_dump_L2_once; + break; + default: + pthread_mutex_unlock(&shared_data->g_trace_mutex); + return false; + } + + bool result = false; + + if (*dump_flag && !*timer_active) { + *start_time = time(NULL); + *timer_active = true; + result = true; + } + else if (*timer_active) { + time_t now = time(NULL); + double elapsed = difftime(now, *start_time) / 60; + + if (elapsed >= *interval) { + *dump_flag = false; + *timer_active = false; + if (!dumped) { + *need_dump_once = true; + } + } else { + result = true; + } + } + + pthread_mutex_unlock(&shared_data->g_trace_mutex); + + return result; +} + +bool need_dump_L1_once() { + SharedData* shared_data = get_shared_data(); + if (!shared_data) { + return false; + } + + pthread_mutex_lock(&shared_data->g_trace_mutex); + bool result = shared_data->need_dump_L1_once; + pthread_mutex_unlock(&shared_data->g_trace_mutex); + + return result; +} + +bool need_dump_L2_once() { + SharedData* shared_data = get_shared_data(); + if (!shared_data) { + return false; + } + + pthread_mutex_lock(&shared_data->g_trace_mutex); + bool result = shared_data->need_dump_L2_once; + pthread_mutex_unlock(&shared_data->g_trace_mutex); + + return result; +} diff --git a/systrace/include/common/shared_constants.h b/systrace/include/common/shared_constants.h index d4408e4c5467ad41c68169dc3c41092d2662d618..831e33436293a3976e3d15250e4c9b1c8c5deca7 100644 --- a/systrace/include/common/shared_constants.h +++ b/systrace/include/common/shared_constants.h @@ -1,12 +1,49 @@ +#ifndef SHARED_CONSTANTS_H +#define SHARED_CONSTANTS_H + #ifdef __cplusplus extern "C" { #endif +#include +#include +#include + +#define SHM_NAME "/sysTrace_shared_mem" +#define SYS_TRACE_ROOT_DIR "/home/sysTrace/" extern int global_stage_id; extern int global_stage_type; -#define SYS_TRACE_ROOT_DIR "/home/sysTrace/" + + typedef struct + { + bool g_dump_L0; + bool g_dump_L1; + bool g_dump_L2; + unsigned int g_dump_L1_interval; + unsigned int g_dump_L2_interval; + bool g_L1_timer_active; + bool g_L2_timer_active; + bool dumped_L1; // Indicates if L1 has been dumped + bool dumped_L2; // Indicates if L2 has been dumped + bool need_dump_L1_once; // Indicates if L1 dump is needed once + bool need_dump_L2_once; // Indicates if L2 dump is needed once + time_t g_L1_start_time; + time_t g_L2_start_time; + pthread_mutex_t g_trace_mutex; + } SharedData; + + int init_shared_memory(); + + SharedData *get_shared_data(); + + void cleanup_shared_memory(); + bool checkAndUpdateTimer(int level); + bool need_dump_L1_once(); + bool need_dump_L2_once(); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif + +#endif // SHARED_CONSTANTS_H \ No newline at end of file diff --git a/systrace/include/common/util.cc b/systrace/include/common/util.cc index c57827c927c8586353ae62c0fe5f842b070cc26e..4c2a6e0eedacffa41cbe69a9bddf8ec19610fd73 100644 --- a/systrace/include/common/util.cc +++ b/systrace/include/common/util.cc @@ -279,6 +279,5 @@ std::string GetPrimaryIP() freeifaddrs(ifaddr); return primaryIP; } - } // namespace util } // namespace systrace \ No newline at end of file diff --git a/systrace/include/common/util.h b/systrace/include/common/util.h index 50c266c215adba90d502c1d8a319626b886b2dca..ea166d535907e1c5cce42efc9c8dd0e603cabac4 100644 --- a/systrace/include/common/util.h +++ b/systrace/include/common/util.h @@ -17,6 +17,7 @@ #include #include #include +#include namespace systrace { diff --git a/systrace/server/monitor_server.cpp b/systrace/server/monitor_server.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2df6ab5a9cc9047967a25c5feffc4646a893f277 --- /dev/null +++ b/systrace/server/monitor_server.cpp @@ -0,0 +1,278 @@ +#include "monitor_server.hpp" +#include "../include/common/shared_constants.h" +#include +#include +#include +#include +#include +#include + +MonitorServer *MonitorServer::instance_ = nullptr; + +MonitorServer &MonitorServer::getInstance() +{ + std::call_once(init_flag_, + []() + { + instance_ = new MonitorServer(); + instance_->start(); + }); + return *instance_; +} + +MonitorServer::MonitorServer() +{ + if (init_shared_memory() != 0) + { + throw std::runtime_error("Failed to initialize shared memory"); + } +} + +MonitorServer::~MonitorServer() +{ + stop(); + cleanup_shared_memory(); +} + +void MonitorServer::start() +{ + if (server_fd_ != -1) + { + return; + } + + server_fd_ = socket(AF_UNIX, SOCK_STREAM, 0); + if (server_fd_ == -1) + { + perror("socket"); + throw std::runtime_error("Failed to create socket"); + } + struct sockaddr_un addr; + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, SOCKET_PATH, sizeof(addr.sun_path) - 1); + + unlink(SOCKET_PATH); + + if (bind(server_fd_, (struct sockaddr *)&addr, sizeof(addr)) == -1) + { + perror("bind"); + close(server_fd_); + server_fd_ = -1; + throw std::runtime_error("Failed to bind socket"); + } + + if (listen(server_fd_, 5) == -1) + { + perror("listen"); + close(server_fd_); + server_fd_ = -1; + throw std::runtime_error("Failed to listen on socket"); + } + + server_thread_ = std::thread(&MonitorServer::server_thread_func, this); +} + +void MonitorServer::stop() +{ + if (server_fd_ != -1) + { + close(server_fd_); + server_fd_ = -1; + } + + if (server_thread_.joinable()) + { + server_thread_.join(); + } + + unlink(SOCKET_PATH); +} + +void MonitorServer::server_thread_func() +{ + while (true) + { + struct sockaddr_un client_addr; + socklen_t client_len = sizeof(client_addr); + + int client_fd = + accept(server_fd_, (struct sockaddr *)&client_addr, &client_len); + if (client_fd == -1) + { + break; + } + + handle_client(client_fd); + } +} + +void MonitorServer::handle_client(int client_fd) +{ + char buffer[1024]; + ssize_t n = read(client_fd, buffer, sizeof(buffer) - 1); + if (n <= 0) + { + return; + } + buffer[n] = '\0'; + std::string command(buffer); + + auto send_response = [client_fd](const std::string &response) + { write(client_fd, response.c_str(), response.size()); }; + + process_command(command, send_response); +} + +void MonitorServer::process_command( + const std::string &cmd, + const std::function &send_response) +{ + SharedData *shared_data = get_shared_data(); + if (!shared_data) + { + send_response("Error: Shared memory not initialized\n"); + return; + } + + std::istringstream iss(cmd); + std::string action; + iss >> action; + + if (action == "set") + { + std::string level_value; + iss >> level_value; + + size_t eq_pos = level_value.find('='); + if (eq_pos == std::string::npos) + { + send_response("Error: Invalid set command format. Use 'set " + "='\n"); + return; + } + + std::string level = level_value.substr(0, eq_pos); + std::string value_str = level_value.substr(eq_pos + 1); + bool value = (value_str == "true"); + + pthread_mutex_lock(&shared_data->g_trace_mutex); + + if (level == "L0") + { + shared_data->g_dump_L0 = value; + } + else if (level == "L1") + { + shared_data->g_dump_L1 = value; + } + else if (level == "L2") + { + shared_data->g_dump_L2 = value; + } + else + { + pthread_mutex_unlock(&shared_data->g_trace_mutex); + send_response("Error: Unknown level '" + level + "'\n"); + return; + } + + pthread_mutex_unlock(&shared_data->g_trace_mutex); + send_response("OK\n"); + } + else if (action == "interval") + { + std::string level_value; + iss >> level_value; + + size_t eq_pos = level_value.find('='); + if (eq_pos == std::string::npos) + { + send_response("Error: Invalid interval command format. Use " + "'interval ='\n"); + return; + } + + std::string level = level_value.substr(0, eq_pos); + unsigned int value; + try + { + value = std::stoul(level_value.substr(eq_pos + 1)); + } + catch (...) + { + send_response("Error: Invalid interval value\n"); + return; + } + + pthread_mutex_lock(&shared_data->g_trace_mutex); + + if (level == "L1") + { + shared_data->g_dump_L1_interval = value; + } + else if (level == "L2") + { + shared_data->g_dump_L2_interval = value; + } + else + { + pthread_mutex_unlock(&shared_data->g_trace_mutex); + send_response("Error: Unknown level '" + level + "'\n"); + return; + } + + pthread_mutex_unlock(&shared_data->g_trace_mutex); + send_response("OK\n"); + } + else if (action == "print") + { + std::string level; + iss >> level; + + std::ostringstream oss; + + pthread_mutex_lock(&shared_data->g_trace_mutex); + + if (level.empty() || level == "all") + { + oss << "Current settings:\n" + << " L0: " << (shared_data->g_dump_L0 ? "true" : "false") + << "\n" + << " L1: " << (shared_data->g_dump_L1 ? "true" : "false") + << "\n" + << " L2: " << (shared_data->g_dump_L2 ? "true" : "false") + << "\n" + << " L1_interval: " << shared_data->g_dump_L1_interval << "\n" + << " L2_interval: " << shared_data->g_dump_L2_interval << "\n"; + } + else if (level == "L0") + { + oss << "L0: " << (shared_data->g_dump_L0 ? "true" : "false") + << "\n"; + } + else if (level == "L1") + { + oss << "L1: " << (shared_data->g_dump_L1 ? "true" : "false") << "\n" + << "L1_interval: " << shared_data->g_dump_L1_interval << "\n"; + } + else if (level == "L2") + { + oss << "L2: " << (shared_data->g_dump_L2 ? "true" : "false") << "\n" + << "L2_interval: " << shared_data->g_dump_L2_interval << "\n"; + } + else + { + pthread_mutex_unlock(&shared_data->g_trace_mutex); + send_response("Error: Unknown level '" + level + "'\n"); + return; + } + + pthread_mutex_unlock(&shared_data->g_trace_mutex); + send_response(oss.str()); + } + else + { + send_response("Error: Unknown command '" + action + "'\n"); + } +} \ No newline at end of file diff --git a/systrace/server/monitor_server.hpp b/systrace/server/monitor_server.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5af92bde5328f60a4a6d3cbb9d42b5676814f39f --- /dev/null +++ b/systrace/server/monitor_server.hpp @@ -0,0 +1,39 @@ +#ifndef SYS_TRACE_SERVER_HPP +#define SYS_TRACE_SERVER_HPP + +#include "../include/common/shared_constants.h" +#include +#include +#include + +class MonitorServer +{ + public: + static MonitorServer &getInstance(); + + void start(); + void stop(); + + MonitorServer(const MonitorServer &) = delete; + MonitorServer &operator=(const MonitorServer &) = delete; + + private: + MonitorServer(); + ~MonitorServer(); + + void server_thread_func(); + void handle_client(int client_fd); + void process_command( + const std::string &cmd, + const std::function &send_response); + + static constexpr const char *SOCKET_PATH = "/tmp/sysTrace_socket"; + + int server_fd_{-1}; + std::thread server_thread_; + + static MonitorServer *instance_; + inline static std::once_flag init_flag_; +}; + +#endif // SYS_TRACE_SERVER_HPP \ No newline at end of file diff --git a/systrace/src/cann/cann_hook.c b/systrace/src/cann/cann_hook.c index a1bd27d6f7756c9754df4e24e1f6ba384be2cd21..754c1b2f7db4dcd17f16447547455de35171ccbb 100644 --- a/systrace/src/cann/cann_hook.c +++ b/systrace/src/cann/cann_hook.c @@ -226,6 +226,10 @@ static char is_ready_to_write(ThreadData *td, time_t *current) static void write_protobuf_to_file() { + if (!checkAndUpdateTimer(2)) + { + return; + } time_t current; uint8_t *buf; ThreadData *td = get_thread_data(); diff --git a/systrace/src/mspti/json_file_writer.h b/systrace/src/mspti/json_file_writer.h index de298c3016c22d58525782e61756ab3f0e63632c..7de211254d95b963b077ae870f23cd8059f4b2f3 100644 --- a/systrace/src/mspti/json_file_writer.h +++ b/systrace/src/mspti/json_file_writer.h @@ -136,6 +136,9 @@ public: } void hcclActivityFormatToCSV() { + if (!checkAndUpdateTimer(1) && !need_dump_L1_once()) { + return; + } std::lock_guard lock(this->buffermtx); if (this->file.is_open()) { // enumerate the buffer and write to file @@ -154,6 +157,13 @@ public: } } this->markerActivityBuffer->clear(); + SharedData *shared_data = get_shared_data(); + if (!shared_data) + { + return; + } + shared_data->dumped_L1 = true; + shared_data->need_dump_L1_once = false; } else { std::cout << "File is not open" << std::endl; } diff --git a/systrace/src/trace/systrace_manager.cc b/systrace/src/trace/systrace_manager.cc index eef5606365b7e98465a732612ed575a82b767897..9b1606c22fa58aa9dec361213f4bb177723b7e98 100644 --- a/systrace/src/trace/systrace_manager.cc +++ b/systrace/src/trace/systrace_manager.cc @@ -67,7 +67,14 @@ void PyTorchTrace::registerTracingFunctions() } } -bool PyTorchTrace::triggerTrace() { return has_trigger_trace_.exchange(true); } +bool PyTorchTrace::triggerTrace() +{ + SharedData* shared_data = get_shared_data(); + if (!shared_data) { + return false; + } + return has_trigger_trace_.exchange(true) && shared_data->g_dump_L0; +} void PyTorchTrace::dumpPyTorchTracing() { @@ -195,6 +202,7 @@ void SysTrace::initializeSystem() return; systrace::util::InitializeSystemUtilities(); + MonitorServer::getInstance(); MSPTITracker::getInstance(); PyTorchTrace::getInstance(); diff --git a/systrace/src/trace/systrace_manager.h b/systrace/src/trace/systrace_manager.h index c043aba0b1985269e8eebc963d77e98b0403f06c..46ee35990dc20fdb25d44a933de58a9fbc1add64 100644 --- a/systrace/src/trace/systrace_manager.h +++ b/systrace/src/trace/systrace_manager.h @@ -7,8 +7,10 @@ #include "../../include/common/logging.h" #include "../../include/common/util.h" +#include "../../include/common/shared_constants.h" #include "../../protos/systrace.pb.h" #include "../mspti/mspti_tracker.hpp" +#include "../../server/monitor_server.hpp" #include "library_loader.h" #include "python/pytorch_tracing_loader.h"