From 132d570da27278b5e49746c7e77dc9167fd74e63 Mon Sep 17 00:00:00 2001 From: weli-l <1289113577@qq.com> Date: Mon, 14 Jul 2025 16:10:36 +0800 Subject: [PATCH] add io logic Signed-off-by: weli-l <1289113577@qq.com> --- systrace/CMakeLists.txt | 4 +- systrace/convert/convert_io_to_timeline.py | 75 +++ systrace/protos/systrace.v3.proto | 123 +++- systrace/src/ascend/hook.h | 1 + systrace/src/cann/common_hook.c | 66 +++ systrace/src/cann/common_hook.h | 36 ++ systrace/src/cann/io_hook.c | 533 ++++++++++++++++++ systrace/src/cann/{cann_hook.c => mem_hook.c} | 67 +-- systrace/src/os/os_probe.c | 6 +- 9 files changed, 816 insertions(+), 95 deletions(-) create mode 100644 systrace/convert/convert_io_to_timeline.py create mode 100644 systrace/src/cann/common_hook.c create mode 100644 systrace/src/cann/common_hook.h create mode 100644 systrace/src/cann/io_hook.c rename systrace/src/cann/{cann_hook.c => mem_hook.c} (85%) diff --git a/systrace/CMakeLists.txt b/systrace/CMakeLists.txt index dfa2e04..05d67f2 100644 --- a/systrace/CMakeLists.txt +++ b/systrace/CMakeLists.txt @@ -55,7 +55,9 @@ add_library(sysTrace_hook SHARED ${PROJECT_SOURCE_DIR}/src/trace/python/pytorch_tracing.c ${PROJECT_SOURCE_DIR}/src/ascend/hook.cc ${PROJECT_SOURCE_DIR}/src/mspti/mspti_tracker.cpp - ${PROJECT_SOURCE_DIR}/src/cann/cann_hook.c + ${PROJECT_SOURCE_DIR}/src/cann/common_hook.c + ${PROJECT_SOURCE_DIR}/src/cann/mem_hook.c + ${PROJECT_SOURCE_DIR}/src/cann/io_hook.c ${PROJECT_SOURCE_DIR}/server/monitor_server.cpp ) diff --git a/systrace/convert/convert_io_to_timeline.py b/systrace/convert/convert_io_to_timeline.py new file mode 100644 index 0000000..43f143c --- /dev/null +++ b/systrace/convert/convert_io_to_timeline.py @@ -0,0 +1,75 @@ +import json +import systrace_pb2 +import argparse +import glob +import os + + +def decode_filename(filename_bytes): + try: + return filename_bytes.decode('utf-8') + except UnicodeDecodeError: + return filename_bytes.decode('utf-8', errors='backslashreplace') + + +def process_io_file(input_path, trace_data): + with open(input_path, "rb") as f: + io_data = systrace_pb2.IO() + io_data.ParseFromString(f.read()) + + for entry in io_data.IO_entries: + filename = decode_filename(entry.file_name) + if 'socket' in filename.lower(): + continue + + io_type_str = systrace_pb2.IOType.Name(entry.io_type) + stage_type_str = systrace_pb2.StageType.Name(entry.stage_type) + + name = f"{stage_type_str}::{io_type_str}" + tid = f"{entry.rank}:{filename}" + + trace_data["traceEvents"].append({ + "name": name, + "cat": "io", + "ph": "X", # Complete event + "pid": entry.rank, + "tid": io_type_str, + "ts": entry.start_us, + "dur": entry.dur, + "args": { + "stage_id": entry.stage_id, + "file_name": filename, + "stage_type": stage_type_str, + "io_type": io_type_str + } + }) + + +def aggregate_io_files(output_path): + trace_data = { + "traceEvents": [], + "displayTimeUnit": "us", + "metadata": {"format": "IO Profiler"} + } + + pb_files = glob.glob("*.pb") + print(f"Found {len(pb_files)} .pb files to process") + + for pb_file in pb_files: + print(f"Processing {pb_file}") + process_io_file(pb_file, trace_data) + + trace_data["traceEvents"].sort(key=lambda x: x["ts"]) + + with open(output_path, "w") as f: + json.dump(trace_data, f, indent=None, separators=(',', ':')) + + print(f"Wrote {len(trace_data['traceEvents'])} events to {output_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Aggregate all *.pb files into a Chrome Trace JSON') + parser.add_argument('--output', required=True, help='Output JSON file path') + args = parser.parse_args() + + aggregate_io_files(args.output) \ No newline at end of file diff --git a/systrace/protos/systrace.v3.proto b/systrace/protos/systrace.v3.proto index 426e1aa..624e42f 100644 --- a/systrace/protos/systrace.v3.proto +++ b/systrace/protos/systrace.v3.proto @@ -1,8 +1,25 @@ syntax = "proto3"; +/***************************** + * Common Types * + *****************************/ + +enum StageType { + STAGE_UNKNOWN = 0; + STAGE_DATALOADER = 1; + STAGE_FORWARD = 2; + STAGE_BACKWARD = 3; + STAGE_SYNCHRONIZATION = 4; + STAGE_GC = 5; +} + +/***************************** + * Memory Tracing * + *****************************/ + message StackFrame { - uint64 address = 1; - string so_name = 2; + uint64 address = 1; + string so_name = 2; } message MemAllocEntry { @@ -25,15 +42,73 @@ message ProcMem { repeated MemFreeEntry mem_free_stacks = 3; } -enum StageType { - STAGE_UNKNOWN = 0; - STAGE_DATALOADER = 1; - STAGE_FORWARD = 2; - STAGE_BACKWARD = 3; - STAGE_SYNCHRONIZATION = 4; - STAGE_GC = 5; +message Mem { + repeated ProcMem proc_mem = 1; +} + +/***************************** + * I/O Tracing * + *****************************/ + +enum IOType { + IO_UNKNOWN = 0; + IO_READ = 1; + IO_WRITE = 2; + IO_FREAD = 3; + IO_FWRITE = 4; + IO_FOPEN = 5; + IO_FCLOSE = 6; + IO_FFLUSH = 7; + IO_REMOVE = 8; + IO_RENAME = 9; + IO_OPEN = 10; + IO_CLOSE = 11; + IO_FSYNC = 12; + IO_MKDIR = 13; + IO_RMDIR = 14; + IO_UNLINK = 15; + IO_OPENDIR = 16; + IO_CLOSEDIR = 17; +} + +message IOEntry { + uint64 start_us = 1; + uint64 dur = 2; + uint32 stage_id = 3; + StageType stage_type = 4; + IOType io_type = 5; + uint32 rank = 6; + bytes file_name = 7; +} + +message IO { + repeated IOEntry IO_entries = 1; +} + +/***************************** + * CPU Tracing * + *****************************/ + +message OSprobeEntry { + uint32 key = 1; + uint64 start_us = 2; + uint64 dur = 3; + uint64 rundelay = 4; + uint32 OS_event_type = 5; + uint32 rank = 6; + string comm = 7; + string nxt_comm = 8; + uint32 nxt_pid = 9; +} + +message OSprobe { + repeated OSprobeEntry OSprobe_entries = 1; } +/***************************** + * Pytorch Tracing * + *****************************/ + message GcDebugData { uint32 collected = 1; uint32 uncollectable = 2; @@ -53,26 +128,22 @@ message PytorchStage { message Pytorch { repeated PytorchStage pytorch_stages = 1; uint32 rank = 2; - uint32 step_id = 3; - string comm = 4; //任务名 + uint32 step_id = 3; + string comm = 4; } -message Mem { - repeated ProcMem proc_mem = 1; -} +/***************************** + * Mutex Tracing * + *****************************/ -message OSprobe { - repeated OSprobeEntry OSprobe_entries = 1; +message MutexEntry { + uint64 start_us = 1; + uint64 dur = 2; + uint32 stage_id = 3; + StageType stage_type = 4; + repeated StackFrame stack_frames = 5; } -message OSprobeEntry { - uint32 key = 1; - uint64 start_us = 2; - uint64 dur = 3; - uint64 rundelay = 4; - uint32 OS_event_type = 5; - uint32 rank = 6; - string comm = 7; - string nxt_comm = 8; - uint32 nxt_pid = 9; +message Mutex { + repeated MutexEntry mutex_entries = 1; } \ No newline at end of file diff --git a/systrace/src/ascend/hook.h b/systrace/src/ascend/hook.h index adbe0a0..31f2581 100644 --- a/systrace/src/ascend/hook.h +++ b/systrace/src/ascend/hook.h @@ -3,6 +3,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" diff --git a/systrace/src/cann/common_hook.c b/systrace/src/cann/common_hook.c new file mode 100644 index 0000000..a696452 --- /dev/null +++ b/systrace/src/cann/common_hook.c @@ -0,0 +1,66 @@ +#include "common_hook.h" +#include "../../include/common/shared_constants.h" +#include +#include +#include + +#define PATH_MAX 100 + +uint64_t get_current_us() { + struct timeval tv; + gettimeofday(&tv, NULL); + return (uint64_t)tv.tv_sec * 1000000 + tv.tv_usec; +} + +const char *get_so_name(uint64_t ip) +{ + Dl_info info; + const char *so_name; + if (dladdr((void *)ip, &info)) + { + so_name = strrchr(info.dli_fname, '/'); + return (so_name != NULL) ? so_name + 1 : info.dli_fname; + } + return "unknown"; +} + +unw_word_t get_so_base(unw_word_t addr) +{ + Dl_info info; + if (dladdr((void *)addr, &info) != 0) + { + return (unw_word_t)info.dli_fbase; + } + return 0; +} + +void get_log_filename(char *buf, size_t buf_size, const char *path_suffix) { + const char *rank_str = getenv("RANK"); + int rank = rank_str ? atoi(rank_str) : 0; + + char path[PATH_MAX] = {0}; + int ret = snprintf(path, sizeof(path), "%s/%s", SYS_TRACE_ROOT_DIR, path_suffix); + if (ret < 0 || (size_t)ret >= sizeof(path)) { + snprintf(buf, buf_size, "%s_trace_rank%d.pb", path_suffix, rank); + return; + } + if (access(path, F_OK) != 0) { + if (mkdir(path, 0755) != 0 && errno != EEXIST) { + perror("Failed to create directory"); + snprintf(buf, buf_size, "%s_trace_rank%d.pb", path_suffix, rank); + return; + } + } + snprintf(buf, buf_size, "%s/%s_trace_rank%d.pb", path, path_suffix, rank); +} + +void *load_symbol(void *lib, const char *symbol_name) +{ + void *sym = dlsym(lib, symbol_name); + if (!sym) + { + fprintf(stderr, "Failed to find symbol %s: %s\n", symbol_name, + dlerror()); + } + return sym; +} \ No newline at end of file diff --git a/systrace/src/cann/common_hook.h b/systrace/src/cann/common_hook.h new file mode 100644 index 0000000..0bae9be --- /dev/null +++ b/systrace/src/cann/common_hook.h @@ -0,0 +1,36 @@ +#ifndef COMMON_HOOK_H +#define COMMON_HOOK_H +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__aarch64__) +#include "../../thirdparty/aarch64/libunwind/libunwind.h" +#elif defined(__x86_64__) +#include "../../thirdparty/x86_64/libunwind/libunwind.h" +#else +#error "Unsupported architecture - only aarch64 and x86_64 are supported" +#endif + +#define LOG_INTERVAL_SEC 5 +#define LOG_ITEMS_MIN 10 + +uint64_t get_current_us(); +const char *get_so_name(uint64_t ip); +unw_word_t get_so_base(unw_word_t addr); +void get_log_filename(char *buf, size_t buf_size, const char *path_suffix); +void *load_symbol(void *lib, const char *symbol_name); +void common_write_protobuf_to_file(pthread_mutex_t *mutex, const char *filename, const void *data, size_t len); +void common_init_key(pthread_key_t *key, pthread_once_t *once, void *(*alloc_func)(void), void (*free_func)(void*)); +void common_atexit(void (*exit_handler)(void)); + + +#endif // COMMON_HOOK_H diff --git a/systrace/src/cann/io_hook.c b/systrace/src/cann/io_hook.c new file mode 100644 index 0000000..986f1a1 --- /dev/null +++ b/systrace/src/cann/io_hook.c @@ -0,0 +1,533 @@ +#define _GNU_SOURCE +#include "../../include/common/shared_constants.h" +#include "../../protos/systrace.pb-c.h" +#include "common_hook.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +typedef struct { + IO *io; + time_t last_log_time; +} ThreadData; + +typedef size_t (*halFReadFunc_t)(void *ptr, size_t size, size_t nmemb, FILE *stream); +typedef size_t (*halFWriteFunc_t)(const void *ptr, size_t size, size_t nmemb, FILE *stream); +typedef ssize_t(*halReadFunc_t)(int fd, void *buf, size_t count); +typedef ssize_t(*halWriteFunc_t)(int fd, const void *buf, size_t count); +typedef FILE* (*halFOpenFunc_t)(const char *path, const char *mode); +typedef int (*halFCloseFunc_t)(FILE *stream); +typedef int (*halFFlushFunc_t)(FILE *stream); +typedef int (*halRemoveFunc_t)(const char *filename); +typedef int (*halRenameFunc_t)(const char *oldname, const char *newname); +typedef int (*halOpenFunc_t)(const char *pathname, int flags, mode_t mode); +typedef int (*halCloseFunc_t)(int fd); +typedef int (*halFsyncFunc_t)(int fd); +typedef int (*halMkdirFunc_t)(const char *path, mode_t mode); +typedef int (*halRmdirFunc_t)(const char *path); +typedef int (*halUnlinkFunc_t)(const char *path); +typedef DIR* (*halOpendirFunc_t)(const char *name); +typedef int (*halClosedirFunc_t)(DIR *dir); + +static halFReadFunc_t orig_fread = NULL; +static halFWriteFunc_t orig_fwrite = NULL; +static halReadFunc_t orig_read = NULL; +static halWriteFunc_t orig_write = NULL; +static halFOpenFunc_t orig_fopen = NULL; +static halFCloseFunc_t orig_fclose = NULL; +static halFFlushFunc_t orig_fflush = NULL; +static halRemoveFunc_t orig_remove = NULL; +static halRenameFunc_t orig_rename = NULL; +static halOpenFunc_t orig_open = NULL; +static halCloseFunc_t orig_close = NULL; +static halFsyncFunc_t orig_fsync = NULL; +static halMkdirFunc_t orig_mkdir = NULL; +static halRmdirFunc_t orig_rmdir = NULL; +static halUnlinkFunc_t orig_unlink = NULL; +static halOpendirFunc_t orig_opendir = NULL; +static halClosedirFunc_t orig_closedir = NULL; + +static pthread_key_t thread_data_key; +static pthread_once_t key_once = PTHREAD_ONCE_INIT; +static pthread_mutex_t file_mutex = PTHREAD_MUTEX_INITIALIZER; +extern int global_stage_id; +extern int global_stage_type; + +static void make_key() { + pthread_key_create(&thread_data_key, NULL); +} + +static ThreadData *get_thread_data() { + ThreadData *td; + + pthread_once(&key_once, make_key); + td = pthread_getspecific(thread_data_key); + + if (!td) { + td = calloc(1, sizeof(ThreadData)); + td->io = calloc(1, sizeof(IO)); + io__init(td->io); + td->last_log_time = time(NULL); + pthread_setspecific(thread_data_key, td); + } + + return td; +} + +static char *get_filename_from_fd(int fd) { + char path[256]; + char resolved[256]; + + snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); + ssize_t len = readlink(path, resolved, sizeof(resolved) - 1); + if (len == -1) { + return strdup(""); + } + resolved[len] = '\0'; + + const char *filename = strrchr(resolved, '/'); + if (!filename) { + return strdup(resolved); + } + + return strdup(filename + 1); +} + +static int is_ready_to_write(ThreadData *td, time_t *current) { + *current = time(NULL); + if (*current - td->last_log_time >= LOG_INTERVAL_SEC || + (td->io && td->io->n_io_entries >= LOG_ITEMS_MIN)) { + return 1; + } + return 0; +} + +static void write_protobuf_to_file() { + time_t current; + uint8_t *buf = NULL; + ThreadData *td = get_thread_data(); + if (!td || !td->io) { + return; + } + + if (!is_ready_to_write(td, ¤t)) { + return; + } + + if (pthread_mutex_trylock(&file_mutex) == 0) { + char filename[256]; + get_log_filename(filename, sizeof(filename), "io_trace"); + + size_t len = io__get_packed_size(td->io); + buf = malloc(len); + io__pack(td->io, buf); + + FILE *fp = fopen(filename, "ab"); + if (fp) { + orig_fwrite(buf, len, 1, fp); + fclose(fp); + } + + pthread_mutex_unlock(&file_mutex); + } else { + return; + } + + if (buf) { + free(buf); + } + + for (size_t i = 0; i < td->io->n_io_entries; i++) { + IOEntry *entry = td->io->io_entries[i]; + free(entry); + } + td->io->n_io_entries = 0; + td->last_log_time = current; +} + +static void exit_handler(void) { write_protobuf_to_file(); } + +static void add_io_entry(int fd, uint64_t start_us, uint64_t duration, IOType operation) { + if (!checkAndUpdateTimer(2)) + { + return; + } + ThreadData *td = get_thread_data(); + if (!td || !td->io) return; + + + size_t frame_count = 0; + + IOEntry *entry = malloc(sizeof(IOEntry)); + ioentry__init(entry); + entry->start_us = start_us; + entry->dur = duration; + entry->stage_id = global_stage_id; + entry->stage_type = global_stage_type; + entry->io_type = operation; + char *filename = get_filename_from_fd(fd); + if (!filename) { + filename = strdup(""); + } + entry->file_name.data = (uint8_t *)filename; + entry->file_name.len = strlen(filename); + + const char *rank_str = getenv("RANK"); + entry->rank = rank_str ? atoi(rank_str) : 0; + + td->io->n_io_entries++; + td->io->io_entries = realloc(td->io->io_entries, td->io->n_io_entries * sizeof(IOEntry*)); + td->io->io_entries[td->io->n_io_entries - 1] = entry; +} + +int init_io_trace() { + void *lib = dlopen("/usr/lib64/libc.so.6", RTLD_LAZY); + if (!lib) { + fprintf(stderr, "dlopen failed: %s\n", dlerror()); + return -1; + } + + orig_fread = (halFReadFunc_t)dlsym(lib, "fread"); + orig_fwrite = (halFWriteFunc_t)dlsym(lib, "fwrite"); + orig_read = (halReadFunc_t)dlsym(lib, "read"); + orig_write = (halWriteFunc_t)dlsym(lib, "write"); + orig_fopen = (halFOpenFunc_t)dlsym(lib, "fopen"); + orig_fclose = (halFCloseFunc_t)dlsym(lib, "fclose"); + orig_fflush = (halFFlushFunc_t)dlsym(lib, "fflush"); + orig_remove = (halRemoveFunc_t)dlsym(lib, "remove"); + orig_rename = (halRenameFunc_t)dlsym(lib, "rename"); + orig_open = (halOpenFunc_t)dlsym(lib, "open"); + orig_close = (halCloseFunc_t)dlsym(lib, "close"); + orig_fsync = (halFsyncFunc_t)dlsym(lib, "fsync"); + orig_mkdir = (halMkdirFunc_t)dlsym(lib, "mkdir"); + orig_rmdir = (halRmdirFunc_t)dlsym(lib, "rmdir"); + orig_unlink = (halUnlinkFunc_t)dlsym(lib, "unlink"); + orig_opendir = (halOpendirFunc_t)dlsym(lib, "opendir"); + orig_closedir = (halClosedirFunc_t)dlsym(lib, "closedir"); + + if (!orig_fread || !orig_fwrite || !orig_read || !orig_write || + !orig_fopen || !orig_fclose || !orig_fflush || !orig_remove || + !orig_rename || !orig_open || !orig_close || !orig_fsync || + !orig_mkdir || !orig_rmdir || !orig_unlink || !orig_opendir || + !orig_closedir) { + fprintf(stderr, "dlsym failed: %s\n", dlerror()); + return -1; + } + + + atexit(exit_handler); + return 0; +} + +ssize_t read(int fd, void *buf, size_t count) { + if (!orig_read) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + ssize_t ret = orig_read(fd, buf, count); + uint64_t end_us = get_current_us(); + + if (ret > 0) { + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_READ); + } + + write_protobuf_to_file(); + return ret; +} + + +ssize_t write(int fd, const void *buf, size_t count) { + if (!orig_write) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + ssize_t ret = orig_write(fd, buf, count); + uint64_t end_us = get_current_us(); + + if (ret > 0) { + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_WRITE); + } + + write_protobuf_to_file(); + return ret; +} + +size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { + if (!orig_fwrite) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + ssize_t ret = orig_fwrite(ptr, size, nmemb, stream); + uint64_t end_us = get_current_us(); + + if (ret > 0) { + int fd = fileno(stream); + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_FWRITE); + } + + write_protobuf_to_file(); + return ret; +} + +size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { + if (!orig_fread) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + ssize_t ret = orig_fread(ptr, size, nmemb, stream); + uint64_t end_us = get_current_us(); + + if (ret > 0) { + int fd = fileno(stream); + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_FREAD); + } + + write_protobuf_to_file(); + return ret; +} + +FILE *fopen(const char *path, const char *mode) { + if (!orig_fopen) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + FILE *ret = orig_fopen(path, mode); + uint64_t end_us = get_current_us(); + + if (ret) { + int fd = fileno(ret); + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_FOPEN); + } + + write_protobuf_to_file(); + return ret; +} + +int fclose(FILE *stream) { + if (!orig_fclose) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_fclose(stream); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + int fd = fileno(stream); + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_FCLOSE); + } + + write_protobuf_to_file(); + return ret; +} + +int fflush(FILE *stream) { + if (!orig_fflush) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_fflush(stream); + uint64_t end_us = get_current_us(); + + if (ret == 0 && stream) { + int fd = fileno(stream); + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_FFLUSH); + } + + write_protobuf_to_file(); + return ret; +} + +int remove(const char *filename) { + if (!orig_remove) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_remove(filename); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + add_io_entry(-1, start_us, end_us - start_us, IOTYPE__IO_REMOVE); + } + + write_protobuf_to_file(); + return ret; +} + +int rename(const char *oldname, const char *newname) { + if (!orig_rename) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_rename(oldname, newname); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + add_io_entry(-1, start_us, end_us - start_us, IOTYPE__IO_RENAME); + } + + write_protobuf_to_file(); + return ret; +} + +int open(const char *pathname, int flags, ...) { + if (!orig_open) { + init_io_trace(); + } + + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; + va_start(ap, flags); + mode = va_arg(ap, mode_t); + va_end(ap); + } + + uint64_t start_us = get_current_us(); + int ret = orig_open(pathname, flags, mode); + uint64_t end_us = get_current_us(); + + if (ret >= 0) { + add_io_entry(ret, start_us, end_us - start_us, IOTYPE__IO_OPEN); + } + + write_protobuf_to_file(); + return ret; +} + +int close(int fd) { + if (!orig_close) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_close(fd); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_CLOSE); + } + + write_protobuf_to_file(); + return ret; +} + +int fsync(int fd) { + if (!orig_fsync) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_fsync(fd); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + add_io_entry(fd, start_us, end_us - start_us, IOTYPE__IO_FSYNC); + } + + write_protobuf_to_file(); + return ret; +} + +int mkdir(const char *path, mode_t mode) { + if (!orig_mkdir) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_mkdir(path, mode); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + add_io_entry(-1, start_us, end_us - start_us, IOTYPE__IO_MKDIR); + } + + write_protobuf_to_file(); + return ret; +} + +int rmdir(const char *path) { + if (!orig_rmdir) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_rmdir(path); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + add_io_entry(-1, start_us, end_us - start_us, IOTYPE__IO_RMDIR); + } + + write_protobuf_to_file(); + return ret; +} + +int unlink(const char *path) { + if (!orig_unlink) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_unlink(path); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + add_io_entry(-1, start_us, end_us - start_us, IOTYPE__IO_UNLINK); + } + + write_protobuf_to_file(); + return ret; +} + +DIR *opendir(const char *name) { + if (!orig_opendir) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + DIR *ret = orig_opendir(name); + uint64_t end_us = get_current_us(); + + if (ret) { + add_io_entry(-1, start_us, end_us - start_us, IOTYPE__IO_OPENDIR); + } + + write_protobuf_to_file(); + return ret; +} + +int closedir(DIR *dir) { + if (!orig_closedir) { + init_io_trace(); + } + + uint64_t start_us = get_current_us(); + int ret = orig_closedir(dir); + uint64_t end_us = get_current_us(); + + if (ret == 0) { + add_io_entry(-1, start_us, end_us - start_us, IOTYPE__IO_CLOSEDIR); + } + + write_protobuf_to_file(); + return ret; +} \ No newline at end of file diff --git a/systrace/src/cann/cann_hook.c b/systrace/src/cann/mem_hook.c similarity index 85% rename from systrace/src/cann/cann_hook.c rename to systrace/src/cann/mem_hook.c index 1782208..6569fe5 100644 --- a/systrace/src/cann/cann_hook.c +++ b/systrace/src/cann/mem_hook.c @@ -1,6 +1,7 @@ #define _GNU_SOURCE #include "../../include/common/shared_constants.h" #include "../../protos/systrace.pb-c.h" +#include "common_hook.h" #include #include #include @@ -12,13 +13,6 @@ #include #include #include -#if defined(__aarch64__) -#include "../../thirdparty/aarch64/libunwind/libunwind.h" -#elif defined(__x86_64__) -#include "../../thirdparty/x86_64/libunwind/libunwind.h" -#else -#error "Unsupported architecture - only aarch64 and x86_64 are supported" -#endif // export LD_PRELOAD=/home/MindSpeed-LLM-1.0.RC3/libascend_hal_jack.so // cd /home/hbdir/mspti_test-megatron @@ -32,9 +26,6 @@ // drv_mem_prop *prop, uint64_t flag); drvError_t halMemRelease // (drv_mem_handle_t *handle); -#define LOG_INTERVAL_SEC 120 -#define LOG_ITEMS_MIN 1000 - typedef int drvError_t; typedef enum aclrtMemMallocPolicy @@ -84,17 +75,6 @@ typedef struct time_t last_log_time; } ThreadData; -static void *load_symbol(void *lib, const char *symbol_name) -{ - void *sym = dlsym(lib, symbol_name); - if (!sym) - { - fprintf(stderr, "Failed to find symbol %s: %s\n", symbol_name, - dlerror()); - } - return sym; -} - static void free_proc_mem(ProcMem *proc_mem) { if (!proc_mem) @@ -168,38 +148,6 @@ static ThreadData *get_thread_data() return td; } -static const char *get_so_name(uint64_t ip) -{ - Dl_info info; - const char *so_name; - if (dladdr((void *)ip, &info)) - { - so_name = strrchr(info.dli_fname, '/'); - return (so_name != NULL) ? so_name + 1 : info.dli_fname; - } - return "unknown"; -} - -static void get_log_filename(time_t current, uint32_t pid, char *buf, - size_t buf_size) -{ - const char *rank_str = getenv("RANK"); - int rank = rank_str ? atoi(rank_str) : 0; - struct tm *tm = localtime(¤t); - - const char *dir_path = SYS_TRACE_ROOT_DIR "cann"; - if (access(dir_path, F_OK) != 0) - { - if (mkdir(dir_path, 0755) != 0 && errno != EEXIST) - { - perror("Failed to create directory"); - snprintf(buf, buf_size, "mem_trace_rank%d.pb",rank); - return; - } - } - snprintf(buf, buf_size, "%s/mem_trace_rank%d.pb", dir_path, rank); -} - static char is_ready_to_write(ThreadData *td, time_t *current) { ProcMem *proc_mem = td->proc_mem; @@ -244,8 +192,7 @@ static void write_protobuf_to_file() if (pthread_mutex_trylock(&file_mutex) == 0) { // pthread_mutex_trylock or pthread_mutex_lock char filename[256]; - get_log_filename(current, td->proc_mem->pid, filename, - sizeof(filename)); + get_log_filename(filename, sizeof(filename), "mem_trace"); size_t len = proc_mem__get_packed_size(td->proc_mem); buf = malloc(len); @@ -310,16 +257,6 @@ int init_mem_trace() return 0; } -unw_word_t get_so_base(unw_word_t addr) -{ - Dl_info info; - if (dladdr((void *)addr, &info) != 0) - { - return (unw_word_t)info.dli_fbase; - } - return 0; -} - static void collect_stack_frames(MemAllocEntry *entry) { unw_cursor_t cursor; diff --git a/systrace/src/os/os_probe.c b/systrace/src/os/os_probe.c index da5025f..3784f0c 100644 --- a/systrace/src/os/os_probe.c +++ b/systrace/src/os/os_probe.c @@ -214,7 +214,7 @@ static void add_osprobe_entry(trace_event_data_t *evt_data) entry->comm = strdup(evt_data->comm); - if (entry->os_event_type == EVENT_TYPE_OFFCPU && evt_data->next_comm) { + if (entry->os_event_type == EVENT_TYPE_OFFCPU && evt_data->next_comm[0] != '\0') { entry->nxt_comm = strdup(evt_data->next_comm); entry->nxt_pid = evt_data->next_pid; } @@ -440,8 +440,8 @@ int update_filter_map_by_kernel_thread() { return -1; } for (int dev_id = 0; dev_id < 16; ++dev_id) { - char send_key[16] = {0}; - char task_key[16] = {0}; + char send_key[32] = {0}; + char task_key[32] = {0}; snprintf(send_key, sizeof(send_key), "dev%d_sq_send_wq", dev_id); snprintf(task_key, sizeof(task_key), "dev%d_sq_task", dev_id); -- Gitee