diff --git a/systrace/build.sh b/systrace/build.sh index 65495c96ff9eb16addf94a3b6e8ead1f797face7..32cecad0510ead6dc4e8088d6f08f50310f00a86 100644 --- a/systrace/build.sh +++ b/systrace/build.sh @@ -10,11 +10,21 @@ rm src/os/*.skel.h -rf cp -f config/PyFuncList /etc/systrace/config/PyFuncList +PROTOC_VERSION=$(protoc --version | awk '{print $2}' | cut -d. -f1) +PROTO_FILE="" +PROTO_EXTRA_OPT="" + cd protos +if [ "$PROTOC_VERSION" -ge 3 ]; then + mv systrace.v3.proto systrace.proto +else + mv systrace.v2.proto systrace.proto +fi protoc --c_out=. systrace.proto protoc --cpp_out=. systrace.proto protoc --python_out=. systrace.proto cd .. + cd build cmake .. make -j $(nproc) \ No newline at end of file diff --git a/systrace/convert/merge_json_by_rank.py b/systrace/convert/merge_json_by_rank.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0a24f2ddcf1192a0de9eb4a30ac5a196e4c104 --- /dev/null +++ b/systrace/convert/merge_json_by_rank.py @@ -0,0 +1,60 @@ +import json +import argparse +from concurrent.futures import ThreadPoolExecutor +from collections import defaultdict +import threading + +""" +Usage: +python merge_json_by_rank.py file1.json file2.json [...] --output merged.json +""" + +def load_json_file(file_path): + with open(file_path, 'r') as f: + return json.load(f) + +def merge_events_by_pid(files): + pid_map = defaultdict(list) + lock = threading.Lock() + + def process_file(file_path): + nonlocal pid_map + data = load_json_file(file_path) + with lock: + for event in data['traceEvents']: + pid_map[event['pid']].append(event) + + with ThreadPoolExecutor() as executor: + executor.map(process_file, files) + + return pid_map + +def save_merged_data(pid_map, output_file): + merged_events = [] + for pid in sorted(pid_map.keys()): + pid_events = sorted(pid_map[pid], key=lambda x: x['ts']) + merged_events.extend(pid_events) + + merged_data = { + "traceEvents": merged_events, + "displayTimeUnit": "ns", + "metadata": { + "format": "Merged Trace Data", + "merged_pids": len(pid_map), + "total_events": len(merged_events) + } + } + + with open(output_file, 'w') as f: + json.dump(merged_data, f, indent=None, separators=(',', ':')) + + print(f"Merged {len(pid_map)} PIDs with total {len(merged_events)} events to {output_file}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Merge JSON trace files by PID with multithreading') + parser.add_argument('input_files', nargs='+', help='Input JSON files to merge') + parser.add_argument('--output', required=True, help='Output JSON file path') + args = parser.parse_args() + + pid_map = merge_events_by_pid(args.input_files) + save_merged_data(pid_map, args.output) \ No newline at end of file diff --git a/systrace/protos/systrace.v2.proto b/systrace/protos/systrace.v2.proto new file mode 100644 index 0000000000000000000000000000000000000000..f583d0a4b88bfe6baca59781d420e30cc8c13c65 --- /dev/null +++ b/systrace/protos/systrace.v2.proto @@ -0,0 +1,76 @@ +syntax = "proto2"; + +message StackFrame { + optional uint64 address = 1; + optional string so_name = 2; +} + +message MemAllocEntry { + optional uint64 alloc_ptr = 1; + optional uint32 stage_id = 2; + optional StageType stage_type = 3; + optional uint64 mem_size = 4; + repeated StackFrame stack_frames = 5; +} + +message MemFreeEntry { + optional uint64 alloc_ptr = 1; + optional uint32 stage_id = 2; + optional StageType stage_type = 3; +} + +message ProcMem { + optional uint32 pid = 1; + repeated MemAllocEntry mem_alloc_stacks = 2; + repeated MemFreeEntry mem_free_stacks = 3; +} + +enum StageType { + STAGE_UNKNOWN = 0; + STAGE_DATALOADER = 1; + STAGE_FORWARD = 2; + STAGE_BACKWARD = 3; + STAGE_SYNCHRONIZATION = 4; + STAGE_GC = 5; +} + +message GcDebugData { + optional uint32 collected = 1; + optional uint32 uncollectable = 2; +} + +message PytorchStage { + optional uint32 stage_id = 1; + optional string stage_type = 2; + optional uint64 start_us = 3; + optional uint64 end_us = 4; + repeated string stack_frames = 5; + optional GcDebugData gc_debug = 6; +} + +message Pytorch { + repeated PytorchStage pytorch_stages = 1; + optional uint32 rank = 2; + optional uint32 step_id = 3; + optional string comm = 4; // 任务名 +} + +message Mem { + repeated ProcMem proc_mem = 1; +} + +message OSprobe { + repeated OSprobeEntry OSprobe_entries = 1; +} + +message OSprobeEntry { + optional uint32 key = 1; + optional uint64 start_us = 2; + optional uint64 dur = 3; + optional uint64 rundelay = 4; + optional uint32 OS_event_type = 5; + optional uint32 rank = 6; + optional string comm = 7; + optional string nxt_comm = 8; + optional uint32 nxt_pid = 9; +} diff --git a/systrace/protos/systrace.proto b/systrace/protos/systrace.v3.proto similarity index 100% rename from systrace/protos/systrace.proto rename to systrace/protos/systrace.v3.proto diff --git a/systrace/src/os/bpf_comm.h b/systrace/src/os/bpf_comm.h index 90b1a58254c82aa2b26a276489253a0d0f24e6e4..0fbe65a9b9266ddb30ca89e37dfc94aff801e6d8 100644 --- a/systrace/src/os/bpf_comm.h +++ b/systrace/src/os/bpf_comm.h @@ -182,6 +182,30 @@ static __always_inline void emit_event(trace_event_data_t *event, void *ctx) case 7: bpf_ringbuf_output(&osprobe_map_7, event, sizeof(*event), 0); break; + case 8: + bpf_ringbuf_output(&osprobe_map_8, event, sizeof(*event), 0); + break; + case 9: + bpf_ringbuf_output(&osprobe_map_9, event, sizeof(*event), 0); + break; + case 10: + bpf_ringbuf_output(&osprobe_map_10, event, sizeof(*event), 0); + break; + case 11: + bpf_ringbuf_output(&osprobe_map_11, event, sizeof(*event), 0); + break; + case 12: + bpf_ringbuf_output(&osprobe_map_12, event, sizeof(*event), 0); + break; + case 13: + bpf_ringbuf_output(&osprobe_map_13, event, sizeof(*event), 0); + break; + case 14: + bpf_ringbuf_output(&osprobe_map_14, event, sizeof(*event), 0); + break; + case 15: + bpf_ringbuf_output(&osprobe_map_15, event, sizeof(*event), 0); + break; default: break; }