From 7f5e6ea8625ebab7597b4f4c99209f6cd2a2b258 Mon Sep 17 00:00:00 2001 From: Mrtutu Date: Mon, 27 Jan 2025 11:11:33 +0800 Subject: [PATCH] add npu trace --- dynolog_npu/README.md | 51 ++- .../dynolog_npu/cli/src/commands/mod.rs | 17 + .../dynolog_npu/cli/src/commands/nputrace.rs | 242 ++++++++++++++ dynolog_npu/dynolog_npu/cli/src/main.rs | 303 ++++++++++++++++++ dynolog_npu/scripts/gen_dyno_patches.sh | 5 +- 5 files changed, 616 insertions(+), 2 deletions(-) create mode 100644 dynolog_npu/dynolog_npu/cli/src/commands/mod.rs create mode 100644 dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs create mode 100644 dynolog_npu/dynolog_npu/cli/src/main.rs diff --git a/dynolog_npu/README.md b/dynolog_npu/README.md index a7ce0249f53..00afb3c3975 100644 --- a/dynolog_npu/README.md +++ b/dynolog_npu/README.md @@ -1,6 +1,6 @@ # Ascend Extension for dynolog -## 安装 +## 安装方式 ### 1. clone 代码 @@ -63,3 +63,52 @@ bash scripts/build.sh -t deb # 编译rpm包, 当前只支持amd64平台 bash scripts/build.sh -t rpm ``` + +## 使用方式 + +### Profiler trace dump功能 +Profiler trace dump功能基于dynolog开发,实现类似于动态profiling的动态触发Ascend Torch Profiler采集profiling的功能。用户基于dyno CLI命令行可以动态触发指定节点的训练进程trace dump。 + +- 查看nputrace支持的命令和帮助 + +```bash +dyno nputrace --help +``` + +- nputrace使用方式 + +```bash +dyno nputrace [SUBCOMMANDS] --log-file +``` + +nputrace子命令支持的参数选项 + +| 子命令 | 参数类型 | 说明 | +|-------|-------|-------| +| record_shapes | action | 是否采集算子的InputShapes和InputTypes,设置参数采集,默认不采集 | +| profile_memory | action | 是否采集算子内存信息,设置参数采集,默认不采集 | +| with_stack | action | 是否采集Python调用栈,设置参数采集,默认不采集 | +| with_flops | action | 是否采集算子flops,设置参数采集,默认不采集 | +| with_modules | action | 是否采集modules层级的Python调用栈,设置参数采集,默认不采集 | +| analyse | action | 采集后是否自动解析,设置参数解析,默认不解析 | +| l2_cache | action | 是否采集L2 Cache数据,设置参数采集,默认不采集 | +| op_attr | action | 是否采集算子属性信息,设置参数采集,默认不采集 | +| data_simplification | String | 解析完成后是否数据精简,可选值范围[`true`, `false`],默认值`true` | +| activities | String | 控制CPU、NPU事件采集范围,可选值范围[`CPU,NPU`, `NPU,CPU`, `CPU`, `NPU`],默认值`CPU,NPU` | +| profiler_level | String | 控制profiler的采集等级,可选值范围[`Level_none`, `Level0`, `Level1`, `Level2`],默认值`Level0`| +| aic_metrics | String | AI Core的性能指标采集项,可选值范围[`AiCoreNone`, `PipeUtilization`, `ArithmeticUtilization`, `Memory`, `MemoryL0`, `ResourceConflictRatio`, `MemoryUB`, `L2Cache`, `MemoryAccess`],默认值`AiCoreNone`| +| export_type | String | profiler解析导出数据的类型,可选值范围[`Text`, `Db`],默认值`Text`| +| gc_detect_threshold | Option | GC检测阈值,单位ms,只采集超过阈值的GC事件。该参数为可选参数,默认不设置时不开启GC检测 | + +- nputrace示例命令 + +```bash +# 示例1:采集框架、CANN和device数据,同时采集完后自动解析以及解析完成不做数据精简,落盘路径为/tmp/profile_data +dyno nputrace --activities CPU,NPU --analyse --data_simplification false --log-file /tmp/profile_data + +# 示例2:只采集CANN和device数据,同时采集完后自动解析以及解析完成后开启数据精简,落盘路径为/tmp/profile_data +dyno nputrace --activities NPU --analyse --data_simplification true --log-file /tmp/profile_data + +# 示例3:只采集CANN和device数据,只采集不解析,落盘路径为/tmp/profile_data +dyno nputrace --activities NPU --log-file /tmp/profile_data +``` diff --git a/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs b/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs new file mode 100644 index 00000000000..e4d92f8c6ce --- /dev/null +++ b/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs @@ -0,0 +1,17 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +// Export all command submodules to be used in main.rs +// Note: This "intermediate" commands module is purely for organizational purposes. +// This allows for a clear distinction between the command dispatching code and the command +// handling code. Additionally, explicitly "exporting" all the command modules here allows +// us to avoid having to explicitly list all the command modules in main.rs. + +pub mod dcgm; +pub mod gputrace; +pub mod nputrace; +pub mod status; +pub mod version; +// ... add new command modules here \ No newline at end of file diff --git a/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs b/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs new file mode 100644 index 00000000000..4bf7132de33 --- /dev/null +++ b/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs @@ -0,0 +1,242 @@ +use std::net::TcpStream; + +use anyhow::Result; +use serde_json::Value; + +#[path = "utils.rs"] +mod utils; + +#[derive(Debug)] +pub enum NpuTraceTriggerConfig { + DurationBased { + profile_start_time: u64, + duration_ms: u64, + }, + IterationBased { + start_step: u64, + iterations: i64, + }, +} + +impl NpuTraceTriggerConfig { + fn config(&self) -> String { + match *self { + NpuTraceTriggerConfig::DurationBased { + profile_start_time, + duration_ms, + } => format!( + "PROFILE_START_TIME={}\nACTIVITIES_DURATION_MSECS={}", + profile_start_time, duration_ms + ), + NpuTraceTriggerConfig::IterationBased { + start_step, + iterations, + } => format!( + r#"PROFILE_START_ITERATION=0 +PROFILE_START_STEP={} +ACTIVITIES_ITERATIONS={}"#, + start_step, iterations + ), + } + } +} + +// torch npu profiler config +#[derive(Debug)] +pub struct NpuTraceOptions { + pub record_shapes: bool, + pub profile_memory: bool, + pub with_stack: bool, + pub with_flops: bool, + pub with_modules: bool, + pub activities: String, + pub analyse: bool, + pub profiler_level: String, + pub aic_metrics: String, + pub l2_cache: bool, + pub op_attr: bool, + pub gc_detect_threshold: Option, + pub data_simplification: String, + pub export_type: String, +} + +impl NpuTraceOptions { + fn config(&self) -> String { + format!( + r#" +PROFILE_RECORD_SHAPES={} +PROFILE_PROFILE_MEMORY={} +PROFILE_WITH_STACK={} +PROFILE_WITH_FLOPS={} +PROFILE_WITH_MODULES={} +PROFILE_ACTIVITIES={} +PROFILE_ANALYSE={} +PROFILE_PROFILER_LEVEL={} +PROFILE_AIC_METRICS={} +PROFILE_L2_CACHE={} +PROFILE_OP_ATTR={} +PROFILE_GC_DETECT_THRESHOLD={} +PROFILE_DATA_SIMPLIFICATION={} +PROFILE_EXPORT_TYPE={}"#, + self.record_shapes, + self.profile_memory, + self.with_stack, + self.with_flops, + self.with_modules, + self.activities, + self.analyse, + self.profiler_level, + self.aic_metrics, + self.l2_cache, + self.op_attr, + self.gc_detect_threshold.map_or("None".to_string(), |v| v.to_string()), + self.data_simplification, + self.export_type + ) + } +} + +#[derive(Debug)] +pub struct NpuTraceConfig { + pub log_file: String, + pub trigger_config: NpuTraceTriggerConfig, + pub trace_options: NpuTraceOptions, +} + +impl NpuTraceConfig { + fn config(&self) -> String { + format!( + "ACTIVITIES_LOG_FILE={}\n{}{}", + self.log_file, + self.trigger_config.config(), + self.trace_options.config() + ) + } +} + +pub fn run_nputrace( + client: TcpStream, + job_id: u64, + pids: &str, + process_limit: u32, + config: NpuTraceConfig, +) -> Result<()> { + let config_str = config.config(); + println!("NpuTrace config = \n{}", config_str); + let config_str = config_str.replace('\n', "\\n"); + + let request_json = format!( + r#" +{{ + "fn": "setKinetOnDemandRequest", + "config": "{}", + "job_id": {}, + "pids": [{}], + "process_limit": {} +}}"#, + config_str, job_id, pids, process_limit + ); + + utils::send_msg(&client, &request_json).expect("Error sending message to service"); + + let resp_str = utils::get_resp(&client).expect("Unable to decode output bytes"); + + println!("response = {}", resp_str); + + let resp_v: Value = serde_json::from_str(&resp_str)?; + let processes = resp_v["processesMatched"].as_array().unwrap(); + + if processes.is_empty() { + println!("No processes were matched, please check --job-id or --pids flags"); + } else { + println!("Matched {} processes", processes.len()); + println!("Trace output files will be written to:"); + + for pid in processes { + let pid = pid.as_i64().unwrap(); + println!( + " {}", + config.log_file.replace(".json", &format!("_{}.json", pid)) + ); + } + } + + Ok(()) +} + + +#[cfg(test)] +mod test { + use crate::*; + + #[test] + fn test_nputrace_trigger_config() { + let trigger_config = NpuTraceTriggerConfig::DurationBased { + profile_start_time: 1000, + duration_ms: 1000, + }; + assert_eq!( + trigger_config.config(), + r#"PROFILE_START_TIME=1000 +ACTIVITIES_DURATION_MSECS=1000"# + ); + + let trigger_config = NpuTraceTriggerConfig::IterationBased { + profile_start_step: 1000, + iterations: 1000, + }; + assert_eq!( + trigger_config.config(), + r#"PROFILE_START_ITERATION=0 +PROFILE_START_STEP=1000 +ACTIVITIES_ITERATIONS=1000"# + ); + } + + #[test] + fn test_nputrace_config() { + let config = NpuTraceConfig { + log_file: "test.json".to_string(), + trigger_config: NpuTraceTriggerConfig::DurationBased { + profile_start_time: 1000, + duration_ms: 1000, + }, + trace_options: NpuTraceOptions { + record_shapes: true, + profile_memory: false, + with_stack: true, + with_flops: true, + with_modules: true, + activities: "CPU,NPU".to_string(), + analyse: false, + profiler_level: "Level0".to_string(), + aic_metrics: "AiCoreNone".to_string(), + l2_cache: true, + op_attr: true, + gc_detect_threshold: 0.1, + data_simplification: "true", + export_type: "Text".to_string(), + }, + }; + assert_eq!( + config.config(), + r#"ACTIVITIES_LOG_FILE=test.json +PROFILE_START_TIME=1000 +ACTIVITIES_DURATION_MSECS=1000 +PROFILE_RECORD_SHAPES=true +PROFILE_PROFILE_MEMORY=false +PROFILE_WITH_STACK=true +PROFILE_WITH_FLOPS=true +PROFILE_WITH_MODULES=true +PROFILE_ACTIVITIES=CPU,NPU +PROFILE_ANALYSE=false +PROFILE_PROFILER_LEVEL=Level0 +PROFILE_AIC_METRICS=AiCoreNone +PROFILE_L2_CACHE=true +PROFILE_OP_ATTR=true +PROFILE_GC_DETECT_THRESHOLD=0.1 +PROFILE_DATA_SIMPLIFICATION=true +PROFILE_EXPORT_TYPE=Text"# + ); + } +} diff --git a/dynolog_npu/dynolog_npu/cli/src/main.rs b/dynolog_npu/dynolog_npu/cli/src/main.rs new file mode 100644 index 00000000000..7a3d04a7138 --- /dev/null +++ b/dynolog_npu/dynolog_npu/cli/src/main.rs @@ -0,0 +1,303 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::net::TcpStream; +use std::net::ToSocketAddrs; + +use anyhow::Result; +use clap::Parser; + +// Make all the command modules accessible to this file. +mod commands; +use commands::gputrace::GpuTraceConfig; +use commands::gputrace::GpuTraceOptions; +use commands::gputrace::GpuTraceTriggerConfig; +use commands::nputrace::NpuTraceConfig; +use commands::nputrace::NpuTraceOptions; +use commands::nputrace::NpuTraceTriggerConfig; +use commands::*; + +/// Instructions on adding a new Dyno CLI command: +/// +/// 1. Add a new variant to the `Command` enum. +/// Please include a description of the command and, if applicable, its flags/subcommands. +/// +/// 2. Create a new file for the command's implementation in the commands/ directory (ie +/// commands/status.rs). This new file is where the command should be implemented. +/// Make the new command's module accessible from this file by adding +/// a new line with `pub mod ;` to commands/mod.rs. +/// +/// +/// 3. Add a branch to the match statement in main() to handle the new enum variant (from step 1). +/// From here, invoke the handling logic defined in the new file (from step 2). In an effort to keep +/// the command dispatching logic clear and concise, please keep the code in the match branch to a minimum. + +const DYNO_PORT: u16 = 1778; + +#[derive(Debug, Parser)] +struct Opts { + #[clap(long, default_value = "localhost")] + hostname: String, + #[clap(long, default_value_t = DYNO_PORT)] + port: u16, + #[clap(subcommand)] + cmd: Command, +} + +#[derive(Debug, Parser)] +enum Command { + /// Check the status of a dynolog process + Status, + /// Check the version of a dynolog process + Version, + /// Capture gputrace + Gputrace { + /// Job id of the application to trace + #[clap(long, default_value_t = 0)] + job_id: u64, + /// List of pids to capture trace for (comma separated). + #[clap(long, default_value = "0")] + pids: String, + /// Duration of trace to collect in ms. + #[clap(long, default_value_t = 500)] + duration_ms: u64, + /// Training iterations to collect, this takes precedence over duration. + #[clap(long, default_value_t = -1)] + iterations: i64, + /// Log file for trace. + #[clap(long)] + log_file: String, + /// Unix timestamp used for synchronized collection (milliseconds since epoch) + #[clap(long, default_value_t = 0)] + profile_start_time: u64, + /// Start iteration roundup, starts an iteration based trace at a multiple + /// of this value. + #[clap(long, default_value_t = 1)] + profile_start_iteration_roundup: u64, + /// Max number of processes to profile + #[clap(long, default_value_t = 3)] + process_limit: u32, + /// Record PyTorch operator input shapes and types + #[clap(long, action)] + record_shapes: bool, + /// Profile PyTorch memory + #[clap(long, action)] + profile_memory: bool, + /// Capture Python stacks in traces + #[clap(long, action)] + with_stacks: bool, + /// Annotate operators with analytical flops + #[clap(long, action)] + with_flops: bool, + /// Capture PyTorch operator modules in traces + #[clap(long, action)] + with_modules: bool, + }, + /// Capture nputrace. Subcommand functions aligned with Ascend Torch Profiler + Nputrace { + /// Job id of the application to trace + #[clap(long, default_value_t = 0)] + job_id: u64, + /// List of pids to capture trace for (comma separated). + #[clap(long, default_value = "0")] + pids: String, + /// Duration of trace to collect in ms. + #[clap(long, default_value_t = 500)] + duration_ms: u64, + /// Training iterations to collect, this takes precedence over duration. + #[clap(long, default_value_t = -1)] + iterations: i64, + /// Log file for trace. + #[clap(long)] + log_file: String, + /// Unix timestamp used for synchronized collection (milliseconds since epoch) + #[clap(long, default_value_t = 0)] + profile_start_time: u64, + /// Number of steps to start profile + #[clap(long, default_value_t = 0)] + start_step: u64, + /// Max number of processes to profile + #[clap(long, default_value_t = 3)] + process_limit: u32, + /// Whether to record PyTorch operator input shapes and types + #[clap(long, action)] + record_shapes: bool, + /// Whether to profile PyTorch memory + #[clap(long, action)] + profile_memory: bool, + /// Whether to profile the Python call stack in trace + #[clap(long, action)] + with_stack: bool, + /// Annotate operators with analytical flops + #[clap(long, action)] + with_flops: bool, + /// Whether to profile PyTorch operator modules in traces + #[clap(long, action)] + with_modules: bool, + /// The scope of the profile's events + #[clap(long, value_parser = ["CPU,NPU", "NPU,CPU", "CPU", "NPU"], default_value = "CPU,NPU")] + activities: String, + /// Profiler level + #[clap(long, value_parser = ["Level0", "Level1", "Level2", "Level_none"], default_value = "Level0")] + profiler_level: String, + /// AIC metrics + #[clap(long, value_parser = ["AiCoreNone", "PipeUtilization", "ArithmeticUtilization", "Memory", "MemoryL0", "ResourceConflictRatio", "MemoryUB", "L2Cache", "MemoryAccess"], default_value = "AiCoreNone")] + aic_metrics: String, + /// Whether to analyse the data after collection + #[clap(long, action)] + analyse: bool, + /// Whether to collect L2 cache + #[clap(long, action)] + l2_cache: bool, + /// Whether to collect op attributes + #[clap(long, action)] + op_attr: bool, + /// GC detect threshold + #[clap(long)] + gc_detect_threshold: Option, + /// Whether to streamline data after analyse is complete + #[clap(long, value_parser = ["true", "false"], default_value = "true")] + data_simplification: String, + /// Types of data exported by the profiler + #[clap(long, value_parser = ["Text", "Db"], default_value = "Text")] + export_type: String, + }, + /// Pause dcgm profiling. This enables running tools like Nsight compute and avoids conflicts. + DcgmPause { + /// Duration to pause dcgm profiling in seconds + #[clap(long, default_value_t = 300)] + duration_s: i32, + }, + /// Resume dcgm profiling + DcgmResume, +} + +/// Create a socket connection to dynolog +fn create_dyno_client(host: &str, port: u16) -> Result { + let addr = (host, port) + .to_socket_addrs()? + .next() + .expect("Failed to connect to the server"); + + TcpStream::connect(addr).map_err(|err| err.into()) +} + +fn main() -> Result<()> { + let Opts { + hostname, + port, + cmd, + } = Opts::parse(); + + let dyno_client = + create_dyno_client(&hostname, port).expect("Couldn't connect to the server..."); + + match cmd { + Command::Status => status::run_status(dyno_client), + Command::Version => version::run_version(dyno_client), + Command::Gputrace { + job_id, + pids, + log_file, + duration_ms, + iterations, + profile_start_time, + profile_start_iteration_roundup, + process_limit, + record_shapes, + profile_memory, + with_stacks, + with_flops, + with_modules, + } => { + let trigger_config = if iterations > 0 { + GpuTraceTriggerConfig::IterationBased { + profile_start_iteration_roundup, + iterations, + } + } else { + GpuTraceTriggerConfig::DurationBased { + profile_start_time, + duration_ms, + } + }; + let trace_options = GpuTraceOptions { + record_shapes, + profile_memory, + with_stacks, + with_flops, + with_modules, + }; + let trace_config = GpuTraceConfig { + log_file, + trigger_config, + trace_options, + }; + gputrace::run_gputrace(dyno_client, job_id, &pids, process_limit, trace_config) + } + Command::Nputrace { + job_id, + pids, + log_file, + duration_ms, + iterations, + profile_start_time, + start_step, + process_limit, + record_shapes, + profile_memory, + with_stack, + with_flops, + with_modules, + activities, + analyse, + profiler_level, + aic_metrics, + l2_cache, + op_attr, + gc_detect_threshold, + data_simplification, + export_type, + } => { + let trigger_config = if iterations > 0 { + NpuTraceTriggerConfig::IterationBased { + start_step, + iterations, + } + } else { + NpuTraceTriggerConfig::DurationBased { + profile_start_time, + duration_ms, + } + }; + + let trace_options = NpuTraceOptions { + record_shapes, + profile_memory, + with_stack, + with_flops, + with_modules, + activities, + analyse, + profiler_level, + aic_metrics, + l2_cache, + op_attr, + gc_detect_threshold, + data_simplification, + export_type, + }; + let trace_config = NpuTraceConfig { + log_file, + trigger_config, + trace_options, + }; + nputrace::run_nputrace(dyno_client, job_id, &pids, process_limit, trace_config) + } + Command::DcgmPause { duration_s } => dcgm::run_dcgm_pause(dyno_client, duration_s), + Command::DcgmResume => dcgm::run_dcgm_resume(dyno_client), + // ... add new commands here + } +} \ No newline at end of file diff --git a/dynolog_npu/scripts/gen_dyno_patches.sh b/dynolog_npu/scripts/gen_dyno_patches.sh index 7fcd016d2a4..5ade74dbcfc 100644 --- a/dynolog_npu/scripts/gen_dyno_patches.sh +++ b/dynolog_npu/scripts/gen_dyno_patches.sh @@ -29,7 +29,10 @@ generate_patches() { echo "original_file: ${original_file}" # 检查原始文件是否存在 if [ ! -f "${original_file}" ]; then - echo "ERROR: Original file not found: ${original_file}" + echo "WARN: Original file not found: ${original_file}" + + cp "${modified_file}" "${original_file}" + echo "Copied ${modified_file} to ${original_file}" continue fi -- Gitee