From 42356ac4114419ccf92a53bfaacd9cff282fd992 Mon Sep 17 00:00:00 2001 From: Mrtutu Date: Mon, 17 Feb 2025 21:56:01 +0800 Subject: [PATCH] add npumonitor param --- .../dynolog_npu/cli/src/commands/mod.rs | 1 + .../cli/src/commands/npumonitor.rs | 59 +++++++++++++ dynolog_npu/dynolog_npu/cli/src/main.rs | 84 +++++++++++++------ 3 files changed, 117 insertions(+), 27 deletions(-) create mode 100644 dynolog_npu/dynolog_npu/cli/src/commands/npumonitor.rs diff --git a/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs b/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs index e4d92f8c6ce..18950d3c1a0 100644 --- a/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs +++ b/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs @@ -12,6 +12,7 @@ pub mod dcgm; pub mod gputrace; pub mod nputrace; +pub mod npumonitor; pub mod status; pub mod version; // ... add new command modules here \ No newline at end of file diff --git a/dynolog_npu/dynolog_npu/cli/src/commands/npumonitor.rs b/dynolog_npu/dynolog_npu/cli/src/commands/npumonitor.rs new file mode 100644 index 00000000000..1edfaea5939 --- /dev/null +++ b/dynolog_npu/dynolog_npu/cli/src/commands/npumonitor.rs @@ -0,0 +1,59 @@ +use std::net::TcpStream; + +use anyhow::Result; + +#[path = "utils.rs"] +mod utils; + +#[derive(Debug)] +pub struct NpuMonitorConfig { + pub npu_monitor_start: bool, + pub npu_monitor_stop: bool, + pub report_interval_s: u32, + pub mspti_activity_kind: String, +} + +impl NpuMonitorConfig { + fn config(&self) -> String { + format!( + r#" +NPU_MONITOR_START={} +NPU_MONITOR_STOP={} +REPORT_INTERVAL_S={} +MSPTI_ACTIVITY_KIND={}"#, + self.npu_monitor_start, + self.npu_monitor_stop, + self.report_interval_s, + self.mspti_activity_kind + ) + } +} + +pub fn run_npumonitor( + client: TcpStream, + config: NpuMonitorConfig, +) -> Result<()> { + let config_str = config.config(); + println!("Npu monitor config = \n{}", config_str); + let config_str = config_str.replace('\n', "\\n"); + + let request_json = format!( + r#" +{{ + "fn": "setKinetOnDemandRequest", + "config": "{}", + "job_id": 0, + "pids": [0], + "process_limit": 3 +}}"#, + config_str + ); + + utils::send_msg(&client, &request_json).expect("Error sending message to service"); + + let resp_str = utils::get_resp(&client).expect("Unable to decode output bytes"); + + println!("response = {}", resp_str); + + Ok(()) +} diff --git a/dynolog_npu/dynolog_npu/cli/src/main.rs b/dynolog_npu/dynolog_npu/cli/src/main.rs index 7a3d04a7138..6d48508268f 100644 --- a/dynolog_npu/dynolog_npu/cli/src/main.rs +++ b/dynolog_npu/dynolog_npu/cli/src/main.rs @@ -17,6 +17,7 @@ use commands::gputrace::GpuTraceTriggerConfig; use commands::nputrace::NpuTraceConfig; use commands::nputrace::NpuTraceOptions; use commands::nputrace::NpuTraceTriggerConfig; +use commands::npumonitor::NpuMonitorConfig; use commands::*; /// Instructions on adding a new Dyno CLI command: @@ -54,7 +55,7 @@ enum Command { Version, /// Capture gputrace Gputrace { - /// Job id of the application to trace + /// Job id of the application to trace. #[clap(long, default_value_t = 0)] job_id: u64, /// List of pids to capture trace for (comma separated). @@ -69,35 +70,35 @@ enum Command { /// Log file for trace. #[clap(long)] log_file: String, - /// Unix timestamp used for synchronized collection (milliseconds since epoch) + /// Unix timestamp used for synchronized collection (milliseconds since epoch). #[clap(long, default_value_t = 0)] profile_start_time: u64, /// Start iteration roundup, starts an iteration based trace at a multiple /// of this value. #[clap(long, default_value_t = 1)] profile_start_iteration_roundup: u64, - /// Max number of processes to profile + /// Max number of processes to profile. #[clap(long, default_value_t = 3)] process_limit: u32, - /// Record PyTorch operator input shapes and types + /// Record PyTorch operator input shapes and types. #[clap(long, action)] record_shapes: bool, - /// Profile PyTorch memory + /// Profile PyTorch memory. #[clap(long, action)] profile_memory: bool, - /// Capture Python stacks in traces + /// Capture Python stacks in traces. #[clap(long, action)] with_stacks: bool, - /// Annotate operators with analytical flops + /// Annotate operators with analytical flops. #[clap(long, action)] with_flops: bool, - /// Capture PyTorch operator modules in traces + /// Capture PyTorch operator modules in traces. #[clap(long, action)] with_modules: bool, }, - /// Capture nputrace. Subcommand functions aligned with Ascend Torch Profiler + /// Capture nputrace. Subcommand functions aligned with Ascend Torch Profiler. Nputrace { - /// Job id of the application to trace + /// Job id of the application to trace. #[clap(long, default_value_t = 0)] job_id: u64, /// List of pids to capture trace for (comma separated). @@ -112,58 +113,73 @@ enum Command { /// Log file for trace. #[clap(long)] log_file: String, - /// Unix timestamp used for synchronized collection (milliseconds since epoch) + /// Unix timestamp used for synchronized collection (milliseconds since epoch). #[clap(long, default_value_t = 0)] profile_start_time: u64, - /// Number of steps to start profile + /// Number of steps to start profile. #[clap(long, default_value_t = 0)] start_step: u64, - /// Max number of processes to profile + /// Max number of processes to profile. #[clap(long, default_value_t = 3)] process_limit: u32, - /// Whether to record PyTorch operator input shapes and types + /// Whether to record PyTorch operator input shapes and types. #[clap(long, action)] record_shapes: bool, - /// Whether to profile PyTorch memory + /// Whether to profile PyTorch memory. #[clap(long, action)] profile_memory: bool, - /// Whether to profile the Python call stack in trace + /// Whether to profile the Python call stack in trace. #[clap(long, action)] with_stack: bool, - /// Annotate operators with analytical flops + /// Annotate operators with analytical flops. #[clap(long, action)] with_flops: bool, - /// Whether to profile PyTorch operator modules in traces + /// Whether to profile PyTorch operator modules in traces. #[clap(long, action)] with_modules: bool, - /// The scope of the profile's events + /// The scope of the profile's events. #[clap(long, value_parser = ["CPU,NPU", "NPU,CPU", "CPU", "NPU"], default_value = "CPU,NPU")] activities: String, - /// Profiler level + /// Profiler level. #[clap(long, value_parser = ["Level0", "Level1", "Level2", "Level_none"], default_value = "Level0")] profiler_level: String, - /// AIC metrics + /// AIC metrics. #[clap(long, value_parser = ["AiCoreNone", "PipeUtilization", "ArithmeticUtilization", "Memory", "MemoryL0", "ResourceConflictRatio", "MemoryUB", "L2Cache", "MemoryAccess"], default_value = "AiCoreNone")] aic_metrics: String, - /// Whether to analyse the data after collection + /// Whether to analyse the data after collection. #[clap(long, action)] analyse: bool, - /// Whether to collect L2 cache + /// Whether to collect L2 cache. #[clap(long, action)] l2_cache: bool, - /// Whether to collect op attributes + /// Whether to collect op attributes. #[clap(long, action)] op_attr: bool, - /// GC detect threshold + /// GC detect threshold. #[clap(long)] gc_detect_threshold: Option, - /// Whether to streamline data after analyse is complete + /// Whether to streamline data after analyse is complete. #[clap(long, value_parser = ["true", "false"], default_value = "true")] data_simplification: String, - /// Types of data exported by the profiler + /// Types of data exported by the profiler. #[clap(long, value_parser = ["Text", "Db"], default_value = "Text")] export_type: String, }, + /// Ascend MSPTI Monitor + NpuMonitor { + /// Start NPU monitor. + #[clap(long, action)] + npu_monitor_start: bool, + /// Stop NPU monitor. + #[clap(long, action)] + npu_monitor_stop: bool, + /// NPU monitor report interval in seconds. + #[clap(long, default_value_t = 60)] + report_interval_s: u32, + /// MSPTI collect activity kind + #[clap(long, value_parser = ["Marker", "Kernel", "API", "Hccl", "Memory", "MemSet", "MemCpy"], default_value = "Marker")] + mspti_activity_kind: String, + }, /// Pause dcgm profiling. This enables running tools like Nsight compute and avoids conflicts. DcgmPause { /// Duration to pause dcgm profiling in seconds @@ -296,6 +312,20 @@ fn main() -> Result<()> { }; nputrace::run_nputrace(dyno_client, job_id, &pids, process_limit, trace_config) } + Command::NpuMonitor { + npu_monitor_start, + npu_monitor_stop, + report_interval_s, + mspti_activity_kind, + } => { + let npu_mon_config = NpuMonitorConfig { + npu_monitor_start, + npu_monitor_stop, + report_interval_s, + mspti_activity_kind + }; + npumonitor::run_npumonitor(dyno_client, npu_mon_config) + } Command::DcgmPause { duration_s } => dcgm::run_dcgm_pause(dyno_client, duration_s), Command::DcgmResume => dcgm::run_dcgm_resume(dyno_client), // ... add new commands here -- Gitee