From 7f5e6ea8625ebab7597b4f4c99209f6cd2a2b258 Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Mon, 27 Jan 2025 11:11:33 +0800
Subject: [PATCH] add npu trace

---
 dynolog_npu/README.md                         |  51 ++-
 .../dynolog_npu/cli/src/commands/mod.rs       |  17 +
 .../dynolog_npu/cli/src/commands/nputrace.rs  | 242 ++++++++++++++
 dynolog_npu/dynolog_npu/cli/src/main.rs       | 303 ++++++++++++++++++
 dynolog_npu/scripts/gen_dyno_patches.sh       |   5 +-
 5 files changed, 616 insertions(+), 2 deletions(-)
 create mode 100644 dynolog_npu/dynolog_npu/cli/src/commands/mod.rs
 create mode 100644 dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs
 create mode 100644 dynolog_npu/dynolog_npu/cli/src/main.rs
diff --git a/dynolog_npu/README.md b/dynolog_npu/README.md
index a7ce0249f53..00afb3c3975 100644
--- a/dynolog_npu/README.md
+++ b/dynolog_npu/README.md
@@ -1,6 +1,6 @@
 # Ascend Extension for dynolog
 
-## 安装
+## 安装方式
 
 ### 1. clone 代码
 
@@ -63,3 +63,52 @@ bash scripts/build.sh -t deb
 # 编译rpm包, 当前只支持amd64平台
 bash scripts/build.sh -t rpm
 ```
+
+## 使用方式
+
+### Profiler trace dump功能
+Profiler trace dump功能基于dynolog开发，实现类似于动态profiling的动态触发Ascend Torch Profiler采集profiling的功能。用户基于dyno CLI命令行可以动态触发指定节点的训练进程trace dump。
+
+- 查看nputrace支持的命令和帮助
+
+```bash
+dyno nputrace --help
+```
+
+- nputrace使用方式
+
+```bash
+dyno nputrace [SUBCOMMANDS] --log-file <LOG_FILE>
+```
+
+nputrace子命令支持的参数选项
+
+| 子命令 | 参数类型 | 说明 |
+|-------|-------|-------|
+| record_shapes | action | 是否采集算子的InputShapes和InputTypes，设置参数采集，默认不采集 |
+| profile_memory | action | 是否采集算子内存信息，设置参数采集，默认不采集 |
+| with_stack | action | 是否采集Python调用栈，设置参数采集，默认不采集 |
+| with_flops | action | 是否采集算子flops，设置参数采集，默认不采集 |
+| with_modules | action | 是否采集modules层级的Python调用栈，设置参数采集，默认不采集 |
+| analyse | action | 采集后是否自动解析，设置参数解析，默认不解析 |
+| l2_cache | action | 是否采集L2 Cache数据，设置参数采集，默认不采集 |
+| op_attr | action | 是否采集算子属性信息，设置参数采集，默认不采集 |
+| data_simplification | String | 解析完成后是否数据精简，可选值范围[`true`, `false`]，默认值`true` |
+| activities | String | 控制CPU、NPU事件采集范围，可选值范围[`CPU,NPU`, `NPU,CPU`, `CPU`, `NPU`]，默认值`CPU,NPU` |
+| profiler_level | String | 控制profiler的采集等级，可选值范围[`Level_none`, `Level0`, `Level1`, `Level2`]，默认值`Level0`|
+| aic_metrics | String | AI Core的性能指标采集项，可选值范围[`AiCoreNone`, `PipeUtilization`, `ArithmeticUtilization`, `Memory`, `MemoryL0`, `ResourceConflictRatio`, `MemoryUB`, `L2Cache`, `MemoryAccess`]，默认值`AiCoreNone`|
+| export_type | String | profiler解析导出数据的类型，可选值范围[`Text`, `Db`]，默认值`Text`|
+| gc_detect_threshold | Option<f32> | GC检测阈值，单位ms，只采集超过阈值的GC事件。该参数为可选参数，默认不设置时不开启GC检测 |
+
+- nputrace示例命令
+
+```bash
+# 示例1：采集框架、CANN和device数据，同时采集完后自动解析以及解析完成不做数据精简，落盘路径为/tmp/profile_data
+dyno nputrace --activities CPU,NPU --analyse --data_simplification false --log-file /tmp/profile_data
+
+# 示例2：只采集CANN和device数据，同时采集完后自动解析以及解析完成后开启数据精简，落盘路径为/tmp/profile_data
+dyno nputrace --activities NPU --analyse --data_simplification true --log-file /tmp/profile_data
+
+# 示例3：只采集CANN和device数据，只采集不解析，落盘路径为/tmp/profile_data
+dyno nputrace --activities NPU --log-file /tmp/profile_data
+```
diff --git a/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs b/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs
new file mode 100644
index 00000000000..e4d92f8c6ce
--- /dev/null
+++ b/dynolog_npu/dynolog_npu/cli/src/commands/mod.rs
@@ -0,0 +1,17 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+// Export all command submodules to be used in main.rs
+// Note: This "intermediate" commands module is purely for organizational purposes.
+// This allows for a clear distinction between the command dispatching code and the command
+// handling code. Additionally, explicitly "exporting" all the command modules here allows
+// us to avoid having to explicitly list all the command modules in main.rs.
+
+pub mod dcgm;
+pub mod gputrace;
+pub mod nputrace;
+pub mod status;
+pub mod version;
+// ... add new command modules here
\ No newline at end of file
diff --git a/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs b/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs
new file mode 100644
index 00000000000..4bf7132de33
--- /dev/null
+++ b/dynolog_npu/dynolog_npu/cli/src/commands/nputrace.rs
@@ -0,0 +1,242 @@
+use std::net::TcpStream;
+
+use anyhow::Result;
+use serde_json::Value;
+
+#[path = "utils.rs"]
+mod utils;
+
+#[derive(Debug)]
+pub enum NpuTraceTriggerConfig {
+    DurationBased {
+        profile_start_time: u64,
+        duration_ms: u64,
+    },
+    IterationBased {
+        start_step: u64,
+        iterations: i64,
+    },
+}
+
+impl NpuTraceTriggerConfig {
+    fn config(&self) -> String {
+        match *self {
+            NpuTraceTriggerConfig::DurationBased {
+                profile_start_time,
+                duration_ms,
+            } => format!(
+                "PROFILE_START_TIME={}\nACTIVITIES_DURATION_MSECS={}",
+                profile_start_time, duration_ms
+            ),
+            NpuTraceTriggerConfig::IterationBased {
+                start_step,
+                iterations,
+            } => format!(
+                r#"PROFILE_START_ITERATION=0
+PROFILE_START_STEP={}
+ACTIVITIES_ITERATIONS={}"#,
+                start_step, iterations
+            ),
+        }
+    }
+}
+
+// torch npu profiler config
+#[derive(Debug)]
+pub struct NpuTraceOptions {
+    pub record_shapes: bool,
+    pub profile_memory: bool,
+    pub with_stack: bool,
+    pub with_flops: bool,
+    pub with_modules: bool,
+    pub activities: String,
+    pub analyse: bool,
+    pub profiler_level: String,
+    pub aic_metrics: String,
+    pub l2_cache: bool,
+    pub op_attr: bool,
+    pub gc_detect_threshold: Option<f32>,
+    pub data_simplification: String,
+    pub export_type: String,
+}
+
+impl NpuTraceOptions {
+    fn config(&self) -> String {
+        format!(
+            r#"
+PROFILE_RECORD_SHAPES={}
+PROFILE_PROFILE_MEMORY={}
+PROFILE_WITH_STACK={}
+PROFILE_WITH_FLOPS={}
+PROFILE_WITH_MODULES={}
+PROFILE_ACTIVITIES={}
+PROFILE_ANALYSE={}
+PROFILE_PROFILER_LEVEL={}
+PROFILE_AIC_METRICS={}
+PROFILE_L2_CACHE={}
+PROFILE_OP_ATTR={}
+PROFILE_GC_DETECT_THRESHOLD={}
+PROFILE_DATA_SIMPLIFICATION={}
+PROFILE_EXPORT_TYPE={}"#,
+            self.record_shapes,
+            self.profile_memory,
+            self.with_stack,
+            self.with_flops,
+            self.with_modules,
+            self.activities,
+            self.analyse,
+            self.profiler_level,
+            self.aic_metrics,
+            self.l2_cache,
+            self.op_attr,
+            self.gc_detect_threshold.map_or("None".to_string(), |v| v.to_string()),
+            self.data_simplification,
+            self.export_type
+        )
+    }
+}
+
+#[derive(Debug)]
+pub struct NpuTraceConfig {
+    pub log_file: String,
+    pub trigger_config: NpuTraceTriggerConfig,
+    pub trace_options: NpuTraceOptions,
+}
+
+impl NpuTraceConfig {
+    fn config(&self) -> String {
+        format!(
+            "ACTIVITIES_LOG_FILE={}\n{}{}",
+            self.log_file,
+            self.trigger_config.config(),
+            self.trace_options.config()
+        )
+    }
+}
+
+pub fn run_nputrace(
+    client: TcpStream,
+    job_id: u64,
+    pids: &str,
+    process_limit: u32,
+    config: NpuTraceConfig,
+) -> Result<()> {
+    let config_str = config.config();
+    println!("NpuTrace config = \n{}", config_str);
+    let config_str = config_str.replace('\n', "\\n");
+
+    let request_json = format!(
+        r#"
+{{
+    "fn": "setKinetOnDemandRequest",
+    "config": "{}",
+    "job_id": {},
+    "pids": [{}],
+    "process_limit": {}
+}}"#,
+        config_str, job_id, pids, process_limit
+    );
+
+    utils::send_msg(&client, &request_json).expect("Error sending message to service");
+
+    let resp_str = utils::get_resp(&client).expect("Unable to decode output bytes");
+
+    println!("response = {}", resp_str);
+
+    let resp_v: Value = serde_json::from_str(&resp_str)?;
+    let processes = resp_v["processesMatched"].as_array().unwrap();
+
+    if processes.is_empty() {
+        println!("No processes were matched, please check --job-id or --pids flags");
+    } else {
+        println!("Matched {} processes", processes.len());
+        println!("Trace output files will be written to:");
+
+        for pid in processes {
+            let pid = pid.as_i64().unwrap();
+            println!(
+                "    {}",
+                config.log_file.replace(".json", &format!("_{}.json", pid))
+            );
+        }
+    }
+
+    Ok(())
+}
+
+
+#[cfg(test)]
+mod test {
+    use crate::*;
+
+    #[test]
+    fn test_nputrace_trigger_config() {
+        let trigger_config = NpuTraceTriggerConfig::DurationBased {
+            profile_start_time: 1000,
+            duration_ms: 1000,
+        };
+        assert_eq!(
+            trigger_config.config(),
+            r#"PROFILE_START_TIME=1000
+ACTIVITIES_DURATION_MSECS=1000"#
+        );
+
+        let trigger_config = NpuTraceTriggerConfig::IterationBased {    
+            profile_start_step: 1000,
+            iterations: 1000,
+        };
+        assert_eq!(
+            trigger_config.config(),
+            r#"PROFILE_START_ITERATION=0
+PROFILE_START_STEP=1000
+ACTIVITIES_ITERATIONS=1000"#
+        );
+    }
+
+    #[test]
+    fn test_nputrace_config() {
+        let config = NpuTraceConfig {
+            log_file: "test.json".to_string(),
+            trigger_config: NpuTraceTriggerConfig::DurationBased {
+                profile_start_time: 1000,
+                duration_ms: 1000,
+            },
+            trace_options: NpuTraceOptions {
+                record_shapes: true,
+                profile_memory: false,
+                with_stack: true,
+                with_flops: true,
+                with_modules: true,
+                activities: "CPU,NPU".to_string(),
+                analyse: false,
+                profiler_level: "Level0".to_string(),
+                aic_metrics: "AiCoreNone".to_string(),
+                l2_cache: true,
+                op_attr: true,
+                gc_detect_threshold: 0.1,
+                data_simplification: "true",
+                export_type: "Text".to_string(),
+            },
+        };
+        assert_eq!(
+            config.config(),
+            r#"ACTIVITIES_LOG_FILE=test.json
+PROFILE_START_TIME=1000
+ACTIVITIES_DURATION_MSECS=1000
+PROFILE_RECORD_SHAPES=true
+PROFILE_PROFILE_MEMORY=false
+PROFILE_WITH_STACK=true
+PROFILE_WITH_FLOPS=true
+PROFILE_WITH_MODULES=true
+PROFILE_ACTIVITIES=CPU,NPU
+PROFILE_ANALYSE=false
+PROFILE_PROFILER_LEVEL=Level0
+PROFILE_AIC_METRICS=AiCoreNone
+PROFILE_L2_CACHE=true
+PROFILE_OP_ATTR=true
+PROFILE_GC_DETECT_THRESHOLD=0.1
+PROFILE_DATA_SIMPLIFICATION=true
+PROFILE_EXPORT_TYPE=Text"#
+        );
+    }
+}
diff --git a/dynolog_npu/dynolog_npu/cli/src/main.rs b/dynolog_npu/dynolog_npu/cli/src/main.rs
new file mode 100644
index 00000000000..7a3d04a7138
--- /dev/null
+++ b/dynolog_npu/dynolog_npu/cli/src/main.rs
@@ -0,0 +1,303 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+use std::net::TcpStream;
+use std::net::ToSocketAddrs;
+
+use anyhow::Result;
+use clap::Parser;
+
+// Make all the command modules accessible to this file.
+mod commands;
+use commands::gputrace::GpuTraceConfig;
+use commands::gputrace::GpuTraceOptions;
+use commands::gputrace::GpuTraceTriggerConfig;
+use commands::nputrace::NpuTraceConfig;
+use commands::nputrace::NpuTraceOptions;
+use commands::nputrace::NpuTraceTriggerConfig;
+use commands::*;
+
+/// Instructions on adding a new Dyno CLI command:
+///
+/// 1. Add a new variant to the `Command` enum.
+///    Please include a description of the command and, if applicable, its flags/subcommands.
+///
+/// 2. Create a new file for the command's implementation in the commands/ directory (ie
+///    commands/status.rs). This new file is where the command should be implemented.
+///    Make the new command's module accessible from this file by adding
+///    a new line with `pub mod <newfile>;` to commands/mod.rs.
+///
+///
+/// 3. Add a branch to the match statement in main() to handle the new enum variant (from step 1).
+///    From here, invoke the handling logic defined in the new file (from step 2). In an effort to keep
+///    the command dispatching logic clear and concise, please keep the code in the match branch to a minimum.
+
+const DYNO_PORT: u16 = 1778;
+
+#[derive(Debug, Parser)]
+struct Opts {
+    #[clap(long, default_value = "localhost")]
+    hostname: String,
+    #[clap(long, default_value_t = DYNO_PORT)]
+    port: u16,
+    #[clap(subcommand)]
+    cmd: Command,
+}
+
+#[derive(Debug, Parser)]
+enum Command {
+    /// Check the status of a dynolog process
+    Status,
+    /// Check the version of a dynolog process
+    Version,
+    /// Capture gputrace
+    Gputrace {
+        /// Job id of the application to trace
+        #[clap(long, default_value_t = 0)]
+        job_id: u64,
+        /// List of pids to capture trace for (comma separated).
+        #[clap(long, default_value = "0")]
+        pids: String,
+        /// Duration of trace to collect in ms.
+        #[clap(long, default_value_t = 500)]
+        duration_ms: u64,
+        /// Training iterations to collect, this takes precedence over duration.
+        #[clap(long, default_value_t = -1)]
+        iterations: i64,
+        /// Log file for trace.
+        #[clap(long)]
+        log_file: String,
+        /// Unix timestamp used for synchronized collection (milliseconds since epoch)
+        #[clap(long, default_value_t = 0)]
+        profile_start_time: u64,
+        /// Start iteration roundup, starts an iteration based trace at a multiple
+        /// of this value.
+        #[clap(long, default_value_t = 1)]
+        profile_start_iteration_roundup: u64,
+        /// Max number of processes to profile
+        #[clap(long, default_value_t = 3)]
+        process_limit: u32,
+        /// Record PyTorch operator input shapes and types
+        #[clap(long, action)]
+        record_shapes: bool,
+        /// Profile PyTorch memory
+        #[clap(long, action)]
+        profile_memory: bool,
+        /// Capture Python stacks in traces
+        #[clap(long, action)]
+        with_stacks: bool,
+        /// Annotate operators with analytical flops
+        #[clap(long, action)]
+        with_flops: bool,
+        /// Capture PyTorch operator modules in traces
+        #[clap(long, action)]
+        with_modules: bool,
+    },
+    /// Capture nputrace. Subcommand functions aligned with Ascend Torch Profiler
+    Nputrace {
+        /// Job id of the application to trace
+        #[clap(long, default_value_t = 0)]
+        job_id: u64,
+        /// List of pids to capture trace for (comma separated).
+        #[clap(long, default_value = "0")]
+        pids: String,
+        /// Duration of trace to collect in ms.
+        #[clap(long, default_value_t = 500)]
+        duration_ms: u64,
+        /// Training iterations to collect, this takes precedence over duration.
+        #[clap(long, default_value_t = -1)]
+        iterations: i64,
+        /// Log file for trace.
+        #[clap(long)]
+        log_file: String,
+        /// Unix timestamp used for synchronized collection (milliseconds since epoch)
+        #[clap(long, default_value_t = 0)]
+        profile_start_time: u64,
+        /// Number of steps to start profile
+        #[clap(long, default_value_t = 0)]
+        start_step: u64,
+        /// Max number of processes to profile
+        #[clap(long, default_value_t = 3)]
+        process_limit: u32,
+        /// Whether to record PyTorch operator input shapes and types
+        #[clap(long, action)]
+        record_shapes: bool,
+        /// Whether to profile PyTorch memory
+        #[clap(long, action)]
+        profile_memory: bool,
+        /// Whether to profile the Python call stack in trace
+        #[clap(long, action)]
+        with_stack: bool,
+        /// Annotate operators with analytical flops
+        #[clap(long, action)]
+        with_flops: bool,
+        /// Whether to profile PyTorch operator modules in traces
+        #[clap(long, action)]
+        with_modules: bool,
+        /// The scope of the profile's events
+        #[clap(long, value_parser = ["CPU,NPU", "NPU,CPU", "CPU", "NPU"], default_value = "CPU,NPU")]
+        activities: String,
+        /// Profiler level
+        #[clap(long, value_parser = ["Level0", "Level1", "Level2", "Level_none"], default_value = "Level0")]
+        profiler_level: String,
+        /// AIC metrics
+        #[clap(long, value_parser = ["AiCoreNone", "PipeUtilization", "ArithmeticUtilization", "Memory", "MemoryL0", "ResourceConflictRatio", "MemoryUB", "L2Cache", "MemoryAccess"], default_value = "AiCoreNone")]
+        aic_metrics: String,
+        /// Whether to analyse the data after collection
+        #[clap(long, action)]
+        analyse: bool,
+        /// Whether to collect L2 cache
+        #[clap(long, action)]
+        l2_cache: bool,
+        /// Whether to collect op attributes
+        #[clap(long, action)]
+        op_attr: bool,
+        /// GC detect threshold
+        #[clap(long)]
+        gc_detect_threshold: Option<f32>,
+        /// Whether to streamline data after analyse is complete 
+        #[clap(long, value_parser = ["true", "false"], default_value = "true")]
+        data_simplification: String,
+        /// Types of data exported by the profiler
+        #[clap(long, value_parser = ["Text", "Db"], default_value = "Text")]
+        export_type: String,
+    },
+    /// Pause dcgm profiling. This enables running tools like Nsight compute and avoids conflicts.
+    DcgmPause {
+        /// Duration to pause dcgm profiling in seconds
+        #[clap(long, default_value_t = 300)]
+        duration_s: i32,
+    },
+    /// Resume dcgm profiling
+    DcgmResume,
+}
+
+/// Create a socket connection to dynolog
+fn create_dyno_client(host: &str, port: u16) -> Result<TcpStream> {
+    let addr = (host, port)
+        .to_socket_addrs()?
+        .next()
+        .expect("Failed to connect to the server");
+
+    TcpStream::connect(addr).map_err(|err| err.into())
+}
+
+fn main() -> Result<()> {
+    let Opts {
+        hostname,
+        port,
+        cmd,
+    } = Opts::parse();
+
+    let dyno_client =
+        create_dyno_client(&hostname, port).expect("Couldn't connect to the server...");
+
+    match cmd {
+        Command::Status => status::run_status(dyno_client),
+        Command::Version => version::run_version(dyno_client),
+        Command::Gputrace {
+            job_id,
+            pids,
+            log_file,
+            duration_ms,
+            iterations,
+            profile_start_time,
+            profile_start_iteration_roundup,
+            process_limit,
+            record_shapes,
+            profile_memory,
+            with_stacks,
+            with_flops,
+            with_modules,
+        } => {
+            let trigger_config = if iterations > 0 {
+                GpuTraceTriggerConfig::IterationBased {
+                    profile_start_iteration_roundup,
+                    iterations,
+                }
+            } else {
+                GpuTraceTriggerConfig::DurationBased {
+                    profile_start_time,
+                    duration_ms,
+                }
+            };
+            let trace_options = GpuTraceOptions {
+                record_shapes,
+                profile_memory,
+                with_stacks,
+                with_flops,
+                with_modules,
+            };
+            let trace_config = GpuTraceConfig {
+                log_file,
+                trigger_config,
+                trace_options,
+            };
+            gputrace::run_gputrace(dyno_client, job_id, &pids, process_limit, trace_config)
+        }
+        Command::Nputrace {
+            job_id,
+            pids,
+            log_file,
+            duration_ms,
+            iterations,
+            profile_start_time,
+            start_step,
+            process_limit,
+            record_shapes,
+            profile_memory,
+            with_stack,
+            with_flops,
+            with_modules,
+            activities,
+            analyse,
+            profiler_level,
+            aic_metrics,
+            l2_cache,
+            op_attr,
+            gc_detect_threshold,
+            data_simplification,
+            export_type,
+        } => {
+            let trigger_config = if iterations > 0 {
+                NpuTraceTriggerConfig::IterationBased {
+                    start_step,
+                    iterations,
+                }
+            } else {
+                NpuTraceTriggerConfig::DurationBased {
+                    profile_start_time,
+                    duration_ms,
+                }
+            };
+
+            let trace_options = NpuTraceOptions {
+                record_shapes,
+                profile_memory,
+                with_stack,
+                with_flops,
+                with_modules,
+                activities,
+                analyse,
+                profiler_level,
+                aic_metrics,
+                l2_cache,
+                op_attr,
+                gc_detect_threshold,
+                data_simplification,
+                export_type,
+            };
+            let trace_config = NpuTraceConfig {
+                log_file,
+                trigger_config,
+                trace_options,
+            };
+            nputrace::run_nputrace(dyno_client, job_id, &pids, process_limit, trace_config)
+        }
+        Command::DcgmPause { duration_s } => dcgm::run_dcgm_pause(dyno_client, duration_s),
+        Command::DcgmResume => dcgm::run_dcgm_resume(dyno_client),
+        // ... add new commands here
+    }
+}
\ No newline at end of file
diff --git a/dynolog_npu/scripts/gen_dyno_patches.sh b/dynolog_npu/scripts/gen_dyno_patches.sh
index 7fcd016d2a4..5ade74dbcfc 100644
--- a/dynolog_npu/scripts/gen_dyno_patches.sh
+++ b/dynolog_npu/scripts/gen_dyno_patches.sh
@@ -29,7 +29,10 @@ generate_patches() {
         echo "original_file: ${original_file}"
         # 检查原始文件是否存在
         if [ ! -f "${original_file}" ]; then
-            echo "ERROR: Original file not found: ${original_file}"
+            echo "WARN: Original file not found: ${original_file}"
+
+            cp "${modified_file}" "${original_file}"
+            echo "Copied ${modified_file} to ${original_file}"
             continue
         fi
         
-- 
Gitee