diff --git a/msmonitor/docs/nputrace.md b/msmonitor/docs/nputrace.md index 379f7da34ad29d946c69f1ba8dea67762be3a61a..d5194edc24f88413dd342a2807c0940bf795f1b4 100644 --- a/msmonitor/docs/nputrace.md +++ b/msmonitor/docs/nputrace.md @@ -23,7 +23,7 @@ nputrace的SUBCOMMANDS(子命令)选项如下: | --duration-ms | u64 | 采集的周期,单位毫秒,默认值500,dynolog原生参数 | N | N | N | | --iterations | i64 | 采集总迭代数,默认值-1,dynolog原生参数,需与start-step参数同时指定 | Y | Y | Y | | --log-file | String | 采集落盘的路径 | Y | Y | Y | -| --start-step | u64 | 开始采集的迭代数,默认值0 | Y | Y | Y | +| --start-step | i64 | 开始采集的迭代数,默认值0,设置为-1时表示从下一个step开始采集 | Y | Y | Y | | --record-shapes | action | 是否采集算子的InputShapes和InputTypes,设置参数采集,默认不采集 | Y | Y | N | | --profile-memory | action | 是否采集算子内存信息,设置参数采集,默认不采集 | Y | Y | N | | --with-stack | action | 是否采集Python调用栈,设置参数采集,默认不采集 | Y | Y | N | @@ -77,13 +77,17 @@ Step 4:使用dyno CLI动态触发trace dump # 示例1:从第10个step开始采集,采集2个step,采集框架、CANN和device数据,同时采集完后自动解析以及解析完成不做数据精简,落盘路径为/tmp/profile_data dyno --certs-dir /home/client_certs nputrace --start-step 10 --iterations 2 --activities CPU,NPU --analyse --data-simplification false --log-file /tmp/profile_data -# 示例2:从第10个step开始采集,采集2个step,只采集CANN和device数据,同时采集完后自动解析以及解析完成后开启数据精简,落盘路径为/tmp/profile_data +# 示例2:从下一个step开始采集,采集2个step,采集框架、CANN和device数据,同时采集完后自动解析以及解析完成不做数据精简,落盘路径为/tmp/profile_data +dyno --certs-dir /home/client_certs nputrace --start-step -1 --iterations 2 --activities CPU,NPU --analyse --data-simplification false --log-file /tmp/profile_data + +# 示例3:从第10个step开始采集,采集2个step,只采集CANN和device数据,同时采集完后自动解析以及解析完成后开启数据精简,落盘路径为/tmp/profile_data dyno --certs-dir /home/client_certs nputrace --start-step 10 --iterations 2 --activities NPU --analyse --data-simplification true --log-file /tmp/profile_data -# 示例3:从第10个step开始采集,采集2个step,只采集CANN和device数据,只采集不解析,落盘路径为/tmp/profile_data +# 示例4:从第10个step开始采集,采集2个step,只采集CANN和device数据,只采集不解析,落盘路径为/tmp/profile_data + dyno --certs-dir /home/client_certs nputrace --start-step 10 --iterations 2 --activities NPU --log-file /tmp/profile_data -# 示例4:多机场景下向特定机器x.x.x.x发送参数信息,参数表示从第10个step开始采集,采集2个step,只采集CANN和device数据,只采集不解析,落盘路径为/tmp/profile_data +# 示例5:多机场景下向特定机器x.x.x.x发送参数信息,参数表示从第10个step开始采集,采集2个step,只采集CANN和device数据,只采集不解析,落盘路径为/tmp/profile_data dyno --certs-dir /home/client_certs --hostname x.x.x.x nputrace --start-step 10 --iterations 2 --activities NPU --log-file /tmp/profile_data ``` nputrace落盘的数据格式和交付件介绍请参考[Profiler数据目录说明](https://www.hiascend.com/document/detail/zh/mindstudio/81RC1/T&ITools/Profiling/atlasprofiling_16_0177.html#ZH-CN_TOPIC_0000002387356237) \ No newline at end of file diff --git a/msmonitor/dynolog_npu/cli/src/commands/nputrace.rs b/msmonitor/dynolog_npu/cli/src/commands/nputrace.rs index ef84b3038df9014a184a61e5d37d50263af3a722..66f5d576c8381e90e3debaf7b2d351670997cce9 100644 --- a/msmonitor/dynolog_npu/cli/src/commands/nputrace.rs +++ b/msmonitor/dynolog_npu/cli/src/commands/nputrace.rs @@ -1,3 +1,8 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// Copyright (c) 2025-2025. Huawei Technologies Co., Ltd. All rights reserved. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. use anyhow::Result; use serde_json::Value; use crate::DynoClient; @@ -10,7 +15,7 @@ pub enum NpuTraceTriggerConfig { duration_ms: u64, }, IterationBased { - start_step: u64, + start_step: i64, iterations: i64, }, } @@ -196,7 +201,7 @@ mod test { ACTIVITIES_DURATION_MSECS=1000"# ); - let trigger_config = NpuTraceTriggerConfig::IterationBased { + let trigger_config = NpuTraceTriggerConfig::IterationBased { profile_start_step: 1000, iterations: 1000, }; diff --git a/msmonitor/dynolog_npu/cli/src/main.rs b/msmonitor/dynolog_npu/cli/src/main.rs index 7ee9d1be02657e711a2984c3a69ca78b02b34845..d6ee4ebcaa9e34f57a594ecfeb07b6574ed1b121 100644 --- a/msmonitor/dynolog_npu/cli/src/main.rs +++ b/msmonitor/dynolog_npu/cli/src/main.rs @@ -78,7 +78,7 @@ fn parse_mspti_activity_kinds(src: &str) -> Result{ return Err(format!("Invalid MSPTI activity kind: {}, Possible values: {:?}.]", kind, allowed_values)); } } - + Ok(src.to_string()) } @@ -103,6 +103,16 @@ fn parse_host_sys(src: &str) -> Result{ Ok(result) } +const INSTANT_START_STEP: i64 = -1; // nputrace子命令,表示从下个step开启采集任务 + +fn parse_start_step(src: &str) -> Result { + let start_step = src.trim().parse::().map_err(|e| format!("{}", e))?; + if start_step < INSTANT_START_STEP { + return Err(format!("Must be non-negative integer or {}", INSTANT_START_STEP)); + } + Ok(start_step) +} + #[derive(Debug, Parser)] enum Command { /// Check the status of a dynolog process @@ -164,7 +174,7 @@ enum Command { #[clap(long, default_value_t = 500)] duration_ms: u64, /// Training iterations to collect, this takes precedence over duration. - #[clap(long, default_value_t = -1)] + #[clap(long, default_value_t = -1, allow_negative_numbers = true)] iterations: i64, /// Log file for trace. #[clap(long)] @@ -172,9 +182,9 @@ enum Command { /// Unix timestamp used for synchronized collection (milliseconds since epoch). #[clap(long, default_value_t = 0)] profile_start_time: u64, - /// Number of steps to start profile. - #[clap(long, default_value_t = 0)] - start_step: u64, + /// Number of steps to start profile, -1 means start from next step. + #[clap(long, default_value_t = 0, value_parser = parse_start_step, allow_negative_numbers = true)] + start_step: i64, /// Max number of processes to profile. #[clap(long, default_value_t = 3)] process_limit: u32, @@ -285,12 +295,12 @@ fn verify_certificate(cert_der: &[u8], is_root_cert: bool) -> Result<()> { // 检查证书签名算法 let sig_alg = cert.signature_algorithm.algorithm; - + // 定义不安全的算法 OID let md2_rsa = oid!(1.2.840.113549.1.1.2); // MD2 with RSA let md5_rsa = oid!(1.2.840.113549.1.1.4); // MD5 with RSA let sha1_rsa = oid!(1.2.840.113549.1.1.5); // SHA1 with RSA - + // 检查是否使用不安全的算法 if sig_alg == md2_rsa || sig_alg == md5_rsa || sig_alg == sha1_rsa { return Err(io::Error::new( @@ -428,7 +438,7 @@ fn is_cert_revoked(cert_der: &[u8], crl_path: &PathBuf) -> Result { let crl_data = read_to_string(crl_path)?; let (_, pem) = pem::parse_x509_pem(crl_data.as_bytes()) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, format!("Failed to parse CRL PEM: {:?}", e)))?; - + // 解析 CRL let (_, crl) = CertificateRevocationList::from_der(&pem.contents) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, format!("Failed to parse CRL: {:?}", e)))?; @@ -472,7 +482,7 @@ fn is_cert_revoked(cert_der: &[u8], crl_path: &PathBuf) -> Result { for revoked in crl.iter_revoked_certificates() { let revoked_serial = revoked.user_certificate.to_bigint() .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Failed to convert revoked certificate serial to BigInt"))?; - + if revoked_serial == cert_serial { return Ok(true); } @@ -486,7 +496,7 @@ enum DynoClient { } fn create_dyno_client( - host: &str, + host: &str, port: u16, certs_dir: &str, ) -> Result { @@ -507,7 +517,7 @@ fn create_dyno_client( } fn create_dyno_client_with_no_certs( - host: &str, + host: &str, port: u16, ) -> Result { let addr = (host, port) @@ -533,7 +543,7 @@ fn secure_clear_password(password: &mut Vec) { } fn create_dyno_client_with_certs( - host: &str, + host: &str, port: u16, config: &ClientConfigPath, ) -> Result> { @@ -563,7 +573,7 @@ fn create_dyno_client_with_certs( let cert_file = File::open(&config.cert_path)?; let mut cert_reader = BufReader::new(cert_file); let certs = rustls_pemfile::certs(&mut cert_reader)?; - + // 检查客户端证书的基本要求 for cert in &certs { verify_certificate(cert, false)?; // 验证客户端证书 @@ -601,7 +611,7 @@ fn create_dyno_client_with_certs( println!("Loading client key from: {}", config.key_path.display()); let key_file = File::open(&config.key_path)?; let mut key_reader = BufReader::new(key_file); - + // 检查私钥是否加密 let mut key_data = Vec::new(); key_reader.read_to_end(&mut key_data)?; @@ -614,10 +624,10 @@ fn create_dyno_client_with_certs( let mut password = prompt_password("Please enter the certificate password: ")?.into_bytes(); let pkey = PKey::private_key_from_pem_passphrase(&key_data, &password) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, format!("Failed to decrypt private key: {}", e)))?; - + // 手动清除密码 secure_clear_password(&mut password); - + // 返回私钥 vec![pkey.private_key_to_der() .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, format!("Failed to convert private key to DER: {}", e)))?] @@ -626,7 +636,7 @@ fn create_dyno_client_with_certs( let mut key_reader = BufReader::new(File::open(&config.key_path)?); rustls_pemfile::pkcs8_private_keys(&mut key_reader)? }; - + if keys.is_empty() { return Err(io::Error::new( io::ErrorKind::InvalidData,