diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index 60257b14b2ec2a5958d771e36e10c349f79aaaac..356b3ddd21d52e12df3c3a1557b7e0adc49f6334 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -215,16 +215,40 @@ class BaseDataProcessor: ndarray_json.update({'type': 'numpy.ndarray'}) ndarray_json.update({'dtype': str(ndarray.dtype)}) ndarray_json.update({'shape': ndarray.shape}) - if ndarray.size > 0: - ndarray_json.update({"Max": np.max(ndarray).item()}) - ndarray_json.update({"Min": np.min(ndarray).item()}) - ndarray_json.update({"Mean": np.mean(ndarray).item()}) - ndarray_json.update({"Norm": np.linalg.norm(ndarray).item()}) - else: - ndarray_json.update({"Max": None}) - ndarray_json.update({"Min": None}) - ndarray_json.update({"Mean": None}) - ndarray_json.update({"Norm": None}) + + # 先初始化默认值 + stats = { + "Max": None, + "Min": None, + "Mean": None, + "Norm": None + } + + try: + # 只有非空时才尝试计算 + if ndarray.size > 0: + stats = { + "Max": np.max(ndarray).item(), + "Min": np.min(ndarray).item(), + "Mean": np.mean(ndarray).item(), + "Norm": np.linalg.norm(ndarray).item() + } + except Exception as e: + # 决定打印内容或切片 + if ndarray.size <= 100: + data_repr = ndarray + else: + # 打印前 5 和最后 5 个元素 + flat = ndarray.flatten() + data_repr = np.concatenate([flat[:5], flat[-5:]]) + logger.warning( + "Failed to compute stats for ndarray (dtype=%s, shape=%s, data=%s): %s", + ndarray.dtype, ndarray.shape, data_repr, e, + exc_info=True + ) + + # 最后一次性更新 + ndarray_json.update(stats) return ndarray_json @staticmethod diff --git a/debug/accuracy_tools/msprobe/docs/33.generate_operator_MindSpore.md b/debug/accuracy_tools/msprobe/docs/33.generate_operator_MindSpore.md index 6d923be76a1ed0d63776456b3996a6ce37829887..5f7640ae22c6ba3ed65244f4f77c239c2b723f77 100644 --- a/debug/accuracy_tools/msprobe/docs/33.generate_operator_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/33.generate_operator_MindSpore.md @@ -17,13 +17,93 @@ a. 在生成单API脚本时可以选择由工具构造随机数获得 dump 数 4.(可选)当需要使用Msadapter时,由于需要环境中同时存在 Torch 与 Msadapter,所以只支持在**安装原生Torch**的场景下通过export PYTHONPATH="xx/msadapter/build/lib"等通过**环境变量使能Msadapter的方式**的环境中进行预检,预检工具能够自动索引得到所需的 Torch 与 Msadapter环境,环境安装详细参考:[msadapter官网](https://gitee.com/mindspore/msadapter)。 +### 2.1 dump.json 实例(由dump生成) +``` +{ + "task": "statistics", + "level": "L1", + "framework": "mindtorch", + "dump_data_dir": null, + "data": { + "Tensor.reshape.779.forward": { + "input_args": [ + { + "type": "mindspore.Tensor", + "dtype": "BFloat16", + "shape": [ + 8192, + 896 + ], + "Max": 0.62890625, + "Min": -0.78515625, + "Mean": 0.00035858154296875, + "Norm": 105.0 + }, + { + "type": "int", + "value": -1 + }, + { + "type": "int", + "value": 896 + } + ], + "input_kwargs": {}, + "output": [ + { + "type": "mindspore.Tensor", + "dtype": "BFloat16", + "shape": [ + 8192, + 896 + ], + "Max": 0.62890625, + "Min": -0.78515625, + "Mean": 0.00035858154296875, + "Norm": 105.0 + } + ] + }, + "Tensor.reshape.779.backward": { + "input": [ + { + "type": "mindspore.Tensor", + "dtype": "BFloat16", + "shape": [ + 8192, + 896 + ], + "Max": 0.0, + "Min": 0.0, + "Mean": 0.0, + "Norm": 0.0 + } + ], + "output": [ + { + "type": "mindspore.Tensor", + "dtype": "BFloat16", + "shape": [ + 8192, + 896 + ], + "Max": 0.0, + "Min": 0.0, + "Mean": 0.0, + "Norm": 0.0 + } + ] + } + } + } +``` ### 2.1 配置config_op.json 单API复现参数配置如下(以复现softmax算子为例): ``` { "dump_json_path": "./dump.json", - "api_name": "Mint.split.1", - "extract_api_path": "Mint.split.1.json", + "api_name": "Tensor.reshape.779", + "extract_api_path": "Tensor.reshape.779", "propagation": "backward", "data_mode": "random_data", "random_seed": 42, @@ -32,15 +112,15 @@ a. 在生成单API脚本时可以选择由工具构造随机数获得 dump 数 ``` **配置文件参数说明** - | 参数名称 | 解释 | 是否必选 | - | ---------------------------- |----------------------------------------------------------------------------| ---------------------------------- | - | dump_json_path | dump.json的文件路径,包含所有dump算子的信息;如果已经提取了可疑算子并保存可以不指定。 | 否 | - | api_name | 算子名,如Functional.softmax.3、Tensor.add.0、Torch.matmul.5等。 | 否 | - | extract_api_path | 提取可疑算子的json文件路径 | 是 | - | propagation | 选择复现算子的forward还是backward,默认为forward | 否 | - | data_mode | 选择复现算子的随机数据(random_data)还是真实数据(real_data)模式,默认为random_data | 否 | - | random_seed | 仅random_data模式有效,表示手动设定的随机种子,默认为1234 | 否 | - | iter_times | 仅random_data模式有效,表示单API运行的次数,由于安全相关原因,最大支持设置为1000 | 否 | + | 参数名称 | 解释 | + | ---------------------------- |-------------------------------------------------------------------------------------------------------------------| + | dump_json_path | dump.json的文件路径,包含所有dump算子的信息;如果已经提取了可疑算子并保存可以不指定。 | + | api_name | 算子名(目前只支持Mint,Tensor,Functional,Torch类中可自动求导api),如Mint.split.1,Functional.softmax.3、Tensor.add.0、Torch.matmul.5等。 | + | extract_api_path | 提取可疑算子的json文件路径 | + | propagation | 选择复现算子的forward还是backward,默认为forward | + | data_mode | 选择复现算子的随机数据(random_data)还是真实数据(real_data)模式,默认为random_data | + | random_seed | 仅random_data模式有效,表示手动设定的随机种子,默认为1234 | + | iter_times | 仅random_data模式有效,表示单API运行的次数,由于安全相关原因,最大支持设置为1000 | ### 2.2 运行命令生成单API脚本 config_op.json配置好后,运行如下命令: diff --git a/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py b/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py index d885c5d95d9f10c4e9c5bb475aecbe17978cd7f6..f546a9b6a7532b58a8943bc7a35345bbdff2e8a3 100644 --- a/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py +++ b/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/bench_functions/flash_attention_score.py @@ -52,8 +52,14 @@ def softmax_grad(dp, softmax_res): def broadcast_kv(num_heads, num_kv_heads, kv_tensor, dtype): + # 检查维度 + if kv_tensor.dim() != 4: + raise ValueError(f"broadcast_kv: kv_tensor 必须是 4 维 (B, N_kv, S, D),但得到 {kv_tensor.shape}") if num_kv_heads == 0 or num_kv_heads > num_heads: - raise ValueError(f"num_kv_heads must be non-zero and bigger than num_heads.") + raise ValueError("broadcast_kv: num_kv_heads 必须大于 0 且不超过 num_heads。") + if num_heads % num_kv_heads != 0: + raise ValueError(f"broadcast_kv: num_heads({num_heads}) 必须能被 num_kv_heads({num_kv_heads}) 整除。") + factor = num_heads // num_kv_heads kv_shape = kv_tensor.shape @@ -68,6 +74,17 @@ def broadcast_kv(num_heads, num_kv_heads, kv_tensor, dtype): def calculate_qk(q, k, attn_mask, pse, scalar_value): + # 基本形状检查 + if q.dim() < 4 or k.dim() < 4: + raise ValueError(f"calculate_qk: q,k 必须至少 4 维,q={q.dim()},k={k.dim()}") + # 检查 head_dim 一致性 + if q.size(-1) != k.size(-1): + raise ValueError(f"calculate_qk: q.head_dim({q.size(-1)}) != k.head_dim({k.size(-1)})") + # 检查序列长度匹配(可选) + if q.size(2) != k.size(2) and attn_mask is None: + # 只有在无 mask 时才严格要求 seq_len 相等 + torch._assert(q.size(2) == k.size(2), + f"calculate_qk: q.seq_len({q.size(2)}) != k.seq_len({k.size(2)})") if k.dim() != 4: raise ValueError(f"k tensor dimension must be 4, but got {k.dim()} dimensions (shape: {k.shape})") diff --git a/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py b/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py index 38304d525069b99367377235479cbd10ebd76158..0a098a23d4779b7b37bdf0f081bd106e1846551c 100644 --- a/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py +++ b/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/generate_op_script/op_generator.py @@ -116,7 +116,10 @@ class CommonConfig: filtered = {k: v for k, v in json_content.items() if k not in EXCLUED} - if len(filtered) > API_INFO: + if propagation == Const.FORWARD and len(filtered) > API_INFO - 1: + raise ValueError(f'json file has more than one API, the API only contains forward info') + + if propagation == Const.BACKWARD and len(filtered) > API_INFO: raise ValueError(f'json file has more than one API, the API only contains forward and backward info') # Retrieve the first API name and dictionary diff --git a/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template b/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template index b4e5747112218a6d89b6b1255b57ebcfed07c5fe..e6e22f256dc1ea40ee676b7c37ec6082b0c1fcc1 100644 --- a/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template +++ b/debug/accuracy_tools/msprobe/mindspore/api_accuracy_checker/generate_op_script/operator_replication.template @@ -326,7 +326,6 @@ if not is_valid_pt_mt_env: import torch - # ======= 常数类 ======= import numpy as np @@ -1430,6 +1429,17 @@ class GlobalContext: global_context = GlobalContext() +def seed_all(seed={random_seed}): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(True) + torch_npu.npu.manual_seed_all(seed) + torch_npu.npu.manual_seed(seed) + mindspore.set_deterministic(True) + + # ======== 输入类型类 ======= class ApiInputAggregation: @@ -2024,7 +2034,8 @@ if __name__ == "__main__": api_name_str = ".".join(api_full_name.split(".")[:3]) propagation = "{propagation}" data_mode = "{data_mode}" - torch.manual_seed({random_seed}) + + seed_all({random_seed}) data_manager = DataManager("./op_result_output", None) create_directory("./op_result_output")