From d7ba38198bc9fb943ab966b39c7d3620f3ff60e0 Mon Sep 17 00:00:00 2001 From: gitee Date: Tue, 14 Jan 2025 20:14:51 +0800 Subject: [PATCH 01/10] fix bug --- .../msprobe/pytorch/online_dispatch/dispatch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py index 77bfa3f5e1..ba1656eacd 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py @@ -44,7 +44,7 @@ DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv" class PtdbgDispatch(TorchDispatchMode): - def __init__(self, dump_mode=Const.OFF, api_list=None, debug=False, dump_path=None, tag=None, process_num=0): + def __init__(self, dump_mode=Const.OFF, api_list=[], debug=False, dump_path=None, tag=None, process_num=0): super(PtdbgDispatch, self).__init__() logger.info(COMPARE_LOGO) if not is_npu: @@ -148,7 +148,7 @@ class PtdbgDispatch(TorchDispatchMode): return func(*args, **kwargs) self.enable_autograd(aten_api) - if aten_api in self.aten_ops_blacklist: + if aten_api in self.aten_ops_blacklist or Const.NPU_LOWERCASE in aten_api: npu_out = func(*args, **kwargs) return npu_out -- Gitee From 2820e9991b5c0e7ad8f52da8993f4add6093d17a Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 09:57:28 +0800 Subject: [PATCH 02/10] fix bug --- debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py index ba1656eacd..f135abc851 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py @@ -141,6 +141,7 @@ class PtdbgDispatch(TorchDispatchMode): func_name_split_list = func.__name__.split(".") aten_api = func_name_split_list[0] + print("aten_api:", aten_api) try: aten_api_overload_name = func_name_split_list[1] except IndexError: -- Gitee From 28832aa824bb6dc7ab1a536f1b3786badaf46362 Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 10:01:25 +0800 Subject: [PATCH 03/10] fix bug --- .../msprobe/pytorch/online_dispatch/dispatch.py | 1 - .../online_dispatch/torch_ops_config.yaml | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py index f135abc851..ba1656eacd 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py @@ -141,7 +141,6 @@ class PtdbgDispatch(TorchDispatchMode): func_name_split_list = func.__name__.split(".") aten_api = func_name_split_list[0] - print("aten_api:", aten_api) try: aten_api_overload_name = func_name_split_list[1] except IndexError: diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml index c5e06f471b..f211aeaa74 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml @@ -48,6 +48,23 @@ aten_ops_blacklist: - zero_ - zeros - zeros_like + - send + - recv + - broadcast + - all_reduce + - reduce + - all_gather + - gather + - isend + - irecv + - scatter + - reduce_scatter + - _reduce_scatter_base + - _all_gather_base + - all_to_all_single + - all_to_all + - all_gather_into_tensor + - reduce_scatter_tensor npu_adjust_autograd: - adaptive_avg_pool2d -- Gitee From 228f0f86899ab4d9638e357fc37ea486ba58e1fc Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 10:03:10 +0800 Subject: [PATCH 04/10] add unsupport list --- .../online_dispatch/torch_ops_config.yaml | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml index f211aeaa74..a1b5f5b8a5 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml @@ -65,6 +65,92 @@ aten_ops_blacklist: - all_to_all - all_gather_into_tensor - reduce_scatter_tensor + - npu_sort_v2 + - npu_transpose + - npu_broadcast + - npu_dtype_cast + - empty_with_format + - npu_one_hot + - npu_stride_add + - npu_ps_roi_pooling + - npu_roi_align + - npu_nms_v4 + - npu_iou + - npu_nms_with_mask + - npu_pad + - npu_bounding_box_encode + - npu_bounding_box_decode + - npu_batch_nms + - npu_slice + - _npu_dropout + - npu_indexing + - npu_ifmr + - npu_max + - npu_scatter + - npu_layer_norm_eval + - npu_alloc_float_status + - npu_confusion_transpose + - npu_bmmV2 + - fast_gelu + - npu_sub_sample + - npu_deformable_conv2d + - npu_mish + - npu_anchor_response_flags + - npu_yolo_boxes_encode + - npu_grid_assign_positive + - npu_normalize_batch + - npu_masked_fill_range + - npu_linear + - npu_bert_apply_adam + - npu_giou + - npu_ciou + - npu_diou + - npu_sign_bits_pack + - npu_sign_bits_unpack + - npu_flash_attention + - npu_scaled_masked_softmax + - npu_rotary_mul + - npu_roi_align + - npu_roi_alignbk + - npu_ptiou + - npu_fusion_attention + - npu_dropout_with_add_softmax + - npu_random_choice_with_mask + - npu_rotated_iou + - npu_conv2d + - npu_conv3d + - npu_softmax_cross_entropy_with_logits + - npu_all_gather_base_mm + - npu_swiglu + - npu_rms_norm + - npu_mm_reduce_scatter_base + - npu_mm_all_reduce_base + - npu_conv_transpose2d + - npu_convolution + - npu_convolution_transpose + - npu_min + - npu_nms_rotated + - npu_reshape + - npu_rotated_box_decode + - npu_rotated_box_encode + - npu_rotated_overlaps + - npu_silu + - npu_fused_attention_score + - npu_multi_head_attention + - npu_gru + - npu_incre_flash_attention + - npu_prompt_flash_attention + - npu_lstm + - npu_apply_adam + - npu_apply_adam_w + - npu_anti_quant + - npu_grouped_matmu + - npu_quant_scatter + - npu_group_norm_silu + - npu_format_cast + - npu_moe_finalize_routing + - npu_moe_gating_top_k_softmax + - npu_trans_quant_param npu_adjust_autograd: - adaptive_avg_pool2d -- Gitee From b7af652f0bd219fea84a22ca43b8856425a778ad Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 10:08:12 +0800 Subject: [PATCH 05/10] fix bug --- .../msprobe/pytorch/online_dispatch/dispatch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py index ba1656eacd..57b424d308 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py @@ -141,6 +141,8 @@ class PtdbgDispatch(TorchDispatchMode): func_name_split_list = func.__name__.split(".") aten_api = func_name_split_list[0] + print("aten_api:", aten_api) + print("aten_api in self.aten_ops_blacklist:", aten_api in self.aten_ops_blacklist) try: aten_api_overload_name = func_name_split_list[1] except IndexError: @@ -148,7 +150,7 @@ class PtdbgDispatch(TorchDispatchMode): return func(*args, **kwargs) self.enable_autograd(aten_api) - if aten_api in self.aten_ops_blacklist or Const.NPU_LOWERCASE in aten_api: + if aten_api in self.aten_ops_blacklist: npu_out = func(*args, **kwargs) return npu_out -- Gitee From 4657f72f22128fb961218686f9aef7f6e7e41387 Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 10:19:59 +0800 Subject: [PATCH 06/10] fix bug --- .../pytorch/online_dispatch/dispatch.py | 8 +- .../online_dispatch/torch_ops_config.yaml | 103 ------------------ 2 files changed, 5 insertions(+), 106 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py index 57b424d308..bc3ec876bf 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py @@ -141,8 +141,6 @@ class PtdbgDispatch(TorchDispatchMode): func_name_split_list = func.__name__.split(".") aten_api = func_name_split_list[0] - print("aten_api:", aten_api) - print("aten_api in self.aten_ops_blacklist:", aten_api in self.aten_ops_blacklist) try: aten_api_overload_name = func_name_split_list[1] except IndexError: @@ -184,7 +182,11 @@ class PtdbgDispatch(TorchDispatchMode): npu_out_cpu = safe_get_value(npu_out_cpu, 0, "npu_out_cpu") with TimeStatistics("CPU RUN", run_param): - cpu_out = func(*cpu_args, **cpu_kwargs) + try: + cpu_out = func(*cpu_args, **cpu_kwargs) + except RuntimeError: + logger.warning(f"This aten_api {aten_api} does not support running on cpu, so skip it.") + return npu_out if isinstance(cpu_out, torch.Tensor) and cpu_out.dtype in [torch.bfloat16, torch.float16, torch.half]: cpu_out = cpu_out.float() diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml index a1b5f5b8a5..c5e06f471b 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/torch_ops_config.yaml @@ -48,109 +48,6 @@ aten_ops_blacklist: - zero_ - zeros - zeros_like - - send - - recv - - broadcast - - all_reduce - - reduce - - all_gather - - gather - - isend - - irecv - - scatter - - reduce_scatter - - _reduce_scatter_base - - _all_gather_base - - all_to_all_single - - all_to_all - - all_gather_into_tensor - - reduce_scatter_tensor - - npu_sort_v2 - - npu_transpose - - npu_broadcast - - npu_dtype_cast - - empty_with_format - - npu_one_hot - - npu_stride_add - - npu_ps_roi_pooling - - npu_roi_align - - npu_nms_v4 - - npu_iou - - npu_nms_with_mask - - npu_pad - - npu_bounding_box_encode - - npu_bounding_box_decode - - npu_batch_nms - - npu_slice - - _npu_dropout - - npu_indexing - - npu_ifmr - - npu_max - - npu_scatter - - npu_layer_norm_eval - - npu_alloc_float_status - - npu_confusion_transpose - - npu_bmmV2 - - fast_gelu - - npu_sub_sample - - npu_deformable_conv2d - - npu_mish - - npu_anchor_response_flags - - npu_yolo_boxes_encode - - npu_grid_assign_positive - - npu_normalize_batch - - npu_masked_fill_range - - npu_linear - - npu_bert_apply_adam - - npu_giou - - npu_ciou - - npu_diou - - npu_sign_bits_pack - - npu_sign_bits_unpack - - npu_flash_attention - - npu_scaled_masked_softmax - - npu_rotary_mul - - npu_roi_align - - npu_roi_alignbk - - npu_ptiou - - npu_fusion_attention - - npu_dropout_with_add_softmax - - npu_random_choice_with_mask - - npu_rotated_iou - - npu_conv2d - - npu_conv3d - - npu_softmax_cross_entropy_with_logits - - npu_all_gather_base_mm - - npu_swiglu - - npu_rms_norm - - npu_mm_reduce_scatter_base - - npu_mm_all_reduce_base - - npu_conv_transpose2d - - npu_convolution - - npu_convolution_transpose - - npu_min - - npu_nms_rotated - - npu_reshape - - npu_rotated_box_decode - - npu_rotated_box_encode - - npu_rotated_overlaps - - npu_silu - - npu_fused_attention_score - - npu_multi_head_attention - - npu_gru - - npu_incre_flash_attention - - npu_prompt_flash_attention - - npu_lstm - - npu_apply_adam - - npu_apply_adam_w - - npu_anti_quant - - npu_grouped_matmu - - npu_quant_scatter - - npu_group_norm_silu - - npu_format_cast - - npu_moe_finalize_routing - - npu_moe_gating_top_k_softmax - - npu_trans_quant_param npu_adjust_autograd: - adaptive_avg_pool2d -- Gitee From c29880f0647d72a6a8d104d301f046320286775c Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 10:45:53 +0800 Subject: [PATCH 07/10] fix bug --- .../msprobe/pytorch/online_dispatch/dump_compare.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py index b185bc1110..cbe26fb07c 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py @@ -138,6 +138,8 @@ def dispatch_workflow(run_param: DispatchRunParam, data_info: DisPatchDataInfo): dump_data(npu_out_cpu, prefix_output, run_param.root_npu_path) if run_param.process_num == 0: + print("all summary", all_summary) + print("run_param.api_index", run_param.api_index) all_summary[run_param.api_index - 1] = copy.deepcopy(single_api_summary) else: save_temp_summary(run_param.api_index - 1, single_api_summary, run_param.root_cpu_path, lock) -- Gitee From 1ab0ed8354c09fd3a699d8e4632575db79c60d7e Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 10:54:39 +0800 Subject: [PATCH 08/10] fix bug --- .../msprobe/pytorch/online_dispatch/dump_compare.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py index cbe26fb07c..59653e8cf0 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py @@ -139,7 +139,10 @@ def dispatch_workflow(run_param: DispatchRunParam, data_info: DisPatchDataInfo): if run_param.process_num == 0: print("all summary", all_summary) + print("len(all_summary)", len(all_summary)) print("run_param.api_index", run_param.api_index) + print("single_api_summary", single_api_summary) + print("all_summary[run_param.api_index - 1]",all_summary[run_param.api_index - 1]) all_summary[run_param.api_index - 1] = copy.deepcopy(single_api_summary) else: save_temp_summary(run_param.api_index - 1, single_api_summary, run_param.root_cpu_path, lock) -- Gitee From 720ec0be2fd7ace6e1145ca3dca5e6bf9272ecf8 Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 11:08:43 +0800 Subject: [PATCH 09/10] fix bug --- debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py index bc3ec876bf..87e197bfcb 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dispatch.py @@ -185,6 +185,7 @@ class PtdbgDispatch(TorchDispatchMode): try: cpu_out = func(*cpu_args, **cpu_kwargs) except RuntimeError: + self.api_index -= 1 logger.warning(f"This aten_api {aten_api} does not support running on cpu, so skip it.") return npu_out -- Gitee From 7fcbb87cac6010da7646a84e26b397b145874441 Mon Sep 17 00:00:00 2001 From: gitee Date: Wed, 15 Jan 2025 14:11:24 +0800 Subject: [PATCH 10/10] fix bug --- .../msprobe/pytorch/online_dispatch/dump_compare.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py index 59653e8cf0..b185bc1110 100644 --- a/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/online_dispatch/dump_compare.py @@ -138,11 +138,6 @@ def dispatch_workflow(run_param: DispatchRunParam, data_info: DisPatchDataInfo): dump_data(npu_out_cpu, prefix_output, run_param.root_npu_path) if run_param.process_num == 0: - print("all summary", all_summary) - print("len(all_summary)", len(all_summary)) - print("run_param.api_index", run_param.api_index) - print("single_api_summary", single_api_summary) - print("all_summary[run_param.api_index - 1]",all_summary[run_param.api_index - 1]) all_summary[run_param.api_index - 1] = copy.deepcopy(single_api_summary) else: save_temp_summary(run_param.api_index - 1, single_api_summary, run_param.root_cpu_path, lock) -- Gitee