From fb52369e76b8a4ec553c04b309553ac438372451 Mon Sep 17 00:00:00 2001
From: l30036321 <lvkaimeng@huawei.com>
Date: Mon, 22 Jan 2024 17:36:59 +0800
Subject: [PATCH] init dump fix

---
 ...0\203\275\350\257\264\346\230\216_v4.0.md" | 40 +++++++++---------
 .../src/python/ptdbg_ascend/common/utils.py   |  2 +-
 .../src/python/ptdbg_ascend/dump/dump.py      | 42 +++++++++++++------
 .../ptdbg_ascend/hook_module/register_hook.py | 16 ++++---
 4 files changed, 58 insertions(+), 42 deletions(-)

diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.0.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.0.md"
index 23ee300d2..f13b3fe18 100644
--- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.0.md"
+++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.0.md"
@@ -55,23 +55,23 @@ PyTorch训练场景的精度问题分析建议参考以下思路进行精度比
 
    ```python
    from ptdbg_ascend import *
-   
+
    # 在main函数开始前固定随机数
    seed_all()
-   
+
    # 配置dump数据目录路径和名称
    set_dump_path("./npu_dump", dump_tag='all')
-   
+
    # 注册dump回调函数
    register_hook(model, acc_cmp_dump)
-   
+
    ...
-   
+
    # 在第一个迭代开始的位置开启dump和堆栈模式，同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量
    set_dump_switch("ON", mode="api_stack", filter_switch="OFF")
-   
+
    ...
-   
+
    # 在第一个迭代结束的位置关闭dump
    set_dump_switch("OFF")
    ```
@@ -118,7 +118,7 @@ PyTorch训练场景的精度问题分析建议参考以下思路进行精度比
 
    ```python
    from ptdbg_ascend import *
-   
+
    # 提取dump信息中第1次调用的API：Torch_batch_normal的堆栈信息及数据统计信息
    parse("./npu_dump/all_v2.0/rank0/api_stack_dump.pkl", "Torch_batch_normal_1_forward")
    ```
@@ -137,17 +137,17 @@ PyTorch训练场景的精度问题分析建议参考以下思路进行精度比
 
      ```python
      from ptdbg_ascend import *
-     
+
      # 固定随机数，开启确定性计算
      seed_all(mode=True)
      set_dump_path("./dump_path", dump_tag='forward')
      register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
-     
+
      # dump指定前向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量
      set_dump_switch("ON", mode="acl", scope=["Tensor_permute_1_forward"], filter_switch="OFF")
-     
+
      ...
-     
+
      set_dump_switch("OFF")
      ```
 
@@ -155,18 +155,18 @@ PyTorch训练场景的精度问题分析建议参考以下思路进行精度比
 
      ```python
      from ptdbg_ascend import *
-     
+
      # 固定随机数，开启确定性计算
      seed_all(mode=True)
      set_dump_path("./dump_path", dump_tag='backward')
      register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
-     
+
      # dump指定反向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量
      set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"], filter_switch="OFF")
      set_backward_input(["./npu_dump/all_v2.0/rank0/api_stack_dump/Functional_conv2d_1_backward_input.0.npy"])
-     
+
      ...
-     
+
      set_dump_switch("OFF")
      ```
 
@@ -454,7 +454,7 @@ PrecisionDebugger(dump_path=None, hook_name=None, rank=None, step=[], enable_dat
 | rank              | 指定对某张卡上的数据进行dump或溢出检测，默认未配置（表示dump所有卡的数据），须根据实际卡的Rank ID配置。应配置为大于0的正整数，且须根据实际卡的Rank ID配置，若所配置的值大于实际训练所运行的卡的Rank ID，则dump数据为空，比如当前环境Rank ID为0~7，实际训练运行0~3卡，此时若配置Rank ID为4或不存在的10等其他值，此时dump数据为空。 | 否       |
 | step              | 指定dump某个step的数据，默认未配置，须指定为训练脚本中存在的step。step为list格式，可配置逐个step，例如：step=[0,1,2]；也可以配置step范围，例如：step=list(range(0,9))，表示dump第0到第8个step。 | 否       |
 | enable_dataloader | 自动控制开关，可取值True（开启）或False（关闭），默认为False。配置为True后自动识别dump step参数指定的迭代，并在该迭代执行完成后退出训练，此时start和stop函数可不配置，开启该开关要求训练脚本是通过torch.utils.data.dataloader方式加载数据；配置为False则需要配置start和stop函数，并在最后一个stop函数后或一个step结束的位置添加debugger.step()。 | 否       |
-| model             | 开启model模式，传入网络模型实例化的对象，配置该参数后，dump操作仅dump网络中init方法里调用的方法（nn.Module类），不会对所有API进行dump。参数示例： model=net，net为网络模型实例化的对象名称。默认未配置。<br/>配置该参数时，PrecisionDebugger模块请在模型实例化之后调用。<br/>该模式不支持“溢出检测”和“模块级精度数据dump”。 | 否       |
+| model             | 开启init dump模式，传入网络模型实例化的对象，配置该参数后，dump操作仅dump网络中init方法里调用的方法（nn.Module类），不会对所有API进行dump。参数示例： model=net，net为网络模型实例化的对象名称。默认未配置。<br/>配置该参数时，PrecisionDebugger模块请在模型实例化之后调用。<br/>该模式不支持“溢出检测”和“模块级精度数据dump”。此模式下dump文件名前缀为网络中定义的模块名或层名。 | 否       |
 
 ### configure_hook函数（可选）
 
@@ -593,9 +593,9 @@ configure_hook可配置多种dump模式，示例如下：
   debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0])
   debugger.configure_hook(mode="acl", acl_config="./dump.json")
   ```
-  
+
   该场景会在原有数据基础上，额外在dump.json文件配置的dump_path目录下生成一份ACL算子数据，该数据可通过“**ptdbg_ascend.parse**”工具进行解析。
-  
+
   仅支持NPU环境。
 
 ### start函数（可选）
@@ -854,7 +854,7 @@ register_hook(model, hook, overflow_nums=overflow_nums, dump_mode=dump_mode, dum
 | model         | 传入网络模型实例化的对象。参数示例： model=net，net为网络模型实例化的对象名称。 | 是       |
 | hook          | 注册工具的dump和溢出检测钩子。可取值overflow_check（表示溢出检测）和acc_cmp_dump（表示dump数据），二选一。 | 是       |
 | overflow_nums | 控制溢出次数，表示第N次溢出时，停止训练，过程中检测到溢出API对应ACL数据均dump。参数示例：overflow_nums=3。配置overflow_check时可配置，默认不配置，即检测到1次溢出，训练停止，配置为-1时，表示持续检测溢出直到训练结束。 | 否       |
-| dump_mode     | 控制针对溢出API的dump模式，可取值"model"、"acl"或"api"。配置为"model"时，表示开启model模式，dump操作仅dump网络中init方法里调用的方法（nn.Module类），不会对所有API进行dump，不支持“溢出检测”和“模块级精度数据dump”；配置acl时，表示dump ACL级别的溢出数据，此时set_dump_path参数不生效，dump数据目录由dump_config的.json文件配置。参数示例：dump_mode="acl"。默认不配置，即dump API级别的溢出数据。 | 否       |
+| dump_mode     | 控制针对溢出API的dump模式，可取值"acl"或"api"。配置acl时，表示dump ACL级别的溢出数据，此时set_dump_path参数不生效，dump数据目录由dump_config的.json文件配置。参数示例：dump_mode="acl"。默认不配置，即dump API级别的溢出数据。 | 否       |
 | dump_config   | acl dump的配置文件。dump_mode="acl"时，该参数必选；dump_mode="api"或"model"时，该参数不选。参数示例：dump_config='./dump.json'。 | 否       |
 
 **函数示例**
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
index 942391d4b..ec40e2c6c 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
@@ -68,7 +68,7 @@ class Const:
     DUMP_RATIO_MAX = 100
     SUMMERY_DATA_NUMS = 256
     FLOAT_EPSILON = np.finfo(float).eps
-    SUPPORT_DUMP_MODE = ['api', 'acl', 'model']
+    SUPPORT_DUMP_MODE = ['api', 'acl']
     ON = 'ON'
     OFF = 'OFF'
     BACKWARD = 'backward'
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
index 2a00f83bf..83900eb60 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
@@ -32,8 +32,10 @@ except ImportError:
 else:
     is_gpu = False
 
-from .utils import DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist
-from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path, check_inplace_op, CompareConst
+from .utils import DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist,\
+    CompareException
+from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path, check_inplace_op, CompareConst,\
+    print_error_log
 from ..dump.utils import check_writable
 from ..common.file_check_util import FileOpen, change_mode, FileCheckConst, check_path_pattern_vaild, check_path_length
 
@@ -44,7 +46,7 @@ thread_lock = threading.Lock()
 pkl_name = ""
 rank = os.getpid()
 multi_output_apis = ["_sort_", "npu_flash_attention"]
-module_count = defaultdict(int)
+module_count = {}
 
 
 class APIList(list):
@@ -367,22 +369,38 @@ def dump_mode_backward_acl_dump(module, module_name, grad_path):
     print_info_log("Dump %s op file." % module_name)
 
 
+def module_count_func(name, name_template):
+    module_name = name.split("_")[-3]
+    if Const.FORWARD in name_template:
+        if module_name not in module_count:
+            module_count[module_name] = [0, [0]]
+        else:
+            if module_count[module_name][-1] and \
+                    module_count[module_name][0] != module_count[module_name][-1][-1]:
+                module_count[module_name][-1].pop()
+            module_count[module_name][0] += 1
+            module_count[module_name][-1].append(module_count[module_name][0])
+        index = module_count[module_name][0]
+    else:
+        index = module_count[module_name][-1].pop()
+    return index
+
+
 def acc_cmp_dump(name, **kwargs):
     dump_step = kwargs.get('dump_step', 1)
     pid = kwargs.get('pid')
+    name_template = name
     if not pid:
         return RuntimeError("Not get the specified process pid.")
 
     def acc_cmp_hook(module, in_feat, out_feat=None):
-        nonlocal name
-        if "_{}_" in name:
-            module_name = name.split("_")[1]
-            if Const.BACKWARD in name:
-                index = module_count[module_name] - 1
-                module_count[module_name] = index
-            else:
-                index = module_count[module_name]
-                module_count[module_name] = index + 1
+        nonlocal name, name_template
+        if "_{}_" in name_template:
+            try:
+                index = module_count_func(name, name_template)
+            except IndexError as e:
+                print_error_log(f"Get module {name_template} index failed.")
+                raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e
             name = name.format(index)
         if pid == os.getpid():
             dump_acc_cmp(name, in_feat, out_feat, dump_step, module)
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py
index 5297ed70e..bc6b9c494 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py
@@ -70,10 +70,7 @@ def register_hook(model, hook, **kwargs):
     if dump_mode == 'acl':
         DumpUtil.dump_switch_mode = dump_mode
         DumpUtil.set_acl_config(dump_config_file)
-    if dump_mode == 'model':
-        register_hook_core(hook, model)
-    else:
-        register_hook_core(hook)
+    register_hook_core(hook)
 
 
 def init_overflow_nums(overflow_nums):
@@ -127,11 +124,12 @@ def register_hook_core(hook, model=None):
         if not isinstance(model, torch.nn.Module):
             print_error_log("The argument model must be an object of torch.nn.Module")
             raise CompareException(CompareException.INVALID_PARAM_ERROR)
-        for _, module in model.named_modules():
-            if "torch.nn.modules" in str(module.__class__):
-                prefix = "Module_" + module.__class__.__name__
-                module.register_forward_hook(hook(prefix + "_{}_" + "forward"))
-                module.register_backward_hook(hook(prefix + "_{}_" + "backward"))
+        for name, module in model.named_modules():
+            if module == model:
+                continue
+            prefix = name + "_" + module.__class__.__name__
+            module.register_forward_hook(hook(prefix + "_{}_" + "forward"))
+            module.register_backward_hook(hook(prefix + "_{}_" + "backward"))
     else:
         api_register.initialize_hook(hook)
         api_register.api_modularity()
-- 
Gitee