From 6795e67e56d27c4a4c58e327622db43e518c2b10 Mon Sep 17 00:00:00 2001
From: fuchao <1501312275@qq.com>
Date: Sat, 18 Jan 2025 12:50:52 +0800
Subject: [PATCH] =?UTF-8?q?=E9=9D=99=E6=80=81=E5=9B=BEcell=20dump=E9=87=87?=
 =?UTF-8?q?=E7=94=A8=E6=96=B0=E6=96=B9=E6=A1=88?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../mindspore/dump/cell_dump_process.py       | 135 +++++++++++-------
 1 file changed, 84 insertions(+), 51 deletions(-)

diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
index ea7cfd801..0df397d03 100644
--- a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
+++ b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py
@@ -33,6 +33,12 @@ CONSTRUCT_FILE_NAME = "construct.json"
 DEFAULT_RANK_DIR = "rank0"
 KEY_LAYERS = "layers"
 construct = {}
+cell_list = []
+KEY_SIDE_EFFECT = "side_effect_io"
+td = ops.TensorDump()
+td_in = ops.TensorDump("in")
+td.add_prim_attr(KEY_SIDE_EFFECT, False)
+td_in.add_prim_attr(KEY_SIDE_EFFECT, False)
 np_ms_dtype_dict = {
     "bool": ms.bool_,
     "int8": ms.int8,
@@ -62,6 +68,7 @@ np_ms_dtype_dict = {
     "complex128": ms.complex128
 }
 
+
 def generate_file_path(dump_path, cell_prefix, suffix, io_type, index):
     step_path = os.path.join(dump_path, "{step}")
     rank_path = os.path.join(step_path, "{rank}")
@@ -78,24 +85,30 @@ def partial_func(func, dump_path, cell_prefix, index, io_type):
 
 def clip_gradient(dump_path, cell_prefix, index, io_type, dx):
     if io_type == CoreConst.OUTPUT:
-        ops.TensorDump()(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx)
+        temp = td(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx)
+        dx = ops.depend(dx, temp)
     if io_type == CoreConst.INPUT:
-        ops.TensorDump("in")(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx)
+        temp = td_in(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx)
+        dx = ops.depend(dx, temp)
     return dx
 
 
 def cell_construct_wrapper(func, self):
     def new_construct(self, *args, **kwargs):
         new_args = []
+        out_list = []
 
+        index = 0
+        item = None
         # The inputs of the cell.
         for index, item in enumerate(args):
             if self.data_mode == "backward" or self.data_mode == "all":
                 if ops.is_tensor(item):
-                    item = self.input_clips[index](item)
+                    item = self.output_clips[index](item)
             if self.data_mode == "forward" or self.data_mode == "all":
                 if ops.is_tensor(item):
-                    ops.TensorDump("in")(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.INPUT, index), item)
+                    temp = td_in(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.INPUT, index), item)
+                    item = ops.depend(item, temp)
             new_args.append(item)
 
         out = func(*new_args, **kwargs)
@@ -105,17 +118,24 @@ def cell_construct_wrapper(func, self):
             for index, item in enumerate(out):
                 if self.data_mode == "backward" or self.data_mode == "all":
                     if ops.is_tensor(item):
-                        item = self.output_clips[index](item)
+                        item = self.input_clips[index](item)
                 if self.data_mode == "forward" or self.data_mode == "all":
                     if ops.is_tensor(item):
-                        ops.TensorDump()(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), item)
+                        temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), item)
+                        item = ops.depend(item, temp)
+                        out_list.append(item)
+                    else:
+                        out_list.append(item)
+            out_list = tuple(out_list)
+            return out_list
         else:
             if self.data_mode == "backward" or self.data_mode == "all":
-                out = self.output_clips[0](out)
+                out = self.input_clips[0](out)
             if self.data_mode == "forward" or self.data_mode == "all":
-                ops.TensorDump()(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, 0), out)
-
-        return out
+                if ops.is_tensor(out):
+                    temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), out)
+                    out = ops.depend(out, temp)
+            return out
 
     return new_construct.__get__(self, type(self))
 
@@ -128,7 +148,7 @@ def sort_filenames(path):
     return filenames
 
 
-# 删除多余dump的文件
+# 删除重复dump的文件：自定义文件名相同，并且数据相同。
 def del_same_file(path, filenames):
     result_list = []
     seen_prefixes = {}
@@ -158,34 +178,22 @@ def rename_filename(path):
 
     filename_dict = {}
     for filename in filenames:
-        match = re.search(rf"{CoreConst.CELL}{CoreConst.SEP}(.*?){CoreConst.SEP}({CoreConst.INPUT}|{CoreConst.OUTPUT}){CoreConst.SEP}", filename)
-        mid_field = match.group(1)
-  
-        if mid_field in filename_dict:
-            filename_dict[mid_field].append(filename)
+        name_field = filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)[0]
+
+        if name_field in filename_dict:
+            filename_dict[name_field] += 1
         else:
-            filename_dict[mid_field] = [filename]
-
-    # Change the file name and add the sequence number of the cell that is repeatedly called.
-    for _, filename_list in filename_dict.items():
-        last_second_index = filename_list[0].rfind(CoreConst.REPLACEMENT_CHARACTER, 0, filename_list[0].rfind(CoreConst.REPLACEMENT_CHARACTER))
-        first_file_sub = filename_list[0][:last_second_index]
-        index_list = []
-        for index, filename in enumerate(filename_list):
-            if first_file_sub in filename:
-                index_list.append(index)
-        index_list.append(len(filename_list))
-      
-        for i in range(len(index_list) - 1):
-            start_index = index_list[i]
-            end_index = index_list[i + 1]
-            for j in range(start_index, end_index):
-                if CoreConst.FORWARD_PATTERN in filename_list[j]:
-                    #Format: Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}_{dtype}_{id}.npy
-                    newFileName = filename_list[j].replace(CoreConst.FORWARD_PATTERN, CoreConst.FORWARD_PATTERN + str(i) + CoreConst.SEP)
-                if CoreConst.BACKWARD_PATTERN in filename_list[j]:
-                    newFileName = filename_list[j].replace(CoreConst.BACKWARD_PATTERN, CoreConst.BACKWARD_PATTERN + str(i) + CoreConst.SEP)
-                os.rename(os.path.join(path,filename_list[j]), os.path.join(path,newFileName))
+            filename_dict[name_field] = 0
+  
+        cell_index = filename_dict[name_field]
+
+        # Change the file name and add the sequence number of the cell that is repeatedly called.
+        if CoreConst.FORWARD_PATTERN in filename:
+            #Format: Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}_{dtype}_{id}.npy
+            newFileName = filename.replace(CoreConst.FORWARD_PATTERN, CoreConst.FORWARD_PATTERN + str(cell_index) + CoreConst.SEP)
+        if CoreConst.BACKWARD_PATTERN in filename:
+            newFileName = filename.replace(CoreConst.BACKWARD_PATTERN, CoreConst.BACKWARD_PATTERN + str(cell_index) + CoreConst.SEP)
+        os.rename(os.path.join(path, filename), os.path.join(path, newFileName))
     logger.info(f"==========The rename_filename phase is Finished!==========")
 
 
@@ -224,13 +232,12 @@ def check_relation(cell_name, parent_cell_name):
     return False
 
 
-def get_construct(cell_list):
-    parent_cell_stack = []
-    for cell in cell_list:
+def get_construct(cell_list_input):
+    for cell in cell_list_input:
         cell_name = get_cell_name(cell)
         cell_data_mode = get_data_mode(cell)
         found_flag = False
-        for parent_cell in reversed(parent_cell_stack):
+        for parent_cell in cell_list_input:
             parent_cell_name = get_cell_name(parent_cell)
             parent_data_mode = get_data_mode(parent_cell)
             has_relation = check_relation(cell_name, parent_cell_name)
@@ -240,7 +247,6 @@ def get_construct(cell_list):
                 break
         if not found_flag:
             construct.update({cell: None})
-        parent_cell_stack.append(cell)
 
 
 def generate_construct(path):
@@ -248,11 +254,23 @@ def generate_construct(path):
     filenames = sort_filenames(path)
 
     # 提取文件名中Cell.{cell_name}.{class_name}.{data_mode}.{重复调用此cell的序号}字段，并存入cell_list
-    cell_list = []
     for filename in filenames:
-        match = re.search(rf"({CoreConst.CELL}{CoreConst.SEP}).*?(?={CoreConst.SEP}(?:{CoreConst.INPUT}{CoreConst.SEP}|{CoreConst.OUTPUT}{CoreConst.SEP}))", filename)
-        mid_field = match.group(0)
-        cell_list.append(mid_field)
+        point_position = 3
+        mid_field = filename.rsplit(CoreConst.SEP, point_position)[0]
+        if CoreConst.INPUT in filename:
+            if mid_field in cell_list:
+                cell_list.remove(mid_field)
+            cell_list.append(mid_field)
+        else:
+            if mid_field not in cell_list:
+                index = filenames.index(filename)
+                output_field = mid_field + CoreConst.OUTPUT
+                find_flag = False
+                for filename_other in cell_list[index + 1:]:
+                    if output_field in filename_other:
+                        find_flag = True
+                if find_flag is False:
+                    cell_list.append(mid_field)
 
     get_construct(cell_list)
 
@@ -308,6 +326,11 @@ def process_file(file_path):
         return None, None, None
 
 
+def custom_sort(item, key_to_index):
+    key = item[0]
+    return key_to_index.get(key, float('inf'))
+
+
 def generate_dump_info(path):
     if not os.path.exists(path):
         logger.error("The provided path does not exist.")
@@ -317,12 +340,10 @@ def generate_dump_info(path):
 
     with Pool(processes=10) as pool:
         file_paths = []
-        # 传入的path为工具生成的./dump_tensor_data，内容为npy文件
         for root, _, files in os.walk(path):
             for file in files:
                 if file.endswith(FileCheckConst.NUMPY_SUFFIX):
                     file_paths.append((os.path.join(root, file),))
-        file_paths.sort()
         results = pool.starmap(process_file, file_paths)
 
     # 收集结果
@@ -336,6 +357,12 @@ def generate_dump_info(path):
                 dump_data.get(CoreConst.DATA, {}).get(op_name, {})[key] = []
             dump_data.get(CoreConst.DATA, {}).get(op_name, {}).get(key, []).append(tensor_json)
 
+    # 根据cell_list排序
+    data_dict = dump_data.get(CoreConst.DATA, {})
+    key_to_index = {key: index for index, key in enumerate(cell_list)}
+    sorted_data_dict = dict(sorted(data_dict.items(), key=lambda item: custom_sort(item, key_to_index)))
+    dump_data[CoreConst.DATA] = sorted_data_dict
+
     # 将数据写入dump.json
     json_path = os.path.join(os.path.dirname(path), 'dump.json')
     save_json(json_path, dump_data, indent=1)
@@ -360,10 +387,10 @@ def generate_stack_info(path):
         # 文件名举例:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0_float32_165.npy
         parts = os.path.basename(file_path).split(CoreConst.SEP)
         # op_name是Cell.network._backbone.loss.CrossEntropyLoss.forward.0
-        op_name = CoreConst.SEP.join(parts[:-3])  # 获取所需的部分路径
+        op_name = CoreConst.SEP.join(parts[:-3])
         stack_data.update({op_name: []})
 
-    # 将数据写入dump.json
+    # 将数据写入stack.json
     json_path = os.path.join(os.path.dirname(path), 'stack.json')
     save_json(json_path, stack_data, indent=1)
 
@@ -395,6 +422,12 @@ def start(net=None, dump_path="./", data_mode=CoreConst.ALL):
     for name, cell in net.cells_and_names():
         if name == "":
             continue
+
+        # 跳过框架内部的cell
+        class_name = cell.__class__.__name__
+        if class_name.startswith(CoreConst.REPLACEMENT_CHARACTER):
+            logger.warning(f"Cell {name}.{class_name} is skipped!")
+            continue
         else:
             #Format: Cell.{cell_name}.{class_name}
             cell.cell_prefix = CoreConst.SEP.join([CoreConst.CELL, name, cell.__class__.__name__])
-- 
Gitee