From 6795e67e56d27c4a4c58e327622db43e518c2b10 Mon Sep 17 00:00:00 2001 From: fuchao <1501312275@qq.com> Date: Sat, 18 Jan 2025 12:50:52 +0800 Subject: [PATCH] =?UTF-8?q?=E9=9D=99=E6=80=81=E5=9B=BEcell=20dump=E9=87=87?= =?UTF-8?q?=E7=94=A8=E6=96=B0=E6=96=B9=E6=A1=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../mindspore/dump/cell_dump_process.py | 135 +++++++++++------- 1 file changed, 84 insertions(+), 51 deletions(-) diff --git a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py index ea7cfd801..0df397d03 100644 --- a/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py +++ b/debug/accuracy_tools/msprobe/mindspore/dump/cell_dump_process.py @@ -33,6 +33,12 @@ CONSTRUCT_FILE_NAME = "construct.json" DEFAULT_RANK_DIR = "rank0" KEY_LAYERS = "layers" construct = {} +cell_list = [] +KEY_SIDE_EFFECT = "side_effect_io" +td = ops.TensorDump() +td_in = ops.TensorDump("in") +td.add_prim_attr(KEY_SIDE_EFFECT, False) +td_in.add_prim_attr(KEY_SIDE_EFFECT, False) np_ms_dtype_dict = { "bool": ms.bool_, "int8": ms.int8, @@ -62,6 +68,7 @@ np_ms_dtype_dict = { "complex128": ms.complex128 } + def generate_file_path(dump_path, cell_prefix, suffix, io_type, index): step_path = os.path.join(dump_path, "{step}") rank_path = os.path.join(step_path, "{rank}") @@ -78,24 +85,30 @@ def partial_func(func, dump_path, cell_prefix, index, io_type): def clip_gradient(dump_path, cell_prefix, index, io_type, dx): if io_type == CoreConst.OUTPUT: - ops.TensorDump()(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx) + temp = td(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx) + dx = ops.depend(dx, temp) if io_type == CoreConst.INPUT: - ops.TensorDump("in")(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx) + temp = td_in(generate_file_path(dump_path, cell_prefix, CoreConst.BACKWARD, io_type, index), dx) + dx = ops.depend(dx, temp) return dx def cell_construct_wrapper(func, self): def new_construct(self, *args, **kwargs): new_args = [] + out_list = [] + index = 0 + item = None # The inputs of the cell. for index, item in enumerate(args): if self.data_mode == "backward" or self.data_mode == "all": if ops.is_tensor(item): - item = self.input_clips[index](item) + item = self.output_clips[index](item) if self.data_mode == "forward" or self.data_mode == "all": if ops.is_tensor(item): - ops.TensorDump("in")(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.INPUT, index), item) + temp = td_in(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.INPUT, index), item) + item = ops.depend(item, temp) new_args.append(item) out = func(*new_args, **kwargs) @@ -105,17 +118,24 @@ def cell_construct_wrapper(func, self): for index, item in enumerate(out): if self.data_mode == "backward" or self.data_mode == "all": if ops.is_tensor(item): - item = self.output_clips[index](item) + item = self.input_clips[index](item) if self.data_mode == "forward" or self.data_mode == "all": if ops.is_tensor(item): - ops.TensorDump()(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), item) + temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), item) + item = ops.depend(item, temp) + out_list.append(item) + else: + out_list.append(item) + out_list = tuple(out_list) + return out_list else: if self.data_mode == "backward" or self.data_mode == "all": - out = self.output_clips[0](out) + out = self.input_clips[0](out) if self.data_mode == "forward" or self.data_mode == "all": - ops.TensorDump()(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, 0), out) - - return out + if ops.is_tensor(out): + temp = td(generate_file_path(self.dump_path, self.cell_prefix, CoreConst.FORWARD, CoreConst.OUTPUT, index), out) + out = ops.depend(out, temp) + return out return new_construct.__get__(self, type(self)) @@ -128,7 +148,7 @@ def sort_filenames(path): return filenames -# 删除多余dump的文件 +# 删除重复dump的文件:自定义文件名相同,并且数据相同。 def del_same_file(path, filenames): result_list = [] seen_prefixes = {} @@ -158,34 +178,22 @@ def rename_filename(path): filename_dict = {} for filename in filenames: - match = re.search(rf"{CoreConst.CELL}{CoreConst.SEP}(.*?){CoreConst.SEP}({CoreConst.INPUT}|{CoreConst.OUTPUT}){CoreConst.SEP}", filename) - mid_field = match.group(1) - - if mid_field in filename_dict: - filename_dict[mid_field].append(filename) + name_field = filename.rsplit(CoreConst.REPLACEMENT_CHARACTER, 1)[0] + + if name_field in filename_dict: + filename_dict[name_field] += 1 else: - filename_dict[mid_field] = [filename] - - # Change the file name and add the sequence number of the cell that is repeatedly called. - for _, filename_list in filename_dict.items(): - last_second_index = filename_list[0].rfind(CoreConst.REPLACEMENT_CHARACTER, 0, filename_list[0].rfind(CoreConst.REPLACEMENT_CHARACTER)) - first_file_sub = filename_list[0][:last_second_index] - index_list = [] - for index, filename in enumerate(filename_list): - if first_file_sub in filename: - index_list.append(index) - index_list.append(len(filename_list)) - - for i in range(len(index_list) - 1): - start_index = index_list[i] - end_index = index_list[i + 1] - for j in range(start_index, end_index): - if CoreConst.FORWARD_PATTERN in filename_list[j]: - #Format: Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}_{dtype}_{id}.npy - newFileName = filename_list[j].replace(CoreConst.FORWARD_PATTERN, CoreConst.FORWARD_PATTERN + str(i) + CoreConst.SEP) - if CoreConst.BACKWARD_PATTERN in filename_list[j]: - newFileName = filename_list[j].replace(CoreConst.BACKWARD_PATTERN, CoreConst.BACKWARD_PATTERN + str(i) + CoreConst.SEP) - os.rename(os.path.join(path,filename_list[j]), os.path.join(path,newFileName)) + filename_dict[name_field] = 0 + + cell_index = filename_dict[name_field] + + # Change the file name and add the sequence number of the cell that is repeatedly called. + if CoreConst.FORWARD_PATTERN in filename: + #Format: Cell.{cell_name}.{class_name}.{forward/backward}.{number}.{input/output}.{index}_{dtype}_{id}.npy + newFileName = filename.replace(CoreConst.FORWARD_PATTERN, CoreConst.FORWARD_PATTERN + str(cell_index) + CoreConst.SEP) + if CoreConst.BACKWARD_PATTERN in filename: + newFileName = filename.replace(CoreConst.BACKWARD_PATTERN, CoreConst.BACKWARD_PATTERN + str(cell_index) + CoreConst.SEP) + os.rename(os.path.join(path, filename), os.path.join(path, newFileName)) logger.info(f"==========The rename_filename phase is Finished!==========") @@ -224,13 +232,12 @@ def check_relation(cell_name, parent_cell_name): return False -def get_construct(cell_list): - parent_cell_stack = [] - for cell in cell_list: +def get_construct(cell_list_input): + for cell in cell_list_input: cell_name = get_cell_name(cell) cell_data_mode = get_data_mode(cell) found_flag = False - for parent_cell in reversed(parent_cell_stack): + for parent_cell in cell_list_input: parent_cell_name = get_cell_name(parent_cell) parent_data_mode = get_data_mode(parent_cell) has_relation = check_relation(cell_name, parent_cell_name) @@ -240,7 +247,6 @@ def get_construct(cell_list): break if not found_flag: construct.update({cell: None}) - parent_cell_stack.append(cell) def generate_construct(path): @@ -248,11 +254,23 @@ def generate_construct(path): filenames = sort_filenames(path) # 提取文件名中Cell.{cell_name}.{class_name}.{data_mode}.{重复调用此cell的序号}字段,并存入cell_list - cell_list = [] for filename in filenames: - match = re.search(rf"({CoreConst.CELL}{CoreConst.SEP}).*?(?={CoreConst.SEP}(?:{CoreConst.INPUT}{CoreConst.SEP}|{CoreConst.OUTPUT}{CoreConst.SEP}))", filename) - mid_field = match.group(0) - cell_list.append(mid_field) + point_position = 3 + mid_field = filename.rsplit(CoreConst.SEP, point_position)[0] + if CoreConst.INPUT in filename: + if mid_field in cell_list: + cell_list.remove(mid_field) + cell_list.append(mid_field) + else: + if mid_field not in cell_list: + index = filenames.index(filename) + output_field = mid_field + CoreConst.OUTPUT + find_flag = False + for filename_other in cell_list[index + 1:]: + if output_field in filename_other: + find_flag = True + if find_flag is False: + cell_list.append(mid_field) get_construct(cell_list) @@ -308,6 +326,11 @@ def process_file(file_path): return None, None, None +def custom_sort(item, key_to_index): + key = item[0] + return key_to_index.get(key, float('inf')) + + def generate_dump_info(path): if not os.path.exists(path): logger.error("The provided path does not exist.") @@ -317,12 +340,10 @@ def generate_dump_info(path): with Pool(processes=10) as pool: file_paths = [] - # 传入的path为工具生成的./dump_tensor_data,内容为npy文件 for root, _, files in os.walk(path): for file in files: if file.endswith(FileCheckConst.NUMPY_SUFFIX): file_paths.append((os.path.join(root, file),)) - file_paths.sort() results = pool.starmap(process_file, file_paths) # 收集结果 @@ -336,6 +357,12 @@ def generate_dump_info(path): dump_data.get(CoreConst.DATA, {}).get(op_name, {})[key] = [] dump_data.get(CoreConst.DATA, {}).get(op_name, {}).get(key, []).append(tensor_json) + # 根据cell_list排序 + data_dict = dump_data.get(CoreConst.DATA, {}) + key_to_index = {key: index for index, key in enumerate(cell_list)} + sorted_data_dict = dict(sorted(data_dict.items(), key=lambda item: custom_sort(item, key_to_index))) + dump_data[CoreConst.DATA] = sorted_data_dict + # 将数据写入dump.json json_path = os.path.join(os.path.dirname(path), 'dump.json') save_json(json_path, dump_data, indent=1) @@ -360,10 +387,10 @@ def generate_stack_info(path): # 文件名举例:Cell.network._backbone.loss.CrossEntropyLoss.forward.0.input.0_float32_165.npy parts = os.path.basename(file_path).split(CoreConst.SEP) # op_name是Cell.network._backbone.loss.CrossEntropyLoss.forward.0 - op_name = CoreConst.SEP.join(parts[:-3]) # 获取所需的部分路径 + op_name = CoreConst.SEP.join(parts[:-3]) stack_data.update({op_name: []}) - # 将数据写入dump.json + # 将数据写入stack.json json_path = os.path.join(os.path.dirname(path), 'stack.json') save_json(json_path, stack_data, indent=1) @@ -395,6 +422,12 @@ def start(net=None, dump_path="./", data_mode=CoreConst.ALL): for name, cell in net.cells_and_names(): if name == "": continue + + # 跳过框架内部的cell + class_name = cell.__class__.__name__ + if class_name.startswith(CoreConst.REPLACEMENT_CHARACTER): + logger.warning(f"Cell {name}.{class_name} is skipped!") + continue else: #Format: Cell.{cell_name}.{class_name} cell.cell_prefix = CoreConst.SEP.join([CoreConst.CELL, name, cell.__class__.__name__]) -- Gitee