diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index a86b87ce0d064bdd1019b551968ad48d9381512c..027d187cced5fcd3b302b4cad83e3d2051b3b9bd 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -677,6 +677,7 @@ class FileCheckConst: IR_SUFFIX = ".ir" ZIP_SUFFIX = ".zip" SHELL_SUFFIX = ".sh" + LOG_SUFFIX = ".log" MAX_PKL_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 MAX_NUMPY_SIZE = 10737418240 # 10 * 1024 * 1024 * 1024 MAX_JSON_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 @@ -689,6 +690,7 @@ class FileCheckConst: MAX_FILE_IN_ZIP_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 MAX_FILE_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 COMMOM_FILE_SIZE = 1048576 # 1 * 1024 * 1024 + MAX_LOG_SIZE = 10737418240 # 1 * 1024 * 1024 * 1024 DIR = "dir" FILE = "file" DATA_DIR_AUTHORITY = 0o750 @@ -702,7 +704,8 @@ class FileCheckConst: XLSX_SUFFIX: MAX_XLSX_SIZE, YAML_SUFFIX: MAX_YAML_SIZE, IR_SUFFIX: MAX_IR_SIZE, - ZIP_SUFFIX: MAX_ZIP_SIZE + ZIP_SUFFIX: MAX_ZIP_SIZE, + LOG_SUFFIX: MAX_LOG_SIZE } CSV_BLACK_LIST = r'^[+-=%@\+\-=%@]|;[+-=%@\+\-=%@]' diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py index 42a8d8e9cf8d4813fdb58fc6ee6f533806f2dcdd..e9f0a7fccddaead29a2a6c949165a9102c227fee 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py @@ -121,10 +121,13 @@ class DataCollector: self.call_stack_collect(name) self.handle_data(name, data_info, flush=self.data_processor.is_terminated) - except Exception: + except Exception as e: + # 取异常类名作为“类型”做去重 + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def forward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -142,10 +145,13 @@ class DataCollector: return self.handle_data(name, data_info, flush=self.data_processor.is_terminated) - except Exception: + except Exception as e: + # 取异常类名作为“类型”做去重 + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def forward_data_collect_only_tensor(self, name, module, pid, module_input_output): @@ -154,10 +160,13 @@ class DataCollector: return self.data_processor.analyze_forward(name, module, module_input_output) - except Exception: + except Exception as e: + # 取异常类名作为“类型”做去重 + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def forward_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -173,10 +182,12 @@ class DataCollector: self.call_stack_collect(name) self.handle_data(name, data_info, flush=self.data_processor.is_terminated) - except Exception: + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def backward_data_collect_only_tensor(self, name, module, pid, module_input_output, is_recompute=None): @@ -185,10 +196,12 @@ class DataCollector: return self.data_processor.analyze_backward(name, module, module_input_output) - except Exception: + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def backward_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -206,10 +219,12 @@ class DataCollector: self.backward_module_names[module_name] = True self.handle_data(name, data_info, flush=self.data_processor.is_terminated) - except Exception: + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def backward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -223,10 +238,12 @@ class DataCollector: self.set_is_recomputable(data_info, is_recompute) self.handle_data(name, data_info) - except Exception: + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def backward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -240,10 +257,12 @@ class DataCollector: self.set_is_recomputable(data_info, is_recompute) self.handle_data(name, data_info) - except Exception: + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def update_construct(self, name): @@ -291,11 +310,13 @@ class DataCollector: return data_info = self.data_processor.analyze_params(grad_name, param_name, data) self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated) - except Exception: + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] params_data_collect failed: " - f"name={name}, param_name={param_name}, pid={pid}\n{tb}" + f"name={name}, param_name={param_name}, pid={pid}\n{tb}", + error_type=error_type ) def debug_data_collect_forward(self, variable, name_with_count): diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index f263f8ad3f94e3806b95f5fb1dcc8903c1a9c306..422ab3e2e6b279eac3dee7c3f1ab7b9aebda4615 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -46,6 +46,7 @@ class DataWriter: self.cache_debug = {} self.stat_stack_list = [] self._error_log_initialized = False + self._cache_logged_error_types = set() @staticmethod def write_data_to_csv(result: list, result_header: tuple, file_path: str): @@ -107,6 +108,7 @@ class DataWriter: self.cache_stack = {} self.cache_construct = {} self.cache_debug = {} + self._cache_logged_error_types = set() def initialize_json_file(self, **kwargs): if kwargs["level"] == Const.LEVEL_DEBUG and not self.cache_debug: @@ -147,13 +149,19 @@ class DataWriter: if length % threshold == 0: self.write_json() - def write_error_log(self, message: str): + def write_error_log(self, message: str, error_type: str): """ 写错误日志: - 第一次调用时以 'w' 模式清空文件,之后都用 'a' 模式追加 - 添加时间戳 - 在 message 后写入当前的调用栈(方便追踪日志来源) """ + # 如果同类型错误已经记录过,跳过 + if error_type in self._cache_logged_error_types: + return + # 否则添加到已记录集合,并继续写日志 + self._cache_logged_error_types.add(error_type) + try: mode = "w" if not self._error_log_initialized else "a" self._error_log_initialized = True