From 8836d00ce68106cc237d45f0106b47d63f779e3d Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Wed, 16 Jul 2025 17:17:11 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E5=8E=BB=E9=87=8D=E6=8A=A5=E9=94=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/core/common/const.py | 5 ++- .../msprobe/core/data_dump/data_collector.py | 39 ++++++++++++++----- .../msprobe/core/data_dump/json_writer.py | 9 ++++- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index a86b87ce0d..027d187cce 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -677,6 +677,7 @@ class FileCheckConst: IR_SUFFIX = ".ir" ZIP_SUFFIX = ".zip" SHELL_SUFFIX = ".sh" + LOG_SUFFIX = ".log" MAX_PKL_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 MAX_NUMPY_SIZE = 10737418240 # 10 * 1024 * 1024 * 1024 MAX_JSON_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 @@ -689,6 +690,7 @@ class FileCheckConst: MAX_FILE_IN_ZIP_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 MAX_FILE_SIZE = 1073741824 # 1 * 1024 * 1024 * 1024 COMMOM_FILE_SIZE = 1048576 # 1 * 1024 * 1024 + MAX_LOG_SIZE = 10737418240 # 1 * 1024 * 1024 * 1024 DIR = "dir" FILE = "file" DATA_DIR_AUTHORITY = 0o750 @@ -702,7 +704,8 @@ class FileCheckConst: XLSX_SUFFIX: MAX_XLSX_SIZE, YAML_SUFFIX: MAX_YAML_SIZE, IR_SUFFIX: MAX_IR_SIZE, - ZIP_SUFFIX: MAX_ZIP_SIZE + ZIP_SUFFIX: MAX_ZIP_SIZE, + LOG_SUFFIX: MAX_LOG_SIZE } CSV_BLACK_LIST = r'^[+-=%@\+\-=%@]|;[+-=%@\+\-=%@]' diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py index 42a8d8e9cf..f3fd354779 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py @@ -122,9 +122,12 @@ class DataCollector: self.handle_data(name, data_info, flush=self.data_processor.is_terminated) except Exception: + # 取异常类名作为“类型”做去重 + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def forward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -143,9 +146,12 @@ class DataCollector: self.handle_data(name, data_info, flush=self.data_processor.is_terminated) except Exception: + # 取异常类名作为“类型”做去重 + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def forward_data_collect_only_tensor(self, name, module, pid, module_input_output): @@ -155,9 +161,12 @@ class DataCollector: self.data_processor.analyze_forward(name, module, module_input_output) except Exception: + # 取异常类名作为“类型”做去重 + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def forward_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -174,9 +183,11 @@ class DataCollector: self.handle_data(name, data_info, flush=self.data_processor.is_terminated) except Exception: + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def backward_data_collect_only_tensor(self, name, module, pid, module_input_output, is_recompute=None): @@ -186,9 +197,11 @@ class DataCollector: self.data_processor.analyze_backward(name, module, module_input_output) except Exception: + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def backward_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -207,9 +220,11 @@ class DataCollector: self.handle_data(name, data_info, flush=self.data_processor.is_terminated) except Exception: + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def backward_input_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -224,9 +239,11 @@ class DataCollector: self.handle_data(name, data_info) except Exception: + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def backward_output_data_collect(self, name, module, pid, module_input_output, is_recompute=None): @@ -241,9 +258,11 @@ class DataCollector: self.handle_data(name, data_info) except Exception: + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( - f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}" + f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}", + error_type=error_type ) def update_construct(self, name): @@ -292,10 +311,12 @@ class DataCollector: data_info = self.data_processor.analyze_params(grad_name, param_name, data) self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated) except Exception: + error_type = type(Exception).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] params_data_collect failed: " - f"name={name}, param_name={param_name}, pid={pid}\n{tb}" + f"name={name}, param_name={param_name}, pid={pid}\n{tb}", + error_type=error_type ) def debug_data_collect_forward(self, variable, name_with_count): diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index f263f8ad3f..0fd83a7152 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -46,6 +46,7 @@ class DataWriter: self.cache_debug = {} self.stat_stack_list = [] self._error_log_initialized = False + self._logged_error_types = set() @staticmethod def write_data_to_csv(result: list, result_header: tuple, file_path: str): @@ -147,13 +148,19 @@ class DataWriter: if length % threshold == 0: self.write_json() - def write_error_log(self, message: str): + def write_error_log(self, message: str, error_type: str): """ 写错误日志: - 第一次调用时以 'w' 模式清空文件,之后都用 'a' 模式追加 - 添加时间戳 - 在 message 后写入当前的调用栈(方便追踪日志来源) """ + # 如果同类型错误已经记录过,跳过 + if error_type in self._logged_error_types: + return + # 否则添加到已记录集合,并继续写日志 + self._logged_error_types.add(error_type) + try: mode = "w" if not self._error_log_initialized else "a" self._error_log_initialized = True -- Gitee From 64ee77c316a1fe975750110bf9b7b6be8c3dcf87 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 17 Jul 2025 17:31:24 +0800 Subject: [PATCH 2/3] Update data_collector.py --- .../msprobe/core/data_dump/data_collector.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py index f3fd354779..e9f0a7fccd 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_collector.py @@ -121,9 +121,9 @@ class DataCollector: self.call_stack_collect(name) self.handle_data(name, data_info, flush=self.data_processor.is_terminated) - except Exception: + except Exception as e: # 取异常类名作为“类型”做去重 - error_type = type(Exception).__name__ + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] forward_input_data_collect failed: name={name}, pid={pid}\n{tb}", @@ -145,9 +145,9 @@ class DataCollector: return self.handle_data(name, data_info, flush=self.data_processor.is_terminated) - except Exception: + except Exception as e: # 取异常类名作为“类型”做去重 - error_type = type(Exception).__name__ + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] forward_output_data_collect failed: name={name}, pid={pid}\n{tb}", @@ -160,9 +160,9 @@ class DataCollector: return self.data_processor.analyze_forward(name, module, module_input_output) - except Exception: + except Exception as e: # 取异常类名作为“类型”做去重 - error_type = type(Exception).__name__ + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] forward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}", @@ -182,8 +182,8 @@ class DataCollector: self.call_stack_collect(name) self.handle_data(name, data_info, flush=self.data_processor.is_terminated) - except Exception: - error_type = type(Exception).__name__ + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] forward_data_collect failed: name={name}, pid={pid}\n{tb}", @@ -196,8 +196,8 @@ class DataCollector: return self.data_processor.analyze_backward(name, module, module_input_output) - except Exception: - error_type = type(Exception).__name__ + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] backward_data_collect_only_tensor failed: name={name}, pid={pid}\n{tb}", @@ -219,8 +219,8 @@ class DataCollector: self.backward_module_names[module_name] = True self.handle_data(name, data_info, flush=self.data_processor.is_terminated) - except Exception: - error_type = type(Exception).__name__ + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] backward_data_collect failed: name={name}, pid={pid}\n{tb}", @@ -238,8 +238,8 @@ class DataCollector: self.set_is_recomputable(data_info, is_recompute) self.handle_data(name, data_info) - except Exception: - error_type = type(Exception).__name__ + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] backward_input_data_collect failed: name={name}, pid={pid}\n{tb}", @@ -257,8 +257,8 @@ class DataCollector: self.set_is_recomputable(data_info, is_recompute) self.handle_data(name, data_info) - except Exception: - error_type = type(Exception).__name__ + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] backward_output_data_collect failed: name={name}, pid={pid}\n{tb}", @@ -310,8 +310,8 @@ class DataCollector: return data_info = self.data_processor.analyze_params(grad_name, param_name, data) self.handle_data(grad_name, data_info, flush=self.data_processor.is_terminated) - except Exception: - error_type = type(Exception).__name__ + except Exception as e: + error_type = type(e).__name__ tb = traceback.format_exc() self.data_writer.write_error_log( f"[ERROR] params_data_collect failed: " -- Gitee From 8a59d9ed0ad68d3e40c272aef46626f8e8001960 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 17 Jul 2025 17:38:58 +0800 Subject: [PATCH 3/3] Update json_writer.py --- debug/accuracy_tools/msprobe/core/data_dump/json_writer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 0fd83a7152..422ab3e2e6 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -46,7 +46,7 @@ class DataWriter: self.cache_debug = {} self.stat_stack_list = [] self._error_log_initialized = False - self._logged_error_types = set() + self._cache_logged_error_types = set() @staticmethod def write_data_to_csv(result: list, result_header: tuple, file_path: str): @@ -108,6 +108,7 @@ class DataWriter: self.cache_stack = {} self.cache_construct = {} self.cache_debug = {} + self._cache_logged_error_types = set() def initialize_json_file(self, **kwargs): if kwargs["level"] == Const.LEVEL_DEBUG and not self.cache_debug: @@ -156,10 +157,10 @@ class DataWriter: - 在 message 后写入当前的调用栈(方便追踪日志来源) """ # 如果同类型错误已经记录过,跳过 - if error_type in self._logged_error_types: + if error_type in self._cache_logged_error_types: return # 否则添加到已记录集合,并继续写日志 - self._logged_error_types.add(error_type) + self._cache_logged_error_types.add(error_type) try: mode = "w" if not self._error_log_initialized else "a" -- Gitee