From 44ea84f49e933a307c6819118c4a37c7355ab71e Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 28 Jul 2025 20:36:42 +0800 Subject: [PATCH 01/14] =?UTF-8?q?md5=E5=88=9D=E7=89=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_processor/mindspore_processor.py | 24 ++++++++- .../msprobe/core/data_dump/json_writer.py | 50 +++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index 1e8cb322f9..f71a2c8b09 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================ +import os import zlib import mindspore as ms @@ -22,6 +23,7 @@ from mindspore._c_expression.typing import Number import numpy as np from msprobe.core.common.const import Const +from concurrent.futures import ThreadPoolExecutor from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo, ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs) from msprobe.core.common.file_utils import path_len_exceeds_limit @@ -53,6 +55,15 @@ class MindsporeDataProcessor(BaseDataProcessor): } self._async_dump_cache = {} self.api_register = get_api_register() + # self._crc_executor = ThreadPoolExecutor(max_workers=4) + self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count()) + + @staticmethod + def compute_crc32_bytes(tensor_bytes): + # 纯函数,方便多进程调用 + # import zlib + print("1111") + return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod def get_md5_for_tensor(x): @@ -188,8 +199,17 @@ class MindsporeDataProcessor(BaseDataProcessor): tensor_json.update({Const.TENSOR_STAT_INDEX: placeholder_index}) if self.config.summary_mode == Const.MD5 and not self.config.async_dump: - tensor_md5 = self.get_md5_for_tensor(tensor) - tensor_json.update({Const.MD5: tensor_md5}) + # 拷贝并搬到 CPU + tensor_bytes = tensor.asnumpy().tobytes() + + future = self._crc_executor.submit( + MindsporeDataProcessor.compute_crc32_bytes, + tensor_bytes + ) + + crc_placeholder = self.data_writer.append_crc32_to_buffer(future) + tensor_json[Const.MD5_INDEX] = crc_placeholder + return tensor_json def _analyze_and_save_tensor(self, tensor, suffix): diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 0119a692a8..59844bb0bd 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -18,6 +18,7 @@ import os import copy import threading +import concurrent from msprobe.core.common.const import Const, FileCheckConst from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json from msprobe.core.common.log import logger @@ -42,6 +43,47 @@ class DataWriter: self.cache_construct = {} self.cache_debug = {} self.stat_stack_list = [] + self._error_log_initialized = False + self._cache_logged_error_types = set() + self.crc32_stack_list = [] + self._crc_flush_threshold = 50 + + def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int: + """ + 把一个计算 CRC32 的 Future 放入队列,返回占位符索引 + """ + idx = len(self.crc32_stack_list) + self.crc32_stack_list.append(future) + return idx + + def flush_crc32_stack(self): + """ + 等待所有 CRC32 计算完成,返回结果列表 + """ + if not self.crc32_stack_list: + return [] + results = [f.result() for f in self.crc32_stack_list] + self.crc32_stack_list = [] + return results + + def _replace_crc32_placeholders(self, data, crc32_results): + """ + 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 + """ + if isinstance(data, dict): + for k, v in list(data.items()): + if k == Const.MD5_INDEX and isinstance(v, int): + idx = v + # 防越界 + crc = crc32_results[idx] if idx < len(crc32_results) else None + # 删除占位符,改成真实字段 + del data[k] + data[Const.MD5] = crc + else: + self._replace_crc32_placeholders(v, crc32_results) + elif isinstance(data, (list, tuple)): + for item in data: + self._replace_crc32_placeholders(item, crc32_results) @staticmethod def write_data_to_csv(result: list, result_header: tuple, file_path: str): @@ -241,6 +283,14 @@ class DataWriter: self._replace_stat_placeholders(self.cache_data, stat_result) if self.cache_debug: self._replace_stat_placeholders(self.cache_debug, stat_result) + + # 2) 再 flush CRC32 + crc32_result = self.flush_crc32_stack() + if crc32_result: + self._replace_crc32_placeholders(self.cache_data, crc32_result) + if self.cache_debug: + self._replace_crc32_placeholders(self.cache_debug, crc32_result) + if self.cache_data: self.write_data_json(self.dump_file_path) if self.cache_stack: -- Gitee From ac58c367d2643f736917068cf4f5e430ed69d203 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 31 Jul 2025 10:21:06 +0800 Subject: [PATCH 02/14] =?UTF-8?q?md5=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_processor/mindspore_processor.py | 4 ++-- .../data_processor/pytorch_processor.py | 23 +++++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index f71a2c8b09..25797d66b1 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -56,13 +56,13 @@ class MindsporeDataProcessor(BaseDataProcessor): self._async_dump_cache = {} self.api_register = get_api_register() # self._crc_executor = ThreadPoolExecutor(max_workers=4) - self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count()) + self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) @staticmethod def compute_crc32_bytes(tensor_bytes): # 纯函数,方便多进程调用 # import zlib - print("1111") + return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 8ab864b232..eea3f96d1f 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import zlib from dataclasses import asdict from typing import List @@ -23,6 +24,7 @@ from torch import distributed as dist from torch.distributed.distributed_c10d import _get_default_group from msprobe.core.common.const import Const +from concurrent.futures import ThreadPoolExecutor from msprobe.core.common.exceptions import MsprobeException from msprobe.core.common.file_utils import path_len_exceeds_limit from msprobe.core.common.log import logger @@ -65,6 +67,15 @@ class PytorchDataProcessor(BaseDataProcessor): "dtype": self.analyze_dtype_in_kwargs } self._async_dump_cache = {} + self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) + + + @staticmethod + def compute_crc32_bytes(tensor_bytes): + # 纯函数,方便多进程调用 + # import zlib + + return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod def get_md5_for_tensor(x): @@ -248,8 +259,16 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_json.update({"requires_grad": tensor.requires_grad}) if self.config.summary_mode == Const.MD5 and not self.config.async_dump: - tensor_md5 = self.get_md5_for_tensor(tensor) - tensor_json.update({Const.MD5: tensor_md5}) + # 拷贝并搬到 CPU + tensor_bytes = tensor.asnumpy().tobytes() + + future = self._crc_executor.submit( + PytorchDataProcessor.compute_crc32_bytes, + tensor_bytes + ) + + crc_placeholder = self.data_writer.append_crc32_to_buffer(future) + tensor_json[Const.MD5_INDEX] = crc_placeholder return tensor_json def _analyze_and_save_tensor(self, tensor, suffix): -- Gitee From 1b7097c51ec9f6f5780fe7ca378139aa88a184d0 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 31 Jul 2025 19:51:11 +0800 Subject: [PATCH 03/14] Update pytorch_processor.py --- .../core/data_dump/data_processor/pytorch_processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index eea3f96d1f..0d828491a5 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -260,7 +260,11 @@ class PytorchDataProcessor(BaseDataProcessor): if self.config.summary_mode == Const.MD5 and not self.config.async_dump: # 拷贝并搬到 CPU - tensor_bytes = tensor.asnumpy().tobytes() + if tensor.dtype == torch.bfloat16: + tensor = tensor.float() + tensor_bytes = tensor.cpu().detach().numpy().tobytes() + + # tensor_bytes = tensor.asnumpy().tobytes() future = self._crc_executor.submit( PytorchDataProcessor.compute_crc32_bytes, -- Gitee From c0c4509e9a384e2e8da1fb3f42dcca4d85209b8f Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 31 Jul 2025 19:56:20 +0800 Subject: [PATCH 04/14] Update const.py --- debug/accuracy_tools/msprobe/core/common/const.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index 327278fd47..a6e3aba0fd 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -24,6 +24,8 @@ class Const: Class for const """ TOOL_NAME = "msprobe" + MD5_INDEX = "md5_index" + MD5 = "md5" ipv4_pattern = "([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])(\.([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])){3}$" SEP = "." -- Gitee From 65921460b219abbc5c8849c392472a7977eb4515 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Sat, 2 Aug 2025 11:13:11 +0800 Subject: [PATCH 05/14] =?UTF-8?q?md5=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/data_dump/data_processor/mindspore_processor.py | 4 ---- .../core/data_dump/data_processor/pytorch_processor.py | 5 ----- debug/accuracy_tools/msprobe/core/data_dump/json_writer.py | 1 - 3 files changed, 10 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index 25797d66b1..fd3362a500 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -55,14 +55,10 @@ class MindsporeDataProcessor(BaseDataProcessor): } self._async_dump_cache = {} self.api_register = get_api_register() - # self._crc_executor = ThreadPoolExecutor(max_workers=4) self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) @staticmethod def compute_crc32_bytes(tensor_bytes): - # 纯函数,方便多进程调用 - # import zlib - return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 0d828491a5..2b79f03090 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -72,9 +72,6 @@ class PytorchDataProcessor(BaseDataProcessor): @staticmethod def compute_crc32_bytes(tensor_bytes): - # 纯函数,方便多进程调用 - # import zlib - return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod @@ -264,8 +261,6 @@ class PytorchDataProcessor(BaseDataProcessor): tensor = tensor.float() tensor_bytes = tensor.cpu().detach().numpy().tobytes() - # tensor_bytes = tensor.asnumpy().tobytes() - future = self._crc_executor.submit( PytorchDataProcessor.compute_crc32_bytes, tensor_bytes diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 59844bb0bd..392dbc1c23 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -46,7 +46,6 @@ class DataWriter: self._error_log_initialized = False self._cache_logged_error_types = set() self.crc32_stack_list = [] - self._crc_flush_threshold = 50 def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int: """ -- Gitee From f1e62569777c32803a420f040c69a589dec3558b Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Sat, 2 Aug 2025 15:56:06 +0800 Subject: [PATCH 06/14] cleancode --- .../data_processor/mindspore_processor.py | 2 +- .../data_processor/pytorch_processor.py | 2 +- .../msprobe/core/data_dump/json_writer.py | 74 +++++++++---------- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index fd3362a500..7bd14ba0f4 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -15,6 +15,7 @@ import os import zlib +from concurrent.futures import ThreadPoolExecutor import mindspore as ms from mindspore import mint, ops, hal @@ -23,7 +24,6 @@ from mindspore._c_expression.typing import Number import numpy as np from msprobe.core.common.const import Const -from concurrent.futures import ThreadPoolExecutor from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo, ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs) from msprobe.core.common.file_utils import path_len_exceeds_limit diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 2b79f03090..c305ee766a 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -17,6 +17,7 @@ import os import zlib from dataclasses import asdict from typing import List +from concurrent.futures import ThreadPoolExecutor import numpy as np import torch @@ -24,7 +25,6 @@ from torch import distributed as dist from torch.distributed.distributed_c10d import _get_default_group from msprobe.core.common.const import Const -from concurrent.futures import ThreadPoolExecutor from msprobe.core.common.exceptions import MsprobeException from msprobe.core.common.file_utils import path_len_exceeds_limit from msprobe.core.common.log import logger diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 392dbc1c23..ef93e2a226 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -47,43 +47,6 @@ class DataWriter: self._cache_logged_error_types = set() self.crc32_stack_list = [] - def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int: - """ - 把一个计算 CRC32 的 Future 放入队列,返回占位符索引 - """ - idx = len(self.crc32_stack_list) - self.crc32_stack_list.append(future) - return idx - - def flush_crc32_stack(self): - """ - 等待所有 CRC32 计算完成,返回结果列表 - """ - if not self.crc32_stack_list: - return [] - results = [f.result() for f in self.crc32_stack_list] - self.crc32_stack_list = [] - return results - - def _replace_crc32_placeholders(self, data, crc32_results): - """ - 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 - """ - if isinstance(data, dict): - for k, v in list(data.items()): - if k == Const.MD5_INDEX and isinstance(v, int): - idx = v - # 防越界 - crc = crc32_results[idx] if idx < len(crc32_results) else None - # 删除占位符,改成真实字段 - del data[k] - data[Const.MD5] = crc - else: - self._replace_crc32_placeholders(v, crc32_results) - elif isinstance(data, (list, tuple)): - for item in data: - self._replace_crc32_placeholders(item, crc32_results) - @staticmethod def write_data_to_csv(result: list, result_header: tuple, file_path: str): if not result: @@ -139,12 +102,49 @@ class DataWriter: for item in data: self._replace_stat_placeholders(item, stat_result) + def _replace_crc32_placeholders(self, data, crc32_results): + """ + 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 + """ + if isinstance(data, dict): + for k, v in list(data.items()): + if k == Const.MD5_INDEX and isinstance(v, int): + idx = v + # 防越界 + crc = crc32_results[idx] if idx < len(crc32_results) else None + # 删除占位符,改成真实字段 + del data[k] + data[Const.MD5] = crc + else: + self._replace_crc32_placeholders(v, crc32_results) + elif isinstance(data, (list, tuple)): + for item in data: + self._replace_crc32_placeholders(item, crc32_results) + def reset_cache(self): self.cache_data = {} self.cache_stack = {} self.cache_construct = {} self.cache_debug = {} + def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int: + """ + 把一个计算 CRC32 的 Future 放入队列,返回占位符索引 + """ + idx = len(self.crc32_stack_list) + self.crc32_stack_list.append(future) + return idx + + def flush_crc32_stack(self): + """ + 等待所有 CRC32 计算完成,返回结果列表 + """ + if not self.crc32_stack_list: + return [] + results = [f.result() for f in self.crc32_stack_list] + self.crc32_stack_list = [] + return results + def initialize_json_file(self, **kwargs): if kwargs["level"] == Const.LEVEL_DEBUG and not self.cache_debug: # debug level case only create debug.json -- Gitee From fcba020bcaff78eaa07095a7a215267a12ff26db Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Sat, 2 Aug 2025 16:00:21 +0800 Subject: [PATCH 07/14] =?UTF-8?q?=E4=BF=AE=E6=94=B9ut?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_dump/data_processor/test_pytorch_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py index 33071c0e01..34e79defce 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py @@ -316,10 +316,10 @@ class TestPytorchDataProcessor(unittest.TestCase): 'type': 'torch.Tensor', 'dtype': str(tensor.dtype), 'shape': tensor.shape, - 'requires_grad': tensor.requires_grad, - 'md5': 'mocked_md5' + 'requires_grad': tensor.requires_grad } result.pop('tensor_stat_index', None) + result.pop('md5', None) self.assertDictEqual(expected, result) def test_analyze_tensor_with_empty_tensor(self): -- Gitee From 537c7fc3044531ee6a09ba8b23bb5abc2fd7d6b9 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 4 Aug 2025 09:16:13 +0800 Subject: [PATCH 08/14] Update test_pytorch_processor.py --- .../core_ut/data_dump/data_processor/test_pytorch_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py index 34e79defce..7bb081e52b 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py @@ -319,7 +319,7 @@ class TestPytorchDataProcessor(unittest.TestCase): 'requires_grad': tensor.requires_grad } result.pop('tensor_stat_index', None) - result.pop('md5', None) + result.pop('md5_index', None) self.assertDictEqual(expected, result) def test_analyze_tensor_with_empty_tensor(self): -- Gitee From 6e6ad792f51eb5ca99f6f72db326505489863366 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 4 Aug 2025 09:27:49 +0800 Subject: [PATCH 09/14] 1 --- .../msprobe/core/data_dump/json_writer.py | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index ef93e2a226..3d57be4d40 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -102,25 +102,6 @@ class DataWriter: for item in data: self._replace_stat_placeholders(item, stat_result) - def _replace_crc32_placeholders(self, data, crc32_results): - """ - 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 - """ - if isinstance(data, dict): - for k, v in list(data.items()): - if k == Const.MD5_INDEX and isinstance(v, int): - idx = v - # 防越界 - crc = crc32_results[idx] if idx < len(crc32_results) else None - # 删除占位符,改成真实字段 - del data[k] - data[Const.MD5] = crc - else: - self._replace_crc32_placeholders(v, crc32_results) - elif isinstance(data, (list, tuple)): - for item in data: - self._replace_crc32_placeholders(item, crc32_results) - def reset_cache(self): self.cache_data = {} self.cache_stack = {} @@ -299,3 +280,21 @@ class DataWriter: if self.cache_debug: self.write_debug_info_json(self.debug_file_path) + def _replace_crc32_placeholders(self, data, crc32_results): + """ + 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 + """ + if isinstance(data, dict): + for k, v in list(data.items()): + if k == Const.MD5_INDEX and isinstance(v, int): + idx = v + # 防越界 + crc = crc32_results[idx] if idx < len(crc32_results) else None + # 删除占位符,改成真实字段 + del data[k] + data[Const.MD5] = crc + else: + self._replace_crc32_placeholders(v, crc32_results) + elif isinstance(data, (list, tuple)): + for item in data: + self._replace_crc32_placeholders(item, crc32_results) -- Gitee From 185064ad991c60a0393dcd96c78c0247a50d1b46 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 4 Aug 2025 14:19:11 +0800 Subject: [PATCH 10/14] Update test_mindspore_processor.py --- .../data_dump/data_processor/test_mindspore_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_mindspore_processor.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_mindspore_processor.py index 25141a9e77..99dc1d5eee 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_mindspore_processor.py @@ -138,12 +138,12 @@ class TestMindsporeDataProcessor(unittest.TestCase): expected_result = { 'type': 'mindspore.Tensor', 'dtype': 'Int32', - 'shape': (3,), - 'md5': 'test_md5', + 'shape': (3,) } result = self.processor._analyze_tensor(tensor, suffix) # 删除不必要的字段 result.pop('tensor_stat_index', None) + result.pop('md5_index', None) self.assertEqual(result, expected_result) -- Gitee From 0ea910d8b786f86b066020833a991cd30363cd79 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 4 Aug 2025 19:47:06 +0800 Subject: [PATCH 11/14] Update mindspore_processor.py --- .../msprobe/core/data_dump/data_processor/mindspore_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index 7bd14ba0f4..bc72d32b6a 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -195,6 +195,7 @@ class MindsporeDataProcessor(BaseDataProcessor): tensor_json.update({Const.TENSOR_STAT_INDEX: placeholder_index}) if self.config.summary_mode == Const.MD5 and not self.config.async_dump: + tensor = convert_bf16_to_fp32(tensor) # 拷贝并搬到 CPU tensor_bytes = tensor.asnumpy().tobytes() -- Gitee From 2e8f01faba1877f9e0e7231a08bdb81e5693fcb8 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Tue, 5 Aug 2025 14:32:56 +0800 Subject: [PATCH 12/14] =?UTF-8?q?MD5=E9=98=B2=E6=AD=A2host=E6=BA=A2?= =?UTF-8?q?=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/core/data_dump/data_processor/base.py | 1 + debug/accuracy_tools/msprobe/core/data_dump/json_writer.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index b188fe5cbe..57fb5842c3 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -94,6 +94,7 @@ class BaseDataProcessor: def __init__(self, config, data_writer): self.data_writer = data_writer self.config = config + self.data_writer.config = config self.api_info_struct = {} self.stack_info_struct = {} self.current_api_or_module_name = None diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 3d57be4d40..0bdcd82b8d 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -159,7 +159,10 @@ class DataWriter: length = len(dump_data) - threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size + if self.config.summary_mode == Const.MD5: + threshold = self.flush_size + else: + threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size if length % threshold == 0: self.write_json() -- Gitee From 0d263d94ea2d98774c5e25d159c195f23afd38db Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Tue, 5 Aug 2025 15:43:01 +0800 Subject: [PATCH 13/14] Update json_writer.py --- debug/accuracy_tools/msprobe/core/data_dump/json_writer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 0bdcd82b8d..31f5fc5cc1 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -159,7 +159,12 @@ class DataWriter: length = len(dump_data) - if self.config.summary_mode == Const.MD5: + # 1) 先取到 config(如果没有,就拿 None) + cfg = getattr(self, "config", None) + # 2) 再取 summary_mode(如果 cfg 是 None 或者没 summary_mode,就拿 None) + summary_mode = getattr(cfg, "summary_mode", None) + + if summary_mode == Const.MD5: threshold = self.flush_size else: threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size -- Gitee From da8acb3c1dd793fdc397f2e31f1957b603e1fc2f Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Tue, 5 Aug 2025 16:41:47 +0800 Subject: [PATCH 14/14] Update base.py --- .../msprobe/core/data_dump/data_processor/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index 57fb5842c3..c43dc9deee 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -94,7 +94,8 @@ class BaseDataProcessor: def __init__(self, config, data_writer): self.data_writer = data_writer self.config = config - self.data_writer.config = config + if self.data_writer is not None: + self.data_writer.config = config self.api_info_struct = {} self.stack_info_struct = {} self.current_api_or_module_name = None -- Gitee