From 3dd012f6dcba6cd87d460b82f50be13420dcee14 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 28 Jul 2025 20:36:42 +0800 Subject: [PATCH 01/13] =?UTF-8?q?md5=E5=88=9D=E7=89=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_processor/mindspore_processor.py | 24 ++++++++- .../msprobe/core/data_dump/json_writer.py | 50 +++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index 1e8cb322f9..f71a2c8b09 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================ +import os import zlib import mindspore as ms @@ -22,6 +23,7 @@ from mindspore._c_expression.typing import Number import numpy as np from msprobe.core.common.const import Const +from concurrent.futures import ThreadPoolExecutor from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo, ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs) from msprobe.core.common.file_utils import path_len_exceeds_limit @@ -53,6 +55,15 @@ class MindsporeDataProcessor(BaseDataProcessor): } self._async_dump_cache = {} self.api_register = get_api_register() + # self._crc_executor = ThreadPoolExecutor(max_workers=4) + self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count()) + + @staticmethod + def compute_crc32_bytes(tensor_bytes): + # 纯函数,方便多进程调用 + # import zlib + print("1111") + return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod def get_md5_for_tensor(x): @@ -188,8 +199,17 @@ class MindsporeDataProcessor(BaseDataProcessor): tensor_json.update({Const.TENSOR_STAT_INDEX: placeholder_index}) if self.config.summary_mode == Const.MD5 and not self.config.async_dump: - tensor_md5 = self.get_md5_for_tensor(tensor) - tensor_json.update({Const.MD5: tensor_md5}) + # 拷贝并搬到 CPU + tensor_bytes = tensor.asnumpy().tobytes() + + future = self._crc_executor.submit( + MindsporeDataProcessor.compute_crc32_bytes, + tensor_bytes + ) + + crc_placeholder = self.data_writer.append_crc32_to_buffer(future) + tensor_json[Const.MD5_INDEX] = crc_placeholder + return tensor_json def _analyze_and_save_tensor(self, tensor, suffix): diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 2f8ef29e40..71961e7d0e 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -18,6 +18,7 @@ import os import copy import threading +import concurrent from msprobe.core.common.const import Const, FileCheckConst from msprobe.core.common.file_utils import change_mode, FileOpen, save_json, load_json from msprobe.core.common.log import logger @@ -42,6 +43,47 @@ class DataWriter: self.cache_construct = {} self.cache_debug = {} self.stat_stack_list = [] + self._error_log_initialized = False + self._cache_logged_error_types = set() + self.crc32_stack_list = [] + self._crc_flush_threshold = 50 + + def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int: + """ + 把一个计算 CRC32 的 Future 放入队列,返回占位符索引 + """ + idx = len(self.crc32_stack_list) + self.crc32_stack_list.append(future) + return idx + + def flush_crc32_stack(self): + """ + 等待所有 CRC32 计算完成,返回结果列表 + """ + if not self.crc32_stack_list: + return [] + results = [f.result() for f in self.crc32_stack_list] + self.crc32_stack_list = [] + return results + + def _replace_crc32_placeholders(self, data, crc32_results): + """ + 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 + """ + if isinstance(data, dict): + for k, v in list(data.items()): + if k == Const.MD5_INDEX and isinstance(v, int): + idx = v + # 防越界 + crc = crc32_results[idx] if idx < len(crc32_results) else None + # 删除占位符,改成真实字段 + del data[k] + data[Const.MD5] = crc + else: + self._replace_crc32_placeholders(v, crc32_results) + elif isinstance(data, (list, tuple)): + for item in data: + self._replace_crc32_placeholders(item, crc32_results) @staticmethod def write_data_to_csv(result: list, result_header: tuple, file_path: str): @@ -241,6 +283,14 @@ class DataWriter: self._replace_stat_placeholders(self.cache_data, stat_result) if self.cache_debug: self._replace_stat_placeholders(self.cache_debug, stat_result) + + # 2) 再 flush CRC32 + crc32_result = self.flush_crc32_stack() + if crc32_result: + self._replace_crc32_placeholders(self.cache_data, crc32_result) + if self.cache_debug: + self._replace_crc32_placeholders(self.cache_debug, crc32_result) + if self.cache_data: self.write_data_json(self.dump_file_path) if self.cache_stack: -- Gitee From aa26986893e8ebf2d25a8952ed4bde6b473763c6 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 31 Jul 2025 10:21:06 +0800 Subject: [PATCH 02/13] =?UTF-8?q?md5=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_processor/mindspore_processor.py | 4 ++-- .../data_processor/pytorch_processor.py | 23 +++++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index f71a2c8b09..25797d66b1 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -56,13 +56,13 @@ class MindsporeDataProcessor(BaseDataProcessor): self._async_dump_cache = {} self.api_register = get_api_register() # self._crc_executor = ThreadPoolExecutor(max_workers=4) - self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count()) + self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) @staticmethod def compute_crc32_bytes(tensor_bytes): # 纯函数,方便多进程调用 # import zlib - print("1111") + return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 9a4c5ee964..9a71b639c5 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import zlib from dataclasses import asdict from typing import List @@ -23,6 +24,7 @@ from torch import distributed as dist from torch.distributed.distributed_c10d import _get_default_group from msprobe.core.common.const import Const +from concurrent.futures import ThreadPoolExecutor from msprobe.core.common.exceptions import MsprobeException from msprobe.core.common.file_utils import path_len_exceeds_limit from msprobe.core.common.log import logger @@ -65,6 +67,15 @@ class PytorchDataProcessor(BaseDataProcessor): "dtype": self.analyze_dtype_in_kwargs } self._async_dump_cache = {} + self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) + + + @staticmethod + def compute_crc32_bytes(tensor_bytes): + # 纯函数,方便多进程调用 + # import zlib + + return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod def get_md5_for_tensor(x): @@ -249,8 +260,16 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_json.update({"requires_grad": tensor.requires_grad}) if self.config.summary_mode == Const.MD5 and not self.config.async_dump: - tensor_md5 = self.get_md5_for_tensor(tensor) - tensor_json.update({Const.MD5: tensor_md5}) + # 拷贝并搬到 CPU + tensor_bytes = tensor.asnumpy().tobytes() + + future = self._crc_executor.submit( + PytorchDataProcessor.compute_crc32_bytes, + tensor_bytes + ) + + crc_placeholder = self.data_writer.append_crc32_to_buffer(future) + tensor_json[Const.MD5_INDEX] = crc_placeholder return tensor_json def _analyze_and_save_tensor(self, tensor, suffix): -- Gitee From 88eb5fc2d02b71fef03af04084348331dd5f2f0a Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 31 Jul 2025 19:51:11 +0800 Subject: [PATCH 03/13] Update pytorch_processor.py --- .../core/data_dump/data_processor/pytorch_processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 9a71b639c5..1bd7ed0ff6 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -261,7 +261,11 @@ class PytorchDataProcessor(BaseDataProcessor): if self.config.summary_mode == Const.MD5 and not self.config.async_dump: # 拷贝并搬到 CPU - tensor_bytes = tensor.asnumpy().tobytes() + if tensor.dtype == torch.bfloat16: + tensor = tensor.float() + tensor_bytes = tensor.cpu().detach().numpy().tobytes() + + # tensor_bytes = tensor.asnumpy().tobytes() future = self._crc_executor.submit( PytorchDataProcessor.compute_crc32_bytes, -- Gitee From 5cab61c9ccdc2220db03fb8f771f6ec52ec3bbaf Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 31 Jul 2025 19:56:20 +0800 Subject: [PATCH 04/13] Update const.py --- debug/accuracy_tools/msprobe/core/common/const.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index e8b5814e11..fada928064 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -24,6 +24,8 @@ class Const: Class for const """ TOOL_NAME = "msprobe" + MD5_INDEX = "md5_index" + MD5 = "md5" ipv4_pattern = "([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])(\.([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])){3}$" SEP = "." -- Gitee From d933978de04c890f5c1efb272d5dd7f7c03d1764 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Sat, 2 Aug 2025 11:13:11 +0800 Subject: [PATCH 05/13] =?UTF-8?q?md5=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/data_dump/data_processor/mindspore_processor.py | 4 ---- .../core/data_dump/data_processor/pytorch_processor.py | 5 ----- debug/accuracy_tools/msprobe/core/data_dump/json_writer.py | 1 - 3 files changed, 10 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index 25797d66b1..fd3362a500 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -55,14 +55,10 @@ class MindsporeDataProcessor(BaseDataProcessor): } self._async_dump_cache = {} self.api_register = get_api_register() - # self._crc_executor = ThreadPoolExecutor(max_workers=4) self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) @staticmethod def compute_crc32_bytes(tensor_bytes): - # 纯函数,方便多进程调用 - # import zlib - return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 1bd7ed0ff6..90ae4ba603 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -72,9 +72,6 @@ class PytorchDataProcessor(BaseDataProcessor): @staticmethod def compute_crc32_bytes(tensor_bytes): - # 纯函数,方便多进程调用 - # import zlib - return f"{zlib.crc32(tensor_bytes):08x}" @staticmethod @@ -265,8 +262,6 @@ class PytorchDataProcessor(BaseDataProcessor): tensor = tensor.float() tensor_bytes = tensor.cpu().detach().numpy().tobytes() - # tensor_bytes = tensor.asnumpy().tobytes() - future = self._crc_executor.submit( PytorchDataProcessor.compute_crc32_bytes, tensor_bytes diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 71961e7d0e..2a212ed768 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -46,7 +46,6 @@ class DataWriter: self._error_log_initialized = False self._cache_logged_error_types = set() self.crc32_stack_list = [] - self._crc_flush_threshold = 50 def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int: """ -- Gitee From 27339090b746aaf9c58d74185329693ec92354a9 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Sat, 2 Aug 2025 15:56:06 +0800 Subject: [PATCH 06/13] cleancode --- .../data_processor/mindspore_processor.py | 2 +- .../data_processor/pytorch_processor.py | 2 +- .../msprobe/core/data_dump/json_writer.py | 74 +++++++++---------- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index fd3362a500..7bd14ba0f4 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -15,6 +15,7 @@ import os import zlib +from concurrent.futures import ThreadPoolExecutor import mindspore as ms from mindspore import mint, ops, hal @@ -23,7 +24,6 @@ from mindspore._c_expression.typing import Number import numpy as np from msprobe.core.common.const import Const -from concurrent.futures import ThreadPoolExecutor from msprobe.core.data_dump.data_processor.base import (BaseDataProcessor, TensorStatInfo, ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs) from msprobe.core.common.file_utils import path_len_exceeds_limit diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 90ae4ba603..2f6ab80d71 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -17,6 +17,7 @@ import os import zlib from dataclasses import asdict from typing import List +from concurrent.futures import ThreadPoolExecutor import numpy as np import torch @@ -24,7 +25,6 @@ from torch import distributed as dist from torch.distributed.distributed_c10d import _get_default_group from msprobe.core.common.const import Const -from concurrent.futures import ThreadPoolExecutor from msprobe.core.common.exceptions import MsprobeException from msprobe.core.common.file_utils import path_len_exceeds_limit from msprobe.core.common.log import logger diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 2a212ed768..be5051b311 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -47,43 +47,6 @@ class DataWriter: self._cache_logged_error_types = set() self.crc32_stack_list = [] - def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int: - """ - 把一个计算 CRC32 的 Future 放入队列,返回占位符索引 - """ - idx = len(self.crc32_stack_list) - self.crc32_stack_list.append(future) - return idx - - def flush_crc32_stack(self): - """ - 等待所有 CRC32 计算完成,返回结果列表 - """ - if not self.crc32_stack_list: - return [] - results = [f.result() for f in self.crc32_stack_list] - self.crc32_stack_list = [] - return results - - def _replace_crc32_placeholders(self, data, crc32_results): - """ - 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 - """ - if isinstance(data, dict): - for k, v in list(data.items()): - if k == Const.MD5_INDEX and isinstance(v, int): - idx = v - # 防越界 - crc = crc32_results[idx] if idx < len(crc32_results) else None - # 删除占位符,改成真实字段 - del data[k] - data[Const.MD5] = crc - else: - self._replace_crc32_placeholders(v, crc32_results) - elif isinstance(data, (list, tuple)): - for item in data: - self._replace_crc32_placeholders(item, crc32_results) - @staticmethod def write_data_to_csv(result: list, result_header: tuple, file_path: str): if not result: @@ -139,12 +102,49 @@ class DataWriter: for item in data: self._replace_stat_placeholders(item, stat_result) + def _replace_crc32_placeholders(self, data, crc32_results): + """ + 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 + """ + if isinstance(data, dict): + for k, v in list(data.items()): + if k == Const.MD5_INDEX and isinstance(v, int): + idx = v + # 防越界 + crc = crc32_results[idx] if idx < len(crc32_results) else None + # 删除占位符,改成真实字段 + del data[k] + data[Const.MD5] = crc + else: + self._replace_crc32_placeholders(v, crc32_results) + elif isinstance(data, (list, tuple)): + for item in data: + self._replace_crc32_placeholders(item, crc32_results) + def reset_cache(self): self.cache_data = {} self.cache_stack = {} self.cache_construct = {} self.cache_debug = {} + def append_crc32_to_buffer(self, future: concurrent.futures.Future) -> int: + """ + 把一个计算 CRC32 的 Future 放入队列,返回占位符索引 + """ + idx = len(self.crc32_stack_list) + self.crc32_stack_list.append(future) + return idx + + def flush_crc32_stack(self): + """ + 等待所有 CRC32 计算完成,返回结果列表 + """ + if not self.crc32_stack_list: + return [] + results = [f.result() for f in self.crc32_stack_list] + self.crc32_stack_list = [] + return results + def initialize_json_file(self, **kwargs): if kwargs["level"] == Const.LEVEL_DEBUG and not self.cache_debug: # debug level case only create debug.json -- Gitee From 9aa89768e492aeb2fece52e3a94b70edad82d55d Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Sat, 2 Aug 2025 16:00:21 +0800 Subject: [PATCH 07/13] =?UTF-8?q?=E4=BF=AE=E6=94=B9ut?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_dump/data_processor/test_pytorch_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py index 33071c0e01..34e79defce 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py @@ -316,10 +316,10 @@ class TestPytorchDataProcessor(unittest.TestCase): 'type': 'torch.Tensor', 'dtype': str(tensor.dtype), 'shape': tensor.shape, - 'requires_grad': tensor.requires_grad, - 'md5': 'mocked_md5' + 'requires_grad': tensor.requires_grad } result.pop('tensor_stat_index', None) + result.pop('md5', None) self.assertDictEqual(expected, result) def test_analyze_tensor_with_empty_tensor(self): -- Gitee From 4246feae14b1a86f7c8ea0c43198a7ddbe09db87 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 4 Aug 2025 09:16:13 +0800 Subject: [PATCH 08/13] Update test_pytorch_processor.py --- .../core_ut/data_dump/data_processor/test_pytorch_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py index 34e79defce..7bb081e52b 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py @@ -319,7 +319,7 @@ class TestPytorchDataProcessor(unittest.TestCase): 'requires_grad': tensor.requires_grad } result.pop('tensor_stat_index', None) - result.pop('md5', None) + result.pop('md5_index', None) self.assertDictEqual(expected, result) def test_analyze_tensor_with_empty_tensor(self): -- Gitee From ea2e00edaa7ad771e00a5fd392745f827150a82c Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 4 Aug 2025 09:27:49 +0800 Subject: [PATCH 09/13] 1 --- .../msprobe/core/data_dump/json_writer.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index be5051b311..6860a37fe1 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -102,25 +102,6 @@ class DataWriter: for item in data: self._replace_stat_placeholders(item, stat_result) - def _replace_crc32_placeholders(self, data, crc32_results): - """ - 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 - """ - if isinstance(data, dict): - for k, v in list(data.items()): - if k == Const.MD5_INDEX and isinstance(v, int): - idx = v - # 防越界 - crc = crc32_results[idx] if idx < len(crc32_results) else None - # 删除占位符,改成真实字段 - del data[k] - data[Const.MD5] = crc - else: - self._replace_crc32_placeholders(v, crc32_results) - elif isinstance(data, (list, tuple)): - for item in data: - self._replace_crc32_placeholders(item, crc32_results) - def reset_cache(self): self.cache_data = {} self.cache_stack = {} @@ -298,3 +279,22 @@ class DataWriter: self.write_construct_info_json(self.construct_file_path) if self.cache_debug: self.write_debug_info_json(self.debug_file_path) + + def _replace_crc32_placeholders(self, data, crc32_results): + """ + 遍历 JSON 结构,将所有 md5_index 占位符替换成真实的 CRC32 + """ + if isinstance(data, dict): + for k, v in list(data.items()): + if k == Const.MD5_INDEX and isinstance(v, int): + idx = v + # 防越界 + crc = crc32_results[idx] if idx < len(crc32_results) else None + # 删除占位符,改成真实字段 + del data[k] + data[Const.MD5] = crc + else: + self._replace_crc32_placeholders(v, crc32_results) + elif isinstance(data, (list, tuple)): + for item in data: + self._replace_crc32_placeholders(item, crc32_results) -- Gitee From a34428017595873b90eefb34fa62a16f2572b4ce Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Mon, 4 Aug 2025 19:47:06 +0800 Subject: [PATCH 10/13] Update mindspore_processor.py --- .../msprobe/core/data_dump/data_processor/mindspore_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py index 7bd14ba0f4..bc72d32b6a 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/mindspore_processor.py @@ -195,6 +195,7 @@ class MindsporeDataProcessor(BaseDataProcessor): tensor_json.update({Const.TENSOR_STAT_INDEX: placeholder_index}) if self.config.summary_mode == Const.MD5 and not self.config.async_dump: + tensor = convert_bf16_to_fp32(tensor) # 拷贝并搬到 CPU tensor_bytes = tensor.asnumpy().tobytes() -- Gitee From fd7e6b3f9a889c4fddd30628bb2d7689ab818fc7 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Tue, 5 Aug 2025 14:32:56 +0800 Subject: [PATCH 11/13] =?UTF-8?q?MD5=E9=98=B2=E6=AD=A2host=E6=BA=A2?= =?UTF-8?q?=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/core/data_dump/data_processor/base.py | 1 + debug/accuracy_tools/msprobe/core/data_dump/json_writer.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index 6ff6b77197..17af8d43bd 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -94,6 +94,7 @@ class BaseDataProcessor: def __init__(self, config, data_writer): self.data_writer = data_writer self.config = config + self.data_writer.config = config self.api_info_struct = {} self.stack_info_struct = {} self.current_api_or_module_name = None diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 6860a37fe1..90e202b86b 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -158,7 +158,10 @@ class DataWriter: length = len(dump_data) - threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size + if self.config.summary_mode == Const.MD5: + threshold = self.flush_size + else: + threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size if length % threshold == 0: self.write_json() -- Gitee From 38fa24ef97bfc633ae63b3b0ee4cf58deadda049 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Tue, 5 Aug 2025 15:43:01 +0800 Subject: [PATCH 12/13] Update json_writer.py --- debug/accuracy_tools/msprobe/core/data_dump/json_writer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py index 90e202b86b..1d0e356d5c 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/json_writer.py @@ -158,7 +158,12 @@ class DataWriter: length = len(dump_data) - if self.config.summary_mode == Const.MD5: + # 1) 先取到 config(如果没有,就拿 None) + cfg = getattr(self, "config", None) + # 2) 再取 summary_mode(如果 cfg 是 None 或者没 summary_mode,就拿 None) + summary_mode = getattr(cfg, "summary_mode", None) + + if summary_mode == Const.MD5: threshold = self.flush_size else: threshold = self.flush_size if length < self.larger_flush_size else self.larger_flush_size -- Gitee From 42d5d0b6e5ee197d39826ae0fa2e97a447ca95a0 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Tue, 5 Aug 2025 16:41:47 +0800 Subject: [PATCH 13/13] Update base.py --- .../msprobe/core/data_dump/data_processor/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py index 17af8d43bd..42c91c189d 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/base.py @@ -94,7 +94,8 @@ class BaseDataProcessor: def __init__(self, config, data_writer): self.data_writer = data_writer self.config = config - self.data_writer.config = config + if self.data_writer is not None: + self.data_writer.config = config self.api_info_struct = {} self.stack_info_struct = {} self.current_api_or_module_name = None -- Gitee