From 9917e1d370f92d3636b900cd14e838643ace3a2f Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Wed, 27 Aug 2025 17:34:24 +0800 Subject: [PATCH 01/11] Update pytorch_processor.py --- .../data_processor/pytorch_processor.py | 63 +++++++++++++++++-- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 9e81ac5691..d89f8d81bc 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -136,6 +136,54 @@ class PytorchDataProcessor(BaseDataProcessor): crc32_hash = zlib.crc32(tensor_bytes) return f"{crc32_hash:08x}" + def _tensor_bytes_view_cpu(t: torch.Tensor, logger=None, tag: str = "crc"): + """ + 返回 t 在当前 dtype 下的原始字节视图(优先零拷贝)。 + 需保证:t 已在 CPU 且是 contiguous。 + 可能返回 memoryview 或 bytes(兜底拷贝),均可被 zlib.crc32 接受。 + """ + # 直接拿底层 storage(PyTorch 提供的无类型存储,单字节步长) + # nbytes = t.numel() * t.element_size() + # storage = t.untyped_storage() + # mv = memoryview(storage) # 这就是 u8 视图 + # return mv[:nbytes] # 截到有效字节长度 + + nbytes = t.numel() * t.element_size() + byte_offset = t.storage_offset() * t.element_size() + + if nbytes == 0: + # _log("empty_tensor", action="return_empty_memoryview") + return memoryview(b"") + + # A) 直接对 UntypedStorage 建立 memoryview + storage = t.untyped_storage() + + # C) ctypes 指针构造 memoryview(零拷贝 FFI) + import ctypes + try: + addr = storage.data_ptr() + byte_offset + buf = (ctypes.c_ubyte * nbytes).from_address(addr) + mv3 = memoryview(buf) + + return mv3 + except Exception as e3: + + pass + + def compute_crc32_from_tensor(t: torch.Tensor, *, logger=None) -> str: + """ + 直接对 Tensor 原始字节做 CRC32。 + : + - "raw": 保持 bfloat16 原始 16bit 字节(推荐,避免升精/增容) + """ + + # 取得字节视图(含多级回退),然后做 CRC + mv = PytorchDataProcessor._tensor_bytes_view_cpu(t) + + crc = zlib.crc32(mv) + + return f"{crc:08x}" + @staticmethod def analyze_device_in_kwargs(element): single_arg = {} @@ -299,14 +347,17 @@ class PytorchDataProcessor(BaseDataProcessor): if self.config.summary_mode == Const.MD5 and not self.config.async_dump: tensor_md5 = None if not self.tensor_handler.is_empty_data(tensor): - # 拷贝并搬到 CPU - if common_tensor.dtype == torch.bfloat16: - common_tensor = common_tensor.float() - tensor_bytes = common_tensor.cpu().detach().numpy() + + if common_tensor.device.type != "cpu": + t_cpu = common_tensor.to("cpu", non_blocking=False) + t_cpu = common_tensor.detach() + if not t_cpu.is_contiguous(): + t_cpu = t_cpu.contiguous() + future = self._crc_executor.submit( - PytorchDataProcessor.compute_crc32_bytes, - tensor_bytes + PytorchDataProcessor.compute_crc32_from_tensor, + t_cpu ) crc_placeholder = self.data_writer.append_crc32_to_buffer(future) -- Gitee From 22121b926794a6fb5323229bd7fe8617beb1f2a6 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 28 Aug 2025 16:20:27 +0800 Subject: [PATCH 02/11] 1 --- .../core/data_dump/data_processor/2.py | 138 ++++++++++++++++++ .../data_processor/pytorch_processor.py | 42 ++++-- 2 files changed, 164 insertions(+), 16 deletions(-) create mode 100644 debug/accuracy_tools/msprobe/core/data_dump/data_processor/2.py diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/2.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/2.py new file mode 100644 index 0000000000..37a24c1ad0 --- /dev/null +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/2.py @@ -0,0 +1,138 @@ +def _tensor_bytes_view_cpu(t: torch.Tensor, logger=None, tag: str = "crc"): + """ + 返回 t 在当前 dtype 下的原始字节视图(优先零拷贝)。 + 需保证:t 已在 CPU 且是 contiguous。 + 可能返回 memoryview 或 bytes(兜底拷贝),均可被 zlib.crc32 接受。 + """ + + # 直接拿底层 storage(PyTorch 提供的无类型存储,单字节步长) + # nbytes = t.numel() * t.element_size() + # storage = t.untyped_storage() + # mv = memoryview(storage) # 这就是 u8 视图 + # return mv[:nbytes] # 截到有效字节长度 + + # print("123124") + def _log(event: str, **kv): + msg = f"[{tag}] {_safe(event)} | " + " ".join(f"{k}={_safe(v)}" for k, v in kv.items()) + try: + if logger is not None: + logger.debug(msg) + else: + print(msg) + except Exception: + # 避免日志本身抛错影响主流程 + try: + print(msg) + except Exception: + pass + + def _safe(x): + try: + return str(x) + except Exception: + return f"" + + nbytes = t.numel() * t.element_size() + byte_offset = t.storage_offset() * t.element_size() + + # _log( + # "enter", + # dtype=t.dtype, + # device=t.device, + # shape=tuple(t.shape), + # contiguous=t.is_contiguous(), + # numel=t.numel(), + # elem_size=t.element_size(), + # nbytes=nbytes, + # storage_offset_bytes=byte_offset, + # ) + + if nbytes == 0: + # _log("empty_tensor", action="return_empty_memoryview") + return memoryview(b"") + + # A) 直接对 UntypedStorage 建立 memoryview + storage = t.untyped_storage() + # try: + # mv = memoryview(storage) # 有些发行版支持 + # mv_sliced = mv[byte_offset: byte_offset + nbytes] + # # _log( + # # "path_A_success", + # # mv_type=type(mv_sliced).__name__, + # # mv_len=len(mv_sliced), + # # head16=bytes(mv_sliced[:16]).hex() if len(mv_sliced) else "", + # # ) + # return mv_sliced + # except Exception as e1: + # pass + # # _log("path_A_failed", err=repr(e1)) + + # # B) 视作 uint8 再取 storage(依然零拷贝) + # try: + # t_u8 = t.view(torch.uint8) + # st_u8 = t_u8.untyped_storage() + # mv2 = memoryview(st_u8) # uint8 下 offset 单位就是字节 + # off_elems = byte_offset + # mv2_sliced = mv2[off_elems: off_elems + nbytes] + # # _log( + # # "path_B_success", + # # mv_type=type(mv2_sliced).__name__, + # # mv_len=len(mv2_sliced), + # # head16=bytes(mv2_sliced[:16]).hex() if len(mv2_sliced) else "", + # # ) + # return mv2_sliced + # except Exception as e2: + # # _log("path_B_failed", err=repr(e2)) + # pass + + # C) ctypes 指针构造 memoryview(零拷贝 FFI) + import ctypes + try: + addr = storage.data_ptr() + byte_offset + buf = (ctypes.c_ubyte * nbytes).from_address(addr) + mv3 = memoryview(buf) + # _log( + # "path_C_success", + # mv_type=type(mv3).__name__, + # mv_len=len(mv3), + # head16=bytes(mv3[:16]).hex() if len(mv3) else "", + # addr_hex=hex(addr), + # ) + return mv3 + except Exception as e3: + _log("path_C_failed", err=repr(e3)) + pass + # D) 兜底拷贝一份 bytes,确保不崩 + try: + data = ctypes.string_at(storage.data_ptr() + byte_offset, nbytes) + _log( + "path_D_copy_success", + bytes_len=len(data), + head16=data[:16].hex() if len(data) else "", + ) + return data # bytes 也可直接用于 zlib.crc32 + except Exception as e4: + _log("path_D_copy_failed", err=repr(e4)) + raise RuntimeError( + f"failed to obtain tensor bytes view; " + f" | C:{e3!r} | D:{e4!r}" + ) + + # E) 兜底拷贝一份 bytes,确保不崩 + try: + if t.dtype == torch.bfloat16: + t = t.float() + data = t.numpy() + # data = ctypes.string_at(storage.data_ptr() + byte_offset, nbytes) + # _log( + # "path_D_copy_success", + # bytes_len=len(data) + # "", + # ) + return data # bytes 也可直接用于 zlib.crc32 + except Exception as e5: + _log("path_E_copy_failed", err=repr(e5)) + raise RuntimeError( + f"failed to obtain tensor bytes view; " + f" | C:{e3!r} | D:{e4!r} | D:{e5!r}" + ) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index d89f8d81bc..7951dbd6f6 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -136,29 +136,23 @@ class PytorchDataProcessor(BaseDataProcessor): crc32_hash = zlib.crc32(tensor_bytes) return f"{crc32_hash:08x}" - def _tensor_bytes_view_cpu(t: torch.Tensor, logger=None, tag: str = "crc"): + @staticmethod + def tensor_bytes_view_cpu(t: torch.Tensor): """ 返回 t 在当前 dtype 下的原始字节视图(优先零拷贝)。 需保证:t 已在 CPU 且是 contiguous。 - 可能返回 memoryview 或 bytes(兜底拷贝),均可被 zlib.crc32 接受。 + 可能返回 memoryview 或 bytes(兜底拷贝)或者 转为numpy,均可被 zlib.crc32 接受。 """ - # 直接拿底层 storage(PyTorch 提供的无类型存储,单字节步长) - # nbytes = t.numel() * t.element_size() - # storage = t.untyped_storage() - # mv = memoryview(storage) # 这就是 u8 视图 - # return mv[:nbytes] # 截到有效字节长度 nbytes = t.numel() * t.element_size() byte_offset = t.storage_offset() * t.element_size() if nbytes == 0: - # _log("empty_tensor", action="return_empty_memoryview") return memoryview(b"") - # A) 直接对 UntypedStorage 建立 memoryview storage = t.untyped_storage() - # C) ctypes 指针构造 memoryview(零拷贝 FFI) + # ctypes 指针构造 memoryview(零拷贝 FFI) import ctypes try: addr = storage.data_ptr() + byte_offset @@ -166,11 +160,28 @@ class PytorchDataProcessor(BaseDataProcessor): mv3 = memoryview(buf) return mv3 - except Exception as e3: + except Exception as e1: + logger.warning(f"path_A_failed: {e1}.") + + try: + data = ctypes.string_at(storage.data_ptr() + byte_offset, nbytes) + + return data # bytes 也可直接用于 zlib.crc32 + except Exception as e2: + logger.warning(f"path_B_failed: {e2}.") - pass + try: + if t.dtype == torch.bfloat16: + t = t.float() + data = t.numpy() - def compute_crc32_from_tensor(t: torch.Tensor, *, logger=None) -> str: + return data + except Exception as e3: + logger.warning(f"path_C_failed: {e3}.") + return memoryview(b"") + + @staticmethod + def compute_crc32_from_tensor(t: torch.Tensor) -> str: """ 直接对 Tensor 原始字节做 CRC32。 : @@ -178,7 +189,7 @@ class PytorchDataProcessor(BaseDataProcessor): """ # 取得字节视图(含多级回退),然后做 CRC - mv = PytorchDataProcessor._tensor_bytes_view_cpu(t) + mv = PytorchDataProcessor.tensor_bytes_view_cpu(t) crc = zlib.crc32(mv) @@ -350,11 +361,10 @@ class PytorchDataProcessor(BaseDataProcessor): if common_tensor.device.type != "cpu": t_cpu = common_tensor.to("cpu", non_blocking=False) - t_cpu = common_tensor.detach() + t_cpu = t_cpu.detach() if not t_cpu.is_contiguous(): t_cpu = t_cpu.contiguous() - future = self._crc_executor.submit( PytorchDataProcessor.compute_crc32_from_tensor, t_cpu -- Gitee From 670d5a11e4ef210a15246a0da77c998fcb54f2ce Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 28 Aug 2025 16:37:59 +0800 Subject: [PATCH 03/11] Delete 2.py --- .../core/data_dump/data_processor/2.py | 138 ------------------ 1 file changed, 138 deletions(-) delete mode 100644 debug/accuracy_tools/msprobe/core/data_dump/data_processor/2.py diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/2.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/2.py deleted file mode 100644 index 37a24c1ad0..0000000000 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/2.py +++ /dev/null @@ -1,138 +0,0 @@ -def _tensor_bytes_view_cpu(t: torch.Tensor, logger=None, tag: str = "crc"): - """ - 返回 t 在当前 dtype 下的原始字节视图(优先零拷贝)。 - 需保证:t 已在 CPU 且是 contiguous。 - 可能返回 memoryview 或 bytes(兜底拷贝),均可被 zlib.crc32 接受。 - """ - - # 直接拿底层 storage(PyTorch 提供的无类型存储,单字节步长) - # nbytes = t.numel() * t.element_size() - # storage = t.untyped_storage() - # mv = memoryview(storage) # 这就是 u8 视图 - # return mv[:nbytes] # 截到有效字节长度 - - # print("123124") - def _log(event: str, **kv): - msg = f"[{tag}] {_safe(event)} | " + " ".join(f"{k}={_safe(v)}" for k, v in kv.items()) - try: - if logger is not None: - logger.debug(msg) - else: - print(msg) - except Exception: - # 避免日志本身抛错影响主流程 - try: - print(msg) - except Exception: - pass - - def _safe(x): - try: - return str(x) - except Exception: - return f"" - - nbytes = t.numel() * t.element_size() - byte_offset = t.storage_offset() * t.element_size() - - # _log( - # "enter", - # dtype=t.dtype, - # device=t.device, - # shape=tuple(t.shape), - # contiguous=t.is_contiguous(), - # numel=t.numel(), - # elem_size=t.element_size(), - # nbytes=nbytes, - # storage_offset_bytes=byte_offset, - # ) - - if nbytes == 0: - # _log("empty_tensor", action="return_empty_memoryview") - return memoryview(b"") - - # A) 直接对 UntypedStorage 建立 memoryview - storage = t.untyped_storage() - # try: - # mv = memoryview(storage) # 有些发行版支持 - # mv_sliced = mv[byte_offset: byte_offset + nbytes] - # # _log( - # # "path_A_success", - # # mv_type=type(mv_sliced).__name__, - # # mv_len=len(mv_sliced), - # # head16=bytes(mv_sliced[:16]).hex() if len(mv_sliced) else "", - # # ) - # return mv_sliced - # except Exception as e1: - # pass - # # _log("path_A_failed", err=repr(e1)) - - # # B) 视作 uint8 再取 storage(依然零拷贝) - # try: - # t_u8 = t.view(torch.uint8) - # st_u8 = t_u8.untyped_storage() - # mv2 = memoryview(st_u8) # uint8 下 offset 单位就是字节 - # off_elems = byte_offset - # mv2_sliced = mv2[off_elems: off_elems + nbytes] - # # _log( - # # "path_B_success", - # # mv_type=type(mv2_sliced).__name__, - # # mv_len=len(mv2_sliced), - # # head16=bytes(mv2_sliced[:16]).hex() if len(mv2_sliced) else "", - # # ) - # return mv2_sliced - # except Exception as e2: - # # _log("path_B_failed", err=repr(e2)) - # pass - - # C) ctypes 指针构造 memoryview(零拷贝 FFI) - import ctypes - try: - addr = storage.data_ptr() + byte_offset - buf = (ctypes.c_ubyte * nbytes).from_address(addr) - mv3 = memoryview(buf) - # _log( - # "path_C_success", - # mv_type=type(mv3).__name__, - # mv_len=len(mv3), - # head16=bytes(mv3[:16]).hex() if len(mv3) else "", - # addr_hex=hex(addr), - # ) - return mv3 - except Exception as e3: - _log("path_C_failed", err=repr(e3)) - pass - # D) 兜底拷贝一份 bytes,确保不崩 - try: - data = ctypes.string_at(storage.data_ptr() + byte_offset, nbytes) - _log( - "path_D_copy_success", - bytes_len=len(data), - head16=data[:16].hex() if len(data) else "", - ) - return data # bytes 也可直接用于 zlib.crc32 - except Exception as e4: - _log("path_D_copy_failed", err=repr(e4)) - raise RuntimeError( - f"failed to obtain tensor bytes view; " - f" | C:{e3!r} | D:{e4!r}" - ) - - # E) 兜底拷贝一份 bytes,确保不崩 - try: - if t.dtype == torch.bfloat16: - t = t.float() - data = t.numpy() - # data = ctypes.string_at(storage.data_ptr() + byte_offset, nbytes) - # _log( - # "path_D_copy_success", - # bytes_len=len(data) - # "", - # ) - return data # bytes 也可直接用于 zlib.crc32 - except Exception as e5: - _log("path_E_copy_failed", err=repr(e5)) - raise RuntimeError( - f"failed to obtain tensor bytes view; " - f" | C:{e3!r} | D:{e4!r} | D:{e5!r}" - ) -- Gitee From 90ad35f52348798a23f01076cce05afa50111c62 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 28 Aug 2025 16:41:56 +0800 Subject: [PATCH 04/11] Update pytorch_processor.py --- .../core/data_dump/data_processor/pytorch_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 7951dbd6f6..142036b9db 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -358,9 +358,9 @@ class PytorchDataProcessor(BaseDataProcessor): if self.config.summary_mode == Const.MD5 and not self.config.async_dump: tensor_md5 = None if not self.tensor_handler.is_empty_data(tensor): - - if common_tensor.device.type != "cpu": - t_cpu = common_tensor.to("cpu", non_blocking=False) + t_cpu = common_tensor + if t_cpu.device.type != "cpu": + t_cpu = t_cpu.to("cpu", non_blocking=False) t_cpu = t_cpu.detach() if not t_cpu.is_contiguous(): t_cpu = t_cpu.contiguous() -- Gitee From 1d5e1aaf4b8ea3f5792a7be37d91aeea67599802 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 28 Aug 2025 17:13:57 +0800 Subject: [PATCH 05/11] Update pytorch_processor.py --- .../core/data_dump/data_processor/pytorch_processor.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 142036b9db..52453da647 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -122,12 +122,6 @@ class PytorchDataProcessor(BaseDataProcessor): self.tensor_handler = TensorHandler() self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) - - @staticmethod - def compute_crc32_bytes(tensor_bytes): - return f"{zlib.crc32(tensor_bytes):08x}" - - @staticmethod def get_md5_for_tensor(x): if x.dtype == torch.bfloat16: -- Gitee From c5810277f386a6fe1ab4529b7ed2b03a31ef1637 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 28 Aug 2025 19:20:00 +0800 Subject: [PATCH 06/11] Update pytorch_processor.py --- .../msprobe/core/data_dump/data_processor/pytorch_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 52453da647..c1d6b67c59 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -354,7 +354,7 @@ class PytorchDataProcessor(BaseDataProcessor): if not self.tensor_handler.is_empty_data(tensor): t_cpu = common_tensor if t_cpu.device.type != "cpu": - t_cpu = t_cpu.to("cpu", non_blocking=False) + t_cpu = t_cpu.to("cpu", non_blocking=True) t_cpu = t_cpu.detach() if not t_cpu.is_contiguous(): t_cpu = t_cpu.contiguous() -- Gitee From 9c647fcaae4d960ce71cc44e55264cb21dbcb118 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 28 Aug 2025 19:35:07 +0800 Subject: [PATCH 07/11] Update pytorch_processor.py --- .../msprobe/core/data_dump/data_processor/pytorch_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index c1d6b67c59..52453da647 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -354,7 +354,7 @@ class PytorchDataProcessor(BaseDataProcessor): if not self.tensor_handler.is_empty_data(tensor): t_cpu = common_tensor if t_cpu.device.type != "cpu": - t_cpu = t_cpu.to("cpu", non_blocking=True) + t_cpu = t_cpu.to("cpu", non_blocking=False) t_cpu = t_cpu.detach() if not t_cpu.is_contiguous(): t_cpu = t_cpu.contiguous() -- Gitee From c1a99d4268351d5a36bccb259f6eda5805858d64 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Thu, 28 Aug 2025 19:41:45 +0800 Subject: [PATCH 08/11] Update pytorch_processor.py --- .../msprobe/core/data_dump/data_processor/pytorch_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 52453da647..b86c928c3a 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -15,6 +15,7 @@ import os import zlib +import ctypes from collections.abc import Iterable from dataclasses import asdict from typing import List @@ -147,7 +148,6 @@ class PytorchDataProcessor(BaseDataProcessor): storage = t.untyped_storage() # ctypes 指针构造 memoryview(零拷贝 FFI) - import ctypes try: addr = storage.data_ptr() + byte_offset buf = (ctypes.c_ubyte * nbytes).from_address(addr) -- Gitee From 55ba9673b6952f4b45c4dff7461d53899eaea846 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Fri, 29 Aug 2025 09:25:52 +0800 Subject: [PATCH 09/11] Update pytorch_processor.py --- .../core/data_dump/data_processor/pytorch_processor.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index b86c928c3a..5e6a0273fa 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -123,13 +123,6 @@ class PytorchDataProcessor(BaseDataProcessor): self.tensor_handler = TensorHandler() self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) - @staticmethod - def get_md5_for_tensor(x): - if x.dtype == torch.bfloat16: - x = x.float() - tensor_bytes = x.cpu().detach().numpy().tobytes() - crc32_hash = zlib.crc32(tensor_bytes) - return f"{crc32_hash:08x}" @staticmethod def tensor_bytes_view_cpu(t: torch.Tensor): -- Gitee From 392e44ac9cb35f9926be96f09470a991d008750d Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Fri, 29 Aug 2025 09:27:30 +0800 Subject: [PATCH 10/11] Update pytorch_processor.py --- .../core/data_dump/data_processor/pytorch_processor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 5e6a0273fa..b86c928c3a 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -123,6 +123,13 @@ class PytorchDataProcessor(BaseDataProcessor): self.tensor_handler = TensorHandler() self._crc_executor = ThreadPoolExecutor(max_workers=os.cpu_count() // 2) + @staticmethod + def get_md5_for_tensor(x): + if x.dtype == torch.bfloat16: + x = x.float() + tensor_bytes = x.cpu().detach().numpy().tobytes() + crc32_hash = zlib.crc32(tensor_bytes) + return f"{crc32_hash:08x}" @staticmethod def tensor_bytes_view_cpu(t: torch.Tensor): -- Gitee From 2bc6b401c3667176cd4671515ad22c22b9df4615 Mon Sep 17 00:00:00 2001 From: mnhdxnh Date: Fri, 29 Aug 2025 15:30:03 +0800 Subject: [PATCH 11/11] Update pytorch_processor.py --- .../data_dump/data_processor/pytorch_processor.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index b86c928c3a..9bef9ad2d8 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -353,8 +353,16 @@ class PytorchDataProcessor(BaseDataProcessor): tensor_md5 = None if not self.tensor_handler.is_empty_data(tensor): t_cpu = common_tensor - if t_cpu.device.type != "cpu": - t_cpu = t_cpu.to("cpu", non_blocking=False) + + # 根据设备类型做同步,确保数据已准备好 + if t_cpu.device.type == "cuda": + t_cpu = t_cpu.to("cpu", non_blocking=True) + torch.cuda.synchronize() + # 先异步搬运再进行同步可以显著提升性能 + elif t_cpu.device.type == "npu": + t_cpu = t_cpu.to("cpu", non_blocking=True) + torch.npu.synchronize() + t_cpu = t_cpu.detach() if not t_cpu.is_contiguous(): t_cpu = t_cpu.contiguous() -- Gitee