From c0b48888c3200f26c4e374dd23293407a464458e Mon Sep 17 00:00:00 2001 From: wangchao426 Date: Sat, 28 Jun 2025 09:55:33 +0800 Subject: [PATCH 1/7] =?UTF-8?q?=E3=80=90bugfix=E3=80=91=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E5=88=86=E6=9E=90=E6=95=B4=E7=BD=91=E9=A6=96=E6=BA=A2=E5=87=BA?= =?UTF-8?q?=E8=8A=82=E7=82=B9=E6=97=B6=E5=88=86=E7=BB=84=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/msprobe/nan_analyze/analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/nan_analyze/analyzer.py b/debug/accuracy_tools/msprobe/nan_analyze/analyzer.py index e147f23b7c7..a32ad7c351c 100644 --- a/debug/accuracy_tools/msprobe/nan_analyze/analyzer.py +++ b/debug/accuracy_tools/msprobe/nan_analyze/analyzer.py @@ -221,7 +221,7 @@ class NanAnalyzer: node = get_next_node(nodes) if not node: continue - if not groups or node.node_id in all_ids_in_groups: + if not groups or node.node_id not in all_ids_in_groups: new_group = find_all_members(node) groups.append(new_group) all_ids_in_groups.update(new_group) -- Gitee From ed66c5b84cc125f00d3505eb83bc309b4b219cc5 Mon Sep 17 00:00:00 2001 From: wangchao426 Date: Sat, 28 Jun 2025 11:04:25 +0800 Subject: [PATCH 2/7] =?UTF-8?q?=E6=97=A5=E5=BF=97=E8=A7=84=E8=8C=83?= =?UTF-8?q?=E6=80=A7=E4=BF=AE=E6=94=B9=EF=BC=9A=E5=A4=A7=E5=B0=8F=E5=86=99?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/msprobe/visualization/graph_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/visualization/graph_service.py b/debug/accuracy_tools/msprobe/visualization/graph_service.py index b14ccab0386..a9f7870beab 100644 --- a/debug/accuracy_tools/msprobe/visualization/graph_service.py +++ b/debug/accuracy_tools/msprobe/visualization/graph_service.py @@ -66,7 +66,7 @@ def _compare_graph_result(input_param, args): # 对两个数据进行构图 graph_n = _build_graph_info(input_param.get('npu_path'), args) graph_b = _build_graph_info(input_param.get('bench_path'), args) - logger.info('Model graphs built successfully, start Comparing graphs...') + logger.info('Model graphs built successfully, start comparing graphs...') # 基于graph、stack和data进行比较 graph_comparator = _compare_graph(graph_n, graph_b, input_param, args) # 增加micro step标记 -- Gitee From 891a8c6b6924f52deddfb0fe7f85c46fec0c0a4d Mon Sep 17 00:00:00 2001 From: wangchao426 Date: Mon, 30 Jun 2025 11:34:06 +0800 Subject: [PATCH 3/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=86=E6=9E=90?= =?UTF-8?q?=E9=A6=96=E5=BC=82=E5=B8=B8=E8=8A=82=E7=82=B9=E6=9E=84=E5=9B=BE?= =?UTF-8?q?=E6=97=B6layer=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/nan_analyze/graph.py | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/msprobe/nan_analyze/graph.py b/debug/accuracy_tools/msprobe/nan_analyze/graph.py index 13ec6144473..3e6e7d71e3a 100644 --- a/debug/accuracy_tools/msprobe/nan_analyze/graph.py +++ b/debug/accuracy_tools/msprobe/nan_analyze/graph.py @@ -99,13 +99,13 @@ class CommunicationNode: self.link_nodes = kwargs.get('link_nodes', {}) self.dst_nodes = kwargs.get('dst_nodes', {}) self.src_nodes = kwargs.get('src_nodes', {}) - self.next_nodes = kwargs.get('next_nodes', {}) + self.next_node = kwargs.get('next_node') self.compute_ops = kwargs.get('compute_ops', []) self.type = self._resolve_type() self.connected = False def add_next(self, node): - self.next_nodes[node.node_id] = node + self.next_node = node node.pre_node = self node.layer = self.layer + 1 node.data.layer = node.layer @@ -113,7 +113,9 @@ class CommunicationNode: def add_link(self, node): self.link_nodes[node.node_id] = node node.link_nodes[self.node_id] = self - node.layer = self.layer + layer = max(node.layer, self.layer) + self.update_layer(layer) + node.update_layer(layer) node.data.layer = node.layer self.connected = True node.connected = True @@ -121,14 +123,16 @@ class CommunicationNode: def add_dst(self, node): self.dst_nodes[node.node_id] = node node.src_nodes[self.node_id] = self - node.layer = self.layer + layer = max(node.layer, self.layer) + self.update_layer(layer) + node.update_layer(layer) node.data.layer = node.layer self.connected = True node.connected = True def delete(self): - for node in self.next_nodes.values(): - node.pre_node = None + if self.next_node: + self.next_node.pre_node = None for node in self.dst_nodes.values(): node.src_nodes.pop(self.node_id) for node in self.src_nodes.values(): @@ -138,9 +142,19 @@ class CommunicationNode: if self.pre_node: self.pre_node.next_nodes.pop(self.node_id) + def update_layer(self, layer): + if layer == self.layer: + return + self.layer = layer + next_node = self.next_node + while next_node: + layer += 1 + next_node.layer = layer + next_node = next_node.next_node + def has_nan_inf(self): return self.input_has_nan_inf() or check_item_anomaly(self.data.outputs) - + def input_has_nan_inf(self): return check_item_anomaly(self.data.input_args) or check_item_anomaly(self.data.input_kwargs) -- Gitee From ccec89c6bc17c98983f38413416c74260a949482 Mon Sep 17 00:00:00 2001 From: wangchao426 Date: Mon, 30 Jun 2025 15:51:55 +0800 Subject: [PATCH 4/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=86=E6=9E=90?= =?UTF-8?q?=E9=A6=96=E5=BC=82=E5=B8=B8=E8=8A=82=E7=82=B9=E6=9E=84=E5=9B=BE?= =?UTF-8?q?=E6=97=B6layer=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/msprobe/nan_analyze/graph.py | 12 ++++++++++-- .../test/nan_analyze_ut/test_nan_analyzer_graph.py | 4 ++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/msprobe/nan_analyze/graph.py b/debug/accuracy_tools/msprobe/nan_analyze/graph.py index 3e6e7d71e3a..979e9e296ae 100644 --- a/debug/accuracy_tools/msprobe/nan_analyze/graph.py +++ b/debug/accuracy_tools/msprobe/nan_analyze/graph.py @@ -16,8 +16,8 @@ from dataclasses import dataclass from msprobe.core.common.const import Const from msprobe.core.common.log import logger -from msprobe.nan_analyze.utils import FileCache, RankPath, is_ignore_op, check_item_anomaly, NanAnalyseConst from msprobe.core.common.exceptions import MsprobeException +from msprobe.nan_analyze.utils import FileCache, RankPath, is_ignore_op, check_item_anomaly, NanAnalyseConst @dataclass @@ -140,16 +140,24 @@ class CommunicationNode: for node in self.link_nodes.values(): node.link_nodes.pop(self.node_id) if self.pre_node: - self.pre_node.next_nodes.pop(self.node_id) + self.pre_node.next_node = None def update_layer(self, layer): if layer == self.layer: return + + def update_comm_layer(node): + nodes = set(node.src_nodes.values()) | set(node.dst_nodes.values()) | set(node.link_nodes.values()) + for comm_node in nodes: + comm_node.update_layer(layer) + self.layer = layer + update_comm_layer(self) next_node = self.next_node while next_node: layer += 1 next_node.layer = layer + update_comm_layer(next_node) next_node = next_node.next_node def has_nan_inf(self): diff --git a/debug/accuracy_tools/msprobe/test/nan_analyze_ut/test_nan_analyzer_graph.py b/debug/accuracy_tools/msprobe/test/nan_analyze_ut/test_nan_analyzer_graph.py index 9bd3777ab6e..ead3da12ff6 100644 --- a/debug/accuracy_tools/msprobe/test/nan_analyze_ut/test_nan_analyzer_graph.py +++ b/debug/accuracy_tools/msprobe/test/nan_analyze_ut/test_nan_analyzer_graph.py @@ -38,7 +38,7 @@ class TestCommunicationNode(unittest.TestCase): comm_node_0.add_next(comm_node_1) self.assertEqual(comm_node_0.layer + 1, comm_node_1.layer) self.assertTrue(comm_node_0 is comm_node_1.pre_node) - self.assertTrue(comm_node_1.node_id in comm_node_0.next_nodes) + self.assertTrue(comm_node_0.next_node is comm_node_1) def test_add_link(self): op_name = 'Distributed.all_gather.0.forward' @@ -68,7 +68,7 @@ class TestCommunicationNode(unittest.TestCase): comm_node_0.add_dst(comm_node_1) comm_node_0.delete() self.assertFalse(comm_node_1.src_nodes) - self.assertFalse(comm_node_2.next_nodes) + self.assertFalse(comm_node_2.next_node) def test_has_nan_inf(self): op_name = 'Distributed.broadcast.0.forward' -- Gitee From 9370e4a8e7b921607dee8ad42ee38401b3e81e7d Mon Sep 17 00:00:00 2001 From: wangchao426 Date: Mon, 30 Jun 2025 16:27:40 +0800 Subject: [PATCH 5/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=86=E6=9E=90?= =?UTF-8?q?=E9=A6=96=E5=BC=82=E5=B8=B8=E8=8A=82=E7=82=B9=E6=9E=84=E5=9B=BE?= =?UTF-8?q?=E6=97=B6layer=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/nan_analyze/graph.py | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/debug/accuracy_tools/msprobe/nan_analyze/graph.py b/debug/accuracy_tools/msprobe/nan_analyze/graph.py index 979e9e296ae..80c6481186e 100644 --- a/debug/accuracy_tools/msprobe/nan_analyze/graph.py +++ b/debug/accuracy_tools/msprobe/nan_analyze/graph.py @@ -104,6 +104,16 @@ class CommunicationNode: self.type = self._resolve_type() self.connected = False + def __hash__(self): + return self.node_id + + def __eq__(self, other): + if not other: + return False + if not isinstance(other, CommunicationNode): + return False + return self.node_id == other.node_id + def add_next(self, node): self.next_node = node node.pre_node = self @@ -142,23 +152,26 @@ class CommunicationNode: if self.pre_node: self.pre_node.next_node = None - def update_layer(self, layer): - if layer == self.layer: + def update_layer(self, top_layer): + if top_layer == self.layer: return - def update_comm_layer(node): - nodes = set(node.src_nodes.values()) | set(node.dst_nodes.values()) | set(node.link_nodes.values()) - for comm_node in nodes: - comm_node.update_layer(layer) + seen_node = set() + queue = [(self, top_layer)] + + def update_comm_layer(node, layer): + node.layer = layer + + while queue: + cur_node, cur_layer = queue.pop(0) + update_comm_layer(cur_node, cur_layer) + seen_node.add(cur_node) + nodes_in_same_layer = set(cur_node.src_nodes.values()) | set(cur_node.dst_nodes.values()) | \ + set(cur_node.link_nodes.values()) + queue.extend([(node, cur_layer) for node in nodes_in_same_layer.difference(seen_node)]) + if cur_node.next_node: + queue.append((cur_node.next_node, cur_layer + 1)) - self.layer = layer - update_comm_layer(self) - next_node = self.next_node - while next_node: - layer += 1 - next_node.layer = layer - update_comm_layer(next_node) - next_node = next_node.next_node def has_nan_inf(self): return self.input_has_nan_inf() or check_item_anomaly(self.data.outputs) -- Gitee From b3f1dfd8bf70988d1fce3f8b45eb0a9ec84bc109 Mon Sep 17 00:00:00 2001 From: wangchao426 Date: Mon, 30 Jun 2025 16:33:13 +0800 Subject: [PATCH 6/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=86=E6=9E=90?= =?UTF-8?q?=E9=A6=96=E5=BC=82=E5=B8=B8=E8=8A=82=E7=82=B9=E6=9E=84=E5=9B=BE?= =?UTF-8?q?=E6=97=B6layer=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/msprobe/nan_analyze/graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/nan_analyze/graph.py b/debug/accuracy_tools/msprobe/nan_analyze/graph.py index 80c6481186e..829fd7a7b29 100644 --- a/debug/accuracy_tools/msprobe/nan_analyze/graph.py +++ b/debug/accuracy_tools/msprobe/nan_analyze/graph.py @@ -105,7 +105,7 @@ class CommunicationNode: self.connected = False def __hash__(self): - return self.node_id + return hash(self.node_id) def __eq__(self, other): if not other: -- Gitee From 0f4c03b456486906ca11561937705bb8760e6e86 Mon Sep 17 00:00:00 2001 From: wangchao426 Date: Tue, 1 Jul 2025 15:27:23 +0800 Subject: [PATCH 7/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=86=E6=9E=90?= =?UTF-8?q?=E9=A6=96=E5=BC=82=E5=B8=B8=E8=8A=82=E7=82=B9=E6=9E=84=E5=9B=BE?= =?UTF-8?q?=E6=97=B6layer=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/nan_analyze/analyzer.py | 21 +++++++++- .../msprobe/nan_analyze/graph.py | 41 +------------------ 2 files changed, 21 insertions(+), 41 deletions(-) diff --git a/debug/accuracy_tools/msprobe/nan_analyze/analyzer.py b/debug/accuracy_tools/msprobe/nan_analyze/analyzer.py index a32ad7c351c..04f2b27f43b 100644 --- a/debug/accuracy_tools/msprobe/nan_analyze/analyzer.py +++ b/debug/accuracy_tools/msprobe/nan_analyze/analyzer.py @@ -83,6 +83,7 @@ class NanAnalyzer: logger.info('Start searching anomaly node during communication.') self._rank_comm_nodes_dict = {rank: self._analyze_comm_nodes(rank) for rank in self._paths} self._connect_comm_nodes() + self._verify_layers() self._pruning() self._search_first_anomaly() @@ -151,7 +152,6 @@ class NanAnalyzer: if search_node.type == NanAnalyseConst.DST: cur_node.add_dst(search_node) elif search_node.type == NanAnalyseConst.SRC: - search_node.layer = cur_node.layer search_node.add_dst(cur_node) else: cur_node.add_link(search_node) @@ -174,6 +174,25 @@ class NanAnalyzer: break return found + def _verify_layers(self): + nodes_queues = {rank: list(nodes.values()) for rank, nodes in self._rank_comm_nodes_dict.items()} + cur_layer = 0 + while any(nodes_queues.values()): + cur_batch = [nodes_queue.pop(0) for nodes_queue in nodes_queues.values() if nodes_queue] + for node in cur_batch: + relatives = {**node.src_nodes, **node.dst_nodes, **node.link_nodes} + max_layer = max([n.layer for n in relatives.values()] + [node.layer]) + if max_layer != cur_layer or set(relatives.keys()).difference({n.node_id for n in cur_batch}): + node.layer = max_layer + rank = int(node.node_id.split(Const.SEP, 1)[0]) + nodes_queues[rank].insert(0, node) + else: + node.layer = cur_layer + node.data.layer = cur_layer + if node.next_node: + node.next_node.layer = node.layer + 1 + cur_layer += 1 + def _pruning(self): deleted_node_id = [] for nodes in self._rank_comm_nodes_dict.values(): diff --git a/debug/accuracy_tools/msprobe/nan_analyze/graph.py b/debug/accuracy_tools/msprobe/nan_analyze/graph.py index 829fd7a7b29..862bc141cb9 100644 --- a/debug/accuracy_tools/msprobe/nan_analyze/graph.py +++ b/debug/accuracy_tools/msprobe/nan_analyze/graph.py @@ -96,24 +96,14 @@ class CommunicationNode: self.api = op_name_split[1] self.call_cnt = op_name_split[2] self.pre_node = kwargs.get('pre_node') + self.next_node = kwargs.get('next_node') self.link_nodes = kwargs.get('link_nodes', {}) self.dst_nodes = kwargs.get('dst_nodes', {}) self.src_nodes = kwargs.get('src_nodes', {}) - self.next_node = kwargs.get('next_node') self.compute_ops = kwargs.get('compute_ops', []) self.type = self._resolve_type() self.connected = False - def __hash__(self): - return hash(self.node_id) - - def __eq__(self, other): - if not other: - return False - if not isinstance(other, CommunicationNode): - return False - return self.node_id == other.node_id - def add_next(self, node): self.next_node = node node.pre_node = self @@ -123,20 +113,12 @@ class CommunicationNode: def add_link(self, node): self.link_nodes[node.node_id] = node node.link_nodes[self.node_id] = self - layer = max(node.layer, self.layer) - self.update_layer(layer) - node.update_layer(layer) - node.data.layer = node.layer self.connected = True node.connected = True def add_dst(self, node): self.dst_nodes[node.node_id] = node node.src_nodes[self.node_id] = self - layer = max(node.layer, self.layer) - self.update_layer(layer) - node.update_layer(layer) - node.data.layer = node.layer self.connected = True node.connected = True @@ -152,27 +134,6 @@ class CommunicationNode: if self.pre_node: self.pre_node.next_node = None - def update_layer(self, top_layer): - if top_layer == self.layer: - return - - seen_node = set() - queue = [(self, top_layer)] - - def update_comm_layer(node, layer): - node.layer = layer - - while queue: - cur_node, cur_layer = queue.pop(0) - update_comm_layer(cur_node, cur_layer) - seen_node.add(cur_node) - nodes_in_same_layer = set(cur_node.src_nodes.values()) | set(cur_node.dst_nodes.values()) | \ - set(cur_node.link_nodes.values()) - queue.extend([(node, cur_layer) for node in nodes_in_same_layer.difference(seen_node)]) - if cur_node.next_node: - queue.append((cur_node.next_node, cur_layer + 1)) - - def has_nan_inf(self): return self.input_has_nan_inf() or check_item_anomaly(self.data.outputs) -- Gitee