diff --git a/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4321793026bf41e9fe51b83d7ada66c1530beaec
--- /dev/null
+++ b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.analyzer.schedule.gc.gc_checker import GcChecker
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+
+logger = logging.getLogger()
+
+
+class GcAnalyzer(BaseAnalyzer):
+ dataset_cls_list = [TimelineEventDataset]
+
+ def __init__(self, collection_path, **kwargs):
+ super().__init__(collection_path, **kwargs)
+ self.result = OptimizeResult()
+ self.html_render = HTMLRender()
+ key = TimelineEventDataset.get_key()
+ self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key)
+
+ @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),))
+ def optimize(self, **kwargs):
+ gc_checker = GcChecker()
+ gc_checker.check_gc(self.timeline_event_dataset, rank_id=kwargs.get("rank_id"), stage=kwargs.get("stage"))
+ gc_checker.make_record(self.result)
+ gc_checker.make_render(self.html_render)
+ return self.result
diff --git a/profiler/advisor/analyzer/schedule/gc/gc_checker.py b/profiler/advisor/analyzer/schedule/gc/gc_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3f713f8be369b4ed0996d359ebad91d1a9617bf
--- /dev/null
+++ b/profiler/advisor/analyzer/schedule/gc/gc_checker.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.cluster_analyse.common_func.file_manager import FileManager
+
+logger = logging.getLogger()
+
+
+class GcChecker:
+ GC_TOPK_NUM = 10
+
+ def __init__(self):
+ self.stage = None
+ self.rank_id = None
+ self.optimization_item = []
+ self.gc_issues = False
+ self.desc = ""
+ self.suggestions = []
+ self.solutions = None
+ self.gc_threshold = 0
+ self.abnormal_gc_count = 0
+ self.abnormal_gc_duration = 0
+ self.abnormal_gc_list = []
+ self.headers = ["timestamp", "duration(us)"]
+ self._init_rule()
+
+ def check_gc(self, event_dataset: TimelineEventDataset, rank_id=None, stage=None):
+ """
+ :Param event_dataset: dataset of timeline event
+ """
+ if not hasattr(event_dataset, "gc_events"):
+ logger.debug("Skip gc checker, because no gc event found")
+ return
+ self.rank_id = rank_id
+ self.stage = stage
+ for gc_event in event_dataset.gc_events:
+ if float(gc_event.dur) > self.gc_threshold:
+ self.gc_issues = True
+ self.abnormal_gc_count += 1
+ self.abnormal_gc_duration += float(gc_event.dur)
+ self.abnormal_gc_list.append([gc_event.ts, gc_event.dur])
+ self.abnormal_gc_duration = round(self.abnormal_gc_duration / 1000, 4)
+ self.abnormal_gc_list.sort(key=lambda x: x[1], reverse=True)
+ self.desc = self.desc.format(gc_count=self.abnormal_gc_count, gc_total_time=self.abnormal_gc_duration)
+
+ def make_record(self, result: OptimizeResult):
+ """
+ make record for what and how to optimize
+ """
+ if not self.gc_issues:
+ return
+
+ self.optimization_item.append(OptimizeItem("gc", self.desc, self.suggestions))
+ for optimization in self.optimization_item:
+ result.add(OptimizeRecord(optimization))
+ if self.rank_id:
+ self.headers = ["Rank id"] + self.headers
+ sub_table_name = "GcAnalysis" if not self.stage else f"Stage-{self.stage}: GcAnalysis"
+ result.add_detail(sub_table_name, headers=self.headers)
+
+ for row in self.abnormal_gc_list:
+ if self.rank_id:
+ row = [self.rank_id] + row
+ result.add_detail(sub_table_name, detail=row)
+
+ def make_render(self, html_render):
+ if not self.gc_issues:
+ return
+ html_render.render_template(key="schedule",
+ template_dir="templates",
+ template_name="gc.html",
+ desc=self.desc,
+ solutions=self.solutions,
+ headers=self.headers,
+ datas=self.abnormal_gc_list[:self.GC_TOPK_NUM],
+ num=self.GC_TOPK_NUM)
+
+ def _init_rule(self):
+ syncbn_rule_path = os.path.join(
+ os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))),
+ "rules",
+ "gc.yaml"
+ )
+
+ syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path)
+
+ self.gc_threshold = syncbn_rule.get("gc_threshold")
+ self.desc = syncbn_rule.get("problem")
+
+ self.solutions = syncbn_rule.get("solutions")
+ for solution in self.solutions:
+ for key, val in solution.items():
+ self.suggestions.append(f"{key}, {val.get('desc')}")
diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py
index b947798c9e6bb708301d6c02c1df93e4665c4132..652e10b083730eb8c0bad5fc62d9e7247e92b4fe 100644
--- a/profiler/advisor/common/analyzer_scopes.py
+++ b/profiler/advisor/common/analyzer_scopes.py
@@ -33,3 +33,4 @@ class SupportedScopes:
SYNCBN = "syncbn"
SYNCHRONIZE_STREAM = "synchronize_stream"
FREQ_ANALYSIS = "freq_analysis"
+ GC_ANALYSIS = "gc_analysis"
diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py
index 2956e207500e866b20acf93553c1e1e0d8d5f5e8..5ed47a6c8246947c7c051bf2944bb68afbc4d88b 100644
--- a/profiler/advisor/dataset/timeline_event_dataset.py
+++ b/profiler/advisor/dataset/timeline_event_dataset.py
@@ -1,4 +1,3 @@
-import json
import logging
import os
from typing import List, Any
@@ -6,7 +5,6 @@ import traceback
import ijson
from tqdm import tqdm
-import yaml
from profiler.advisor.common import constant as const
from profiler.advisor.common.timeline.event import TimelineEvent
@@ -89,6 +87,7 @@ class TimelineEventDataset:
self._optimizer: List[Any] = []
self._dataloader: List[Any] = []
self._sync_batchnorm: List[Any] = []
+ self._gc: List[Any] = []
self._synchronize_stream = SynchronizeStreamCollector()
self.timeline_dir = collection_path
self.timeline_data_list = get_file_path_from_directory(collection_path,
@@ -151,6 +150,10 @@ class TimelineEventDataset:
def sync_batchnorm(self):
return self._sync_batchnorm
+ @property
+ def gc_events(self):
+ return self._gc
+
@property
def synchronize_stream(self):
return self._synchronize_stream
@@ -226,6 +229,10 @@ class TimelineEventDataset:
if event.name == const.OP_COMPILE_NAME or event.args.get("id") == const.OP_COMPILE_ID:
self._ops_compile.update(event)
+ def _add_gc(self, event: TimelineEvent):
+ if event.cat == "GC":
+ self._gc.append(event)
+
def _add_optimizer(self, event: TimelineEvent):
self._optimizer.append(TimelineEvent({"name": event.name, "dataset_index": event.dataset_index}))
@@ -260,6 +267,8 @@ class TimelineEventDataset:
self._add_dataloader(event)
# for analysis of syncBatchNorm operator, prompt users to replace source code of torch_npu's syncbn
self._add_sync_batchnorm(event)
+ # for analysis of GcAnalyzer
+ self._add_gc(event)
def _add_event(self, index, event):
event["dataset_index"] = index
diff --git a/profiler/advisor/display/html/templates/gc.html b/profiler/advisor/display/html/templates/gc.html
new file mode 100644
index 0000000000000000000000000000000000000000..e6357c92210012b58b2e6cf1f447a427543d545e
--- /dev/null
+++ b/profiler/advisor/display/html/templates/gc.html
@@ -0,0 +1,37 @@
+
+
+
+
+
{{ desc }}
+
+
+ Suggestions |
+
+ {% for item in solutions %}
+ {% set rowloop = loop %}
+ {% for key, value in item.items() %}
+
+ {{ rowloop.index }}. {{ value.desc }} |
+
+ {% endfor %}
+ {% endfor %}
+
+
The details of top {{ num }} garbage collection events are as follows:
+
+
+
+ {% for header in headers %}
+ {{ header }} |
+ {% endfor %}
+
+
+ {% for row in datas %}
+
+ {% for element in row %}
+ {{ element|safe }} |
+ {% endfor %}
+
+ {% endfor %}
+
+
+
diff --git a/profiler/advisor/interface/interface.py b/profiler/advisor/interface/interface.py
index 4908c275d05034666440e6c5e478dc8a68f1dad4..3fa10db64749681027b27d1d8a6f34d68d55e2f7 100644
--- a/profiler/advisor/interface/interface.py
+++ b/profiler/advisor/interface/interface.py
@@ -34,6 +34,7 @@ from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_an
from profiler.advisor.analyzer.dataloader.dataloader_analyzer import DataloaderAnalyzer
from profiler.advisor.analyzer.computation.ai_core_freq.ai_core_freq_analyzer import AICoreFreqAnalyzer
from profiler.advisor.analyzer.communication.packet_analyzer import PacketAnalyzer
+from profiler.advisor.analyzer.schedule.gc.gc_analyzer import GcAnalyzer
class Interface:
@@ -42,7 +43,8 @@ class Interface:
SupportedScopes.SYNCBN: SyncBNAnalyzer,
SupportedScopes.TIMELINE_OP_DISPATCH: OpDispatchAnalyzer,
SupportedScopes.SYNCHRONIZE_STREAM: SynchronizeStreamAnalyzer,
- SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer
+ SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer,
+ SupportedScopes.GC_ANALYSIS: GcAnalyzer
}),
"computation": OrderedDict({
SupportedScopes.DYNAMIC_SHAPE_ANALYSIS: DynamicShapeAnalyzer,
diff --git a/profiler/advisor/rules/gc.yaml b/profiler/advisor/rules/gc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b62063103b2f7a6aa0140a53b417435c68459d98
--- /dev/null
+++ b/profiler/advisor/rules/gc.yaml
@@ -0,0 +1,8 @@
+problem: "Abnormal garbage collection (GC) event is detected for {gc_count} times, and the total time is {gc_total_time} ms\n.
+The GC operation is time-consuming and blocks the entire process. As a result, some steps in the model training process take a longer time than other steps."
+gc_threshold: 1000 #us
+solutions:
+ - check circular references:
+ desc: "check whether circular references exist in the code. Eliminating circular references can prevent garbage collection."
+ - adjusting the GC threshold:
+ desc: "adjusting the garbage collection threshold can delay garbage collection, but this is a temporary solution."
\ No newline at end of file
diff --git a/profiler/advisor/utils/utils.py b/profiler/advisor/utils/utils.py
index 83f304c2d3c7d2e583b9c3979a4cf2c232020f55..2afd310a0982e41bc28442496ab24e3f84c53c3b 100644
--- a/profiler/advisor/utils/utils.py
+++ b/profiler/advisor/utils/utils.py
@@ -92,6 +92,12 @@ def singleton(cls):
_instance[cls][collection_path] = cls(*args, **kw)
return _instance[cls].get(collection_path)
+ def reset_all_instances():
+ """
+ 用于ut使用,清空单例类,防止ut不同测试用例间相互干扰
+ """
+ _instance.clear()
+
# 保留原始类的属性和方法
_singleton.__name__ = cls.__name__
_singleton.__module__ = cls.__module__
@@ -110,6 +116,9 @@ def singleton(cls):
continue
setattr(_singleton, function_obj.__name__, function_obj)
+ _singleton.reset_all_instances = reset_all_instances
+ singleton.reset_all_instances = reset_all_instances
+
return _singleton
diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py
index f400a265b7bfcab1e5f19513a3eea43fea5250ce..677972084ba0a064cff83dd4c4d247324cf1c427 100644
--- a/profiler/cli/analyze_cli.py
+++ b/profiler/cli/analyze_cli.py
@@ -73,7 +73,7 @@ def analyze_cli(**kwargs):
type=click.Choice(constant.SUPPORTED_TORCH_VERSION, case_sensitive=False),
default=constant.DEFAULT_TORCH_VERSION,
help='The runtime torch version, which can be detected by exec command "pip show torch"')
-# @click.option('--is_inference', is_flag=True, help="Enable performance analysis of inference task")
+
@click.option("-pt",
"--profiling_type",
metavar="",
diff --git a/profiler/test/ut/advisor/schedule_advice/test_gc_advice.py b/profiler/test/ut/advisor/schedule_advice/test_gc_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..f18311ab130d72fef216768632d6db93ac5dbc8d
--- /dev/null
+++ b/profiler/test/ut/advisor/schedule_advice/test_gc_advice.py
@@ -0,0 +1,116 @@
+import os
+import shutil
+import stat
+import json
+
+import unittest
+from profiler.advisor.interface.interface import Interface
+from profiler.advisor.common.analyzer_scopes import SupportedScopes
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+
+
+class TestGcAdvice(unittest.TestCase):
+ TMP_DIR = "./ascend_pt"
+ OUTPUT_DIR = "./ascend_pt/ASCEND_PROFILER_OUTPUT"
+ interface = None
+
+ def tearDown(self):
+ if os.path.exists(TestGcAdvice.TMP_DIR):
+ shutil.rmtree(TestGcAdvice.TMP_DIR)
+ self.clear_htmls()
+ TimelineEventDataset.reset_all_instances()
+
+ def setUp(self):
+ if os.path.exists(TestGcAdvice.TMP_DIR):
+ shutil.rmtree(TestGcAdvice.TMP_DIR)
+ if not os.path.exists(TestGcAdvice.TMP_DIR):
+ os.makedirs(TestGcAdvice.TMP_DIR)
+ if not os.path.exists(TestGcAdvice.OUTPUT_DIR):
+ os.makedirs(TestGcAdvice.OUTPUT_DIR)
+ self.clear_htmls()
+
+ @classmethod
+ def clear_htmls(cls):
+ current_path = os.path.dirname(os.path.abspath(__file__))
+ for filename in os.listdir(current_path):
+ # 检查文件是否以“att”开头
+ if filename.startswith("mstt"):
+ # 构建文件的完整路径
+ file_path = os.path.join(current_path, filename)
+ # 删除文件
+ os.remove(file_path)
+
+ @classmethod
+ def create_trace_view_with_gc_events(cls):
+ # Python pid
+ py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}}
+ # Python GC pid
+ py_gc_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 2, "args": {"name": "Python GC"}}
+ # ascend pid
+ ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}}
+ # ascend pid
+ cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}}
+ # ascend hardware ops
+ ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4,
+ "args": {"Task Type": "AI_CORE"}}
+ ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106888", "dur": 80, "tid": 3, "pid": 4,
+ "args": {"Task Type": "AI_CORE"}}
+ gc_event1 = {"ph": "X", "name": "GC", "ts": "1699529622103750", "dur": 1500, "tid": 3, "pid": 4, "cat": "GC",
+ "args": {}}
+ gc_event2 = {"ph": "X", "name": "GC", "ts": "1699529623104750", "dur": 50, "tid": 3, "pid": 4, "cat": "GC",
+ "args": {}}
+ gc_event3 = {"ph": "X", "name": "GC", "ts": "1699529623105750", "dur": 50000, "tid": 3, "pid": 4, "cat": "GC",
+ "args": {}}
+ # flow event
+ flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}}
+ flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}}
+
+ raw_data = [
+ py_pid_data, py_gc_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, gc_event1, gc_event2,
+ gc_event3, flow_event_s, flow_event_e
+ ]
+ with os.fdopen(os.open(f"{TestGcAdvice.OUTPUT_DIR}/trace_view.json",
+ os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+ fp.write(json.dumps(raw_data))
+
+ @classmethod
+ def create_trace_view_without_gc_events(cls):
+ # Python pid
+ py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}}
+ # ascend pid
+ ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}}
+ # ascend pid
+ cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}}
+ # ascend hardware ops
+ ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4,
+ "args": {"Task Type": "AI_CORE"}}
+ ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106888", "dur": 80, "tid": 3, "pid": 4,
+ "args": {"Task Type": "AI_CORE"}}
+ # flow event
+ flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}}
+ flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}}
+
+ raw_data = [
+ py_pid_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, flow_event_s, flow_event_e
+ ]
+ with os.fdopen(os.open(f"{TestGcAdvice.OUTPUT_DIR}/trace_view.json",
+ os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+ fp.write(json.dumps(raw_data))
+
+ def test_run_should_run_success_when_trace_view_contain_gc_events(self):
+ self.create_trace_view_with_gc_events()
+ interface = Interface(profiling_path=self.TMP_DIR)
+ dimension = "schedule"
+ scope = SupportedScopes.GC_ANALYSIS
+ result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR)
+ self.assertEqual(2, len(result.data.get("GcAnalysis", dict).get("data", [])))
+ result.clear()
+
+ def test_run_should_run_success_when_trace_view_not_contain_gc_events(self):
+ self.create_trace_view_without_gc_events()
+ interface = Interface(profiling_path=self.TMP_DIR)
+ dimension = "schedule"
+ scope = SupportedScopes.GC_ANALYSIS
+ result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR)
+ self.assertEqual(0, len(result.data.get("GcAnalysis", [])))
+ result.clear()