diff --git a/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..4321793026bf41e9fe51b83d7ada66c1530beaec --- /dev/null +++ b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.analyzer.schedule.gc.gc_checker import GcChecker +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset + +logger = logging.getLogger() + + +class GcAnalyzer(BaseAnalyzer): + dataset_cls_list = [TimelineEventDataset] + + def __init__(self, collection_path, **kwargs): + super().__init__(collection_path, **kwargs) + self.result = OptimizeResult() + self.html_render = HTMLRender() + key = TimelineEventDataset.get_key() + self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key) + + @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),)) + def optimize(self, **kwargs): + gc_checker = GcChecker() + gc_checker.check_gc(self.timeline_event_dataset, rank_id=kwargs.get("rank_id"), stage=kwargs.get("stage")) + gc_checker.make_record(self.result) + gc_checker.make_render(self.html_render) + return self.result diff --git a/profiler/advisor/analyzer/schedule/gc/gc_checker.py b/profiler/advisor/analyzer/schedule/gc/gc_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..b3f713f8be369b4ed0996d359ebad91d1a9617bf --- /dev/null +++ b/profiler/advisor/analyzer/schedule/gc/gc_checker.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os + +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.cluster_analyse.common_func.file_manager import FileManager + +logger = logging.getLogger() + + +class GcChecker: + GC_TOPK_NUM = 10 + + def __init__(self): + self.stage = None + self.rank_id = None + self.optimization_item = [] + self.gc_issues = False + self.desc = "" + self.suggestions = [] + self.solutions = None + self.gc_threshold = 0 + self.abnormal_gc_count = 0 + self.abnormal_gc_duration = 0 + self.abnormal_gc_list = [] + self.headers = ["timestamp", "duration(us)"] + self._init_rule() + + def check_gc(self, event_dataset: TimelineEventDataset, rank_id=None, stage=None): + """ + :Param event_dataset: dataset of timeline event + """ + if not hasattr(event_dataset, "gc_events"): + logger.debug("Skip gc checker, because no gc event found") + return + self.rank_id = rank_id + self.stage = stage + for gc_event in event_dataset.gc_events: + if float(gc_event.dur) > self.gc_threshold: + self.gc_issues = True + self.abnormal_gc_count += 1 + self.abnormal_gc_duration += float(gc_event.dur) + self.abnormal_gc_list.append([gc_event.ts, gc_event.dur]) + self.abnormal_gc_duration = round(self.abnormal_gc_duration / 1000, 4) + self.abnormal_gc_list.sort(key=lambda x: x[1], reverse=True) + self.desc = self.desc.format(gc_count=self.abnormal_gc_count, gc_total_time=self.abnormal_gc_duration) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.gc_issues: + return + + self.optimization_item.append(OptimizeItem("gc", self.desc, self.suggestions)) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + if self.rank_id: + self.headers = ["Rank id"] + self.headers + sub_table_name = "GcAnalysis" if not self.stage else f"Stage-{self.stage}: GcAnalysis" + result.add_detail(sub_table_name, headers=self.headers) + + for row in self.abnormal_gc_list: + if self.rank_id: + row = [self.rank_id] + row + result.add_detail(sub_table_name, detail=row) + + def make_render(self, html_render): + if not self.gc_issues: + return + html_render.render_template(key="schedule", + template_dir="templates", + template_name="gc.html", + desc=self.desc, + solutions=self.solutions, + headers=self.headers, + datas=self.abnormal_gc_list[:self.GC_TOPK_NUM], + num=self.GC_TOPK_NUM) + + def _init_rule(self): + syncbn_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), + "rules", + "gc.yaml" + ) + + syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path) + + self.gc_threshold = syncbn_rule.get("gc_threshold") + self.desc = syncbn_rule.get("problem") + + self.solutions = syncbn_rule.get("solutions") + for solution in self.solutions: + for key, val in solution.items(): + self.suggestions.append(f"{key}, {val.get('desc')}") diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py index b947798c9e6bb708301d6c02c1df93e4665c4132..652e10b083730eb8c0bad5fc62d9e7247e92b4fe 100644 --- a/profiler/advisor/common/analyzer_scopes.py +++ b/profiler/advisor/common/analyzer_scopes.py @@ -33,3 +33,4 @@ class SupportedScopes: SYNCBN = "syncbn" SYNCHRONIZE_STREAM = "synchronize_stream" FREQ_ANALYSIS = "freq_analysis" + GC_ANALYSIS = "gc_analysis" diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index 2956e207500e866b20acf93553c1e1e0d8d5f5e8..5ed47a6c8246947c7c051bf2944bb68afbc4d88b 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -1,4 +1,3 @@ -import json import logging import os from typing import List, Any @@ -6,7 +5,6 @@ import traceback import ijson from tqdm import tqdm -import yaml from profiler.advisor.common import constant as const from profiler.advisor.common.timeline.event import TimelineEvent @@ -89,6 +87,7 @@ class TimelineEventDataset: self._optimizer: List[Any] = [] self._dataloader: List[Any] = [] self._sync_batchnorm: List[Any] = [] + self._gc: List[Any] = [] self._synchronize_stream = SynchronizeStreamCollector() self.timeline_dir = collection_path self.timeline_data_list = get_file_path_from_directory(collection_path, @@ -151,6 +150,10 @@ class TimelineEventDataset: def sync_batchnorm(self): return self._sync_batchnorm + @property + def gc_events(self): + return self._gc + @property def synchronize_stream(self): return self._synchronize_stream @@ -226,6 +229,10 @@ class TimelineEventDataset: if event.name == const.OP_COMPILE_NAME or event.args.get("id") == const.OP_COMPILE_ID: self._ops_compile.update(event) + def _add_gc(self, event: TimelineEvent): + if event.cat == "GC": + self._gc.append(event) + def _add_optimizer(self, event: TimelineEvent): self._optimizer.append(TimelineEvent({"name": event.name, "dataset_index": event.dataset_index})) @@ -260,6 +267,8 @@ class TimelineEventDataset: self._add_dataloader(event) # for analysis of syncBatchNorm operator, prompt users to replace source code of torch_npu's syncbn self._add_sync_batchnorm(event) + # for analysis of GcAnalyzer + self._add_gc(event) def _add_event(self, index, event): event["dataset_index"] = index diff --git a/profiler/advisor/display/html/templates/gc.html b/profiler/advisor/display/html/templates/gc.html new file mode 100644 index 0000000000000000000000000000000000000000..e6357c92210012b58b2e6cf1f447a427543d545e --- /dev/null +++ b/profiler/advisor/display/html/templates/gc.html @@ -0,0 +1,37 @@ + +
+

GC Analysis

+
+ {{ desc }} + + + + + {% for item in solutions %} + {% set rowloop = loop %} + {% for key, value in item.items() %} + + + + {% endfor %} + {% endfor %} +
Suggestions
{{ rowloop.index }}. {{ value.desc }}
+ The details of top {{ num }} garbage collection events are as follows: +

+ + + {% for header in headers %} + + {% endfor %} + + + {% for row in datas %} + + {% for element in row %} + + {% endfor %} + + {% endfor %} +
{{ header }}
{{ element|safe }}
+
+
diff --git a/profiler/advisor/interface/interface.py b/profiler/advisor/interface/interface.py index 4908c275d05034666440e6c5e478dc8a68f1dad4..3fa10db64749681027b27d1d8a6f34d68d55e2f7 100644 --- a/profiler/advisor/interface/interface.py +++ b/profiler/advisor/interface/interface.py @@ -34,6 +34,7 @@ from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_an from profiler.advisor.analyzer.dataloader.dataloader_analyzer import DataloaderAnalyzer from profiler.advisor.analyzer.computation.ai_core_freq.ai_core_freq_analyzer import AICoreFreqAnalyzer from profiler.advisor.analyzer.communication.packet_analyzer import PacketAnalyzer +from profiler.advisor.analyzer.schedule.gc.gc_analyzer import GcAnalyzer class Interface: @@ -42,7 +43,8 @@ class Interface: SupportedScopes.SYNCBN: SyncBNAnalyzer, SupportedScopes.TIMELINE_OP_DISPATCH: OpDispatchAnalyzer, SupportedScopes.SYNCHRONIZE_STREAM: SynchronizeStreamAnalyzer, - SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer + SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer, + SupportedScopes.GC_ANALYSIS: GcAnalyzer }), "computation": OrderedDict({ SupportedScopes.DYNAMIC_SHAPE_ANALYSIS: DynamicShapeAnalyzer, diff --git a/profiler/advisor/rules/gc.yaml b/profiler/advisor/rules/gc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b62063103b2f7a6aa0140a53b417435c68459d98 --- /dev/null +++ b/profiler/advisor/rules/gc.yaml @@ -0,0 +1,8 @@ +problem: "Abnormal garbage collection (GC) event is detected for {gc_count} times, and the total time is {gc_total_time} ms\n. +The GC operation is time-consuming and blocks the entire process. As a result, some steps in the model training process take a longer time than other steps." +gc_threshold: 1000 #us +solutions: + - check circular references: + desc: "check whether circular references exist in the code. Eliminating circular references can prevent garbage collection." + - adjusting the GC threshold: + desc: "adjusting the garbage collection threshold can delay garbage collection, but this is a temporary solution." \ No newline at end of file diff --git a/profiler/advisor/utils/utils.py b/profiler/advisor/utils/utils.py index 83f304c2d3c7d2e583b9c3979a4cf2c232020f55..2afd310a0982e41bc28442496ab24e3f84c53c3b 100644 --- a/profiler/advisor/utils/utils.py +++ b/profiler/advisor/utils/utils.py @@ -92,6 +92,12 @@ def singleton(cls): _instance[cls][collection_path] = cls(*args, **kw) return _instance[cls].get(collection_path) + def reset_all_instances(): + """ + 用于ut使用,清空单例类,防止ut不同测试用例间相互干扰 + """ + _instance.clear() + # 保留原始类的属性和方法 _singleton.__name__ = cls.__name__ _singleton.__module__ = cls.__module__ @@ -110,6 +116,9 @@ def singleton(cls): continue setattr(_singleton, function_obj.__name__, function_obj) + _singleton.reset_all_instances = reset_all_instances + singleton.reset_all_instances = reset_all_instances + return _singleton diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py index f400a265b7bfcab1e5f19513a3eea43fea5250ce..677972084ba0a064cff83dd4c4d247324cf1c427 100644 --- a/profiler/cli/analyze_cli.py +++ b/profiler/cli/analyze_cli.py @@ -73,7 +73,7 @@ def analyze_cli(**kwargs): type=click.Choice(constant.SUPPORTED_TORCH_VERSION, case_sensitive=False), default=constant.DEFAULT_TORCH_VERSION, help='The runtime torch version, which can be detected by exec command "pip show torch"') -# @click.option('--is_inference', is_flag=True, help="Enable performance analysis of inference task") + @click.option("-pt", "--profiling_type", metavar="", diff --git a/profiler/test/ut/advisor/schedule_advice/test_gc_advice.py b/profiler/test/ut/advisor/schedule_advice/test_gc_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..f18311ab130d72fef216768632d6db93ac5dbc8d --- /dev/null +++ b/profiler/test/ut/advisor/schedule_advice/test_gc_advice.py @@ -0,0 +1,116 @@ +import os +import shutil +import stat +import json + +import unittest +from profiler.advisor.interface.interface import Interface +from profiler.advisor.common.analyzer_scopes import SupportedScopes +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset + + +class TestGcAdvice(unittest.TestCase): + TMP_DIR = "./ascend_pt" + OUTPUT_DIR = "./ascend_pt/ASCEND_PROFILER_OUTPUT" + interface = None + + def tearDown(self): + if os.path.exists(TestGcAdvice.TMP_DIR): + shutil.rmtree(TestGcAdvice.TMP_DIR) + self.clear_htmls() + TimelineEventDataset.reset_all_instances() + + def setUp(self): + if os.path.exists(TestGcAdvice.TMP_DIR): + shutil.rmtree(TestGcAdvice.TMP_DIR) + if not os.path.exists(TestGcAdvice.TMP_DIR): + os.makedirs(TestGcAdvice.TMP_DIR) + if not os.path.exists(TestGcAdvice.OUTPUT_DIR): + os.makedirs(TestGcAdvice.OUTPUT_DIR) + self.clear_htmls() + + @classmethod + def clear_htmls(cls): + current_path = os.path.dirname(os.path.abspath(__file__)) + for filename in os.listdir(current_path): + # 检查文件是否以“att”开头 + if filename.startswith("mstt"): + # 构建文件的完整路径 + file_path = os.path.join(current_path, filename) + # 删除文件 + os.remove(file_path) + + @classmethod + def create_trace_view_with_gc_events(cls): + # Python pid + py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}} + # Python GC pid + py_gc_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 2, "args": {"name": "Python GC"}} + # ascend pid + ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}} + # ascend pid + cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}} + # ascend hardware ops + ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4, + "args": {"Task Type": "AI_CORE"}} + ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106888", "dur": 80, "tid": 3, "pid": 4, + "args": {"Task Type": "AI_CORE"}} + gc_event1 = {"ph": "X", "name": "GC", "ts": "1699529622103750", "dur": 1500, "tid": 3, "pid": 4, "cat": "GC", + "args": {}} + gc_event2 = {"ph": "X", "name": "GC", "ts": "1699529623104750", "dur": 50, "tid": 3, "pid": 4, "cat": "GC", + "args": {}} + gc_event3 = {"ph": "X", "name": "GC", "ts": "1699529623105750", "dur": 50000, "tid": 3, "pid": 4, "cat": "GC", + "args": {}} + # flow event + flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}} + flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}} + + raw_data = [ + py_pid_data, py_gc_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, gc_event1, gc_event2, + gc_event3, flow_event_s, flow_event_e + ] + with os.fdopen(os.open(f"{TestGcAdvice.OUTPUT_DIR}/trace_view.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(raw_data)) + + @classmethod + def create_trace_view_without_gc_events(cls): + # Python pid + py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}} + # ascend pid + ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}} + # ascend pid + cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}} + # ascend hardware ops + ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4, + "args": {"Task Type": "AI_CORE"}} + ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106888", "dur": 80, "tid": 3, "pid": 4, + "args": {"Task Type": "AI_CORE"}} + # flow event + flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}} + flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}} + + raw_data = [ + py_pid_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, flow_event_s, flow_event_e + ] + with os.fdopen(os.open(f"{TestGcAdvice.OUTPUT_DIR}/trace_view.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(raw_data)) + + def test_run_should_run_success_when_trace_view_contain_gc_events(self): + self.create_trace_view_with_gc_events() + interface = Interface(profiling_path=self.TMP_DIR) + dimension = "schedule" + scope = SupportedScopes.GC_ANALYSIS + result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR) + self.assertEqual(2, len(result.data.get("GcAnalysis", dict).get("data", []))) + result.clear() + + def test_run_should_run_success_when_trace_view_not_contain_gc_events(self): + self.create_trace_view_without_gc_events() + interface = Interface(profiling_path=self.TMP_DIR) + dimension = "schedule" + scope = SupportedScopes.GC_ANALYSIS + result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR) + self.assertEqual(0, len(result.data.get("GcAnalysis", []))) + result.clear()