From 04619f471413cb7df07071894be8eb3ef120574c Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 25 Sep 2024 09:44:48 +0800 Subject: [PATCH 1/2] bug fix --- profiler/advisor/analyzer/analyzer_controller.py | 4 ++++ profiler/advisor/common/constant.py | 2 ++ profiler/advisor/interface/interface.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/profiler/advisor/analyzer/analyzer_controller.py b/profiler/advisor/analyzer/analyzer_controller.py index 3b617b0365..8f9e6ff720 100644 --- a/profiler/advisor/analyzer/analyzer_controller.py +++ b/profiler/advisor/analyzer/analyzer_controller.py @@ -18,6 +18,7 @@ from profiler.advisor.analyzer.cluster.slow_link_analyzer import SlowLinkAnalyze from profiler.advisor.analyzer.computation.pp_stage_computation_analyzer import PPStageComputationAnalyzer from profiler.advisor.analyzer.overall.overall_summary_analyzer import OverallSummaryAnalyzer from profiler.advisor.config.config import Config +from profiler.advisor.common import constant as const from profiler.advisor.common.analyzer_scopes import SupportedScopes from profiler.advisor.common.async_analysis_status import AsyncAnalysisStatus from profiler.advisor.utils.utils import Timer, safe_index_value, safe_division, safe_index @@ -470,6 +471,9 @@ class AnalyzerController: def _profiling_comparison(self, compare_profiling_list): job_list = [] + if not os.getenv(const.ENABLE_PROFILING_COMPARISON, False): + logger.info("Skip profiling comparison due to longer processing time, manually set env 'ENABLE_PROFILING_COMPARISON=True' to enable profiling comparison.") + return job_list for index, _kwargs in enumerate(compare_profiling_list): kwargs = copy.deepcopy(self.kwargs) diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py index d63f053c40..b3b163991c 100644 --- a/profiler/advisor/common/constant.py +++ b/profiler/advisor/common/constant.py @@ -145,3 +145,5 @@ MAX_READ_DB_FILE_BYTES = 8 * 1024 * 1024 * 1024 WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC + +ENABLE_PROFILING_COMPARISON = "ENABLE_PROFILING_COMPARISON" \ No newline at end of file diff --git a/profiler/advisor/interface/interface.py b/profiler/advisor/interface/interface.py index 72c24d6c88..d6d42e5c49 100644 --- a/profiler/advisor/interface/interface.py +++ b/profiler/advisor/interface/interface.py @@ -15,6 +15,7 @@ import os from collections import OrderedDict import sys +import logging sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "cluster_analyse")) @@ -45,6 +46,8 @@ from profiler.advisor.analyzer.communication.contention.bandwidth_contention_ana from profiler.advisor.analyzer.schedule.gc.gc_analyzer import GcAnalyzer from profiler.advisor.analyzer.comparison.comparison_analyzer import ComparisonAnalyzer +logger = logging.getLogger() + class Interface: SCHEDULE = "schedule" -- Gitee From 7e00132532e68a4f21941395b1e03dafd383d8eb Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 26 Sep 2024 11:37:29 +0800 Subject: [PATCH 2/2] =?UTF-8?q?bug=20fix,=20=E4=BB=8Etimeline=E4=B8=AD?= =?UTF-8?q?=E6=A3=80=E6=B5=8Bgc,=20html=E4=B8=AD=E6=AF=8F=E9=A1=B9?= =?UTF-8?q?=E5=88=86=E6=9E=90=E6=8F=90=E7=A4=BA=E5=88=86=E6=9E=90=E7=9A=84?= =?UTF-8?q?=E5=8D=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../advisor/analyzer/analyzer_controller.py | 58 ++++++++--- .../analyzer/cluster/slow_link_analyzer.py | 10 +- .../analyzer/cluster/slow_rank_analyzer.py | 5 +- .../ai_core_freq/ai_core_freq_analyzer.py | 5 +- .../ai_core_freq/ai_core_freq_checker.py | 17 ++-- .../computation/aicpu/aicpu_checker.py | 3 +- .../computation/bound/block_dim_checker.py | 3 +- .../bound/operator_bound_checker.py | 3 +- .../op_compile/dynamic_shape_checker.py | 16 ++-- .../analyzer/computation/operator_checker.py | 10 +- .../pp_stage_computation_analyzer.py | 2 +- .../computation/profiling_analyzer.py | 10 +- .../dataloader/dataloader_analyzer.py | 2 +- .../analyzer/dataloader/dataloader_checker.py | 3 +- .../analyzer/memory/memory_analyzer.py | 2 +- .../advisor/analyzer/memory/memory_checker.py | 3 +- .../dispatch/timeline_op_dispatch_analyzer.py | 5 +- .../fusion_ops/fusion_ops_analyzer.py | 8 +- .../analyzer/schedule/gc/gc_analyzer.py | 4 +- .../analyzer/schedule/gc/gc_checker.py | 96 ++++++++++++++++--- .../schedule/syncbn/syncbn_analyzer.py | 2 +- .../schedule/syncbn/syncbn_checker.py | 4 +- .../synchronize_stream_analyzer.py | 12 +-- .../synchronize_stream_checker.py | 46 +++++++-- .../advisor/common/async_analysis_status.py | 4 +- profiler/advisor/common/constant.py | 4 +- .../advisor/dataset/timeline_event_dataset.py | 8 +- .../timeline_op_collector.py | 50 ++++++++++ .../display/html/templates/affinity_api.html | 3 + .../html/templates/ai_core_frequency.html | 3 + .../advisor/display/html/templates/gc.html | 5 + .../display/html/templates/memory.html | 3 + .../html/templates/operator_ai_cpu.html | 3 + .../html/templates/operator_block_dim.html | 3 + .../html/templates/operator_dispatch.html | 4 +- .../templates/operator_dynamic_shape.html | 3 + .../html/templates/operator_no_bound.html | 3 + .../pp_stage_computation_analysis.html | 2 +- .../html/templates/slow_dataloader.html | 3 + .../html/templates/sync_batchnorm.html | 3 + .../html/templates/synchronize_stream.html | 3 + profiler/advisor/rules/aicpu_rules.yaml | 8 +- .../rules/environment_variable_info.yaml | 4 +- profiler/advisor/rules/gc.yaml | 8 +- profiler/advisor/rules/synchronize.yaml | 5 +- profiler/advisor/utils/utils.py | 4 +- 46 files changed, 357 insertions(+), 108 deletions(-) diff --git a/profiler/advisor/analyzer/analyzer_controller.py b/profiler/advisor/analyzer/analyzer_controller.py index bac7ee3ef0..74b1ef7f30 100644 --- a/profiler/advisor/analyzer/analyzer_controller.py +++ b/profiler/advisor/analyzer/analyzer_controller.py @@ -142,7 +142,7 @@ class AnalyzerController: self._do_analysis(dimensions, pid=pid, async_resp=resp, **kwargs) except Exception as e: - self._update_analysis_process_resp(pid, resp, status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + self._update_analysis_process_resp(pid, resp, status_code=AsyncAnalysisStatus.INNER_ERROR_STATUS_CODE, status=AsyncAnalysisStatus.FAILED, error_msg=str(e)) logger.error(e) raise RuntimeError(e) @@ -158,7 +158,24 @@ class AnalyzerController: return async_analysis_process def get_response_by_pid(self, pid): - return self.analysis_process_resp.get(pid) + def _is_pid_exists(pid): + try: + psutil.Process(pid) + return True + except psutil.NoSuchProcess: + return False + + pid_not_exist_response = dict(id=pid, status_code=AsyncAnalysisStatus.NOT_FOUND_STATUS_CODE, + status=AsyncAnalysisStatus.FAILED, + error_msg="The advisor task id does not exist") + if pid not in self.analysis_process_resp: + return pid_not_exist_response + + response = self.analysis_process_resp.get(pid) + if response.get("status") not in [AsyncAnalysisStatus.FAILED, + AsyncAnalysisStatus.SUCCESS] and not _is_pid_exists(pid): + return pid_not_exist_response + return response def single_rank_analysis(self, profiling_path, benchmark_profiling_path=None): job_list = [] @@ -224,7 +241,7 @@ class AnalyzerController: overall_analyzer.optimize() def schedule_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, benchmark_step=None, - **kwargs): + rank=None, **kwargs): # 任意单卡的下发分析 kwargs = copy.deepcopy(self.kwargs) @@ -234,6 +251,7 @@ class AnalyzerController: kwargs["benchmark_profiling_path"] = benchmark_profiling_path kwargs["step"] = step kwargs["benchmark_step"] = benchmark_step + kwargs["rank"] = rank for dimension in [Interface.SCHEDULE]: for scope in Interface.get_scope(dimension): @@ -242,7 +260,7 @@ class AnalyzerController: return job_list def computation_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, - benchmark_step=None, stage=None, **kwargs): + benchmark_step=None, stage=None, rank=None, **kwargs): # 任意单卡的计算分析 kwargs = copy.deepcopy(self.kwargs) @@ -251,6 +269,7 @@ class AnalyzerController: kwargs["step"] = step kwargs["benchmark_step"] = benchmark_step kwargs["stage"] = stage + kwargs["rank"] = rank job_list = [] for dimension in [Interface.COMPUTATION]: @@ -261,7 +280,7 @@ class AnalyzerController: job_list.append((dimension, scope, interface, kwargs)) return job_list - def memory_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, benchmark_step=None): + def memory_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, benchmark_step=None, rank=None): # 任意单卡的内存分析 kwargs = copy.deepcopy(self.kwargs) @@ -271,6 +290,7 @@ class AnalyzerController: kwargs["benchmark_profiling_path"] = benchmark_profiling_path kwargs["step"] = step kwargs["benchmark_step"] = benchmark_step + kwargs["rank"] = rank for dimension in [Interface.MEMORY]: for scope in Interface.get_scope(dimension): @@ -302,12 +322,18 @@ class AnalyzerController: job_list = [] global_step_rank = self.slow_rank_analyzer.get_global_step_rank(SlowRankAnalyzer.FREE) - slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") or self.default_rank_id + + slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") + if slow_rank_id is not None: + info_msg = f"Maximum free for rank {slow_rank_id}" + else: + slow_rank_id = self.default_rank_id + info_msg = f"No slow rank with free time, analysis for default rank {slow_rank_id}" + fast_rank_id = global_step_rank.get("minimum", {}).get("rank_id") or self.default_rank_id slow_step = global_step_rank.get("maximum", {}).get("step") fast_step = global_step_rank.get("minimum", {}).get("step") - info_msg = f"Maximum free for rank {slow_rank_id}" if slow_step: info_msg += f" and step {slow_step}" logger.info(info_msg) @@ -399,14 +425,14 @@ class AnalyzerController: if not self._check_profiling_path_valid(profiling_path): error_msg = f"Got invalid argument '-d/--profiling_path' {profiling_path}, skip analysis" self._update_analysis_process_resp(pid, async_resp, error_msg=error_msg, - status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + status_code=AsyncAnalysisStatus.BAD_REQUEST_STATUS_CODE, status=AsyncAnalysisStatus.FAILED) logger.error(error_msg) return if benchmark_profiling_path and not self._check_profiling_path_valid(benchmark_profiling_path): error_msg = f"Got invalid argument '-bp/--benchmark_profiling_path' {benchmark_profiling_path}, skip analysis" self._update_analysis_process_resp(pid, async_resp, error_msg=error_msg, - status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + status_code=AsyncAnalysisStatus.BAD_REQUEST_STATUS_CODE, status=AsyncAnalysisStatus.FAILED) logger.error(error_msg) return @@ -473,8 +499,10 @@ class AnalyzerController: def _profiling_comparison(self, compare_profiling_list): job_list = [] - if not os.getenv(const.ENABLE_PROFILING_COMPARISON, False): - logger.info("Skip profiling comparison due to longer processing time, manually set env 'ENABLE_PROFILING_COMPARISON=True' to enable profiling comparison.") + disable_profiling_comparison = os.getenv(const.DISABLE_PROFILING_COMPARISON) + if disable_profiling_comparison is not None and disable_profiling_comparison.lower()=="true": + logger.info( + "Skip profiling comparison due to longer processing time due to env 'DISABLE_PROFILING_COMPARISON'") return job_list for index, _kwargs in enumerate(compare_profiling_list): @@ -654,13 +682,17 @@ class AnalyzerController: # 不区分stage,对所有卡取Min max进行分析 logger.info("Without pipeline parallel stage, steps and ranks to be analyzed are %s", json.dumps(global_step_rank)) - slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") or self.default_rank_id + slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") + if slow_rank_id: + info_msg = f"Maximum computation time for rank {slow_rank_id}" + else: + slow_rank_id = self.default_rank_id + info_msg = f"No slow rank with computation time, analysis for default rank {slow_rank_id}" slow_step = global_step_rank.get("maximum", {}).get("step") # 如果没有标杆profiling数据的rank id,说明没有快慢卡问题,直接对默认rank id进行分析,因此这里取值为None fast_rank_id = global_step_rank.get("minimum", {}).get("rank_id") fast_step = global_step_rank.get("minimum", {}).get("step") - info_msg = f"Maximum computation time for rank {slow_rank_id}" if slow_step is not None: info_msg += f" and step {slow_step}, " if fast_rank_id is not None: diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyzer.py b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py index 9653a25c19..259e5eb0c4 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyzer.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py @@ -22,7 +22,7 @@ from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset -from profiler.advisor.utils.utils import safe_index_value +from profiler.advisor.utils.utils import safe_index_value, convert_to_int logger = logging.getLogger() @@ -51,6 +51,7 @@ class SlowLinkAnalyzer(BaseAnalyzer): self.result = OptimizeResult() self.bottelneck = '' self.suggestion = '' + self.format_datas = {} if self.rank_bw_dict is not None: self.format_datas = self.format_details() @@ -104,7 +105,7 @@ class SlowLinkAnalyzer(BaseAnalyzer): data_list = [] for step_rank, rank_bw in self.rank_bw_dict.items(): - step_rank_list = list(map(int, step_rank.split(constant.STEP_RANK_SEP))) + step_rank_list = list(map(convert_to_int, step_rank.split(constant.STEP_RANK_SEP))) value_list = [rank_bw.get(i, 0) for i in headers] data_list.append(step_rank_list + value_list) data_list.sort(key=lambda x: (x[0], x[1])) # 按rank_id排序 @@ -147,6 +148,9 @@ class SlowLinkAnalyzer(BaseAnalyzer): def get_global_step_rank(self, bindwidth_type): global_step_rank = {} + if not self.format_datas: + return global_step_rank + bindwidth_key_map = {self.RDMA: self.RDMA_BANDWIDTH, self.SDMA: self.SDMA_BANDWIDTH} if bindwidth_type not in bindwidth_key_map: @@ -188,4 +192,4 @@ class SlowLinkAnalyzer(BaseAnalyzer): return global_step_rank def get_priority(self): - pass \ No newline at end of file + pass diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py b/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py index efe32184b1..bb3a8fdbd5 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py @@ -20,8 +20,7 @@ from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataset -from profiler.advisor.utils.utils import safe_index_value -from profiler.advisor.utils.utils import safe_division +from profiler.advisor.utils.utils import safe_index_value, safe_division, convert_to_int logger = logging.getLogger() @@ -114,7 +113,7 @@ class SlowRankAnalyzer(BaseAnalyzer): data_list = [] for key, value in self.step_trace_dict.items(): step, rank_id = key.split(constant.STEP_RANK_SEP) - data_list.append([int(step), int(rank_id)] + value) + data_list.append([convert_to_int(step), convert_to_int(rank_id)] + value) if step and step not in self._steps: self._steps.add(step) diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py index 1c6aadb156..049952931a 100644 --- a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py @@ -33,9 +33,10 @@ class AICoreFreqAnalyzer(BaseAnalyzer): add_render_list = kwargs.get("add_render_list", True) ai_core_freq_checker = AICoreFreqChecker() - ai_core_freq_checker.check_ai_core_freq(self.dataset, rank_id=kwargs.get("rank"), stage=kwargs.get("stage")) + ai_core_freq_checker.check_ai_core_freq(self.dataset, rank=kwargs.get("rank"), stage=kwargs.get("stage")) ai_core_freq_checker.make_record(self.result) - self.html = ai_core_freq_checker.make_render(self.html_render, add_render_list, priority=self.get_priority()) + self.html = ai_core_freq_checker.make_render(self.html_render, add_render_list, priority=self.get_priority(), + rank=kwargs.get("rank")) return self.result def get_priority(self): diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py index 05c5cd25ee..2fd49a22a3 100644 --- a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py @@ -23,10 +23,10 @@ class AICoreFreqChecker: self.decrease_freq_ops = [] self.headers = [] self.op_freq = None - self.rank_id = None + self.rank = None self.stage = None - def check_ai_core_freq(self, event_dataset: ComputationAnalysisDataset, rank_id=None, stage=None): + def check_ai_core_freq(self, event_dataset: ComputationAnalysisDataset, rank=None, stage=None): """ :Param event_dataset: dataset of timeline event """ @@ -35,7 +35,7 @@ class AICoreFreqChecker: "because no ai core frequency were recorded in trace_view.json") return - self.rank_id = rank_id + self.rank = rank self.stage = stage self.op_freq = event_dataset.op_freq for op_name, op_info in self.op_freq.items(): @@ -67,8 +67,8 @@ class AICoreFreqChecker: self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") - if self.rank_id: - self.desc = f"For rank {self.rank_id}, " + self.desc.lower() + if self.rank: + self.desc = f"For rank {self.rank}, " + self.desc.lower() self.suggestions = "Please check the temperature or max power of your machine." def make_record(self, result: OptimizeResult): @@ -79,8 +79,8 @@ class AICoreFreqChecker: return self.ai_core_freq_issues sheet_name = "AI Core Frequency" - if self.rank_id is not None: - sheet_name = f"rank {self.rank_id} AI Core Frequency".capitalize() + if self.rank is not None: + sheet_name = f"rank {self.rank} AI Core Frequency".capitalize() optimization_item = OptimizeItem(sheet_name, self.desc, [self.suggestions]) result.add(OptimizeRecord(optimization_item)) @@ -108,4 +108,5 @@ class AICoreFreqChecker: headers=self.headers, data=self.decrease_freq_ops[:self.SHOW_TOPK_OPS], add_render_list=add_render_list, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) diff --git a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py index 394ad74fd7..0c1b454cc8 100644 --- a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py +++ b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py @@ -157,7 +157,8 @@ class AicpuChecker(OperatorChecker): format_result=self.format_operator_result(record, constant.OPERATOR_LIST_UNLIMIT), add_render_list=add_render_list, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) def format_operator_result(self, record, limit): """ diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index 8b8e3fa9f6..cb6a824cb7 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -62,7 +62,8 @@ class BlockDimChecker(OperatorChecker): format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK), add_render_list=add_render_list, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) def _check_operator(self, op_info) -> bool: if op_info.task_type not in ["AI_CORE", "AI_VECTOR_CORE", "MIX_AIC"]: diff --git a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py index 2096e9ffaf..cc4e6f135c 100644 --- a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py +++ b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py @@ -54,4 +54,5 @@ class OperatorBoundChecker(OperatorChecker): format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK), add_render_list=add_render_list, - priority_background_color=priority) \ No newline at end of file + priority_background_color=priority, + rank=kwargs.get("rank")) \ No newline at end of file diff --git a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py index 9cd75a6e93..639bc994ea 100644 --- a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py +++ b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -12,8 +12,9 @@ logger = logging.getLogger() class DynamicShapeChecker(OperatorChecker): - ENABLE_COMPILED_SUGGESTION = "Optimize by enabling compiled operator, such as:\n" \ - "`torch_npu.npu.set_compile_mode(jit_compile=False)`\n" + ENABLE_COMPILED_SUGGESTION = "Please place the following code at the entrance of the python script to disable jit compile. " \ + "Code: `torch_npu.npu.set_compile_mode(jit_compile=False); " \ + "torch_npu.npu.config.allow_internal_format = False`" _SUGGESTION: List[str] = [ENABLE_COMPILED_SUGGESTION] _CHECKER = "dynamic shape operator" _PROBLEM = "Dynamic shape operator" @@ -28,13 +29,13 @@ class DynamicShapeChecker(OperatorChecker): def check(self, profiling_database) -> bool: return self.is_dynamic_shape(profiling_database) - def make_record(self, profiling_database, rank_id=None) -> OptimizeRecord: + def make_record(self, profiling_database, rank=None) -> OptimizeRecord: """ make record for what and how to optimize """ - if rank_id is not None: - self._PROBLEM = f"rank {rank_id} ".capitalize() + self._PROBLEM.lower() + if rank is not None: + self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() optimization_item = OptimizeItem( self._PROBLEM, self._description, @@ -56,7 +57,7 @@ class DynamicShapeChecker(OperatorChecker): release_suggestion = copy.deepcopy(suggestion) if release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION: release_suggestion += \ - f"for details please refer to link : LINK" + f"for details please refer to link : LINK" release_suggestion_list.append(release_suggestion.replace('\n', '
')) format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)} return format_result @@ -68,4 +69,5 @@ class DynamicShapeChecker(OperatorChecker): template_name="operator_dynamic_shape.html", format_result=self.format_operator_result(record), add_render_list=add_render_list, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py index 397b9d507e..02b3e17f55 100644 --- a/profiler/advisor/analyzer/computation/operator_checker.py +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -95,15 +95,15 @@ class OperatorChecker(VersionControl): return True return False - def make_record(self, profiling_data: ProfilingDataset, rank_id=None): + def make_record(self, profiling_data: ProfilingDataset, rank=None): """ Make record for what and how to optimize :param profiling_data: profiling data :return: optimize record """ - if rank_id is not None: - self._PROBLEM = f"rank {rank_id} ".capitalize() + self._PROBLEM.lower() + if rank is not None: + self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list if hasattr(op_info, "get_attr")] @@ -181,14 +181,14 @@ class OperatorChecker(VersionControl): release_suggestion = copy.deepcopy(suggestion) if release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION: release_suggestion += \ - (f"for details please refer to link : LINK") + (f"for details please refer to link : LINK") elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION: release_suggestion += \ (f"\nThe config file for MSLite AOE usage is as follows:\n" \ f"[ascend_context]\n" \ f"aoe_mode=\"operator tuning\"\n" \ f"--tune_ops_file={Config().tune_ops_file}\n" - f"\nFor details please refer to link : LINK") + f"\nFor details please refer to link : LINK") release_suggestion_list.append(release_suggestion.replace('\n', '
')) format_result = {"record": record.__dict__, "suggestion": fill('
'.join(release_suggestion_list), width=200), diff --git a/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py b/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py index 64a6c2ceba..b84b983c3f 100644 --- a/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py +++ b/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py @@ -59,7 +59,7 @@ class PPStageComputationAnalyzer(BaseAnalyzer): pass def _optimize(self, profiling_path, **kwargs): - stage_html_record = dict(stage=kwargs.get("stage"), rank_id=kwargs.get("rank"), step=kwargs.get("step")) + stage_html_record = dict(stage=kwargs.get("stage"), rank=kwargs.get("rank"), step=kwargs.get("step")) kwargs["add_render_list"] = False # stage 并行分析时,避免调用本身,即SupportedScopes.STAGE_COMPUTE diff --git a/profiler/advisor/analyzer/computation/profiling_analyzer.py b/profiler/advisor/analyzer/computation/profiling_analyzer.py index 20ebf1da73..a3e1b36faf 100644 --- a/profiler/advisor/analyzer/computation/profiling_analyzer.py +++ b/profiler/advisor/analyzer/computation/profiling_analyzer.py @@ -34,7 +34,7 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): """ profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key()) checker = self.checker - rank_id = kwargs.get("rank") + rank = kwargs.get("rank") add_render_list = kwargs.get("add_render_list", True) @@ -42,16 +42,16 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): return self.result if checker.check(profiling_data): # add record - record = checker.make_record(profiling_data, rank_id) + record = checker.make_record(profiling_data, rank) self.html = checker.make_render(self.html_render, record, add_render_list, - priority=self.get_priority(checker)) + priority=self.get_priority(checker), rank=kwargs.get("rank")) self.result.add(record) # add details details = checker.get_details() if details: for i, detail in enumerate(details): - sheet_name = checker.get_name() if rank_id is None else \ - f"rank {rank_id} ".capitalize() + checker.get_name() + sheet_name = checker.get_name() if rank is None else \ + f"rank {rank} ".capitalize() + checker.get_name() if i == 0: # the first row is header self.result.add_detail(sheet_name, headers=detail) diff --git a/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py index 3d1a537c21..debbaa9eef 100644 --- a/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py +++ b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py @@ -27,7 +27,7 @@ class DataloaderAnalyzer(BaseAnalyzer): dataloader_checker = DataloaderChecker() dataloader_checker.check_slow_dataloader(self.dataset) dataloader_checker.make_record(self.result) - dataloader_checker.make_render(self.html_render, priority=self.get_priority()) + dataloader_checker.make_render(self.html_render, priority=self.get_priority(), rank=kwargs.get("rank")) return self.result def get_priority(self): diff --git a/profiler/advisor/analyzer/dataloader/dataloader_checker.py b/profiler/advisor/analyzer/dataloader/dataloader_checker.py index f392a0838a..376729a1b6 100644 --- a/profiler/advisor/analyzer/dataloader/dataloader_checker.py +++ b/profiler/advisor/analyzer/dataloader/dataloader_checker.py @@ -62,7 +62,8 @@ class DataloaderChecker: template_name="slow_dataloader.html", desc=self.desc, suggestions=self.suggestions, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) def _init_rule(self): dataloader_rule_path = os.path.join( diff --git a/profiler/advisor/analyzer/memory/memory_analyzer.py b/profiler/advisor/analyzer/memory/memory_analyzer.py index 097b6e1794..5f34b03f2c 100644 --- a/profiler/advisor/analyzer/memory/memory_analyzer.py +++ b/profiler/advisor/analyzer/memory/memory_analyzer.py @@ -25,7 +25,7 @@ class MemoryAnalyzer(BaseAnalyzer): memory_checker = MemoryOpsChecker() memory_checker.check_memory_ops(self.dataset) memory_checker.make_record(self.result) - memory_checker.make_render(self.html_render, priority=self.get_priority(memory_checker.max_mem_op_dur)) + memory_checker.make_render(self.html_render, priority=self.get_priority(memory_checker.max_mem_op_dur), rank=kwargs.get("rank")) return self.result def get_priority(self, max_mem_op_dur): diff --git a/profiler/advisor/analyzer/memory/memory_checker.py b/profiler/advisor/analyzer/memory/memory_checker.py index b906ffbc89..b66761d7a4 100644 --- a/profiler/advisor/analyzer/memory/memory_checker.py +++ b/profiler/advisor/analyzer/memory/memory_checker.py @@ -73,4 +73,5 @@ class MemoryOpsChecker: template_name="memory.html", desc=self.desc, suggestions=self.suggestions, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) diff --git a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py index 58b2c301b5..126fe30176 100644 --- a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py +++ b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py @@ -51,7 +51,7 @@ class OpDispatchAnalyzer(BaseAnalyzer): """ self.get_op_compile_info(self.dataset) self.make_record(self.result) - self.make_render(self.html_render) + self.make_render(self.html_render, rank=kwargs.get('rank')) return self.result def get_op_compile_info(self, event_dataset: ScheduleAnalysisDataset): @@ -106,7 +106,8 @@ class OpDispatchAnalyzer(BaseAnalyzer): template_name="operator_dispatch.html", issues=issues, optimizers=optimizations, - priority_background_color=self.get_priority()) + priority_background_color=self.get_priority(), + rank=kwargs.get("rank")) def get_priority(self): step_duration = getattr(self.dataset, "step_duration", None) diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py index 97f1d052a1..b40e258818 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -45,7 +45,7 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): logger.info("Finish timeline analysis") self.make_record() - self.make_render() + self.make_render(rank=kwargs.get("rank")) return self.result def find_fusion_ops(self, event_dataset, ops: str, npu_api: str, mode: str): @@ -180,7 +180,8 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): detail = [api_name, *stack] self.result.add_detail(sheet_name, detail=detail) - def make_render(self): + def make_render(self, **kwargs): + rank = kwargs.get("rank") format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) self.html_render.render_template(key="schedule", @@ -192,7 +193,8 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): with_stack_doc_url=Config().timeline_with_stack_doc_url, api_doc_url=Config().timeline_api_doc_url, result=format_result_for_html, - priority_background_color=self.get_priority()) + priority_background_color=self.get_priority(), + rank=rank) def query_stack(self, event_dataset): if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]): diff --git a/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py index 3d142819db..a504a21c70 100644 --- a/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py +++ b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py @@ -36,9 +36,9 @@ class GcAnalyzer(BaseAnalyzer): @BaseAnalyzer.check_data((ScheduleAnalysisDataset.get_key(),)) def optimize(self, **kwargs): gc_checker = GcChecker() - gc_checker.check_gc(self.timeline_event_dataset, rank_id=kwargs.get("rank_id"), stage=kwargs.get("stage")) + gc_checker.check_gc(self.timeline_event_dataset, rank=kwargs.get("rank"), stage=kwargs.get("stage")) gc_checker.make_record(self.result) - gc_checker.make_render(self.html_render, priority=self.get_priority()) + gc_checker.make_render(self.html_render, priority=self.get_priority(), rank=kwargs.get("rank")) return self.result def get_priority(self): diff --git a/profiler/advisor/analyzer/schedule/gc/gc_checker.py b/profiler/advisor/analyzer/schedule/gc/gc_checker.py index 1fbddf6557..37a225ef8e 100644 --- a/profiler/advisor/analyzer/schedule/gc/gc_checker.py +++ b/profiler/advisor/analyzer/schedule/gc/gc_checker.py @@ -13,13 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import math import os from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.utils.utils import convert_to_float, convert_to_int, safe_division +from profiler.advisor.common import constant as const from profiler.cluster_analyse.common_func.file_manager import FileManager -from profiler.advisor.utils.utils import convert_to_float, convert_to_int logger = logging.getLogger() @@ -28,9 +30,11 @@ class GcChecker: def __init__(self): self.stage = None - self.rank_id = None + self.rank = None self.optimization_item = [] self.gc_issues = False + self.gc_problem_with_count = "" + self.gc_problem_with_free = "" self.desc = "" self.suggestions = [] self.solutions = None @@ -42,15 +46,29 @@ class GcChecker: self.headers = ["timestamp", "duration(us)"] self._init_rule() - def check_gc(self, event_dataset: ScheduleAnalysisDataset, rank_id=None, stage=None): + def check_gc(self, event_dataset: ScheduleAnalysisDataset, rank=None, stage=None): """ :Param event_dataset: dataset of timeline event """ if not hasattr(event_dataset, "gc_events"): logger.debug("Skip gc checker, because no gc event found") return - self.rank_id = rank_id + self.rank = rank self.stage = stage + + # 当用户cann和pta版本不支持采集gc信息时,通过timeline中的free和cann层acl事件 综合判断是否可能存在free + if not event_dataset.gc_events: + acl_events = getattr(event_dataset, "acl_events", []) + large_free_events = getattr(event_dataset, "large_free_events", []) + # 如果acl_events为空,则没有采集cann信息,不基于free+acl events进行gc分析 + if acl_events and large_free_events: + free_event = self.get_free_events_include_gc(large_free_events, acl_events) + if not free_event: + return + self.desc = self.gc_problem_with_free.format(free_duration_time=free_event.dur) + + return + for gc_event in event_dataset.gc_events: if convert_to_float(gc_event.dur) >= self.gc_threshold: self.gc_issues = True @@ -59,7 +77,8 @@ class GcChecker: self.abnormal_gc_list.append([gc_event.ts, gc_event.dur]) self.abnormal_gc_duration = round(self.abnormal_gc_duration / 1000, 4) self.abnormal_gc_list.sort(key=lambda x: x[1], reverse=True) - self.desc = self.desc.format(gc_count=self.abnormal_gc_count, gc_total_time=self.abnormal_gc_duration) + self.desc = self.gc_problem_with_count.format(gc_count=self.abnormal_gc_count, + gc_total_time=self.abnormal_gc_duration) def make_record(self, result: OptimizeResult): """ @@ -68,23 +87,24 @@ class GcChecker: if not self.gc_issues: return - self.optimization_item.append(OptimizeItem("gc", self.desc, self.suggestions)) + self.optimization_item.append(OptimizeItem("GC", self.desc, self.suggestions)) for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - if self.rank_id is not None: + if self.rank is not None: self.headers = ["Rank id"] + self.headers sub_table_name = "GcAnalysis" if not self.stage else f"Stage-{self.stage}: GcAnalysis" result.add_detail(sub_table_name, headers=self.headers) for row in self.abnormal_gc_list: - if self.rank_id is not None: - row = [self.rank_id] + row + if self.rank is not None: + row = [self.rank] + row result.add_detail(sub_table_name, detail=row) def make_render(self, html_render, **kwargs): if not self.gc_issues: return priority = kwargs.get("priority") + rank = kwargs.get("rank") show_num = min(self.gc_topk_num, self.abnormal_gc_count) html_render.render_template(key="schedule", template_dir="templates", @@ -94,7 +114,8 @@ class GcChecker: headers=self.headers, datas=self.abnormal_gc_list[:show_num], num=show_num, - priority_background_color=priority) + priority_background_color=priority, + rank=rank) def _init_rule(self): gc_rule_path = os.path.join( @@ -107,9 +128,62 @@ class GcChecker: self.gc_threshold = convert_to_float(gc_rule.get("gc_threshold", 0)) self.gc_topk_num = convert_to_int(gc_rule.get("top_num", 0)) - self.desc = gc_rule.get("problem", "") + self.gc_problem_with_count = gc_rule.get("gc_problem_with_count", "") + self.gc_problem_with_free = gc_rule.get("gc_problem_with_free", "") + self.max_acl_event_num_ratio = convert_to_float(gc_rule.get("max_acl_event_num_ratio")) + self.max_acl_event_time_ratio = convert_to_float(gc_rule.get("max_acl_event_time_ratio")) self.solutions = gc_rule.get("solutions", []) for solution in self.solutions: for key, val in solution.items(): self.suggestions.append(f"{key}, {val.get('desc')}") + + def get_free_events_include_gc(self, large_free_events, acl_events): + free_event_index, acl_event_index = 0, 0 + free_include_acl_events = {} + + while free_event_index < len(large_free_events) and acl_event_index < len(acl_events): + free_event = large_free_events[free_event_index] + free_event_name = f"{const.FREE}-{free_event_index}" + free_event_start_time = convert_to_float(free_event.ts) + free_event_end_time = free_event_start_time + convert_to_float(free_event.dur) + + while acl_event_index < len(acl_events): + acl_event = acl_events[acl_event_index] + acl_event_index += 1 + acl_event_start_time = convert_to_float(acl_event.ts) + acl_event_end_time = acl_event_start_time + convert_to_float(acl_event.dur) + + if acl_event_start_time < free_event_start_time: + continue + if acl_event_end_time > free_event_end_time: + break + + if free_event_name not in free_include_acl_events: + free_include_acl_events[free_event_name] = {} + + if "acl_event_count" not in free_include_acl_events[free_event_name]: + free_include_acl_events[free_event_name]["acl_event_count"] = 0.0 + free_include_acl_events[free_event_name]["acl_event_count"] += 1 + + if "acl_event_dur" not in free_include_acl_events[free_event_name]: + free_include_acl_events[free_event_name]["acl_event_dur"] = 0.0 + free_include_acl_events[free_event_name]["acl_event_dur"] += convert_to_float(acl_event.dur) + + free_event_index += 1 + + # 按free持续时间降序排列,优先判断持续时间最长的free + event_indexs = range(len(large_free_events)) + for index, free_event in sorted(zip(event_indexs, large_free_events), key=lambda x: x[1].dur, reverse=True): + + + free_event_name = f"{const.FREE}-{index}" + free_duration = convert_to_float(free_event.dur) + acl_event_dur = free_include_acl_events.get(free_event_name, {}).get("acl_event_dur", 0.0) + acl_event_count = free_include_acl_events.get(free_event_name, {}).get("acl_event_count", 0.0) + + if safe_division(acl_event_dur, free_duration) < self.max_acl_event_time_ratio and safe_division( + acl_event_count, free_duration) < self.max_acl_event_num_ratio: + self.gc_issues = True + return free_event + return None diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py index df8c22fa51..b123bc3cca 100644 --- a/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py @@ -25,7 +25,7 @@ class SyncBNAnalyzer(BaseAnalyzer): syncbn_checker = SyncBNChecker() syncbn_checker.check_syncbn(self.timeline_event_dataset) syncbn_checker.make_record(self.result) - syncbn_checker.make_render(self.html_render, priority=self.get_priority()) + syncbn_checker.make_render(self.html_render, priority=self.get_priority(), rank=kwargs.get("rank")) return self.result def get_priority(self): diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py index e83a154918..04556ee743 100644 --- a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py @@ -48,12 +48,14 @@ class SyncBNChecker: return priority = kwargs.get("priority") + rank = kwargs.get("rank") html_render.render_template(key="schedule", template_dir="templates", template_name="sync_batchnorm.html", desc=self.desc, solutions=self.solutions, - priority_background_color=priority) + priority_background_color=priority, + rank=rank) def _init_rule(self): syncbn_rule_path = os.path.join( diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py index 61ec7d1fa6..965c2bcf3a 100644 --- a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py @@ -1,11 +1,8 @@ import logging -from typing import List, Dict, Any - from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.result.result import OptimizeResult from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_checker import SynchronizeStreamChecker -from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset @@ -25,13 +22,12 @@ class SynchronizeStreamAnalyzer(BaseAnalyzer): @BaseAnalyzer.check_data((ScheduleAnalysisDataset.get_key(),)) def optimize(self, **kwargs): - synchronize_stream_checker = SynchronizeStreamChecker() synchronize_stream_checker.check_synchronize(self.timeline_event_dataset, kwargs.get("profiling_with_stack")) synchronize_stream_checker.make_record(self.result) - synchronize_stream_checker.make_render(self.html_render, priority=self.get_priority()) + synchronize_stream_checker.make_render(self.html_render, priority=self.get_priority(synchronize_stream_checker), + rank=kwargs.get("rank")) return self.result - - def get_priority(self): - return PriorityBackgroundColor.low \ No newline at end of file + def get_priority(self, synchronize_stream_checker): + return synchronize_stream_checker.priority diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py index 9136d611db..9f25c0c1a7 100644 --- a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py @@ -1,12 +1,13 @@ import logging +from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker from profiler.advisor.common import constant as const from profiler.advisor.config.config import Config from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker -from profiler.advisor.utils.utils import format_timeline_result +from profiler.advisor.utils.utils import format_timeline_result, safe_division logger = logging.getLogger() @@ -20,23 +21,37 @@ class SynchronizeStreamChecker(TimelineBaseChecker): self.desc = "" self.suggestions = [] self.solutions = [] - self.max_synchronize_num = None + self.max_synchronize_num = 0 + self.max_synchronize_num_ratio = 0 + self.step_synchronize_num = 0 + self.step_synchronize_num_ratio = 0 + self.priority = None def check_synchronize(self, event_dataset: ScheduleAnalysisDataset, profiling_with_stack=None): """ :Param event_dataset: dataset of timeline event """ if not hasattr(event_dataset, "synchronize_stream") or not getattr(event_dataset, "synchronize_stream"): - logger.debug("Skip synchronize stream checker, because no synchronize stream found") + logger.info("Skip synchronize stream checker, because no synchronize stream found") return - synchronize_num = event_dataset.synchronize_stream.total_count + self.step_synchronize_num = event_dataset.synchronize_stream.total_count + self._cal_synchronize_stream_num_ratio(event_dataset) + slow_synchronize_stream = event_dataset.synchronize_stream.slow_synchronize_stream total_slow_synchronize_time = sum((float(sync_stream.dur) for sync_stream in slow_synchronize_stream)) synchronize_stream_rule = event_dataset.synchronize_stream.rule self.max_synchronize_num = synchronize_stream_rule.get("max_synchronize_num") - self.synchronize_issues = synchronize_num >= self.max_synchronize_num and len(slow_synchronize_stream) > 0 + self.max_synchronize_num_ratio = synchronize_stream_rule.get("max_synchronize_num_ratio") + + is_reach_max_ratio_limit = self.step_synchronize_num_ratio >= self.max_synchronize_num_ratio + is_reach_max_num_limit = self.step_synchronize_num >= self.max_synchronize_num + is_reach_max_slow_num_limit = len(slow_synchronize_stream) > 0 + + self.priority = self.get_priority(is_reach_max_ratio_limit, is_reach_max_num_limit, is_reach_max_slow_num_limit) + self.synchronize_issues = is_reach_max_ratio_limit or is_reach_max_num_limit or is_reach_max_slow_num_limit + if not self.synchronize_issues: return @@ -47,7 +62,8 @@ class SynchronizeStreamChecker(TimelineBaseChecker): self.query_stack(event_dataset, profiling_with_stack) self.desc = synchronize_stream_rule.get("problem") - self.desc = self.desc.format(synchronize_num=synchronize_num, + self.desc = self.desc.format(synchronize_num=self.step_synchronize_num, + synchronize_aten_ratio=self.step_synchronize_num_ratio, slow_synchronize_num=len(slow_synchronize_stream), total_synchronize_stream_time=total_slow_synchronize_time) @@ -78,6 +94,7 @@ class SynchronizeStreamChecker(TimelineBaseChecker): if not self.synchronize_issues: return priority = kwargs.get("priority") + rank = kwargs.get("rank") format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) html_render.render_template(key="schedule", template_dir="templates", @@ -88,4 +105,17 @@ class SynchronizeStreamChecker(TimelineBaseChecker): with_stack_doc_url=Config().timeline_with_stack_doc_url, empty_stacks=self.empty_stacks, framework_black_list=self.framework_black_list, - priority_background_color=priority) + priority_background_color=priority, + rank=rank) + + def get_priority(self, is_reach_max_ratio_limit=None, is_reach_max_num_limit=None, + is_reach_max_slow_num_limit=None): + if is_reach_max_ratio_limit or is_reach_max_num_limit: + return PriorityBackgroundColor.high + if is_reach_max_slow_num_limit: + return PriorityBackgroundColor.low + + def _cal_synchronize_stream_num_ratio(self, event_dataset): + if event_dataset.aten: + self.step_synchronize_num_ratio = round(safe_division(self.step_synchronize_num, len(event_dataset.aten)), + 4) diff --git a/profiler/advisor/common/async_analysis_status.py b/profiler/advisor/common/async_analysis_status.py index f67ca235a9..36b41e0d55 100644 --- a/profiler/advisor/common/async_analysis_status.py +++ b/profiler/advisor/common/async_analysis_status.py @@ -3,5 +3,7 @@ class AsyncAnalysisStatus: SUCCESS = "success" ANALYZING = "analyzing" - FAILED_STATUS_CODE = 400 + BAD_REQUEST_STATUS_CODE = 400 + NOT_FOUND_STATUS_CODE = 404 + INNER_ERROR_STATUS_CODE = 500 NON_FAILED_STATUS_CODE = 200 diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py index 8c539d74c5..3030bf96c3 100644 --- a/profiler/advisor/common/constant.py +++ b/profiler/advisor/common/constant.py @@ -27,6 +27,7 @@ OPTIMIZER_SEP = "#" OPTIMIZER_STEP = "step" ENQUEUE = "enqueue" TORCH_TO_NPU = "torch_to_npu" +FREE = "free" OP_COMPILE_NAME = "AscendCL@aclopCompileAndExecute" OP_COMPILE_ID = "aclopCompileAndExecute" SYNC_STREAM = "AscendCL@aclrtSynchronizeStream" @@ -147,4 +148,5 @@ MAX_READ_DB_FILE_BYTES = 8 * 1024 * 1024 * 1024 WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC -ENABLE_PROFILING_COMPARISON = "ENABLE_PROFILING_COMPARISON" \ No newline at end of file +DISABLE_PROFILING_COMPARISON = "DISABLE_PROFILING_COMPARISON" +FREE_DURATION_FOR_GC_ANALYSIS = "FREE_DURATION_FOR_GC_ANALYSIS" \ No newline at end of file diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index e0a12fab6f..659601587e 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -23,7 +23,9 @@ from profiler.advisor.dataset.timeline_op_collector.timeline_op_collector import AclToNpuCollector, OpStackCollector, StepCollector, - GcCollector + GcCollector, + FreeEventsCollector, + AclEventsCollector ) logger = logging.getLogger() @@ -162,7 +164,9 @@ class ScheduleAnalysisDataset(BaseTimelineEventDataset): SyncBNCollector=SyncBNCollector(), AtenCollector=AtenCollector(), OptimizerCollector=OptimizerCollector(), - GcCollector=GcCollector() + GcCollector=GcCollector(), + FreeEventsCollector=FreeEventsCollector(), + AclEventsCollector=AclEventsCollector() ) def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None: diff --git a/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py b/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py index 56e6165dd2..5ea349ad9c 100644 --- a/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py +++ b/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py @@ -374,3 +374,53 @@ class GcCollector(BaseOpCollector): def post_process(self, target_op_list, **kwargs): self.attribute_to_dataset["gc_events"] = self.op_list + + +class FreeEventsCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + @staticmethod + def _load_rule(): + sync_stream_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + "rules", + "gc.yaml") + + gc_rule = FileManager.read_yaml_file(sync_stream_rule_path) + return gc_rule + + def add_op(self, event): + if event.name.lower() == const.FREE: + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + gc_rule = self._load_rule() + if os.getenv(const.FREE_DURATION_FOR_GC_ANALYSIS): + max_free_threshold = convert_to_float(os.getenv(const.FREE_DURATION_FOR_GC_ANALYSIS)) + else: + max_free_threshold = gc_rule.get("max_free_threshold") + + large_free_events = [] + + for op in target_op_list: + if convert_to_float(op.dur) > max_free_threshold: + large_free_events.append(op) + + large_free_events.sort(key=lambda x: convert_to_float(x.ts)) + self.attribute_to_dataset["large_free_events"] = large_free_events + + +class AclEventsCollector(BaseOpCollector): + ACL_EVENT_PREFIX = "AscendCL@" + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.startswith(self.ACL_EVENT_PREFIX): + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + target_op_list.sort(key=lambda x: convert_to_float(x.ts)) + self.attribute_to_dataset["acl_events"] = target_op_list diff --git a/profiler/advisor/display/html/templates/affinity_api.html b/profiler/advisor/display/html/templates/affinity_api.html index e9f3dd29c4..7cd3d7ad33 100644 --- a/profiler/advisor/display/html/templates/affinity_api.html +++ b/profiler/advisor/display/html/templates/affinity_api.html @@ -2,6 +2,9 @@

Affinity API Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} The analysis results of following affinity APIs are based on runtime env cann-{{ cann_version }} and diff --git a/profiler/advisor/display/html/templates/ai_core_frequency.html b/profiler/advisor/display/html/templates/ai_core_frequency.html index 9e5f34cefe..405460ac96 100644 --- a/profiler/advisor/display/html/templates/ai_core_frequency.html +++ b/profiler/advisor/display/html/templates/ai_core_frequency.html @@ -2,6 +2,9 @@

AI CORE Frequency Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} Issue: {{ desc }}
Suggestion: {{ suggestion }} diff --git a/profiler/advisor/display/html/templates/gc.html b/profiler/advisor/display/html/templates/gc.html index e6357c9221..205e1b3b9e 100644 --- a/profiler/advisor/display/html/templates/gc.html +++ b/profiler/advisor/display/html/templates/gc.html @@ -2,6 +2,9 @@

GC Analysis

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }} @@ -16,6 +19,7 @@ {% endfor %} {% endfor %}
+ {% if datas|safe %} The details of top {{ num }} garbage collection events are as follows:

@@ -33,5 +37,6 @@ {% endfor %}
+ {% endif %}
diff --git a/profiler/advisor/display/html/templates/memory.html b/profiler/advisor/display/html/templates/memory.html index c350701098..a3d75877b6 100644 --- a/profiler/advisor/display/html/templates/memory.html +++ b/profiler/advisor/display/html/templates/memory.html @@ -1,6 +1,9 @@

Memory Operator Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }} diff --git a/profiler/advisor/display/html/templates/operator_ai_cpu.html b/profiler/advisor/display/html/templates/operator_ai_cpu.html index 90f3bb93a1..79be0c9e9b 100644 --- a/profiler/advisor/display/html/templates/operator_ai_cpu.html +++ b/profiler/advisor/display/html/templates/operator_ai_cpu.html @@ -1,6 +1,9 @@

AICPU Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
diff --git a/profiler/advisor/display/html/templates/operator_block_dim.html b/profiler/advisor/display/html/templates/operator_block_dim.html index 8079db30a7..0f3e909cfa 100644 --- a/profiler/advisor/display/html/templates/operator_block_dim.html +++ b/profiler/advisor/display/html/templates/operator_block_dim.html @@ -1,6 +1,9 @@

Block Dim Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
Description
diff --git a/profiler/advisor/display/html/templates/operator_dispatch.html b/profiler/advisor/display/html/templates/operator_dispatch.html index 7aa69251eb..85fdb41319 100644 --- a/profiler/advisor/display/html/templates/operator_dispatch.html +++ b/profiler/advisor/display/html/templates/operator_dispatch.html @@ -2,7 +2,9 @@

Operator Dispatch Issues

- + {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
Description
diff --git a/profiler/advisor/display/html/templates/operator_dynamic_shape.html b/profiler/advisor/display/html/templates/operator_dynamic_shape.html index 9d4ca028bb..fea1a80fee 100644 --- a/profiler/advisor/display/html/templates/operator_dynamic_shape.html +++ b/profiler/advisor/display/html/templates/operator_dynamic_shape.html @@ -1,6 +1,9 @@

Operator Dynamic Shape Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
Description
diff --git a/profiler/advisor/display/html/templates/operator_no_bound.html b/profiler/advisor/display/html/templates/operator_no_bound.html index d0b8925cbf..d71e6f6b68 100644 --- a/profiler/advisor/display/html/templates/operator_no_bound.html +++ b/profiler/advisor/display/html/templates/operator_no_bound.html @@ -1,6 +1,9 @@

Operator No Bound Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
Description
diff --git a/profiler/advisor/display/html/templates/pp_stage_computation_analysis.html b/profiler/advisor/display/html/templates/pp_stage_computation_analysis.html index 577aeda9e4..189e6fadf8 100644 --- a/profiler/advisor/display/html/templates/pp_stage_computation_analysis.html +++ b/profiler/advisor/display/html/templates/pp_stage_computation_analysis.html @@ -6,7 +6,7 @@

{{ stage_html['stage']|safe }}

- Description: analysis for slow rank {{ stage_html['rank_id']|safe }} in current stage + Description: analysis for slow rank {{ stage_html['rank']|safe }} in current stage

{% for html in stage_html['html_list'] %} {{ html|safe }} diff --git a/profiler/advisor/display/html/templates/slow_dataloader.html b/profiler/advisor/display/html/templates/slow_dataloader.html index bf71a7085b..b9ce7a574a 100644 --- a/profiler/advisor/display/html/templates/slow_dataloader.html +++ b/profiler/advisor/display/html/templates/slow_dataloader.html @@ -1,6 +1,9 @@

Slow Dataloader Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }}
Description
diff --git a/profiler/advisor/display/html/templates/sync_batchnorm.html b/profiler/advisor/display/html/templates/sync_batchnorm.html index bb46c1f06d..402404c8a4 100644 --- a/profiler/advisor/display/html/templates/sync_batchnorm.html +++ b/profiler/advisor/display/html/templates/sync_batchnorm.html @@ -2,6 +2,9 @@

SyncBatchNorm Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }}
diff --git a/profiler/advisor/display/html/templates/synchronize_stream.html b/profiler/advisor/display/html/templates/synchronize_stream.html index 1832f9406d..a622ac685c 100644 --- a/profiler/advisor/display/html/templates/synchronize_stream.html +++ b/profiler/advisor/display/html/templates/synchronize_stream.html @@ -1,6 +1,9 @@

Synchronize Stream Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }}
diff --git a/profiler/advisor/rules/aicpu_rules.yaml b/profiler/advisor/rules/aicpu_rules.yaml index 58e6eef163..bdbb71fecc 100644 --- a/profiler/advisor/rules/aicpu_rules.yaml +++ b/profiler/advisor/rules/aicpu_rules.yaml @@ -85,19 +85,19 @@ ExampleGuideChecker: - IndexPutChecker: op_type: [index] url: *AICPU_DOC_URL - suggestion: 'Please modify source code followed by this LINK, try to replace index operator with equivalent operator.' + suggestion: 'Please modify source code followed by this LINK, try to replace index operator with equivalent operator.' - NonzeroChecker: op_type: [ indexput, indexputv2 ] url: *AICPU_DOC_URL - suggestion: 'Please modify source code followed by this LINK, try to replace indexput operator with equivalent operator.' + suggestion: 'Please modify source code followed by this LINK, try to replace indexput operator with equivalent operator.' - CastChecker: op_type: [ argmin ] url: *AICPU_DOC_URL - suggestion: 'Please update your cann-tookit to at least 7.0.RC1 version by this LINK.' + suggestion: 'Please update your cann-tookit to at least 7.0.RC1 version by this LINK.' - CastChecker: op_type: [ nonzero ] url: *AICPU_DOC_URL - suggestion: 'Please modify source code followed by this LINK, try to replace nonzero operator with equivalent operator.' \ No newline at end of file + suggestion: 'Please modify source code followed by this LINK, try to replace nonzero operator with equivalent operator.' \ No newline at end of file diff --git a/profiler/advisor/rules/environment_variable_info.yaml b/profiler/advisor/rules/environment_variable_info.yaml index b91f827ef4..09323e8e82 100644 --- a/profiler/advisor/rules/environment_variable_info.yaml +++ b/profiler/advisor/rules/environment_variable_info.yaml @@ -7,14 +7,14 @@ HCCL_RDAM_TC: In the DS field of IP datagram header, the rightmost 6 bits are DSCP, and leftmost 2 bits are 0.\n It should be set to DSCP * 4. Default value is 132, that is, DSCP is 33 (132=33*4)." suggest: "Please refer to https://support.huawei.com/enterprise/zh/doc/EDOC1100371278/5eeeed85?idPath=23710424" - suggest_html: "Please refer to LINK" + suggest_html: "Please refer to LINK" HCCL_RDMA_SL: desc: "Specify the priority of the RDMA NIC.\n The value must be the same as the PFC priority for the NIC.\n Otherwise, the performance may deteriorate.\n The value range is [0, 7], and default value is 4." suggest: "Please refer to https://support.huawei.com/enterprise/zh/doc/EDOC1100371278/5eeeed85?idPath=23710424" - suggest_html: "Please refer to LINK" + suggest_html: "Please refer to LINK" ACLNN_CACHE_LIMIT: desc: "Number of cached aclnn operators." suggest: "Setting a large number when alcnn and host bound, such as 'export ACLNN_CACHE_LIMIT=100000'" diff --git a/profiler/advisor/rules/gc.yaml b/profiler/advisor/rules/gc.yaml index fad9b35235..f896439cb6 100644 --- a/profiler/advisor/rules/gc.yaml +++ b/profiler/advisor/rules/gc.yaml @@ -1,7 +1,11 @@ -problem: "Abnormal garbage collection (GC) event is detected for {gc_count} times, and the total time is {gc_total_time} ms\n. +gc_problem_with_count: "Abnormal garbage collection (GC) event is detected for {gc_count} times, and the total time is {gc_total_time} ms\n. The GC operation is time-consuming and blocks the entire process. As a result, some steps in the model training process take a longer time than other steps." -gc_threshold: 1000 #us +gc_problem_with_free: "Nearly no host tasks within {free_duration_time} microseconds(us) of free time, which is likely caused by abnormal garbage collection (GC) of Python" +gc_threshold: 10 #us top_num: 10 +max_free_threshold: 200000 # us +max_acl_event_num_ratio: 0.0001 # max 10 events per 100 ms +max_acl_event_time_ratio: 0.01 # total time of acl events no larger than 0.01 * free duration solutions: - memory management: desc: "implement effective Python memory management; release memory promptly when not in use to avoid long-term retention; avoid circular references between objects." diff --git a/profiler/advisor/rules/synchronize.yaml b/profiler/advisor/rules/synchronize.yaml index 3bd518d003..9fef821182 100644 --- a/profiler/advisor/rules/synchronize.yaml +++ b/profiler/advisor/rules/synchronize.yaml @@ -1,5 +1,6 @@ -problem: "SynchronizeStream will reduce training efficiency. Found {synchronize_num} SynchronizeStream, {slow_synchronize_num} slow SynchronizeStream cost {total_synchronize_stream_time} us." -max_synchronize_num: 20 +problem: "SynchronizeStream will reduce training efficiency. Found {synchronize_num} SynchronizeStream, ratio to aten operators is {synchronize_aten_ratio}, {slow_synchronize_num} slow SynchronizeStream cost {total_synchronize_stream_time} us." +max_synchronize_num: 1000 +max_synchronize_num_ratio: 0.3 slow_synchronize_threshold: 10 #ms solutions: - disable ascend launch blocking: diff --git a/profiler/advisor/utils/utils.py b/profiler/advisor/utils/utils.py index 71d3dd1730..f0868af5d1 100644 --- a/profiler/advisor/utils/utils.py +++ b/profiler/advisor/utils/utils.py @@ -499,8 +499,8 @@ def safe_index(array, index, return_value_if_error=None): def convert_to_int(data: any) -> int: try: - int_value = int(data) + int_value = int(convert_to_float(data)) except ValueError: - logger.error(f"Can not convert %ss to float.", data) + logger.error(f"Can not convert %s to int.", data) return 0 return int_value -- Gitee