diff --git a/profiler/advisor/analyzer/analyzer_controller.py b/profiler/advisor/analyzer/analyzer_controller.py index 833289816d9e84fec494f998bb61d14eb36597ce..74b1ef7f300607d73195dacd9f9848dd02f80732 100644 --- a/profiler/advisor/analyzer/analyzer_controller.py +++ b/profiler/advisor/analyzer/analyzer_controller.py @@ -18,6 +18,7 @@ from profiler.advisor.analyzer.cluster.slow_link_analyzer import SlowLinkAnalyze from profiler.advisor.analyzer.computation.pp_stage_computation_analyzer import PPStageComputationAnalyzer from profiler.advisor.analyzer.overall.overall_summary_analyzer import OverallSummaryAnalyzer from profiler.advisor.config.config import Config +from profiler.advisor.common import constant as const from profiler.advisor.common.analyzer_scopes import SupportedScopes from profiler.advisor.common.async_analysis_status import AsyncAnalysisStatus from profiler.advisor.utils.utils import Timer, safe_index_value, safe_division, safe_index @@ -141,7 +142,7 @@ class AnalyzerController: self._do_analysis(dimensions, pid=pid, async_resp=resp, **kwargs) except Exception as e: - self._update_analysis_process_resp(pid, resp, status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + self._update_analysis_process_resp(pid, resp, status_code=AsyncAnalysisStatus.INNER_ERROR_STATUS_CODE, status=AsyncAnalysisStatus.FAILED, error_msg=str(e)) logger.error(e) raise RuntimeError(e) @@ -157,7 +158,24 @@ class AnalyzerController: return async_analysis_process def get_response_by_pid(self, pid): - return self.analysis_process_resp.get(pid) + def _is_pid_exists(pid): + try: + psutil.Process(pid) + return True + except psutil.NoSuchProcess: + return False + + pid_not_exist_response = dict(id=pid, status_code=AsyncAnalysisStatus.NOT_FOUND_STATUS_CODE, + status=AsyncAnalysisStatus.FAILED, + error_msg="The advisor task id does not exist") + if pid not in self.analysis_process_resp: + return pid_not_exist_response + + response = self.analysis_process_resp.get(pid) + if response.get("status") not in [AsyncAnalysisStatus.FAILED, + AsyncAnalysisStatus.SUCCESS] and not _is_pid_exists(pid): + return pid_not_exist_response + return response def single_rank_analysis(self, profiling_path, benchmark_profiling_path=None): job_list = [] @@ -223,7 +241,7 @@ class AnalyzerController: overall_analyzer.optimize() def schedule_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, benchmark_step=None, - **kwargs): + rank=None, **kwargs): # 任意单卡的下发分析 kwargs = copy.deepcopy(self.kwargs) @@ -233,6 +251,7 @@ class AnalyzerController: kwargs["benchmark_profiling_path"] = benchmark_profiling_path kwargs["step"] = step kwargs["benchmark_step"] = benchmark_step + kwargs["rank"] = rank for dimension in [Interface.SCHEDULE]: for scope in Interface.get_scope(dimension): @@ -241,7 +260,7 @@ class AnalyzerController: return job_list def computation_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, - benchmark_step=None, stage=None, **kwargs): + benchmark_step=None, stage=None, rank=None, **kwargs): # 任意单卡的计算分析 kwargs = copy.deepcopy(self.kwargs) @@ -250,6 +269,7 @@ class AnalyzerController: kwargs["step"] = step kwargs["benchmark_step"] = benchmark_step kwargs["stage"] = stage + kwargs["rank"] = rank job_list = [] for dimension in [Interface.COMPUTATION]: @@ -260,7 +280,7 @@ class AnalyzerController: job_list.append((dimension, scope, interface, kwargs)) return job_list - def memory_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, benchmark_step=None): + def memory_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, benchmark_step=None, rank=None): # 任意单卡的内存分析 kwargs = copy.deepcopy(self.kwargs) @@ -270,6 +290,7 @@ class AnalyzerController: kwargs["benchmark_profiling_path"] = benchmark_profiling_path kwargs["step"] = step kwargs["benchmark_step"] = benchmark_step + kwargs["rank"] = rank for dimension in [Interface.MEMORY]: for scope in Interface.get_scope(dimension): @@ -301,12 +322,18 @@ class AnalyzerController: job_list = [] global_step_rank = self.slow_rank_analyzer.get_global_step_rank(SlowRankAnalyzer.FREE) - slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") or self.default_rank_id + + slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") + if slow_rank_id is not None: + info_msg = f"Maximum free for rank {slow_rank_id}" + else: + slow_rank_id = self.default_rank_id + info_msg = f"No slow rank with free time, analysis for default rank {slow_rank_id}" + fast_rank_id = global_step_rank.get("minimum", {}).get("rank_id") or self.default_rank_id slow_step = global_step_rank.get("maximum", {}).get("step") fast_step = global_step_rank.get("minimum", {}).get("step") - info_msg = f"Maximum free for rank {slow_rank_id}" if slow_step: info_msg += f" and step {slow_step}" logger.info(info_msg) @@ -398,14 +425,14 @@ class AnalyzerController: if not self._check_profiling_path_valid(profiling_path): error_msg = f"Got invalid argument '-d/--profiling_path' {profiling_path}, skip analysis" self._update_analysis_process_resp(pid, async_resp, error_msg=error_msg, - status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + status_code=AsyncAnalysisStatus.BAD_REQUEST_STATUS_CODE, status=AsyncAnalysisStatus.FAILED) logger.error(error_msg) return if benchmark_profiling_path and not self._check_profiling_path_valid(benchmark_profiling_path): error_msg = f"Got invalid argument '-bp/--benchmark_profiling_path' {benchmark_profiling_path}, skip analysis" self._update_analysis_process_resp(pid, async_resp, error_msg=error_msg, - status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + status_code=AsyncAnalysisStatus.BAD_REQUEST_STATUS_CODE, status=AsyncAnalysisStatus.FAILED) logger.error(error_msg) return @@ -472,6 +499,11 @@ class AnalyzerController: def _profiling_comparison(self, compare_profiling_list): job_list = [] + disable_profiling_comparison = os.getenv(const.DISABLE_PROFILING_COMPARISON) + if disable_profiling_comparison is not None and disable_profiling_comparison.lower()=="true": + logger.info( + "Skip profiling comparison due to longer processing time due to env 'DISABLE_PROFILING_COMPARISON'") + return job_list for index, _kwargs in enumerate(compare_profiling_list): kwargs = copy.deepcopy(self.kwargs) @@ -650,13 +682,17 @@ class AnalyzerController: # 不区分stage,对所有卡取Min max进行分析 logger.info("Without pipeline parallel stage, steps and ranks to be analyzed are %s", json.dumps(global_step_rank)) - slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") or self.default_rank_id + slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") + if slow_rank_id: + info_msg = f"Maximum computation time for rank {slow_rank_id}" + else: + slow_rank_id = self.default_rank_id + info_msg = f"No slow rank with computation time, analysis for default rank {slow_rank_id}" slow_step = global_step_rank.get("maximum", {}).get("step") # 如果没有标杆profiling数据的rank id,说明没有快慢卡问题,直接对默认rank id进行分析,因此这里取值为None fast_rank_id = global_step_rank.get("minimum", {}).get("rank_id") fast_step = global_step_rank.get("minimum", {}).get("step") - info_msg = f"Maximum computation time for rank {slow_rank_id}" if slow_step is not None: info_msg += f" and step {slow_step}, " if fast_rank_id is not None: diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyzer.py b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py index 9653a25c197e5c76f21863d9c9c905b6b2e0b3d5..259e5eb0c4255afc97aad83210b72a14b7285888 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyzer.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py @@ -22,7 +22,7 @@ from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset -from profiler.advisor.utils.utils import safe_index_value +from profiler.advisor.utils.utils import safe_index_value, convert_to_int logger = logging.getLogger() @@ -51,6 +51,7 @@ class SlowLinkAnalyzer(BaseAnalyzer): self.result = OptimizeResult() self.bottelneck = '' self.suggestion = '' + self.format_datas = {} if self.rank_bw_dict is not None: self.format_datas = self.format_details() @@ -104,7 +105,7 @@ class SlowLinkAnalyzer(BaseAnalyzer): data_list = [] for step_rank, rank_bw in self.rank_bw_dict.items(): - step_rank_list = list(map(int, step_rank.split(constant.STEP_RANK_SEP))) + step_rank_list = list(map(convert_to_int, step_rank.split(constant.STEP_RANK_SEP))) value_list = [rank_bw.get(i, 0) for i in headers] data_list.append(step_rank_list + value_list) data_list.sort(key=lambda x: (x[0], x[1])) # 按rank_id排序 @@ -147,6 +148,9 @@ class SlowLinkAnalyzer(BaseAnalyzer): def get_global_step_rank(self, bindwidth_type): global_step_rank = {} + if not self.format_datas: + return global_step_rank + bindwidth_key_map = {self.RDMA: self.RDMA_BANDWIDTH, self.SDMA: self.SDMA_BANDWIDTH} if bindwidth_type not in bindwidth_key_map: @@ -188,4 +192,4 @@ class SlowLinkAnalyzer(BaseAnalyzer): return global_step_rank def get_priority(self): - pass \ No newline at end of file + pass diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py b/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py index efe32184b1ae1ebf025ade9ab65d474b9dc90672..bb3a8fdbd597a40b54a7274592d9efd7f82461b9 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py @@ -20,8 +20,7 @@ from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataset -from profiler.advisor.utils.utils import safe_index_value -from profiler.advisor.utils.utils import safe_division +from profiler.advisor.utils.utils import safe_index_value, safe_division, convert_to_int logger = logging.getLogger() @@ -114,7 +113,7 @@ class SlowRankAnalyzer(BaseAnalyzer): data_list = [] for key, value in self.step_trace_dict.items(): step, rank_id = key.split(constant.STEP_RANK_SEP) - data_list.append([int(step), int(rank_id)] + value) + data_list.append([convert_to_int(step), convert_to_int(rank_id)] + value) if step and step not in self._steps: self._steps.add(step) diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py index 1c6aadb156a0439d2ae7572eb48fad7f82659f37..049952931a7007a2642e910f326ac59a3a648edd 100644 --- a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py @@ -33,9 +33,10 @@ class AICoreFreqAnalyzer(BaseAnalyzer): add_render_list = kwargs.get("add_render_list", True) ai_core_freq_checker = AICoreFreqChecker() - ai_core_freq_checker.check_ai_core_freq(self.dataset, rank_id=kwargs.get("rank"), stage=kwargs.get("stage")) + ai_core_freq_checker.check_ai_core_freq(self.dataset, rank=kwargs.get("rank"), stage=kwargs.get("stage")) ai_core_freq_checker.make_record(self.result) - self.html = ai_core_freq_checker.make_render(self.html_render, add_render_list, priority=self.get_priority()) + self.html = ai_core_freq_checker.make_render(self.html_render, add_render_list, priority=self.get_priority(), + rank=kwargs.get("rank")) return self.result def get_priority(self): diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py index 05c5cd25ee09637487dac4c9c464abda5936624a..2fd49a22a35c41f3ff19c715d36a7103fde2e540 100644 --- a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py @@ -23,10 +23,10 @@ class AICoreFreqChecker: self.decrease_freq_ops = [] self.headers = [] self.op_freq = None - self.rank_id = None + self.rank = None self.stage = None - def check_ai_core_freq(self, event_dataset: ComputationAnalysisDataset, rank_id=None, stage=None): + def check_ai_core_freq(self, event_dataset: ComputationAnalysisDataset, rank=None, stage=None): """ :Param event_dataset: dataset of timeline event """ @@ -35,7 +35,7 @@ class AICoreFreqChecker: "because no ai core frequency were recorded in trace_view.json") return - self.rank_id = rank_id + self.rank = rank self.stage = stage self.op_freq = event_dataset.op_freq for op_name, op_info in self.op_freq.items(): @@ -67,8 +67,8 @@ class AICoreFreqChecker: self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") - if self.rank_id: - self.desc = f"For rank {self.rank_id}, " + self.desc.lower() + if self.rank: + self.desc = f"For rank {self.rank}, " + self.desc.lower() self.suggestions = "Please check the temperature or max power of your machine." def make_record(self, result: OptimizeResult): @@ -79,8 +79,8 @@ class AICoreFreqChecker: return self.ai_core_freq_issues sheet_name = "AI Core Frequency" - if self.rank_id is not None: - sheet_name = f"rank {self.rank_id} AI Core Frequency".capitalize() + if self.rank is not None: + sheet_name = f"rank {self.rank} AI Core Frequency".capitalize() optimization_item = OptimizeItem(sheet_name, self.desc, [self.suggestions]) result.add(OptimizeRecord(optimization_item)) @@ -108,4 +108,5 @@ class AICoreFreqChecker: headers=self.headers, data=self.decrease_freq_ops[:self.SHOW_TOPK_OPS], add_render_list=add_render_list, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) diff --git a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py index 394ad74fd7dcb739caa1f69929646f98207b2aa8..0c1b454cc8fc62a1522843d90335da5fb6be5709 100644 --- a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py +++ b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py @@ -157,7 +157,8 @@ class AicpuChecker(OperatorChecker): format_result=self.format_operator_result(record, constant.OPERATOR_LIST_UNLIMIT), add_render_list=add_render_list, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) def format_operator_result(self, record, limit): """ diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index 8b8e3fa9f6c98f8cabc98de60197894d6a34c541..cb6a824cb7b768bf8b9a2387ec90ee309d521445 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -62,7 +62,8 @@ class BlockDimChecker(OperatorChecker): format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK), add_render_list=add_render_list, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) def _check_operator(self, op_info) -> bool: if op_info.task_type not in ["AI_CORE", "AI_VECTOR_CORE", "MIX_AIC"]: diff --git a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py index 2096e9ffaf2e5e041716dea381e2d99824fefe0f..cc4e6f135c85339faef69e1dad5782d12fd597bd 100644 --- a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py +++ b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py @@ -54,4 +54,5 @@ class OperatorBoundChecker(OperatorChecker): format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK), add_render_list=add_render_list, - priority_background_color=priority) \ No newline at end of file + priority_background_color=priority, + rank=kwargs.get("rank")) \ No newline at end of file diff --git a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py index 9cd75a6e9323c63686cdad1f7c5efc2a408f64d5..639bc994ea5cbf3d0b62aff29e7aa8a6a768f498 100644 --- a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py +++ b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -12,8 +12,9 @@ logger = logging.getLogger() class DynamicShapeChecker(OperatorChecker): - ENABLE_COMPILED_SUGGESTION = "Optimize by enabling compiled operator, such as:\n" \ - "`torch_npu.npu.set_compile_mode(jit_compile=False)`\n" + ENABLE_COMPILED_SUGGESTION = "Please place the following code at the entrance of the python script to disable jit compile. " \ + "Code: `torch_npu.npu.set_compile_mode(jit_compile=False); " \ + "torch_npu.npu.config.allow_internal_format = False`" _SUGGESTION: List[str] = [ENABLE_COMPILED_SUGGESTION] _CHECKER = "dynamic shape operator" _PROBLEM = "Dynamic shape operator" @@ -28,13 +29,13 @@ class DynamicShapeChecker(OperatorChecker): def check(self, profiling_database) -> bool: return self.is_dynamic_shape(profiling_database) - def make_record(self, profiling_database, rank_id=None) -> OptimizeRecord: + def make_record(self, profiling_database, rank=None) -> OptimizeRecord: """ make record for what and how to optimize """ - if rank_id is not None: - self._PROBLEM = f"rank {rank_id} ".capitalize() + self._PROBLEM.lower() + if rank is not None: + self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() optimization_item = OptimizeItem( self._PROBLEM, self._description, @@ -56,7 +57,7 @@ class DynamicShapeChecker(OperatorChecker): release_suggestion = copy.deepcopy(suggestion) if release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION: release_suggestion += \ - f"for details please refer to link : LINK" + f"for details please refer to link : LINK" release_suggestion_list.append(release_suggestion.replace('\n', '
')) format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)} return format_result @@ -68,4 +69,5 @@ class DynamicShapeChecker(OperatorChecker): template_name="operator_dynamic_shape.html", format_result=self.format_operator_result(record), add_render_list=add_render_list, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py index 397b9d507ef3baf50efc4bae698a94c767bb8148..02b3e17f5517c43c9a7c28e6538fe88a633c2798 100644 --- a/profiler/advisor/analyzer/computation/operator_checker.py +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -95,15 +95,15 @@ class OperatorChecker(VersionControl): return True return False - def make_record(self, profiling_data: ProfilingDataset, rank_id=None): + def make_record(self, profiling_data: ProfilingDataset, rank=None): """ Make record for what and how to optimize :param profiling_data: profiling data :return: optimize record """ - if rank_id is not None: - self._PROBLEM = f"rank {rank_id} ".capitalize() + self._PROBLEM.lower() + if rank is not None: + self._PROBLEM = f"rank {rank} ".capitalize() + self._PROBLEM.lower() task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list if hasattr(op_info, "get_attr")] @@ -181,14 +181,14 @@ class OperatorChecker(VersionControl): release_suggestion = copy.deepcopy(suggestion) if release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION: release_suggestion += \ - (f"for details please refer to link : LINK") + (f"for details please refer to link : LINK") elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION: release_suggestion += \ (f"\nThe config file for MSLite AOE usage is as follows:\n" \ f"[ascend_context]\n" \ f"aoe_mode=\"operator tuning\"\n" \ f"--tune_ops_file={Config().tune_ops_file}\n" - f"\nFor details please refer to link : LINK") + f"\nFor details please refer to link : LINK") release_suggestion_list.append(release_suggestion.replace('\n', '
')) format_result = {"record": record.__dict__, "suggestion": fill('
'.join(release_suggestion_list), width=200), diff --git a/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py b/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py index 64a6c2ceba594ab3d9c34b17527c2119e13bca9b..b84b983c3f8da790bb2eb314f979ce79e47c3e5b 100644 --- a/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py +++ b/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py @@ -59,7 +59,7 @@ class PPStageComputationAnalyzer(BaseAnalyzer): pass def _optimize(self, profiling_path, **kwargs): - stage_html_record = dict(stage=kwargs.get("stage"), rank_id=kwargs.get("rank"), step=kwargs.get("step")) + stage_html_record = dict(stage=kwargs.get("stage"), rank=kwargs.get("rank"), step=kwargs.get("step")) kwargs["add_render_list"] = False # stage 并行分析时,避免调用本身,即SupportedScopes.STAGE_COMPUTE diff --git a/profiler/advisor/analyzer/computation/profiling_analyzer.py b/profiler/advisor/analyzer/computation/profiling_analyzer.py index 20ebf1da7332ed8639f694a8cd9df16fec4ab0f4..a3e1b36fafd3ec8bee0e608f267703c0971995af 100644 --- a/profiler/advisor/analyzer/computation/profiling_analyzer.py +++ b/profiler/advisor/analyzer/computation/profiling_analyzer.py @@ -34,7 +34,7 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): """ profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key()) checker = self.checker - rank_id = kwargs.get("rank") + rank = kwargs.get("rank") add_render_list = kwargs.get("add_render_list", True) @@ -42,16 +42,16 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): return self.result if checker.check(profiling_data): # add record - record = checker.make_record(profiling_data, rank_id) + record = checker.make_record(profiling_data, rank) self.html = checker.make_render(self.html_render, record, add_render_list, - priority=self.get_priority(checker)) + priority=self.get_priority(checker), rank=kwargs.get("rank")) self.result.add(record) # add details details = checker.get_details() if details: for i, detail in enumerate(details): - sheet_name = checker.get_name() if rank_id is None else \ - f"rank {rank_id} ".capitalize() + checker.get_name() + sheet_name = checker.get_name() if rank is None else \ + f"rank {rank} ".capitalize() + checker.get_name() if i == 0: # the first row is header self.result.add_detail(sheet_name, headers=detail) diff --git a/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py index 3d1a537c211a3ba26133f31e23284844d681d6e4..debbaa9eef493780f7e0d4ac2143f4e6dcc22f2e 100644 --- a/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py +++ b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py @@ -27,7 +27,7 @@ class DataloaderAnalyzer(BaseAnalyzer): dataloader_checker = DataloaderChecker() dataloader_checker.check_slow_dataloader(self.dataset) dataloader_checker.make_record(self.result) - dataloader_checker.make_render(self.html_render, priority=self.get_priority()) + dataloader_checker.make_render(self.html_render, priority=self.get_priority(), rank=kwargs.get("rank")) return self.result def get_priority(self): diff --git a/profiler/advisor/analyzer/dataloader/dataloader_checker.py b/profiler/advisor/analyzer/dataloader/dataloader_checker.py index f392a0838ac03fd180c6f5201c7fc489f19a2ab7..376729a1b61cf838189ed86735bec56b7806a6b1 100644 --- a/profiler/advisor/analyzer/dataloader/dataloader_checker.py +++ b/profiler/advisor/analyzer/dataloader/dataloader_checker.py @@ -62,7 +62,8 @@ class DataloaderChecker: template_name="slow_dataloader.html", desc=self.desc, suggestions=self.suggestions, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) def _init_rule(self): dataloader_rule_path = os.path.join( diff --git a/profiler/advisor/analyzer/memory/memory_analyzer.py b/profiler/advisor/analyzer/memory/memory_analyzer.py index 097b6e17949b1afa14e3b5804795f1df593f6084..5f34b03f2cf74001c868c9caf9d8c84b6ff53630 100644 --- a/profiler/advisor/analyzer/memory/memory_analyzer.py +++ b/profiler/advisor/analyzer/memory/memory_analyzer.py @@ -25,7 +25,7 @@ class MemoryAnalyzer(BaseAnalyzer): memory_checker = MemoryOpsChecker() memory_checker.check_memory_ops(self.dataset) memory_checker.make_record(self.result) - memory_checker.make_render(self.html_render, priority=self.get_priority(memory_checker.max_mem_op_dur)) + memory_checker.make_render(self.html_render, priority=self.get_priority(memory_checker.max_mem_op_dur), rank=kwargs.get("rank")) return self.result def get_priority(self, max_mem_op_dur): diff --git a/profiler/advisor/analyzer/memory/memory_checker.py b/profiler/advisor/analyzer/memory/memory_checker.py index b906ffbc89531a64d8f72fa6f478824bc8bf9d2c..b66761d7a4876e5c85965937c753241e96a7d55b 100644 --- a/profiler/advisor/analyzer/memory/memory_checker.py +++ b/profiler/advisor/analyzer/memory/memory_checker.py @@ -73,4 +73,5 @@ class MemoryOpsChecker: template_name="memory.html", desc=self.desc, suggestions=self.suggestions, - priority_background_color=priority) + priority_background_color=priority, + rank=kwargs.get("rank")) diff --git a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py index 58b2c301b590e74c054ef997b1973e7a595bbc73..126fe30176cf6ca0f1d7d3557c360f95af7b20be 100644 --- a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py +++ b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py @@ -51,7 +51,7 @@ class OpDispatchAnalyzer(BaseAnalyzer): """ self.get_op_compile_info(self.dataset) self.make_record(self.result) - self.make_render(self.html_render) + self.make_render(self.html_render, rank=kwargs.get('rank')) return self.result def get_op_compile_info(self, event_dataset: ScheduleAnalysisDataset): @@ -106,7 +106,8 @@ class OpDispatchAnalyzer(BaseAnalyzer): template_name="operator_dispatch.html", issues=issues, optimizers=optimizations, - priority_background_color=self.get_priority()) + priority_background_color=self.get_priority(), + rank=kwargs.get("rank")) def get_priority(self): step_duration = getattr(self.dataset, "step_duration", None) diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py index 97f1d052a1e1d9223165e2465b6cbf0897f89069..b40e258818319d0c7428ce71a6d45bc9d1cc2026 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -45,7 +45,7 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): logger.info("Finish timeline analysis") self.make_record() - self.make_render() + self.make_render(rank=kwargs.get("rank")) return self.result def find_fusion_ops(self, event_dataset, ops: str, npu_api: str, mode: str): @@ -180,7 +180,8 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): detail = [api_name, *stack] self.result.add_detail(sheet_name, detail=detail) - def make_render(self): + def make_render(self, **kwargs): + rank = kwargs.get("rank") format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) self.html_render.render_template(key="schedule", @@ -192,7 +193,8 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): with_stack_doc_url=Config().timeline_with_stack_doc_url, api_doc_url=Config().timeline_api_doc_url, result=format_result_for_html, - priority_background_color=self.get_priority()) + priority_background_color=self.get_priority(), + rank=rank) def query_stack(self, event_dataset): if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]): diff --git a/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py index 3d142819db8222106a6bd58eb4341d43a8ca3b59..a504a21c70a839b9fec29b99f52275839ef9bf10 100644 --- a/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py +++ b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py @@ -36,9 +36,9 @@ class GcAnalyzer(BaseAnalyzer): @BaseAnalyzer.check_data((ScheduleAnalysisDataset.get_key(),)) def optimize(self, **kwargs): gc_checker = GcChecker() - gc_checker.check_gc(self.timeline_event_dataset, rank_id=kwargs.get("rank_id"), stage=kwargs.get("stage")) + gc_checker.check_gc(self.timeline_event_dataset, rank=kwargs.get("rank"), stage=kwargs.get("stage")) gc_checker.make_record(self.result) - gc_checker.make_render(self.html_render, priority=self.get_priority()) + gc_checker.make_render(self.html_render, priority=self.get_priority(), rank=kwargs.get("rank")) return self.result def get_priority(self): diff --git a/profiler/advisor/analyzer/schedule/gc/gc_checker.py b/profiler/advisor/analyzer/schedule/gc/gc_checker.py index 1fbddf655758d4a4d31fe6925f6240b085c6c975..37a225ef8e597dece5f72d9f12d70846bd707188 100644 --- a/profiler/advisor/analyzer/schedule/gc/gc_checker.py +++ b/profiler/advisor/analyzer/schedule/gc/gc_checker.py @@ -13,13 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import math import os from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.utils.utils import convert_to_float, convert_to_int, safe_division +from profiler.advisor.common import constant as const from profiler.cluster_analyse.common_func.file_manager import FileManager -from profiler.advisor.utils.utils import convert_to_float, convert_to_int logger = logging.getLogger() @@ -28,9 +30,11 @@ class GcChecker: def __init__(self): self.stage = None - self.rank_id = None + self.rank = None self.optimization_item = [] self.gc_issues = False + self.gc_problem_with_count = "" + self.gc_problem_with_free = "" self.desc = "" self.suggestions = [] self.solutions = None @@ -42,15 +46,29 @@ class GcChecker: self.headers = ["timestamp", "duration(us)"] self._init_rule() - def check_gc(self, event_dataset: ScheduleAnalysisDataset, rank_id=None, stage=None): + def check_gc(self, event_dataset: ScheduleAnalysisDataset, rank=None, stage=None): """ :Param event_dataset: dataset of timeline event """ if not hasattr(event_dataset, "gc_events"): logger.debug("Skip gc checker, because no gc event found") return - self.rank_id = rank_id + self.rank = rank self.stage = stage + + # 当用户cann和pta版本不支持采集gc信息时,通过timeline中的free和cann层acl事件 综合判断是否可能存在free + if not event_dataset.gc_events: + acl_events = getattr(event_dataset, "acl_events", []) + large_free_events = getattr(event_dataset, "large_free_events", []) + # 如果acl_events为空,则没有采集cann信息,不基于free+acl events进行gc分析 + if acl_events and large_free_events: + free_event = self.get_free_events_include_gc(large_free_events, acl_events) + if not free_event: + return + self.desc = self.gc_problem_with_free.format(free_duration_time=free_event.dur) + + return + for gc_event in event_dataset.gc_events: if convert_to_float(gc_event.dur) >= self.gc_threshold: self.gc_issues = True @@ -59,7 +77,8 @@ class GcChecker: self.abnormal_gc_list.append([gc_event.ts, gc_event.dur]) self.abnormal_gc_duration = round(self.abnormal_gc_duration / 1000, 4) self.abnormal_gc_list.sort(key=lambda x: x[1], reverse=True) - self.desc = self.desc.format(gc_count=self.abnormal_gc_count, gc_total_time=self.abnormal_gc_duration) + self.desc = self.gc_problem_with_count.format(gc_count=self.abnormal_gc_count, + gc_total_time=self.abnormal_gc_duration) def make_record(self, result: OptimizeResult): """ @@ -68,23 +87,24 @@ class GcChecker: if not self.gc_issues: return - self.optimization_item.append(OptimizeItem("gc", self.desc, self.suggestions)) + self.optimization_item.append(OptimizeItem("GC", self.desc, self.suggestions)) for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - if self.rank_id is not None: + if self.rank is not None: self.headers = ["Rank id"] + self.headers sub_table_name = "GcAnalysis" if not self.stage else f"Stage-{self.stage}: GcAnalysis" result.add_detail(sub_table_name, headers=self.headers) for row in self.abnormal_gc_list: - if self.rank_id is not None: - row = [self.rank_id] + row + if self.rank is not None: + row = [self.rank] + row result.add_detail(sub_table_name, detail=row) def make_render(self, html_render, **kwargs): if not self.gc_issues: return priority = kwargs.get("priority") + rank = kwargs.get("rank") show_num = min(self.gc_topk_num, self.abnormal_gc_count) html_render.render_template(key="schedule", template_dir="templates", @@ -94,7 +114,8 @@ class GcChecker: headers=self.headers, datas=self.abnormal_gc_list[:show_num], num=show_num, - priority_background_color=priority) + priority_background_color=priority, + rank=rank) def _init_rule(self): gc_rule_path = os.path.join( @@ -107,9 +128,62 @@ class GcChecker: self.gc_threshold = convert_to_float(gc_rule.get("gc_threshold", 0)) self.gc_topk_num = convert_to_int(gc_rule.get("top_num", 0)) - self.desc = gc_rule.get("problem", "") + self.gc_problem_with_count = gc_rule.get("gc_problem_with_count", "") + self.gc_problem_with_free = gc_rule.get("gc_problem_with_free", "") + self.max_acl_event_num_ratio = convert_to_float(gc_rule.get("max_acl_event_num_ratio")) + self.max_acl_event_time_ratio = convert_to_float(gc_rule.get("max_acl_event_time_ratio")) self.solutions = gc_rule.get("solutions", []) for solution in self.solutions: for key, val in solution.items(): self.suggestions.append(f"{key}, {val.get('desc')}") + + def get_free_events_include_gc(self, large_free_events, acl_events): + free_event_index, acl_event_index = 0, 0 + free_include_acl_events = {} + + while free_event_index < len(large_free_events) and acl_event_index < len(acl_events): + free_event = large_free_events[free_event_index] + free_event_name = f"{const.FREE}-{free_event_index}" + free_event_start_time = convert_to_float(free_event.ts) + free_event_end_time = free_event_start_time + convert_to_float(free_event.dur) + + while acl_event_index < len(acl_events): + acl_event = acl_events[acl_event_index] + acl_event_index += 1 + acl_event_start_time = convert_to_float(acl_event.ts) + acl_event_end_time = acl_event_start_time + convert_to_float(acl_event.dur) + + if acl_event_start_time < free_event_start_time: + continue + if acl_event_end_time > free_event_end_time: + break + + if free_event_name not in free_include_acl_events: + free_include_acl_events[free_event_name] = {} + + if "acl_event_count" not in free_include_acl_events[free_event_name]: + free_include_acl_events[free_event_name]["acl_event_count"] = 0.0 + free_include_acl_events[free_event_name]["acl_event_count"] += 1 + + if "acl_event_dur" not in free_include_acl_events[free_event_name]: + free_include_acl_events[free_event_name]["acl_event_dur"] = 0.0 + free_include_acl_events[free_event_name]["acl_event_dur"] += convert_to_float(acl_event.dur) + + free_event_index += 1 + + # 按free持续时间降序排列,优先判断持续时间最长的free + event_indexs = range(len(large_free_events)) + for index, free_event in sorted(zip(event_indexs, large_free_events), key=lambda x: x[1].dur, reverse=True): + + + free_event_name = f"{const.FREE}-{index}" + free_duration = convert_to_float(free_event.dur) + acl_event_dur = free_include_acl_events.get(free_event_name, {}).get("acl_event_dur", 0.0) + acl_event_count = free_include_acl_events.get(free_event_name, {}).get("acl_event_count", 0.0) + + if safe_division(acl_event_dur, free_duration) < self.max_acl_event_time_ratio and safe_division( + acl_event_count, free_duration) < self.max_acl_event_num_ratio: + self.gc_issues = True + return free_event + return None diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py index df8c22fa5161d8f4315748cb629a3dd19b79e39a..b123bc3cca848de964802c0920c1bb0ee2c187d9 100644 --- a/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py @@ -25,7 +25,7 @@ class SyncBNAnalyzer(BaseAnalyzer): syncbn_checker = SyncBNChecker() syncbn_checker.check_syncbn(self.timeline_event_dataset) syncbn_checker.make_record(self.result) - syncbn_checker.make_render(self.html_render, priority=self.get_priority()) + syncbn_checker.make_render(self.html_render, priority=self.get_priority(), rank=kwargs.get("rank")) return self.result def get_priority(self): diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py index e83a1549184b2a48f5ddc25ae15f6cece34825c2..04556ee743a5b9812aaf3b5dcda8aafcef81e4dd 100644 --- a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py @@ -48,12 +48,14 @@ class SyncBNChecker: return priority = kwargs.get("priority") + rank = kwargs.get("rank") html_render.render_template(key="schedule", template_dir="templates", template_name="sync_batchnorm.html", desc=self.desc, solutions=self.solutions, - priority_background_color=priority) + priority_background_color=priority, + rank=rank) def _init_rule(self): syncbn_rule_path = os.path.join( diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py index 61ec7d1fa602f8359ce2bf9d1ae0297151588ef3..965c2bcf3a1e8710e0aaed1c66f684fad800961c 100644 --- a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py @@ -1,11 +1,8 @@ import logging -from typing import List, Dict, Any - from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.result.result import OptimizeResult from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_checker import SynchronizeStreamChecker -from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset @@ -25,13 +22,12 @@ class SynchronizeStreamAnalyzer(BaseAnalyzer): @BaseAnalyzer.check_data((ScheduleAnalysisDataset.get_key(),)) def optimize(self, **kwargs): - synchronize_stream_checker = SynchronizeStreamChecker() synchronize_stream_checker.check_synchronize(self.timeline_event_dataset, kwargs.get("profiling_with_stack")) synchronize_stream_checker.make_record(self.result) - synchronize_stream_checker.make_render(self.html_render, priority=self.get_priority()) + synchronize_stream_checker.make_render(self.html_render, priority=self.get_priority(synchronize_stream_checker), + rank=kwargs.get("rank")) return self.result - - def get_priority(self): - return PriorityBackgroundColor.low \ No newline at end of file + def get_priority(self, synchronize_stream_checker): + return synchronize_stream_checker.priority diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py index 9136d611db74f07ef7d8811507e5923fdee18dbd..9f25c0c1a7006adac4c77fae7ef796876cb6d0e2 100644 --- a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py @@ -1,12 +1,13 @@ import logging +from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker from profiler.advisor.common import constant as const from profiler.advisor.config.config import Config from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker -from profiler.advisor.utils.utils import format_timeline_result +from profiler.advisor.utils.utils import format_timeline_result, safe_division logger = logging.getLogger() @@ -20,23 +21,37 @@ class SynchronizeStreamChecker(TimelineBaseChecker): self.desc = "" self.suggestions = [] self.solutions = [] - self.max_synchronize_num = None + self.max_synchronize_num = 0 + self.max_synchronize_num_ratio = 0 + self.step_synchronize_num = 0 + self.step_synchronize_num_ratio = 0 + self.priority = None def check_synchronize(self, event_dataset: ScheduleAnalysisDataset, profiling_with_stack=None): """ :Param event_dataset: dataset of timeline event """ if not hasattr(event_dataset, "synchronize_stream") or not getattr(event_dataset, "synchronize_stream"): - logger.debug("Skip synchronize stream checker, because no synchronize stream found") + logger.info("Skip synchronize stream checker, because no synchronize stream found") return - synchronize_num = event_dataset.synchronize_stream.total_count + self.step_synchronize_num = event_dataset.synchronize_stream.total_count + self._cal_synchronize_stream_num_ratio(event_dataset) + slow_synchronize_stream = event_dataset.synchronize_stream.slow_synchronize_stream total_slow_synchronize_time = sum((float(sync_stream.dur) for sync_stream in slow_synchronize_stream)) synchronize_stream_rule = event_dataset.synchronize_stream.rule self.max_synchronize_num = synchronize_stream_rule.get("max_synchronize_num") - self.synchronize_issues = synchronize_num >= self.max_synchronize_num and len(slow_synchronize_stream) > 0 + self.max_synchronize_num_ratio = synchronize_stream_rule.get("max_synchronize_num_ratio") + + is_reach_max_ratio_limit = self.step_synchronize_num_ratio >= self.max_synchronize_num_ratio + is_reach_max_num_limit = self.step_synchronize_num >= self.max_synchronize_num + is_reach_max_slow_num_limit = len(slow_synchronize_stream) > 0 + + self.priority = self.get_priority(is_reach_max_ratio_limit, is_reach_max_num_limit, is_reach_max_slow_num_limit) + self.synchronize_issues = is_reach_max_ratio_limit or is_reach_max_num_limit or is_reach_max_slow_num_limit + if not self.synchronize_issues: return @@ -47,7 +62,8 @@ class SynchronizeStreamChecker(TimelineBaseChecker): self.query_stack(event_dataset, profiling_with_stack) self.desc = synchronize_stream_rule.get("problem") - self.desc = self.desc.format(synchronize_num=synchronize_num, + self.desc = self.desc.format(synchronize_num=self.step_synchronize_num, + synchronize_aten_ratio=self.step_synchronize_num_ratio, slow_synchronize_num=len(slow_synchronize_stream), total_synchronize_stream_time=total_slow_synchronize_time) @@ -78,6 +94,7 @@ class SynchronizeStreamChecker(TimelineBaseChecker): if not self.synchronize_issues: return priority = kwargs.get("priority") + rank = kwargs.get("rank") format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) html_render.render_template(key="schedule", template_dir="templates", @@ -88,4 +105,17 @@ class SynchronizeStreamChecker(TimelineBaseChecker): with_stack_doc_url=Config().timeline_with_stack_doc_url, empty_stacks=self.empty_stacks, framework_black_list=self.framework_black_list, - priority_background_color=priority) + priority_background_color=priority, + rank=rank) + + def get_priority(self, is_reach_max_ratio_limit=None, is_reach_max_num_limit=None, + is_reach_max_slow_num_limit=None): + if is_reach_max_ratio_limit or is_reach_max_num_limit: + return PriorityBackgroundColor.high + if is_reach_max_slow_num_limit: + return PriorityBackgroundColor.low + + def _cal_synchronize_stream_num_ratio(self, event_dataset): + if event_dataset.aten: + self.step_synchronize_num_ratio = round(safe_division(self.step_synchronize_num, len(event_dataset.aten)), + 4) diff --git a/profiler/advisor/common/async_analysis_status.py b/profiler/advisor/common/async_analysis_status.py index f67ca235a97c54cd107308a030a3b82d0eaf3352..36b41e0d55b0e1d4ae35f7ad68d23d3a9e7afe0d 100644 --- a/profiler/advisor/common/async_analysis_status.py +++ b/profiler/advisor/common/async_analysis_status.py @@ -3,5 +3,7 @@ class AsyncAnalysisStatus: SUCCESS = "success" ANALYZING = "analyzing" - FAILED_STATUS_CODE = 400 + BAD_REQUEST_STATUS_CODE = 400 + NOT_FOUND_STATUS_CODE = 404 + INNER_ERROR_STATUS_CODE = 500 NON_FAILED_STATUS_CODE = 200 diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py index 647856d3cd7c0f2378707344b5ba3bebd9d813e8..3030bf96c335c9f1654ac3d3a2731dd3380e64f9 100644 --- a/profiler/advisor/common/constant.py +++ b/profiler/advisor/common/constant.py @@ -27,6 +27,7 @@ OPTIMIZER_SEP = "#" OPTIMIZER_STEP = "step" ENQUEUE = "enqueue" TORCH_TO_NPU = "torch_to_npu" +FREE = "free" OP_COMPILE_NAME = "AscendCL@aclopCompileAndExecute" OP_COMPILE_ID = "aclopCompileAndExecute" SYNC_STREAM = "AscendCL@aclrtSynchronizeStream" @@ -146,3 +147,6 @@ MAX_READ_DB_FILE_BYTES = 8 * 1024 * 1024 * 1024 WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC + +DISABLE_PROFILING_COMPARISON = "DISABLE_PROFILING_COMPARISON" +FREE_DURATION_FOR_GC_ANALYSIS = "FREE_DURATION_FOR_GC_ANALYSIS" \ No newline at end of file diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index e0a12fab6fa9e3dd381839e94572d401088461e0..659601587e1926c7e98d3a6542febde414b6b9cb 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -23,7 +23,9 @@ from profiler.advisor.dataset.timeline_op_collector.timeline_op_collector import AclToNpuCollector, OpStackCollector, StepCollector, - GcCollector + GcCollector, + FreeEventsCollector, + AclEventsCollector ) logger = logging.getLogger() @@ -162,7 +164,9 @@ class ScheduleAnalysisDataset(BaseTimelineEventDataset): SyncBNCollector=SyncBNCollector(), AtenCollector=AtenCollector(), OptimizerCollector=OptimizerCollector(), - GcCollector=GcCollector() + GcCollector=GcCollector(), + FreeEventsCollector=FreeEventsCollector(), + AclEventsCollector=AclEventsCollector() ) def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None: diff --git a/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py b/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py index 56e6165dd24aa4d7a8aafaab455793c6c8df8e13..5ea349ad9cea249158d061e8f4fee7361486fdfa 100644 --- a/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py +++ b/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py @@ -374,3 +374,53 @@ class GcCollector(BaseOpCollector): def post_process(self, target_op_list, **kwargs): self.attribute_to_dataset["gc_events"] = self.op_list + + +class FreeEventsCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + @staticmethod + def _load_rule(): + sync_stream_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + "rules", + "gc.yaml") + + gc_rule = FileManager.read_yaml_file(sync_stream_rule_path) + return gc_rule + + def add_op(self, event): + if event.name.lower() == const.FREE: + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + gc_rule = self._load_rule() + if os.getenv(const.FREE_DURATION_FOR_GC_ANALYSIS): + max_free_threshold = convert_to_float(os.getenv(const.FREE_DURATION_FOR_GC_ANALYSIS)) + else: + max_free_threshold = gc_rule.get("max_free_threshold") + + large_free_events = [] + + for op in target_op_list: + if convert_to_float(op.dur) > max_free_threshold: + large_free_events.append(op) + + large_free_events.sort(key=lambda x: convert_to_float(x.ts)) + self.attribute_to_dataset["large_free_events"] = large_free_events + + +class AclEventsCollector(BaseOpCollector): + ACL_EVENT_PREFIX = "AscendCL@" + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.startswith(self.ACL_EVENT_PREFIX): + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + target_op_list.sort(key=lambda x: convert_to_float(x.ts)) + self.attribute_to_dataset["acl_events"] = target_op_list diff --git a/profiler/advisor/display/html/templates/affinity_api.html b/profiler/advisor/display/html/templates/affinity_api.html index e9f3dd29c433c6d2481fe755ab5426d42f94a50f..7cd3d7ad33d0220c7aba055721eddf049161a0d8 100644 --- a/profiler/advisor/display/html/templates/affinity_api.html +++ b/profiler/advisor/display/html/templates/affinity_api.html @@ -2,6 +2,9 @@

Affinity API Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} The analysis results of following affinity APIs are based on runtime env cann-{{ cann_version }} and diff --git a/profiler/advisor/display/html/templates/ai_core_frequency.html b/profiler/advisor/display/html/templates/ai_core_frequency.html index 9e5f34cefed3c3ae3ba176cc54c1ff5875bedcbb..405460ac9616740613bc337d705d617cc9de9287 100644 --- a/profiler/advisor/display/html/templates/ai_core_frequency.html +++ b/profiler/advisor/display/html/templates/ai_core_frequency.html @@ -2,6 +2,9 @@

AI CORE Frequency Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} Issue: {{ desc }}
Suggestion: {{ suggestion }} diff --git a/profiler/advisor/display/html/templates/gc.html b/profiler/advisor/display/html/templates/gc.html index e6357c92210012b58b2e6cf1f447a427543d545e..205e1b3b9ede3282189864f116a9c650b59626df 100644 --- a/profiler/advisor/display/html/templates/gc.html +++ b/profiler/advisor/display/html/templates/gc.html @@ -2,6 +2,9 @@

GC Analysis

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }} @@ -16,6 +19,7 @@ {% endfor %} {% endfor %}
+ {% if datas|safe %} The details of top {{ num }} garbage collection events are as follows:

@@ -33,5 +37,6 @@ {% endfor %}
+ {% endif %}
diff --git a/profiler/advisor/display/html/templates/memory.html b/profiler/advisor/display/html/templates/memory.html index c3507010985de5fbf4d23dd08af137abdaac112d..a3d75877b60ef3481a13572fbd6b0e2bb5eaf2a0 100644 --- a/profiler/advisor/display/html/templates/memory.html +++ b/profiler/advisor/display/html/templates/memory.html @@ -1,6 +1,9 @@

Memory Operator Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }} diff --git a/profiler/advisor/display/html/templates/operator_ai_cpu.html b/profiler/advisor/display/html/templates/operator_ai_cpu.html index 90f3bb93a1cb548236fe4b1b02a3a86b0058d546..79be0c9e9ba8539b02f007c8325a637f8dc01f44 100644 --- a/profiler/advisor/display/html/templates/operator_ai_cpu.html +++ b/profiler/advisor/display/html/templates/operator_ai_cpu.html @@ -1,6 +1,9 @@

AICPU Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
diff --git a/profiler/advisor/display/html/templates/operator_block_dim.html b/profiler/advisor/display/html/templates/operator_block_dim.html index 8079db30a7759155f7342307ecca1adf84fa2dc1..0f3e909cfa777590cb5fa432e11a5c00a44f461c 100644 --- a/profiler/advisor/display/html/templates/operator_block_dim.html +++ b/profiler/advisor/display/html/templates/operator_block_dim.html @@ -1,6 +1,9 @@

Block Dim Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
Description
diff --git a/profiler/advisor/display/html/templates/operator_dispatch.html b/profiler/advisor/display/html/templates/operator_dispatch.html index 7aa69251ebeca068f39378427ad3e2505b7c6742..85fdb41319d3a4e9b914cadd3982d347bc62471e 100644 --- a/profiler/advisor/display/html/templates/operator_dispatch.html +++ b/profiler/advisor/display/html/templates/operator_dispatch.html @@ -2,7 +2,9 @@

Operator Dispatch Issues

- + {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
Description
diff --git a/profiler/advisor/display/html/templates/operator_dynamic_shape.html b/profiler/advisor/display/html/templates/operator_dynamic_shape.html index 9d4ca028bbd5ad9cd776aa03b9468abf9b4f36e3..fea1a80fee59bf7f54de2567d9465425a362a43f 100644 --- a/profiler/advisor/display/html/templates/operator_dynamic_shape.html +++ b/profiler/advisor/display/html/templates/operator_dynamic_shape.html @@ -1,6 +1,9 @@

Operator Dynamic Shape Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
Description
diff --git a/profiler/advisor/display/html/templates/operator_no_bound.html b/profiler/advisor/display/html/templates/operator_no_bound.html index d0b8925cbf07b2ccd57767704c75bb3e99401ea4..d71e6f6b686053e7c50a10a26a3d8031b08f94d9 100644 --- a/profiler/advisor/display/html/templates/operator_no_bound.html +++ b/profiler/advisor/display/html/templates/operator_no_bound.html @@ -1,6 +1,9 @@

Operator No Bound Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %}
Description
diff --git a/profiler/advisor/display/html/templates/pp_stage_computation_analysis.html b/profiler/advisor/display/html/templates/pp_stage_computation_analysis.html index 577aeda9e4c094fc0d1ff6ecfcf69de4546cc240..189e6fadf863e5d1ec930690e9ef8b3012d15c51 100644 --- a/profiler/advisor/display/html/templates/pp_stage_computation_analysis.html +++ b/profiler/advisor/display/html/templates/pp_stage_computation_analysis.html @@ -6,7 +6,7 @@

{{ stage_html['stage']|safe }}

- Description: analysis for slow rank {{ stage_html['rank_id']|safe }} in current stage + Description: analysis for slow rank {{ stage_html['rank']|safe }} in current stage

{% for html in stage_html['html_list'] %} {{ html|safe }} diff --git a/profiler/advisor/display/html/templates/slow_dataloader.html b/profiler/advisor/display/html/templates/slow_dataloader.html index bf71a7085b70d80d04d76cfb1778029e5fdf9353..b9ce7a574ab2a838633cb7c5181cfecb737097c9 100644 --- a/profiler/advisor/display/html/templates/slow_dataloader.html +++ b/profiler/advisor/display/html/templates/slow_dataloader.html @@ -1,6 +1,9 @@

Slow Dataloader Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }}
Description
diff --git a/profiler/advisor/display/html/templates/sync_batchnorm.html b/profiler/advisor/display/html/templates/sync_batchnorm.html index bb46c1f06d15ed84b4ffa276d614a317c656cf22..402404c8a43706ec4a598300eec42c7d2b7767cc 100644 --- a/profiler/advisor/display/html/templates/sync_batchnorm.html +++ b/profiler/advisor/display/html/templates/sync_batchnorm.html @@ -2,6 +2,9 @@

SyncBatchNorm Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }}
diff --git a/profiler/advisor/display/html/templates/synchronize_stream.html b/profiler/advisor/display/html/templates/synchronize_stream.html index 1832f9406d3e234f80278f2065f461c2db4ae82b..a622ac685cd4cf8ec6f72ad1bf8101b168076e8a 100644 --- a/profiler/advisor/display/html/templates/synchronize_stream.html +++ b/profiler/advisor/display/html/templates/synchronize_stream.html @@ -1,6 +1,9 @@

Synchronize Stream Issues

+ {% if rank is not none %} + Analysis of rank {{ rank|safe }}. + {% endif %} {{ desc }}
diff --git a/profiler/advisor/interface/interface.py b/profiler/advisor/interface/interface.py index 55252d90d4739b9ef5392b51efd57b8dbd2a8302..d6d42e5c49b53144ab485b6273f6f9ec023c52cc 100644 --- a/profiler/advisor/interface/interface.py +++ b/profiler/advisor/interface/interface.py @@ -48,6 +48,7 @@ from profiler.advisor.analyzer.comparison.comparison_analyzer import ComparisonA logger = logging.getLogger() + class Interface: SCHEDULE = "schedule" COMPUTATION = "computation" diff --git a/profiler/advisor/rules/aicpu_rules.yaml b/profiler/advisor/rules/aicpu_rules.yaml index 58e6eef163204ea1b5efbb5148770948bd4afdad..bdbb71fecc7df9ec0ab9f82fec2fb8decc1b001c 100644 --- a/profiler/advisor/rules/aicpu_rules.yaml +++ b/profiler/advisor/rules/aicpu_rules.yaml @@ -85,19 +85,19 @@ ExampleGuideChecker: - IndexPutChecker: op_type: [index] url: *AICPU_DOC_URL - suggestion: 'Please modify source code followed by this LINK, try to replace index operator with equivalent operator.' + suggestion: 'Please modify source code followed by this LINK, try to replace index operator with equivalent operator.' - NonzeroChecker: op_type: [ indexput, indexputv2 ] url: *AICPU_DOC_URL - suggestion: 'Please modify source code followed by this LINK, try to replace indexput operator with equivalent operator.' + suggestion: 'Please modify source code followed by this LINK, try to replace indexput operator with equivalent operator.' - CastChecker: op_type: [ argmin ] url: *AICPU_DOC_URL - suggestion: 'Please update your cann-tookit to at least 7.0.RC1 version by this LINK.' + suggestion: 'Please update your cann-tookit to at least 7.0.RC1 version by this LINK.' - CastChecker: op_type: [ nonzero ] url: *AICPU_DOC_URL - suggestion: 'Please modify source code followed by this LINK, try to replace nonzero operator with equivalent operator.' \ No newline at end of file + suggestion: 'Please modify source code followed by this LINK, try to replace nonzero operator with equivalent operator.' \ No newline at end of file diff --git a/profiler/advisor/rules/environment_variable_info.yaml b/profiler/advisor/rules/environment_variable_info.yaml index b91f827ef47cb6c0894f321e617e787e138016f9..09323e8e829e8369c2dfa084ca5156e86e4bc417 100644 --- a/profiler/advisor/rules/environment_variable_info.yaml +++ b/profiler/advisor/rules/environment_variable_info.yaml @@ -7,14 +7,14 @@ HCCL_RDAM_TC: In the DS field of IP datagram header, the rightmost 6 bits are DSCP, and leftmost 2 bits are 0.\n It should be set to DSCP * 4. Default value is 132, that is, DSCP is 33 (132=33*4)." suggest: "Please refer to https://support.huawei.com/enterprise/zh/doc/EDOC1100371278/5eeeed85?idPath=23710424" - suggest_html: "Please refer to LINK" + suggest_html: "Please refer to LINK" HCCL_RDMA_SL: desc: "Specify the priority of the RDMA NIC.\n The value must be the same as the PFC priority for the NIC.\n Otherwise, the performance may deteriorate.\n The value range is [0, 7], and default value is 4." suggest: "Please refer to https://support.huawei.com/enterprise/zh/doc/EDOC1100371278/5eeeed85?idPath=23710424" - suggest_html: "Please refer to LINK" + suggest_html: "Please refer to LINK" ACLNN_CACHE_LIMIT: desc: "Number of cached aclnn operators." suggest: "Setting a large number when alcnn and host bound, such as 'export ACLNN_CACHE_LIMIT=100000'" diff --git a/profiler/advisor/rules/gc.yaml b/profiler/advisor/rules/gc.yaml index fad9b3523593a44c765a9a0e13dac82515626181..f896439cb657005aaf589efd43576547422efa12 100644 --- a/profiler/advisor/rules/gc.yaml +++ b/profiler/advisor/rules/gc.yaml @@ -1,7 +1,11 @@ -problem: "Abnormal garbage collection (GC) event is detected for {gc_count} times, and the total time is {gc_total_time} ms\n. +gc_problem_with_count: "Abnormal garbage collection (GC) event is detected for {gc_count} times, and the total time is {gc_total_time} ms\n. The GC operation is time-consuming and blocks the entire process. As a result, some steps in the model training process take a longer time than other steps." -gc_threshold: 1000 #us +gc_problem_with_free: "Nearly no host tasks within {free_duration_time} microseconds(us) of free time, which is likely caused by abnormal garbage collection (GC) of Python" +gc_threshold: 10 #us top_num: 10 +max_free_threshold: 200000 # us +max_acl_event_num_ratio: 0.0001 # max 10 events per 100 ms +max_acl_event_time_ratio: 0.01 # total time of acl events no larger than 0.01 * free duration solutions: - memory management: desc: "implement effective Python memory management; release memory promptly when not in use to avoid long-term retention; avoid circular references between objects." diff --git a/profiler/advisor/rules/synchronize.yaml b/profiler/advisor/rules/synchronize.yaml index 3bd518d003c598ddca54e53a359b957bce3c0bab..9fef8211828fa5855e990df3868216e81d98c7d5 100644 --- a/profiler/advisor/rules/synchronize.yaml +++ b/profiler/advisor/rules/synchronize.yaml @@ -1,5 +1,6 @@ -problem: "SynchronizeStream will reduce training efficiency. Found {synchronize_num} SynchronizeStream, {slow_synchronize_num} slow SynchronizeStream cost {total_synchronize_stream_time} us." -max_synchronize_num: 20 +problem: "SynchronizeStream will reduce training efficiency. Found {synchronize_num} SynchronizeStream, ratio to aten operators is {synchronize_aten_ratio}, {slow_synchronize_num} slow SynchronizeStream cost {total_synchronize_stream_time} us." +max_synchronize_num: 1000 +max_synchronize_num_ratio: 0.3 slow_synchronize_threshold: 10 #ms solutions: - disable ascend launch blocking: diff --git a/profiler/advisor/utils/utils.py b/profiler/advisor/utils/utils.py index 71d3dd1730f7c3d12f627332520c11e108d4611c..f0868af5d19897df0e19b97f42d2b9299aad6571 100644 --- a/profiler/advisor/utils/utils.py +++ b/profiler/advisor/utils/utils.py @@ -499,8 +499,8 @@ def safe_index(array, index, return_value_if_error=None): def convert_to_int(data: any) -> int: try: - int_value = int(data) + int_value = int(convert_to_float(data)) except ValueError: - logger.error(f"Can not convert %ss to float.", data) + logger.error(f"Can not convert %s to int.", data) return 0 return int_value