diff --git a/profiler/advisor/analyzer/analyzer_controller.py b/profiler/advisor/analyzer/analyzer_controller.py new file mode 100644 index 0000000000000000000000000000000000000000..e8704542e0f7712cb1a5e4a5ee63c8e8246fa959 --- /dev/null +++ b/profiler/advisor/analyzer/analyzer_controller.py @@ -0,0 +1,557 @@ +import copy +import logging +import json +import sys +import os +import multiprocessing as mp +from pathlib import Path +from multiprocessing import Manager + +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "compare_tools")) +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "cluster_analyse")) + +from profiler.advisor.analyzer.cluster.slow_rank_analyzer import SlowRankAnalyzer +from profiler.advisor.analyzer.cluster.slow_link_analyzer import SlowLinkAnalyzer +from profiler.advisor.analyzer.computation.pp_stage_computation_analyzer import PPStageComputationAnalyzer +from profiler.advisor.config.config import Config +from profiler.advisor.common.analyzer_scopes import SupportedScopes +from profiler.advisor.common.async_analysis_status import AsyncAnalysisStatus +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterDataset +from profiler.advisor.utils.utils import Timer, safe_index, safe_division +from profiler.advisor.interface.interface import Interface +from profiler.cluster_analyse.cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor +from profiler.prof_common.path_manager import PathManager + +logger = logging.getLogger() + + +class AnalyzerController: + CLUSTER_RANK_THRESHOLD = 2 + + def __init__(self): + self.dimensions = Interface.all_dimension + self.kwargs = {} + self.slow_rank_analyzer = None + self.slow_link_analyzer = None + self.cluster_local_data_map = {} + self.default_rank_id = None + self.rank_id_map = {} + self._is_cluster = False + self.analysis_process_resp = Manager().dict() + + @staticmethod + def _check_profiling_path_valid(profiling_path): + PathManager.input_path_common_check(profiling_path) + + if not Path(profiling_path).exists(): + logger.error("Profiling path is not existed. Invalid profiling path: %s", profiling_path) + return False + return True + + @staticmethod + def _get_step_rank_for_cluster_statistic_diff(target_cluster_statistic_data, benchmark_cluster_statistic_data, + headers, dimension, get_max=False): + if dimension not in headers: + logger.error("Error dimension %s for cluster statistics data, optionals are %s.", dimension, headers) + return None, None, None + + dimension_index = safe_index(headers, dimension) + diff_record = [] + # 对比目标profiling和benchmark profiling 每张卡的计算和下发和带宽,取计算、下发、带宽差异最大的卡进行下一步分析 + for target_row_data, benchmark_row_data in zip(target_cluster_statistic_data, benchmark_cluster_statistic_data): + target_data = safe_index(target_row_data, dimension_index) + benchmark_data = safe_index(benchmark_row_data, dimension_index) + if not isinstance(target_data, (int, float)) or not isinstance(benchmark_data, (int, float)): + continue + diff_record.append(target_data - benchmark_data) + + if SlowRankAnalyzer.compute_max_gap_ratio(diff_record, safe_division(sum(diff_record), len( + diff_record))) < SlowRankAnalyzer.RATIO_THRESHOLD: + return None, None, None + + value = max(diff_record) if get_max else min(diff_record) + value_index = safe_index(diff_record, value) + + step_value_index = safe_index(headers, "step") + rank_id_value_index = safe_index(headers, "rank_id") + step = safe_index(safe_index(target_cluster_statistic_data, value_index, []), step_value_index) + benchmark_step = safe_index(safe_index(benchmark_cluster_statistic_data, value_index, []), step_value_index) + target_rank_id = safe_index(safe_index(target_cluster_statistic_data, value_index, []), rank_id_value_index) + benchmark_rank_id = safe_index(safe_index(target_cluster_statistic_data, value_index, []), rank_id_value_index) + + if target_rank_id != benchmark_rank_id: + logger.error( + "Rank ids of target profiling must keep the same as benchmark profiling, skip cluster comparison") + return None, None, None + + return step, benchmark_step, target_rank_id + + def do_analysis(self, dimensions, **kwargs): + pid = os.getpid() + resp = {"id": pid} + try: + self._do_analysis(dimensions, pid=pid, resp=resp, **kwargs) + except Exception as e: + self._update_analysis_process_resp(pid, resp, status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + status=AsyncAnalysisStatus.FAILED, error_msg=str(e)) + logger.error(e) + raise RuntimeError(e) + + def async_do_analysis(self, dimensions, **kwargs): + # 异步分析,用于部署服务,通过接口查询异步作业状态 + async_analysis_process = mp.Process(target=self.do_analysis, args=(dimensions,), kwargs=kwargs, + name="Async advisor performance analysis") + async_analysis_process.start() + return async_analysis_process + + def get_response_by_pid(self, pid): + return self.analysis_process_resp.get(pid) + + def single_rank_analysis(self, profiling_path, benchmark_profiling_path=None): + job_list = [] + + profiling_path = self._get_profiling_path_by_rank(profiling_path) + benchmark_profiling_path = self._get_profiling_path_by_rank(benchmark_profiling_path) + + # 单卡场景无集群分析 + for dim in [Interface.CLUSTER]: + if dim in self.dimensions: + self.dimensions.remove(dim) + + for dimension in self.dimensions: + dimension_analysis_func_name = f"{dimension}_analysis" + if not hasattr(self, dimension_analysis_func_name): + continue + logger.info("Start %s analysis", dimension) + job_list += getattr(self, dimension_analysis_func_name)(profiling_path) + + if benchmark_profiling_path: + # kernel/api 比对 + job_list += self._single_profiling_comparison(profiling_path, benchmark_profiling_path) + else: + # 单卡性能拆解 + self.overall(profiling_path) + return job_list + + def cluster_analysis(self, profiling_path, benchmark_profiling_path=None): + job_list = [] + + # 单集群profiling分析:下发、通信、计算、显存/内存 + for dimension in self.dimensions: + dimension_analysis_func_name = f"cluster_{dimension}_analysis" + if not hasattr(self, dimension_analysis_func_name): + continue + logger.info("Start cluster %s analysis", dimension) + job_list += getattr(self, dimension_analysis_func_name)(profiling_path) + + if benchmark_profiling_path: + # 两个集群profiling比对分析 + job_list += self._cluster_profiling_comparison(profiling_path, benchmark_profiling_path) + else: + self.overall(profiling_path) + return job_list + + def overall(self, profiling_path): + from profiler.advisor.analyzer.overall.environment_variable_analyzer import EnvironmentVariabelAnalyzer + env_analyzer = EnvironmentVariabelAnalyzer(profiling_path) + env_analyzer.optimize() + + if self._is_cluster: + self.slow_rank_analyzer.optimize(template_key=Interface.OVERALL) + self.slow_link_analyzer.optimize(template_key=Interface.OVERALL) + else: + from profiler.advisor.analyzer.overall.overall_summary_analyzer import OverallSummaryAnalyzer + overall_analyzer = OverallSummaryAnalyzer(profiling_path) + overall_analyzer.optimize() + + def schedule_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, benchmark_step=None): + # 任意单卡的下发分析 + + kwargs = copy.deepcopy(self.kwargs) + job_list = [] + + kwargs["profiling_path"] = profiling_path + kwargs["benchmark_profiling_path"] = benchmark_profiling_path + kwargs["step"] = step + kwargs["benchmark_step"] = benchmark_step + + for dimension in [Interface.SCHEDULE]: + for scope in Interface.get_scope(dimension): + interface = Interface(**kwargs) + job_list.append((dimension, scope, interface, kwargs)) + return job_list + + def computation_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, + benchmark_step=None, stage=None): + # 任意单卡的计算分析 + + kwargs = copy.deepcopy(self.kwargs) + kwargs["profiling_path"] = profiling_path + kwargs["benchmark_profiling_path"] = benchmark_profiling_path + kwargs["step"] = step + kwargs["benchmark_step"] = benchmark_step + kwargs["stage"] = stage + job_list = [] + + for dimension in [Interface.COMPUTATION]: + for scope in Interface.get_scope(dimension): + if scope == SupportedScopes.STAGE_COMPUTE: + continue + interface = Interface(**kwargs) + job_list.append((dimension, scope, interface, kwargs)) + return job_list + + def memory_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, benchmark_step=None): + # 任意单卡的内存分析 + + kwargs = copy.deepcopy(self.kwargs) + job_list = [] + + kwargs["profiling_path"] = profiling_path + kwargs["benchmark_profiling_path"] = benchmark_profiling_path + kwargs["step"] = step + kwargs["benchmark_step"] = benchmark_step + + for dimension in [Interface.MEMORY]: + for scope in Interface.get_scope(dimension): + interface = Interface(**kwargs) + job_list.append((dimension, scope, interface, kwargs)) + return job_list + + def communication_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, + benchmark_step=None, bandwidth_type=None): + + job_list = [] + supported_trans_type = [SlowLinkAnalyzer.SDMA, SlowLinkAnalyzer.RDMA] + if bandwidth_type is not None and bandwidth_type not in supported_trans_type: + logger.error("Error transit type %s, optionals are %s", bandwidth_type, supported_trans_type) + return job_list + + bandwidth_type_list = [bandwidth_type] if bandwidth_type is not None else supported_trans_type + + for bandwidth_type in bandwidth_type_list: + job_list += getattr(self, f"_communication_{bandwidth_type.lower()}_analysis")(profiling_path, + benchmark_profiling_path, + step, benchmark_step) + + return job_list + + def cluster_schedule_analysis(self, profiling_path): + # 目标集群profiling数据下发分析,不包含两个集群profiling数据的比对分析 + + job_list = [] + global_step_rank = self.slow_rank_analyzer.get_global_step_rank(SlowRankAnalyzer.FREE) + slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") or self.default_rank_id + slow_step = global_step_rank.get("maximum", {}).get("step") + analysis_profiling_path = self._get_profiling_path_by_rank(profiling_path, slow_rank_id) + + info_msg = f"Maximum free for rank {slow_rank_id}" + if slow_step: + info_msg += f" and step {slow_step}" + logger.info(info_msg) + + job_list += self.schedule_analysis(analysis_profiling_path, step=slow_step) + return job_list + + def cluster_communication_analysis(self, profiling_path): + job_list = [] + + for dimension in [Interface.COMMUNICATION]: + for scope in Interface.get_scope(dimension): + analyzer_class = Interface.get_analyzer(dimension, scope) + if hasattr(analyzer_class, "requires_cluster_dataset") and getattr(analyzer_class, + "requires_cluster_dataset"): + + # 如果不依赖数据集,或者依赖的是ClusterDataset,则不用根据带宽确定需要分析的特定rank + kwargs = copy.deepcopy(self.kwargs) + kwargs["profiling_path"] = profiling_path + interface = Interface(**kwargs) + job_list.append((dimension, scope, interface, kwargs)) + else: + # 非ClusterDataset场景,需要根据带宽大小分析特定的rank + for bandwidth_type in [SlowLinkAnalyzer.SDMA, SlowLinkAnalyzer.RDMA]: + global_step_rank = self.slow_link_analyzer.get_global_step_rank(bandwidth_type) + # 获取带宽最小的卡进行分析 + target_rank_id = global_step_rank.get("minimum", {}).get("rank_id") or self.default_rank_id + step = global_step_rank.get("minimum", {}).get("step") + analysis_profiling_path = self._get_profiling_path_by_rank(profiling_path, target_rank_id) + + info_msg = f"Minimum {bandwidth_type} bandwidth for rank {target_rank_id} " + if step: + info_msg += f"and step {step}" + logger.info(info_msg) + + job_list += self.communication_analysis(analysis_profiling_path, step=step, + bandwidth_type=bandwidth_type) + + return job_list + + def cluster_computation_analysis(self, profiling_path): + # 目标集群profiling数据计算分析,不包含两个集群profiling数据的比对分析;如果有pp stage,则对不同stage进行计算分析 + + job_list = [] + global_step_rank = self.slow_rank_analyzer.get_global_step_rank(SlowRankAnalyzer.COMPUTE) + stage_step_rank = self.slow_rank_analyzer.get_stage_step_rank(SlowRankAnalyzer.COMPUTE) + + if stage_step_rank: + # 对不同pp stage取min max进行分析 + logger.info("Analysis steps and ranks of different pipeline parallel stages are %s", + json.dumps(stage_step_rank)) + + stages_profiling_path = [] + for stage, step_rank_info in stage_step_rank.items(): + rank_id = step_rank_info.get("maximum", {}).get("rank_id") + step = step_rank_info.get("maximum", {}).get("step") + + info_msg = f"For {stage}, slow rank is {rank_id}" + if step: + info_msg += f", step is {step}" + logger.info(info_msg) + + stages_profiling_path.append( + dict( + stage=stage, + rank_id=rank_id, + step=step, + profiling_path=self._get_profiling_path_by_rank(profiling_path, rank_id) + ) + ) + Interface.add_analyzer(Interface.COMPUTATION, SupportedScopes.STAGE_COMPUTE, PPStageComputationAnalyzer) + kwargs = {"stages_profiling_path": stages_profiling_path, "profiling_path": profiling_path} + + job_list.append((Interface.COMPUTATION, SupportedScopes.STAGE_COMPUTE, Interface(**kwargs), kwargs)) + else: + # 不区分stage,对所有卡取Min max进行分析 + logger.info("Without pipeline parallel stage, Global analysis steps and ranks is %s", + json.dumps(global_step_rank)) + slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") or self.default_rank_id + slow_step = global_step_rank.get("maximum", {}).get("step") + # 如果没有标杆profiling数据的rank id,说明没有快慢卡问题,直接对默认rank id进行分析,因此这里取值为None + fast_rank_id = global_step_rank.get("minimum", {}).get("rank_id") + fast_step = global_step_rank.get("minimum", {}).get("step") + + info_msg = f"Maximum computation time for rank {slow_rank_id}" + if slow_step: + info_msg += f" and step {slow_step}, " + if fast_rank_id: + info_msg += f"minimum computation time for rank {fast_rank_id}" + if fast_step: + info_msg += f" and step {fast_step}" + logger.info(info_msg) + + job_list += self.computation_analysis( + self._get_profiling_path_by_rank(profiling_path, slow_rank_id), + self._get_profiling_path_by_rank(profiling_path, fast_rank_id), + slow_step, + fast_step + ) + + return job_list + + def cluster_memory_analysis(self, profiling_path): + # 目标集群profiling数据内存分析,当前memory识别的两个算子,导致的问题都是大的free,因此选择FREE最慢的卡进行分析 + + job_list = [] + global_step_rank = self.slow_rank_analyzer.get_global_step_rank(SlowRankAnalyzer.FREE) + slow_rank_id = global_step_rank.get("maximum", {}).get("rank_id") or self.default_rank_id + slow_step = global_step_rank.get("maximum", {}).get("step") + analysis_profiling_path = self._get_profiling_path_by_rank(profiling_path, slow_rank_id) + + info_msg = f"Maximum free for rank {slow_rank_id} " + if slow_step: + info_msg += f"and step {slow_step}" + logger.info(info_msg) + + job_list += self.memory_analysis(analysis_profiling_path, step=slow_step) + return job_list + + def _do_analysis(self, dimensions, **kwargs): + self.dimensions = dimensions + self.kwargs = kwargs + result_list = [] + profiling_path = self.kwargs.get("profiling_path") + benchmark_profiling_path = self.kwargs.get("benchmark_profiling_path") + pid = self.kwargs.get("pid") + resp = self.kwargs.get("resp") + + self._update_analysis_process_resp(pid, resp, status_code=AsyncAnalysisStatus.NON_FAILED_STATUS_CODE, + status=AsyncAnalysisStatus.ANALYZING) + + if not self._check_profiling_path_valid(profiling_path): + error_msg = f"Got invalid argument '-d/--profiling_path' {profiling_path}, skip analysis" + self._update_analysis_process_resp(pid, resp, error_msg=error_msg, + status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + status=AsyncAnalysisStatus.FAILED) + logger.error(error_msg) + return + if benchmark_profiling_path and not self._check_profiling_path_valid(benchmark_profiling_path): + error_msg = f"Got invalid argument '-bp/--benchmark_profiling_path' {benchmark_profiling_path}, skip analysis" + self._update_analysis_process_resp(pid, resp, error_msg=error_msg, + status_code=AsyncAnalysisStatus.FAILED_STATUS_CODE, + status=AsyncAnalysisStatus.FAILED) + logger.error(error_msg) + return + + self._is_cluster = self._is_cluster_profiling(profiling_path) + if not self._is_cluster: + job_list = self.single_rank_analysis(profiling_path, benchmark_profiling_path) + else: + job_list = self.cluster_analysis(profiling_path, benchmark_profiling_path) + + for i, (dimension, scope, interface, kwargs) in enumerate(job_list[::-1]): + result_list.append( + interface.get_result(dimension, scope, render_html=i == len(job_list) - 1, output_dict=False, + **kwargs) + ) + + for result in result_list[::-1]: + if result and hasattr(result, "show"): + result.show() + break + self._get_analysis_success_resp(pid, resp) + + def _communication_rdma_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, + benchmark_step=None): + # 小包分析 + kwargs = copy.deepcopy(self.kwargs) + job_list = [] + + kwargs["profiling_path"] = profiling_path + kwargs["benchmark_profiling_path"] = benchmark_profiling_path + kwargs["step"] = step + kwargs["benchmark_step"] = benchmark_step + + for dimension in [Interface.COMMUNICATION]: + for scope in Interface.get_scope(dimension): + if scope != SupportedScopes.PACKET: + continue + interface = Interface(**kwargs) + job_list.append((dimension, scope, interface, kwargs)) + + return job_list + + def _communication_sdma_analysis(self, profiling_path, benchmark_profiling_path=None, step=None, + benchmark_step=None): + kwargs = copy.deepcopy(self.kwargs) + job_list = [] + return job_list + + def _single_profiling_comparison(self, profiling_path, benchmark_profiling_path, step=None, + benchmark_step=None): + # TODO 基于compare tools 对比计算下发 + kwargs = copy.deepcopy(self.kwargs) + return [] + + def _cluster_profiling_comparison(self, profiling_path, benchmark_profiling_path): + # 从计算、下发和通信三个维度对集群profiling数据进行对比 + + job_list = [] + benchmark_profiling_path = self._get_profiling_path_by_rank(benchmark_profiling_path) + benchmark_slow_rank_analyzer = SlowRankAnalyzer(benchmark_profiling_path) + benchmark_slow_link_analyzer = SlowLinkAnalyzer(benchmark_profiling_path) + + # 计算和下发分析 + job_list += self._cluster_data_comparison(profiling_path, + benchmark_profiling_path, + self.slow_rank_analyzer, + benchmark_slow_rank_analyzer, + get_max=True) + + # 通信分析 + job_list += self._cluster_data_comparison(profiling_path, + benchmark_profiling_path, + self.slow_link_analyzer, + benchmark_slow_link_analyzer, + get_max=False) + return job_list + + def _cluster_data_comparison(self, profiling_path, benchmark_profiling_path, target_cluster_analyzer, + benchmark_cluster_analyzer, get_max=False): + # #low rank/slow link结果逐行对比获取差值最大的rank和step进行单卡分析 + job_list = [] + + if isinstance(target_cluster_analyzer, SlowRankAnalyzer): + comparison_dims = [SlowRankAnalyzer.COMPUTE, SlowRankAnalyzer.FREE] + elif isinstance(target_cluster_analyzer, SlowLinkAnalyzer): + comparison_dims = [SlowLinkAnalyzer.SDMA, SlowLinkAnalyzer.RDMA] + else: + return job_list + + target_data = target_cluster_analyzer.format_datas.get("data", []) + benchmark_data = benchmark_cluster_analyzer.format_datas.get("data", []) + headers = benchmark_cluster_analyzer.format_datas.get("headers", []) + + if len(target_data) != len(benchmark_data): + logger.warning( + "The product of ranks and steps of Benchmark profiling is not equals to target profiling, skip cluster comparison.") + return job_list + + for dimension in comparison_dims: + step, benchmark_step, rank_id_for_comparison = AnalyzerController._get_step_rank_for_cluster_statistic_diff( + target_data, + benchmark_data, + headers, + dimension, + get_max=get_max + ) + rank_profiling_path = self._get_profiling_path_by_rank(profiling_path, rank_id_for_comparison) + rank_benchmark_profiling_path = self._get_profiling_path_by_rank( + benchmark_profiling_path, + rank_id_for_comparison + ) + + job_list += self._single_profiling_comparison( + rank_profiling_path, + rank_benchmark_profiling_path, + step, + benchmark_step + ) + return job_list + + def _is_cluster_profiling(self, profiling_path): + path_list = [os.path.join(profiling_path, dir_name) for dir_name in os.listdir(profiling_path)] + ascend_pt_dirs = [path for path in path_list if os.path.isdir(path) and path.endswith("ascend_pt")] + data_processor = PytorchDataPreprocessor(ascend_pt_dirs) + + self.cluster_local_data_map[profiling_path] = data_processor.get_data_map() + + if not self.cluster_local_data_map or not self.cluster_local_data_map.get(profiling_path): + return False + + self.default_rank_id = list(self.cluster_local_data_map[profiling_path].keys())[0] + + self.slow_rank_analyzer = SlowRankAnalyzer(profiling_path) + self.slow_link_analyzer = SlowLinkAnalyzer(profiling_path) + return len(self.cluster_local_data_map[profiling_path]) >= self.CLUSTER_RANK_THRESHOLD + + def _get_profiling_path_by_rank(self, profiling_path, rank_id=None): + + if not profiling_path: + return profiling_path + + return self._get_target_profiling_path_for_local(profiling_path, rank_id) + + def _get_target_profiling_path_for_local(self, profiling_path, rank_id): + rank_id_map = self.cluster_local_data_map.get(profiling_path, {}) + if rank_id is None or not rank_id_map: + return profiling_path + + if rank_id in rank_id_map: + return rank_id_map.get(rank_id) + + local_first_rank_id = sorted(list(map(int, rank_id_map.keys())))[0] + logger.warning("Target rank id %s does not exist in local profiling data %s, use rank %s for analysis", + rank_id, profiling_path, local_first_rank_id) + return rank_id_map.get(local_first_rank_id) + + def _update_analysis_process_resp(self, pid, resp, **kwargs): + if kwargs: + resp.update(kwargs) + self.analysis_process_resp[pid] = resp + + def _get_analysis_success_resp(self, pid, resp): + html_path = os.path.join(Config().work_path, f"mstt_advisor_{Timer().strftime}.html") + xlsx_path = os.path.join(Config().work_path, f"mstt_advisor_{Timer().strftime}.xlsx") + result_files = {"html": html_path, "xlsx": xlsx_path} + self._update_analysis_process_resp(pid, resp, status_code=AsyncAnalysisStatus.NON_FAILED_STATUS_CODE, + status=AsyncAnalysisStatus.SUCCESS, result_files=result_files) \ No newline at end of file diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py index 80368e1d60a14020637ba60bb41c5536dcf2e081..6347839b1f71c111dc89e0ffd4c369dc6775c6ec 100644 --- a/profiler/advisor/analyzer/base_analyzer.py +++ b/profiler/advisor/analyzer/base_analyzer.py @@ -22,12 +22,16 @@ from profiler.advisor.common.version_control import VersionControl from profiler.advisor.dataset.dataset import Dataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor +from profiler.advisor.utils.utils import safe_division logger = logging.getLogger() class BaseAnalyzer(VersionControl, metaclass=ABCMeta): _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION + ANALYZER_HIGH_PRIORITY_TIME_RATIO = 0.05 + ANALYZER_MEDIUM_PRIORITY_TIME_RATIO = 0.03 dataset_cls_list = [] @@ -43,6 +47,18 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): self.result = OptimizeResult() self.record_list: Dict[str, List] = {} + @staticmethod + def get_first_data_by_key(data, key) -> Union[Dataset, None]: + """ + get the first member from data with key + :param data: input data + :param key: data key + :return: the first dataset in dataset list + """ + if key in data and len(data[key]) > 0: + return data[key][0] + return None + @classmethod def check_data(cls, data_list: tuple): """ @@ -63,7 +79,7 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): return None logger.info("Enable analysis %s with %s", self.__class__.__name__, ",".join(data_list)) - return func(self) + return func(self, **kwargs) return wrapper @@ -73,6 +89,10 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): def optimize(self, **kwargs): pass + @abstractmethod + def get_priority(self): + pass + def init_dataset_list(self)->None: dataset_cls_list = self.dataset_cls_list if len(dataset_cls_list) == 0: @@ -91,14 +111,25 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): self.dataset_list[key] = [] self.dataset_list[key].append(dataset) - @staticmethod - def get_first_data_by_key(data, key) -> Union[Dataset, None]: - """ - get the first member from data with key - :param data: input data - :param key: data key - :return: the first dataset in dataset list - """ - if key in data and len(data[key]) > 0: - return data[key][0] - return None + def init_dataset_list(self) -> None: + dataset_cls_list = self.dataset_cls_list + if len(dataset_cls_list) == 0: + logger.warning(f"Analyzer: %s don't rely on any dataset!", self.__class__.__name__) + return + + for dataset_cls in dataset_cls_list: + if dataset_cls and callable(dataset_cls): + dataset = dataset_cls(collection_path=self.collection_path, data=self.dataset_list, **self.kwargs) + key = dataset_cls.get_key() + if key not in self.dataset_list: + self.dataset_list[key] = [] + self.dataset_list[key].append(dataset) + + def get_priority_by_time_ratio(self, dur, step_dur): + time_ratio = safe_division(dur, step_dur) + if time_ratio >= self.ANALYZER_HIGH_PRIORITY_TIME_RATIO: + return PriorityBackgroundColor.high + elif time_ratio >= self.ANALYZER_MEDIUM_PRIORITY_TIME_RATIO: + return PriorityBackgroundColor.medium + else: + return PriorityBackgroundColor.low diff --git a/profiler/advisor/analyzer/cluster/Communication_retransmission_analyzer.py b/profiler/advisor/analyzer/cluster/Communication_retransmission_analyzer.py index 3683ef1b44f8b6c571dd4d8fdce0d39882d342af..de1ec7ebee09b7f8392036895554f13753070671 100644 --- a/profiler/advisor/analyzer/cluster/Communication_retransmission_analyzer.py +++ b/profiler/advisor/analyzer/cluster/Communication_retransmission_analyzer.py @@ -17,6 +17,7 @@ import logging from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.result.result import OptimizeResult from profiler.advisor.analyzer.cluster.Communication_retransmission_checker import CommunicationRetransmissionChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset @@ -44,3 +45,6 @@ class RDMARetransmissionAnalyzer(BaseAnalyzer): rdma_checker.make_record(self.result) self.html = rdma_checker.make_render(self.html_render, add_render_list) return self.result + + def get_priority(self): + return PriorityBackgroundColor.medium \ No newline at end of file diff --git a/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py b/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py index cc0f688e843cdc75681827e5599572d5dd42c3cc..4fa3c91c62d1abf7459a0f38141c38e89155329c 100644 --- a/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py +++ b/profiler/advisor/analyzer/cluster/Communication_retransmission_checker.py @@ -102,7 +102,7 @@ class CommunicationRetransmissionChecker: result.add_detail(sub_table_name, detail=row) def make_render(self, html_render, add_render_list=True): - return html_render.render_template(key="cluster", + return html_render.render_template(key="communication", template_dir="templates", template_name="communication_retransmission_analysis.html", desc=self.desc, diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py similarity index 55% rename from profiler/advisor/analyzer/cluster/slow_link_analyser.py rename to profiler/advisor/analyzer/cluster/slow_link_analyzer.py index 0b585cbc7c5f136b15cd9eb035ea2dac5caa9e4e..438d3f55ec471251dd4cb3cc50d9de587a7414ce 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyzer.py @@ -15,11 +15,16 @@ from collections import defaultdict from typing import Dict, List +import logging + from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset +from profiler.advisor.utils.utils import safe_index + +logger = logging.getLogger() class SlowLinkAnalyzer(BaseAnalyzer): @@ -34,7 +39,8 @@ class SlowLinkAnalyzer(BaseAnalyzer): TRANSIT_SIZE = "Transit Size(MB)" SDMA = "SDMA" RDMA = "RDMA" - SLOW_LINK_ANALYSIS = "slow_link_analysis" + SLOW_LINK_ANALYSIS = "slow link" + RATIO_THRESHOLD = 0.05 dataset_cls_list = [ClusterCommunicationDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs): @@ -45,18 +51,25 @@ class SlowLinkAnalyzer(BaseAnalyzer): self.result = OptimizeResult() self.bottelneck = '' self.suggestion = '' - self.format_datas = [] + if self.rank_bw_dict is not None: + self.format_datas = self.format_details() + + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean def optimize(self, **kwargs): if self.rank_bw_dict is None: - print("Slow link analysis failed due to data loading failure. \ + logger.error("Slow link analysis failed due to data loading failure. \ Please check your cluster_analysis_output folder. \ If you are not concerned about this type of data, please ignore this message.") return self.result self.process() - self.format_datas = self.format_details() self.make_record() - self.make_render() + self.make_render(kwargs.get("template_key")) return self.result def process(self): @@ -69,7 +82,7 @@ class SlowLinkAnalyzer(BaseAnalyzer): if len(data_list) > 0: avg_bw = round(sum(data_list) / len(data_list), 3) else: - print("The slow link (identified bottleneck) cannot provide a bottleneck \ + logger.info("The slow link (identified bottleneck) cannot provide a bottleneck \ because the analysis data is missing bandwidth information.") return self.bottelneck += f'{link_type}: \n' \ @@ -88,14 +101,19 @@ class SlowLinkAnalyzer(BaseAnalyzer): details_dict = {} headers = list({k for rank_bw_value in self.rank_bw_dict.values() for k in rank_bw_value.keys()}) headers.sort() - data_list = [[rank_id] + [rank_bw.get(k, 0) for k in headers] for rank_id, rank_bw in self.rank_bw_dict.items()] - data_list.sort(key = lambda x: x[0]) # 按rank_id排序 - - details_dict["headers"] = ["rank_id"] + headers + + data_list = [] + for step_rank, rank_bw in self.rank_bw_dict.items(): + step_rank_list = list(map(int, step_rank.split(constant.STEP_RANK_SEP))) + value_list = [rank_bw.get(i, 0) for i in headers] + data_list.append(step_rank_list + value_list) + data_list.sort(key=lambda x: (x[0], x[1])) # 按rank_id排序 + + details_dict["headers"] = ["step", "rank_id"] + headers details_dict["data"] = data_list return details_dict - + def make_record(self): """ make record for what and how to optimize @@ -107,20 +125,67 @@ class SlowLinkAnalyzer(BaseAnalyzer): ) self.result.add(OptimizeRecord(optimization_item)) - for i, data in enumerate(self.format_datas["data"]): - self.result.add_detail(SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, self.format_datas["headers"], data) + data_list = self.format_datas.get("data", []) + headers = self.format_datas.get("headers", []) + for data in data_list: + self.result.add_detail(SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, headers, data) - def make_render(self): + def make_render(self, template_key="cluster"): result_for_html = { - "Description" : self.bottelneck, - "suggestion" : self.suggestion, - "details" : [self.format_datas] + "Description": self.bottelneck, + "suggestion": self.suggestion, + "details": [self.format_datas] } - self.html_render.render_template(key="cluster", + self.html_render.render_template(key=template_key, title=SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, template_dir="templates", template_name="cluster_analysis.html", cann_version=self.cann_version, torch_version=self.torch_version, - result=result_for_html) \ No newline at end of file + result=result_for_html) + + def get_global_step_rank(self, bindwidth_type): + global_step_rank = {} + bindwidth_key_map = {self.RDMA: self.RDMA_BANDWIDTH, self.SDMA: self.SDMA_BANDWIDTH} + + if bindwidth_type not in bindwidth_key_map: + raise RuntimeError(f"Error bindwidth type {bindwidth_type}, optionals are {bindwidth_key_map.keys()}") + + headers = self.format_datas.get("headers") + + bindwidth_index = safe_index(headers, bindwidth_key_map.get(bindwidth_type)) + + if bindwidth_index is not None: + data_list = [tuple_list[bindwidth_index] for tuple_list in self.format_datas.get("data", [])] + max_bandwidth, min_bandwidth = max(data_list), min(data_list) + + if self.compute_max_gap_ratio(data_list, sum(data_list) / len( + data_list)) < self.RATIO_THRESHOLD: + return global_step_rank + + max_bandwidth_index = data_list.index(max_bandwidth) + min_bandwidth_index = data_list.index(min_bandwidth) + + rank_id_index = safe_index(headers, "rank_id") + step_index = safe_index(headers, "step") + + if rank_id_index is None: + return global_step_rank + + max_bandwidth_rank_id = self.format_datas.get("data")[max_bandwidth_index][rank_id_index] + min_bandwidth_rank_id = self.format_datas.get("data")[min_bandwidth_index][rank_id_index] + + if step_index is None: + max_bandwidth_step, min_bandwidth_step = constant.DEFAULT_STEP, constant.DEFAULT_STEP + else: + max_bandwidth_step = self.format_datas.get("data")[max_bandwidth_index][step_index] + min_bandwidth_step = self.format_datas.get("data")[min_bandwidth_index][step_index] + + global_step_rank["maximum"] = {"rank_id": max_bandwidth_rank_id, "step": max_bandwidth_step} + global_step_rank["minimum"] = {"rank_id": min_bandwidth_rank_id, "step": min_bandwidth_step} + + return global_step_rank + + def get_priority(self): + pass \ No newline at end of file diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py similarity index 42% rename from profiler/advisor/analyzer/cluster/slow_rank_analyser.py rename to profiler/advisor/analyzer/cluster/slow_rank_analyzer.py index f439b31f7736ee4777d5ef10bf968738a76ae1b3..a1971baf9d216298eb6e3c41d92b2d1382897292 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyzer.py @@ -13,43 +13,62 @@ # See the License for the specific language governing permissions and # limitations under the License. -from collections import defaultdict -from typing import Dict, List +import logging + from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataset +from profiler.advisor.utils.utils import safe_index + +logger = logging.getLogger() class SlowRankAnalyzer(BaseAnalyzer): - SLOW_RANK_ANALYSIS = "slow_rank_analysis" + SLOW_RANK_ANALYSIS = "slow rank" RANK = "rank" RATIO_THRESHOLD = 0.05 BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] dataset_cls_list = [ClusterStepTraceTimeDataset] + COMPUTE = "compute(us)" + FREE = "free(us)" + COMMUNICATION = "communication(us)" def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) key = ClusterStepTraceTimeDataset.get_key() - self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key) + self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key) self.step_trace_dict = self.step_trace_class.get_data() + self.stages = self.step_trace_class.get_stages() self.result = OptimizeResult() self.bottelneck = '' self.suggestion = '' - self.format_datas = [] + self._steps = set() + if self.step_trace_dict is not None: + self.format_datas = self.format_details() + + @property + def steps(self): + return sorted(list(self._steps)) + + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean def optimize(self, **kwargs): if self.step_trace_dict is None: - print("slow_rank 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹 \ + logger.error("slow_rank 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹 \ 如不关心这类数据请忽略") return self.result self.process() - self.format_datas = self.format_details() self.make_record() - self.make_render() + self.make_render(kwargs.get("template_key")) return self.result - + def process(self): total_time_list = [sum(data_tuple) for rank_id, data_tuple in self.step_trace_dict.items()] if total_time_list: @@ -57,6 +76,9 @@ class SlowRankAnalyzer(BaseAnalyzer): for i in range(len(self.BOTTLENECK_LIST)): self.produce_bottleneck(self.step_trace_dict, i, mean_total_time) + if not self.bottelneck: + self.bottelneck = "There is no slow rank issues" + def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float): data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()] max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time) @@ -70,33 +92,41 @@ class SlowRankAnalyzer(BaseAnalyzer): """ make record for what and how to optimize """ + optimization_item = OptimizeItem( SlowRankAnalyzer.SLOW_RANK_ANALYSIS, self.bottelneck, self.suggestion ) self.result.add(OptimizeRecord(optimization_item)) - for i, data in enumerate(self.format_datas["data"]): - self.result.add_detail(SlowRankAnalyzer.SLOW_RANK_ANALYSIS, self.format_datas["headers"], data) + + data_list = self.format_datas.get("data", []) + headers = self.format_datas.get("headers", []) + for data in data_list: + self.result.add_detail(SlowRankAnalyzer.SLOW_RANK_ANALYSIS, headers, data) def format_details(self): details_dict = {} - headers = ["rank_id", "compute(us)", "communication(us)", "free(us)"] + headers = ["step", "rank_id", "compute(us)", "communication(us)", "free(us)"] data_list = [] - for key,value in self.step_trace_dict.items(): - data_list.append([key] + value) + for key, value in self.step_trace_dict.items(): + step, rank_id = key.split(constant.STEP_RANK_SEP) + data_list.append([int(step), int(rank_id)] + value) + if step and step not in self._steps: + self._steps.add(step) + details_dict["headers"] = headers - details_dict["data"] = data_list + details_dict["data"] = sorted(data_list, key=lambda x: (x[0], x[1])) return details_dict - def make_render(self): + def make_render(self, template_key="cluster"): result_for_html = { - "Description" : self.bottelneck, - "suggestion" : self.suggestion, - "details" : [self.format_datas] + "Description": self.bottelneck, + "suggestion": self.suggestion, + "details": [self.format_datas] } - self.html_render.render_template(key="cluster", + self.html_render.render_template(key=template_key, title=SlowRankAnalyzer.SLOW_RANK_ANALYSIS, template_dir="templates", template_name="cluster_analysis.html", @@ -104,9 +134,83 @@ class SlowRankAnalyzer(BaseAnalyzer): torch_version=self.torch_version, result=result_for_html) - @staticmethod - def compute_max_gap_ratio(data: list, mean: float): - if mean == 0: - return 0 + def get_global_step_rank(self, dimension): + global_step_rank = {} + + headers = self.format_datas.get("headers") + + dimension_index = safe_index(headers, dimension) + rank_id_index = safe_index(headers, "rank_id") + step_index = safe_index(headers, "step") + if dimension_index is None or rank_id_index is None: + return global_step_rank + + data_list = [tuple_list[dimension_index] for tuple_list in self.format_datas.get("data")] + max_time, min_time = max(data_list), min(data_list) + + if self.compute_max_gap_ratio(data_list, sum(data_list) / len( + data_list)) < self.RATIO_THRESHOLD: + return global_step_rank + max_time_index = data_list.index(max_time) + min_time_index = data_list.index(min_time) + + max_time_rank_id = self.format_datas.get("data")[max_time_index][rank_id_index] + min_time_rank_id = self.format_datas.get("data")[min_time_index][rank_id_index] + + if step_index is not None: + max_time_step = self.format_datas.get("data")[max_time_index][step_index] + min_time_step = self.format_datas.get("data")[min_time_index][step_index] else: - return (max(data) - min(data)) / mean + max_time_step, min_time_step = constant.DEFAULT_STEP, constant.DEFAULT_STEP + + global_step_rank["maximum"] = {"rank_id": max_time_rank_id, "step": max_time_step} + global_step_rank["minimum"] = {"rank_id": min_time_rank_id, "step": min_time_step} + + return global_step_rank + + def get_stage_step_rank(self, dimension): + stage_step_rank = {} + + headers = self.format_datas.get("headers") + dimension_index = safe_index(headers, dimension) + rank_id_index = safe_index(headers, "rank_id") + step_index = safe_index(headers, "step") + if dimension_index is None or rank_id_index is None: + return stage_step_rank + + rank_list = [tuple_list[rank_id_index] for tuple_list in self.format_datas.get("data")] + cost_time_list = [tuple_list[dimension_index] for tuple_list in self.format_datas.get("data")] + + if step_index is not None: + step_list = [tuple_list[step_index] for tuple_list in self.format_datas.get("data")] + else: + step_list = [constant.DEFAULT_STEP] * len(rank_list) + + for index, stage in enumerate(self.stages): + tmp_step_list, tmp_rank_list, tmp_time_list = [], [], [] + for step, rank_id, time in zip(step_list, rank_list, cost_time_list): + if rank_id not in stage: + continue + + tmp_step_list.append(step) + tmp_rank_list.append(rank_id) + tmp_time_list.append(time) + + if self.compute_max_gap_ratio(tmp_time_list, sum(tmp_time_list) / len( + tmp_time_list)) < self.RATIO_THRESHOLD: + continue + + max_time, min_time = max(tmp_time_list), min(tmp_time_list) + max_time_index, min_time_index = tmp_time_list.index(max_time), tmp_time_list.index(min_time) + + stage_key = f"stage-{index}" + stage_step_rank[stage_key] = {} + stage_step_rank[stage_key]["maximum"] = {"rank_id": tmp_rank_list[max_time_index], + "step": tmp_step_list[max_time_index]} + stage_step_rank[stage_key]["minimum"] = {"rank_id": tmp_rank_list[min_time_index], + "step": tmp_step_list[min_time_index]} + + return stage_step_rank + + def get_priority(self): + pass \ No newline at end of file diff --git a/profiler/advisor/analyzer/communication/base_communication_analyzer.py b/profiler/advisor/analyzer/communication/base_communication_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..95a830e47575380be04c1c8bf5731d2b87f69949 --- /dev/null +++ b/profiler/advisor/analyzer/communication/base_communication_analyzer.py @@ -0,0 +1,8 @@ +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer + + +class BaseCommunicationAnalyzer(BaseAnalyzer): + requires_cluster_dataset = True + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) diff --git a/profiler/advisor/analyzer/communication/packet/__init__.py b/profiler/advisor/analyzer/communication/packet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/communication/packet_analyzer.py b/profiler/advisor/analyzer/communication/packet/packet_analyzer.py similarity index 74% rename from profiler/advisor/analyzer/communication/packet_analyzer.py rename to profiler/advisor/analyzer/communication/packet/packet_analyzer.py index 73e5bc2bc99bf3a2c7e11ef55ae279e8ddeb5ef5..e77ea7780a82a0ac9a00605628b7019b3c6c5220 100644 --- a/profiler/advisor/analyzer/communication/packet_analyzer.py +++ b/profiler/advisor/analyzer/communication/packet/packet_analyzer.py @@ -14,17 +14,19 @@ # limitations under the License. import logging -from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer -from profiler.advisor.result.result import OptimizeResult -from profiler.advisor.analyzer.communication.packet_checker import PacketChecker +from profiler.advisor.analyzer.communication.base_communication_analyzer import BaseCommunicationAnalyzer +from profiler.advisor.analyzer.communication.packet.packet_checker import PacketChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.dataset.communication.communication_dataset import CommunicationDataset +from profiler.advisor.result.result import OptimizeResult logger = logging.getLogger() -class PacketAnalyzer(BaseAnalyzer): +class PacketAnalyzer(BaseCommunicationAnalyzer): dataset_cls_list = [CommunicationDataset] + requires_cluster_dataset = False def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: super().__init__(collection_path, n_processes, **kwargs) @@ -34,7 +36,7 @@ class PacketAnalyzer(BaseAnalyzer): self.html_render = HTMLRender() self.html = None - @BaseAnalyzer.check_data((CommunicationDataset.get_key(),)) + @BaseCommunicationAnalyzer.check_data((CommunicationDataset.get_key(),)) def optimize(self, **kwargs): add_render_list = kwargs.get("add_render_list", True) packet_checker = PacketChecker(**kwargs) @@ -42,5 +44,9 @@ class PacketAnalyzer(BaseAnalyzer): if not packet_checker.packet_issues: return self.result packet_checker.make_record(self.result) - self.html = packet_checker.make_render(self.html_render, add_render_list) + self.html = packet_checker.make_render(self.html_render, add_render_list, priority=self.get_priority()) return self.result + + def get_priority(self): + # 提升1% ~ 3% + return PriorityBackgroundColor.low diff --git a/profiler/advisor/analyzer/communication/packet_checker.py b/profiler/advisor/analyzer/communication/packet/packet_checker.py similarity index 96% rename from profiler/advisor/analyzer/communication/packet_checker.py rename to profiler/advisor/analyzer/communication/packet/packet_checker.py index 3d9ac81ffdb9cc049e6b82d01570f2f041d3ff68..d270667cd052a41f14e457c75485d927660c5d36 100644 --- a/profiler/advisor/analyzer/communication/packet_checker.py +++ b/profiler/advisor/analyzer/communication/packet/packet_checker.py @@ -116,19 +116,20 @@ class PacketChecker: result.add_detail(sub_table_name, headers=self.headers) result.add_detail(sub_table_name, detail=self.small_packet_detail) - def make_render(self, html_render, add_render_list=True): + def make_render(self, html_render, add_render_list=True, **kwargs): + priority = kwargs.get("priority") return html_render.render_template(key="communication", template_dir="templates", template_name="packet_analysis.html", desc=self.desc, solutions=self.solutions, headers=self.headers, - data=self.small_packet_detail - ) + data=self.small_packet_detail, + priority_background_color=priority) def _init_rule(self): syncbn_rule_path = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), "rules", "packet.yaml" ) diff --git a/profiler/advisor/analyzer/communication/retransmission/__init__.py b/profiler/advisor/analyzer/communication/retransmission/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_analyzer.py b/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..78cade900731926f6be2303fd8b9ac6072df35f7 --- /dev/null +++ b/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_analyzer.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + +from profiler.advisor.analyzer.communication.base_communication_analyzer import BaseCommunicationAnalyzer +from profiler.advisor.analyzer.communication.retransmission.communication_retransmission_checker import \ + CommunicationRetransmissionChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset +from profiler.advisor.result.result import OptimizeResult + +logger = logging.getLogger() + + +class RDMARetransmissionAnalyzer(BaseCommunicationAnalyzer): + dataset_cls_list = [ClusterCommunicationDataset] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: + super().__init__(collection_path, n_processes, **kwargs) + key = ClusterCommunicationDataset.get_key() + self.dataset = self.get_first_data_by_key(self.dataset_list, key) + self.result = OptimizeResult() + self.html_render = HTMLRender() + self.html = None + + @BaseCommunicationAnalyzer.check_data((ClusterCommunicationDataset.get_key(),)) + def optimize(self, **kwargs): + add_render_list = kwargs.get("add_render_list", True) + rdma_checker = CommunicationRetransmissionChecker(**kwargs) + rdma_checker.check_retransmission(self.dataset) + if not rdma_checker.rdma_issues: + return self.result + rdma_checker.make_record(self.result) + self.html = rdma_checker.make_render(self.html_render, add_render_list, priority=self.get_priority()) + return self.result + + def get_priority(self): + # 单次重传最少4s,高优先级 + return PriorityBackgroundColor.high diff --git a/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py b/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..4431ccce4d3911600ed3b8e0375d42dcee77e9af --- /dev/null +++ b/profiler/advisor/analyzer/communication/retransmission/communication_retransmission_checker.py @@ -0,0 +1,129 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +from typing import Dict, List +from collections import defaultdict +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.cluster_analyse.common_func.file_manager import FileManager +from profiler.advisor.dataset.cluster.hccl_collection import HcclInfo + +logger = logging.getLogger() + + +class GroupStatistic: + def __init__(self, min_transmission_time): + self.retransmission_issue = False + self.abnormal_op_dict: Dict[str, List] = dict() + + def add_op(self, op_name: str, hccl_info: HcclInfo): + if self.abnormal_op_dict.get(op_name) is None: + self.abnormal_op_dict.setdefault(op_name, []) + self.abnormal_op_dict.get(op_name).append([hccl_info.group, op_name, hccl_info.step, hccl_info.rank, + hccl_info.get_rdma_transit_size(), + hccl_info.get_rdma_transmit_time(), hccl_info.get_rdma_bandwidth()]) + + +class CommunicationRetransmissionChecker: + def __init__(self, **kwargs): + self.rdma_issues = False + self.desc = "" + self.sdma_desc = "" + self.rdma_desc = "" + self.suggestions = [] + self.abnormal_group_count = 0 + self.abnormal_rdma_list = [] + self.step_id = kwargs.get("step") + self.stage = None + self.group_statistics = defaultdict(GroupStatistic) + self.headers = ["Communication group", "Op name", "Step id", "Rank id", "RDMA transmit size(MB)", + "RDMA transmit time(ms)", "RDMA bandwidth"] + self._init_rule() + + def check_possible_retransmission_occurrence(self, hccl_list: List[HcclInfo]): + min_elapse_time = min(hccl.elapse_time for hccl in hccl_list) + max_transit_time = max(hccl.rdma_info.get('Transit Time(ms)', 0) for hccl in hccl_list) + if min_elapse_time < self.min_retransmission_time: # 检测是否是卡间不同步问题,而不是重传 + return False + return max_transit_time > self.min_retransmission_time + + def check_retransmission(self, hccl_dataset: ClusterCommunicationDataset): + """ + :Param event_dataset: dataset of timeline event + """ + for group_name, hccl_group_dict in hccl_dataset.hccl_dict.items(): + for op_name, hccl_op_dict in hccl_group_dict.items(): + for step_id, hccl_list in hccl_op_dict.items(): + if self.step_id and step_id != self.step_id: # 传输指定step(self.step_id)情况下,非目标step跳过 + continue + if not self.check_possible_retransmission_occurrence(hccl_list): + continue + self.rdma_issues = True + if self.group_statistics.get(group_name) is None: + self.group_statistics.setdefault(group_name, GroupStatistic(self.min_retransmission_time)) + self.abnormal_group_count += 1 + for hccl_info in hccl_list: + if hccl_info.rdma_info.get('Transit Size(MB)', 0): + transit_time = hccl_info.rdma_info.get('Transit Time(ms)', 0) + if transit_time > self.min_retransmission_time: + self.group_statistics.get(group_name).add_op(op_name, hccl_info) + if self.rdma_issues: + self.desc = self.desc.format(group_count=self.abnormal_group_count) + for _, group_statistic in self.group_statistics.items(): + for _, op_list in group_statistic.abnormal_op_dict.items(): + for op in op_list: + self.abnormal_rdma_list.append(op) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + optimization_item = OptimizeItem("Communication retransmission analysis", self.desc, self.suggestions) + result.add(OptimizeRecord(optimization_item)) + + sub_table_name = "Comm Retransmission Analysis" if not self.stage else f"Stage-{self.stage}: Comm Retransmission Analysis" + result.add_detail(sub_table_name, headers=self.headers) + + for row in self.abnormal_rdma_list: + result.add_detail(sub_table_name, detail=row) + + def make_render(self, html_render, add_render_list=True, **kwargs): + priority = kwargs.get("priority") + return html_render.render_template(key="communication", + template_dir="templates", + template_name="communication_retransmission_analysis.html", + desc=self.desc, + solutions=self.solutions, + headers=self.headers, + data=self.abnormal_rdma_list, + priority_background_color=priority) + + def _init_rule(self): + syncbn_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), + "rules", + "rdma_analysis.yaml" + ) + + syncbn_rule = FileManager.read_yaml_file(syncbn_rule_path) + self.desc = syncbn_rule.get("problem") + self.min_retransmission_time = syncbn_rule.get("min_retransmission_time") + + self.solutions = syncbn_rule.get("solutions") + for solution in self.solutions: + for key, val in solution.items(): + self.suggestions.append(f"{key}, {val.get('desc')}") diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py index 4f25deff7c0cdb415ccae6ab748304d4044c5eec..bc0841152768d71c600e3998110c22ef491e864a 100644 --- a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_analyzer.py @@ -3,34 +3,40 @@ import logging from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.result.result import OptimizeResult from profiler.advisor.analyzer.computation.ai_core_freq.ai_core_freq_checker import AICoreFreqChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender -from profiler.advisor.dataset.ai_core_freq.ai_core_freq_dataset import AICoreFreqDataset +from profiler.advisor.dataset.timeline_event_dataset import ComputationAnalysisDataset +from profiler.advisor.dataset.profiling.device_info import DeviceInfoParser from profiler.advisor.config.config import Config logger = logging.getLogger() class AICoreFreqAnalyzer(BaseAnalyzer): - dataset_cls_list = [AICoreFreqDataset] + dataset_cls_list = [ComputationAnalysisDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: super().__init__(collection_path, n_processes, **kwargs) - key = AICoreFreqDataset.get_key() + key = ComputationAnalysisDataset.get_key() self.dataset = self.get_first_data_by_key(self.dataset_list, key) self.result = OptimizeResult() self.html_render = HTMLRender() self.html = None + info = DeviceInfoParser(collection_path) + info.parse_data() - @BaseAnalyzer.check_data((AICoreFreqDataset.get_key(),)) + @BaseAnalyzer.check_data((ComputationAnalysisDataset.get_key(),)) def optimize(self, **kwargs): if not Config().get_config("aic_frequency"): logger.warning("Can not find ai core frequency in info.json*, please check data integrity.") return self.result + add_render_list = kwargs.get("add_render_list", True) ai_core_freq_checker = AICoreFreqChecker() - ai_core_freq_checker.check_ai_core_freq(self.dataset) - if not ai_core_freq_checker.ai_core_freq_issues: - return self.result + ai_core_freq_checker.check_ai_core_freq(self.dataset, rank_id=kwargs.get("rank_id"), stage=kwargs.get("stage")) ai_core_freq_checker.make_record(self.result) - self.html = ai_core_freq_checker.make_render(self.html_render, add_render_list) + self.html = ai_core_freq_checker.make_render(self.html_render, add_render_list, priority=self.get_priority()) return self.result + + def get_priority(self): + return PriorityBackgroundColor.high \ No newline at end of file diff --git a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py index 5bfa5adc4b5982f8bddf3ce6bbcbf870c68b32c6..2b3760eb9f65cb7e8ee65e28aa870fae67be1194 100644 --- a/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py +++ b/profiler/advisor/analyzer/computation/ai_core_freq/ai_core_freq_checker.py @@ -1,6 +1,6 @@ import logging -from profiler.advisor.dataset.ai_core_freq.ai_core_freq_dataset import AICoreFreqDataset +from profiler.advisor.dataset.timeline_event_dataset import ComputationAnalysisDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.config.config import Config @@ -26,7 +26,7 @@ class AICoreFreqChecker: self.rank_id = None self.stage = None - def check_ai_core_freq(self, event_dataset: AICoreFreqDataset, rank_id=None, stage=None): + def check_ai_core_freq(self, event_dataset: ComputationAnalysisDataset, rank_id=None, stage=None): """ :Param event_dataset: dataset of timeline event """ @@ -60,6 +60,8 @@ class AICoreFreqChecker: self.decrease_freq_ops.sort(key= lambda x: (x[self.TOTAL_DURATION_INDEX], x[self.DECREASE_FREQ_RATIO_INDEX]), reverse=True) + if not self.ai_core_freq_issues: + return self.desc = (f"{len(self.decrease_freq_ops)} operators are found during frequency reduction, and the reduction " f"ratio is larger than {self.DECREASE_FREQ_RATIO}.") @@ -71,22 +73,29 @@ class AICoreFreqChecker: """ make record for what and how to optimize """ - optimization_item = OptimizeItem("AI Core Frequency", self.desc, [self.suggestions]) + if not self.ai_core_freq_issues: + return self.ai_core_freq_issues + + sheet_name = "AI Core Frequency" + if self.rank_id is not None: + sheet_name = f"rank {self.rank_id} AI Core Frequency".capitalize() + + optimization_item = OptimizeItem(sheet_name, self.desc, [self.suggestions]) result.add(OptimizeRecord(optimization_item)) self.headers = ["Operator name", "Count", "Total duration(us)", "AI CORE frequency decreased ratio", "Average frequency", "Max frequency", "Min frequency"] - if self.rank_id: - self.headers = ["Rank id"] + self.headers - sub_table_name = "AI Core Frequency" if not self.stage else f"Stage-{self.stage}: AI Core Frequency" - result.add_detail(sub_table_name, headers=self.headers) + result.add_detail(sheet_name, headers=self.headers) for row in self.decrease_freq_ops: - if self.rank_id: - row = [self.rank_id] + row - result.add_detail(sub_table_name, detail=row) + result.add_detail(sheet_name, detail=row) + return True + + def make_render(self, html_render, add_render_list=True, **kwargs): + if not self.ai_core_freq_issues: + return self.ai_core_freq_issues - def make_render(self, html_render, add_render_list=True): + priority = kwargs.get("priority") if self.SHOW_TOPK_OPS: self.desc += f" Only show {self.SHOW_TOPK_OPS} operators here, see latest mstt_advisor.xlsx for details." return html_render.render_template(key="computation", @@ -96,4 +105,6 @@ class AICoreFreqChecker: suggestion=self.suggestions, headers=self.headers, data=self.decrease_freq_ops[:self.SHOW_TOPK_OPS], - add_render_list=add_render_list) + add_render_list=add_render_list, + priority_background_color=priority) + return True diff --git a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py index 0caede4b894e0dda15333e6d3a480fa943c66323..394ad74fd7dcb739caa1f69929646f98207b2aa8 100644 --- a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py +++ b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py @@ -8,7 +8,7 @@ from profiler.advisor.analyzer.schedule.fusion_ops.timeline_api_stack_checker im from profiler.advisor.common import constant from profiler.advisor.dataset.dataset import Dataset from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ComputationAnalysisDataset from profiler.cluster_analyse.common_func.file_manager import FileManager @@ -30,6 +30,8 @@ class AicpuChecker(OperatorChecker): self.aicpu_rules: Dict = {} self.aicpu_checker: Dict = {} self.load_aicpu_rules() + self.total_task_duration = 0.0 + self.aicpu_task_duration = 0.0 def _check_data(self, profiling_data: ProfilingDataset) -> bool: if not self._check_summary(profiling_data): @@ -88,7 +90,7 @@ class AicpuChecker(OperatorChecker): def get_opeartor_stack_info(api_stack_finder: OpStackFinder, op_name_list: list) -> list: data: Dict[str, Dataset] = {} - event_dataset = TimelineEventDataset(collection_path=profiling_data.collection_path, data=data, task_type=constant.AI_CPU) + event_dataset = ComputationAnalysisDataset(collection_path=profiling_data.collection_path, data=data, task_type=constant.AI_CPU) # disable multiprocessing, avoid cost time of enable new process for light task api_stack_finder.get_api_stack_by_op(event_dataset, op_name_list, constant.AI_CPU, @@ -96,14 +98,16 @@ class AicpuChecker(OperatorChecker): return api_stack_finder._stack_record self._op_list = [] - total_task_duration = 0.0 + max_task_duration = 0.0 for op_info in op_summary.op_list: + task_duration = float(op_info.task_duration) + if self._check_operator(op_info): self._op_list.append(op_info) + self.aicpu_task_duration += task_duration - task_duration = float(op_info.task_duration) - total_task_duration += task_duration + self.total_task_duration += task_duration max_task_duration = max(max_task_duration, task_duration) if (not self._op_list) or (max_task_duration < self._MIN_TASK_DURATION): return False @@ -145,11 +149,15 @@ class AicpuChecker(OperatorChecker): ",".join(double_type_ai_cpu_operator))) return True - def make_render(self, html_render, record): - html_render.render_template(key="computation", - template_dir="templates", - template_name="operator_ai_cpu.html", - format_result=self.format_operator_result(record, constant.OPERATOR_LIST_UNLIMIT)) + def make_render(self, html_render, record, add_render_list=True, **kwargs): + priority = kwargs.get("priority") + return html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_ai_cpu.html", + format_result=self.format_operator_result(record, + constant.OPERATOR_LIST_UNLIMIT), + add_render_list=add_render_list, + priority_background_color=priority) def format_operator_result(self, record, limit): """ diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index 7a873c65635fcc8f2ebb35c8d317de09d78da491..5b358ebaab2069479aa2221b346cc256e5920750 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -45,11 +45,15 @@ class BlockDimChecker(OperatorChecker): "task duration are as follows:\n" return True - def make_render(self, html_render, record): - html_render.render_template(key="computation", - template_dir="templates", - template_name="operator_block_dim.html", - format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK)) + def make_render(self, html_render, record, add_render_list=True, **kwargs): + priority = kwargs.get("priority") + return html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_block_dim.html", + format_result=self.format_operator_result(record, + constant.OPERATOR_OUT_TOPK), + add_render_list=add_render_list, + priority_background_color=priority) def _check_operator(self, op_info) -> bool: if op_info.task_type not in ["AI_CORE", "AI_VECTOR_CORE", "MIX_AIC"]: diff --git a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py index a22b380f974b14207d6d7be262cd49f0ba0fbe99..2096e9ffaf2e5e041716dea381e2d99824fefe0f 100644 --- a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py +++ b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py @@ -46,8 +46,12 @@ class OperatorBoundChecker(OperatorChecker): return False return True - def make_render(self, html_render, record): - html_render.render_template(key="computation", - template_dir="templates", - template_name="operator_no_bound.html", - format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK)) + def make_render(self, html_render, record, add_render_list=True, **kwargs): + priority = kwargs.get("priority") + return html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_no_bound.html", + format_result=self.format_operator_result(record, + constant.OPERATOR_OUT_TOPK), + add_render_list=add_render_list, + priority_background_color=priority) \ No newline at end of file diff --git a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py index 86d3bac4ff8cb163d23a6365307b855839b12a6a..2521b6e7e39c53465536d1516a29b0788891ca05 100644 --- a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py +++ b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -27,11 +27,13 @@ class DynamicShapeChecker(OperatorChecker): def check(self, profiling_database) -> bool: return self.is_dynamic_shape(profiling_database) - def make_record(self, profiling_database) -> OptimizeRecord: + def make_record(self, profiling_database, rank_id=None) -> OptimizeRecord: """ make record for what and how to optimize """ + if rank_id is not None: + self._PROBLEM = f"rank {rank_id} ".capitalize() + self._PROBLEM.lower() optimization_item = OptimizeItem( self._PROBLEM, self._description, @@ -58,8 +60,11 @@ class DynamicShapeChecker(OperatorChecker): format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)} return format_result - def make_render(self, html_render, record): - html_render.render_template(key="computation", - template_dir="templates", - template_name="operator_dynamic_shape.html", - format_result=self.format_operator_result(record)) + def make_render(self, html_render, record, add_render_list=True, **kwargs): + priority = kwargs.get("priority") + return html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_dynamic_shape.html", + format_result=self.format_operator_result(record), + add_render_list=add_render_list, + priority_background_color=priority) diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py index 64618b56a8df7f380277e99ae7ca47cd69d24648..e24eae1d0c55df98c16fb9a2085a010b7a1f4ed7 100644 --- a/profiler/advisor/analyzer/computation/operator_checker.py +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -40,6 +40,23 @@ class OperatorChecker(VersionControl): self.cann_version = cann_version self._op_list: List[OpInfo] = [] + @staticmethod + def get_ratio(op_info: OpInfo, attr: str) -> float: + if not op_info.has_attr(attr): + return 0 + value = op_info.get_attr(attr) + if not value or value == "N/A": + return 0 + return float(value) + + @classmethod + def get_name(cls): + """ + get name of checker + :return: checker name + """ + return cls._PROBLEM + def check(self, profiling_data: ProfilingDataset) -> bool: """ check if any operator need optimize @@ -77,12 +94,16 @@ class OperatorChecker(VersionControl): return True return False - def make_record(self, profiling_data: ProfilingDataset): + def make_record(self, profiling_data: ProfilingDataset, rank_id=None): """ Make record for what and how to optimize :param profiling_data: profiling data :return: optimize record """ + + if rank_id is not None: + self._PROBLEM = f"rank {rank_id} ".capitalize() + self._PROBLEM.lower() + task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list if hasattr(op_info, "get_attr")] total_cost_time = sum(task_duration_list) @@ -239,14 +260,6 @@ class OperatorChecker(VersionControl): """Get node views.""" return [] - @classmethod - def get_name(cls): - """ - get name of checker - :return: checker name - """ - return cls._PROBLEM - def get_incomes(self) -> float: """get incomes""" incomes = 0.0 @@ -269,16 +282,7 @@ class OperatorChecker(VersionControl): logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "op summary") return False return True - - @staticmethod - def get_ratio(op_info: OpInfo, attr: str) -> float: - if not op_info.has_attr(attr): - return 0 - value = op_info.get_attr(attr) - if not value or value == "N/A": - return 0 - return float(value) - + def get_details(self) -> list: """ get details of operator to be optimized diff --git a/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py b/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..bc02b4c3e3dd222758f66214b0486d5ca64b0ed4 --- /dev/null +++ b/profiler/advisor/analyzer/computation/pp_stage_computation_analyzer.py @@ -0,0 +1,106 @@ +import logging +import os +from multiprocessing import Manager + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common.analyzer_scopes import SupportedScopes +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor +from profiler.advisor.interface.interface import Interface +from profiler.advisor.utils.utils import ParallelJob +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.common import constant as const + +logger = logging.getLogger() + + +class PPStageComputationAnalyzer(BaseAnalyzer): + + def __init__(self, collection_path, **kwargs): + super().__init__(collection_path, **kwargs) + self.collection_path = collection_path + self._stages_rendered_html = Manager().list() + self._multiprocess_result = Manager().dict() + # html render不能序列化,无法用多进程,放到optimize里面初始化 + self.html_render = None + self.result = None + + @staticmethod + def _get_valid_sheet_name(sheet_name, prefix): + if not sheet_name.lower().startswith(prefix.lower()): + sheet_name = f"{prefix} {sheet_name}" + return sheet_name + + def optimize(self, stages_profiling_path, **kwargs): + pp_stage_processes = min(int(os.getenv("PP_STAGE_ANALYSIS_PROCESSES", 0)), len(stages_profiling_path), + const.MAX_NUM_PROCESSES) + if pp_stage_processes <= 1: + for stage_profiling_path in stages_profiling_path: + self._optimize(**stage_profiling_path) + else: + logger.info("Start to parallel analysis of pp stages, number of processes is %s", pp_stage_processes) + parallel_stage_analysis_job = ParallelJob(self._optimize, stages_profiling_path, + "Computation analysis of Pipeline parallel stages") + parallel_stage_analysis_job.start(pp_stage_processes) + self._merge_multiprocess_result() + + self.make_render() + self.html_render = HTMLRender() + return self.result + + def make_render(self): + HTMLRender().render_template(key="computation", + template_dir="templates", + template_name="pp_stage_computation_analysis.html", + stages_rendered_html=list(self._stages_rendered_html), + priority_background_color=PriorityBackgroundColor.high) + + def get_priority(self): + pass + + def _optimize(self, profiling_path, **kwargs): + stage_html_record = dict(stage=kwargs.get("stage"), rank_id=kwargs.get("rank_id"), step=kwargs.get("step")) + kwargs["add_render_list"] = False + + # stage 并行分析时,避免调用本身,即SupportedScopes.STAGE_COMPUTE + scopes = Interface.get_scope(Interface.COMPUTATION) + stage_analyzer_list = [Interface.get_analyzer(Interface.COMPUTATION, scope) for scope in scopes if + scope != SupportedScopes.STAGE_COMPUTE] + + for analyzer_cls in stage_analyzer_list: + analyzer = analyzer_cls(collection_path=profiling_path, **kwargs) + result = analyzer.optimize(**kwargs) + if hasattr(result, "data") and result.data: + self.result = result + if hasattr(analyzer, "html") and analyzer.html: + if "html_list" not in stage_html_record: + stage_html_record["html_list"] = [] + stage_html_record["html_list"].append(analyzer.html) + self._stages_rendered_html.append(stage_html_record) + self._multiprocess_result[f"rank {kwargs.get('rank_id')}".capitalize()] = result.data + + def _merge_multiprocess_result(self): + self.result = OptimizeResult() + for key, result_data in self._multiprocess_result.items(): + problem_data = result_data.get("problems", {}).get("data", []) + if not problem_data: + continue + + for row in problem_data: + if len(row) < 3: + continue + issue_name, desc, suggestion = row[:3] + sheet_name = PPStageComputationAnalyzer._get_valid_sheet_name(issue_name, key) + optimization_item = OptimizeItem(sheet_name, desc, [suggestion]) + self.result.add(OptimizeRecord(optimization_item)) + del result_data["problems"] + + for issue_name, issue_details in result_data.items(): + headers = issue_details.get("headers", []) + data = issue_details.get("data", []) + sheet_name = PPStageComputationAnalyzer._get_valid_sheet_name(issue_name, key) + self.result.add_detail(sheet_name, headers=headers) + + for row in data: + self.result.add_detail(sheet_name, detail=row) diff --git a/profiler/advisor/analyzer/computation/profiling_analyzer.py b/profiler/advisor/analyzer/computation/profiling_analyzer.py index 2021bcd5765d1df7489f202b3453a83924fb28dc..b29373e876ca7cd14302a9d0074a64bf874004e1 100644 --- a/profiler/advisor/analyzer/computation/profiling_analyzer.py +++ b/profiler/advisor/analyzer/computation/profiling_analyzer.py @@ -8,6 +8,7 @@ from profiler.advisor.analyzer.computation.bound.block_dim_checker import BlockD from profiler.advisor.analyzer.computation.bound.operator_bound_checker import OperatorBoundChecker from profiler.advisor.analyzer.computation.op_compile.dynamic_shape_checker import DynamicShapeChecker from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset @@ -22,6 +23,7 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): self.checker = OperatorChecker(self.cann_version) self.html_render = HTMLRender() self.result = OptimizeResult() + self.html = None @BaseAnalyzer.check_data((ProfilingDataset.get_key(),)) def optimize(self, **kwargs) -> OptimizeResult: @@ -32,22 +34,29 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): """ profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key()) checker = self.checker + rank_id = kwargs.get("rank_id") + + add_render_list = kwargs.get("add_render_list", True) + if not checker.pre_check(profiling_data): return self.result if checker.check(profiling_data): # add record - record = checker.make_record(profiling_data) - checker.make_render(self.html_render, record) + record = checker.make_record(profiling_data, rank_id) + self.html = checker.make_render(self.html_render, record, add_render_list, + priority=self.get_priority(checker)) self.result.add(record) # add details details = checker.get_details() if details: for i, detail in enumerate(details): + sheet_name = checker.get_name() if rank_id is None else \ + f"rank {rank_id} ".capitalize() + checker.get_name() if i == 0: # the first row is header - self.result.add_detail(checker.get_name(), headers=detail) + self.result.add_detail(sheet_name, headers=detail) else: - self.result.add_detail(checker.get_name(), detail=detail) + self.result.add_detail(sheet_name, detail=detail) # add tune op list tune_op_list = checker.get_tune_op_list() if tune_op_list: @@ -55,11 +64,13 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): return self.result - def make_record(self): - pass + def get_priority(self, checker): + if "aicpu" not in checker.__class__.__name__.lower(): + return PriorityBackgroundColor.low - def make_render(self): - pass + aicpu_duration = getattr(checker, "aicpu_task_duration", 0.0) + total_duration = getattr(checker, "total_task_duration", 0.0) + return self.get_priority_by_time_ratio(aicpu_duration, total_duration) class DynamicShapeAnalyzer(ProfilingAnalyzer): diff --git a/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py index 291c3a1f941cf1934c0c91b7603b6270ee66f3fb..3d1a537c211a3ba26133f31e23284844d681d6e4 100644 --- a/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py +++ b/profiler/advisor/analyzer/dataloader/dataloader_analyzer.py @@ -5,26 +5,30 @@ from typing import List, Dict, Any from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.result.result import OptimizeResult from profiler.advisor.analyzer.dataloader.dataloader_checker import DataloaderChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset logger = logging.getLogger() class DataloaderAnalyzer(BaseAnalyzer): - dataset_cls_list = [TimelineEventDataset] + dataset_cls_list = [ScheduleAnalysisDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: super().__init__(collection_path, n_processes, **kwargs) - key = TimelineEventDataset.get_key() + key = ScheduleAnalysisDataset.get_key() self.dataset = self.get_first_data_by_key(self.dataset_list, key) self.result = OptimizeResult() self.html_render = HTMLRender() - @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),)) + @BaseAnalyzer.check_data((ScheduleAnalysisDataset.get_key(),)) def optimize(self, **kwargs): dataloader_checker = DataloaderChecker() dataloader_checker.check_slow_dataloader(self.dataset) dataloader_checker.make_record(self.result) - dataloader_checker.make_render(self.html_render) + dataloader_checker.make_render(self.html_render, priority=self.get_priority()) return self.result + + def get_priority(self): + return PriorityBackgroundColor.high diff --git a/profiler/advisor/analyzer/dataloader/dataloader_checker.py b/profiler/advisor/analyzer/dataloader/dataloader_checker.py index eb1886284ef5d508f911d0c353df4574fd4a8bd3..f392a0838ac03fd180c6f5201c7fc489f19a2ab7 100644 --- a/profiler/advisor/analyzer/dataloader/dataloader_checker.py +++ b/profiler/advisor/analyzer/dataloader/dataloader_checker.py @@ -3,7 +3,7 @@ import re import logging import yaml -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.cluster_analyse.common_func.file_manager import FileManager @@ -22,7 +22,7 @@ class DataloaderChecker: self.dataloader_duration_threshold = None self._init_rule() - def check_slow_dataloader(self, event_dataset: TimelineEventDataset): + def check_slow_dataloader(self, event_dataset: ScheduleAnalysisDataset): """ :Param event_dataset: dataset of timeline event """ @@ -32,7 +32,7 @@ class DataloaderChecker: return for event in event_dataset.dataloader: - dataloader_duration = float(event.dur) / 1000 + dataloader_duration = float(event.dur) if dataloader_duration < self.dataloader_duration_threshold: continue self.desc = self.desc.format(dataloader_duration=dataloader_duration, @@ -53,14 +53,16 @@ class DataloaderChecker: for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - def make_render(self, html_render): + def make_render(self, html_render, **kwargs): if not self.dataloader_issues: return + priority = kwargs.get("priority") html_render.render_template(key="dataloader", template_dir="templates", template_name="slow_dataloader.html", desc=self.desc, - suggestions=self.suggestions) + suggestions=self.suggestions, + priority_background_color=priority) def _init_rule(self): dataloader_rule_path = os.path.join( diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py index 326be83b8d49088b1563ccd8c08b68a4aa3001ef..e9dcd263d6c2875ece7e94409191a7f9ceee1b27 100644 --- a/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py +++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py @@ -20,17 +20,22 @@ class FusionOPAnalyzer(BaseAnalyzer): super(FusionOPAnalyzer, self).__init__(collection_path, **kwargs) self.result = OptimizeResult() self.html_render = HTMLRender() - + self.html = None + @BaseAnalyzer.check_data((GraphDataset.get_key(),)) def optimize(self, **kwargs): """ :return: result """ - self._check(self.dataset_list.get("GraphDataset"), self.dataset_list.get("ProfilingDataset")) + self._check(self.dataset_list.get("GraphDataset"), self.dataset_list.get("ProfilingDataset"), + kwargs.get("add_render_list")) return self.result - def _check(self, graph_data: List[GraphDataset], - profiling_data: List[ProfilingDataset] = None) -> None: + def get_priority(self): + pass + + def _check(self, graph_data: List[GraphDataset], profiling_data: List[ProfilingDataset] = None, + add_render_list=True) -> None: if len(graph_data) == 0 or graph_data[0].is_empty(): return for _, rule in self.RULES.items(): @@ -40,10 +45,4 @@ class FusionOPAnalyzer(BaseAnalyzer): else: checker.find_fusion_matched_issues_with_times(graph_data, profiling_data) checker.make_record(self.result) - checker.make_render(self.html_render) - - def make_record(self): - pass - - def make_render(self): - pass + self.html = checker.make_render(self.html_render, add_render_list) \ No newline at end of file diff --git a/profiler/advisor/analyzer/memory/__init__.py b/profiler/advisor/analyzer/memory/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/memory/memory_analyzer.py b/profiler/advisor/analyzer/memory/memory_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..cd7b0a24223af942ffbb0fe8a68e6e5edcce5979 --- /dev/null +++ b/profiler/advisor/analyzer/memory/memory_analyzer.py @@ -0,0 +1,38 @@ +import logging + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.analyzer.memory.memory_checker import MemoryOpsChecker +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor + +logger = logging.getLogger() + + +class MemoryAnalyzer(BaseAnalyzer): + dataset_cls_list = [ScheduleAnalysisDataset] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: + super().__init__(collection_path, n_processes, **kwargs) + key = ScheduleAnalysisDataset.get_key() + self.dataset = self.get_first_data_by_key(self.dataset_list, key) + self.result = OptimizeResult() + self.html_render = HTMLRender() + + @BaseAnalyzer.check_data((ScheduleAnalysisDataset.get_key(),)) + def optimize(self, **kwargs): + memory_checker = MemoryOpsChecker() + memory_checker.check_memory_ops(self.dataset) + memory_checker.make_record(self.result) + memory_checker.make_render(self.html_render, priority=self.get_priority(memory_checker.max_mem_op_dur)) + return self.result + + def get_priority(self, max_mem_op_dur): + step_duration = getattr(self.dataset, "step_duration", None) + ratio = self.get_priority_by_time_ratio(max_mem_op_dur, step_duration) + + if step_duration is None: + return PriorityBackgroundColor.low + + return ratio diff --git a/profiler/advisor/analyzer/memory/memory_checker.py b/profiler/advisor/analyzer/memory/memory_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..8dec295c9fe69af72b21c7628d3f930bc50f7a9c --- /dev/null +++ b/profiler/advisor/analyzer/memory/memory_checker.py @@ -0,0 +1,76 @@ +import os +import re +import logging +import yaml + +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset, MemCollector +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.cluster_analyse.common_func.file_manager import FileManager + +logger = logging.getLogger() + + +class MemoryOpsChecker: + + def __init__(self): + + self.memory_issues = False + self.optimization_item = [] + self.desc = "" + self.suggestions = [] + self.memory_ops_duration_threshold = None + self.max_mem_op_dur = 0 + + def check_memory_ops(self, event_dataset: ScheduleAnalysisDataset): + """ + :Param event_dataset: dataset of timeline event + """ + if not hasattr(event_dataset, "memory_ops") or not getattr(event_dataset, "memory_ops") or \ + not event_dataset.memory_ops.mem_op_info: + logger.debug("Skip slow memory ops checker, because no memory ops: %s", MemCollector.MEMORY_OP_NAME) + return + + rule = event_dataset.memory_ops.rule + max_dur_thres = rule.get("max_total_duration") + raw_problem = rule.get("problem") + + for memory_op_name, memory_op_info in event_dataset.memory_ops.mem_op_info.items(): + op_dur = memory_op_info.get("total_dur") + op_count = memory_op_info.get("count") + if op_dur < max_dur_thres: + continue + if op_dur > self.max_mem_op_dur: + self.max_mem_op_dur = op_dur + + self.memory_issues = True + self.desc += raw_problem.format(memory_op_num=op_count, memory_op_name=memory_op_name, + memory_op_dur=op_dur) + " " + for solution in rule.get("solutions", []): + if memory_op_name not in solution: + continue + suggestion = solution.get(memory_op_name, {}).get("desc") + + self.suggestions.append(f"{suggestion} for optimize memory operator {memory_op_name}") + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.memory_issues: + return + + self.optimization_item.append(OptimizeItem("Memory", self.desc, self.suggestions)) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + + def make_render(self, html_render, **kwargs): + if not self.memory_issues: + return + priority = kwargs.get("priority") + html_render.render_template(key="memory", + template_dir="templates", + template_name="memory.html", + desc=self.desc, + suggestions=self.suggestions, + priority_background_color=priority) diff --git a/profiler/advisor/analyzer/overall/environment_variable_analyzer.py b/profiler/advisor/analyzer/overall/environment_variable_analyzer.py index 3daaa3460912620795294faa1266a34c858918e8..c4468c36d0eded6b36ae265e239d95e1fdf2dbbb 100644 --- a/profiler/advisor/analyzer/overall/environment_variable_analyzer.py +++ b/profiler/advisor/analyzer/overall/environment_variable_analyzer.py @@ -18,6 +18,7 @@ from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.prof_common.path_manager import PathManager from profiler.advisor.dataset.environment_variable_dataset import EnvironmentVariableDataset from profiler.advisor.analyzer.overall.environment_variable_checker import EnvironmentVariabelChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor class EnvironmentVariabelAnalyzer(BaseAnalyzer): @@ -40,6 +41,9 @@ class EnvironmentVariabelAnalyzer(BaseAnalyzer): checker.make_render(self.html_render) return self.result + def get_priority(self): + return PriorityBackgroundColor.high + def make_record(self): pass diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index 8e93dbda77d4915e716af856114184324d1d8807..fe43072a89f382f5f3fff0941bff4ab041f7e986 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -23,7 +23,7 @@ from profiler.compare_tools.compare_interface.comparison_interface import Compar class OverallSummaryAnalyzer(BaseAnalyzer): - OVERALL_SUMMARY_ANALYZER = "overall_summary_analysis" + OVERALL_SUMMARY_ANALYZER = "overall summary" advice_map = { "Computing Time": "if you want more detailed advice please go to mstt_advisor_*.html", "Uncovered Communication Time": "if you want more detailed advice please go to mstt_advisor_*.html", @@ -233,6 +233,9 @@ class OverallSummaryAnalyzer(BaseAnalyzer): torch_version=self.torch_version, result=result_for_html) + def get_priority(self): + pass + def get_profile_path(collection_path): for root, dirs, files in os.walk(collection_path): diff --git a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py index 0e62a3ff0c8eebc0cf7b5b89953b8a0842df9c9d..58b2c301b590e74c054ef997b1973e7a595bbc73 100644 --- a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py +++ b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py @@ -16,26 +16,26 @@ # limitations under the License. import logging - from profiler.advisor.common import constant as const from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.result.result import OptimizeResult from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor logger = logging.getLogger() class OpDispatchAnalyzer(BaseAnalyzer): - dataset_cls_list = [TimelineEventDataset] + dataset_cls_list = [ScheduleAnalysisDataset] """ operator dispatch optimizer """ def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: super().__init__(collection_path, n_processes, **kwargs) - key = TimelineEventDataset.get_key() + key = ScheduleAnalysisDataset.get_key() self.dataset = self.get_first_data_by_key(self.dataset_list, key) self.result = OptimizeResult() self.html_render = HTMLRender() @@ -54,21 +54,21 @@ class OpDispatchAnalyzer(BaseAnalyzer): self.make_render(self.html_render) return self.result - def get_op_compile_info(self, event_dataset: TimelineEventDataset): - """ - :Param event_dataset: dataset of timeline event - """ - if hasattr(event_dataset, "ops_compile"): - self._op_compile = getattr(event_dataset, "ops_compile") - if not self._op_compile or self._op_compile.total_count < const.MAX_OP_COMPILE_NUM: - return + def get_op_compile_info(self, event_dataset: ScheduleAnalysisDataset): + """ + :Param event_dataset: dataset of timeline event + """ + if hasattr(event_dataset, "ops_compile"): + self._op_compile = getattr(event_dataset, "ops_compile") + if not self._op_compile or self._op_compile.total_count < const.MAX_OP_COMPILE_NUM: + return - self._issues_record.append(['operator dispatch', - const.OP_COMPILE_ID, - self._op_compile.total_count, - self._op_compile.total_time]) - else: - logger.debug("Skip operator compile checker, because no op_compile attr find.") + self._issues_record.append(['operator dispatch', + const.OP_COMPILE_ID, + self._op_compile.total_count, + self._op_compile.total_time]) + else: + logger.debug("Skip operator compile checker, because no op_compile attr find.") def make_record(self, result: OptimizeResult): """ @@ -77,8 +77,9 @@ class OpDispatchAnalyzer(BaseAnalyzer): if not self._op_compile or len(self._issues_record) <= 0: return desc = f"Found {self._op_compile.total_count} operator compile issues." - suggestion = (f"Please use `torch_npu.npu.set_compile_mode(jit_compile=False)` to disable jit compile " - f"in dynamic shape usage.") + suggestion = ("Please place the following code at the entrance of the python script to disable jit compile. " \ + "Code: `torch_npu.npu.set_compile_mode(jit_compile=False); " + "torch_npu.npu.config.allow_internal_format = False`") self.optimization_item.append(OptimizeItem("Operator dispatch", desc, [suggestion])) for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) @@ -87,7 +88,7 @@ class OpDispatchAnalyzer(BaseAnalyzer): for op_info in self._issues_record: result.add_detail('operator dispatch', detail=op_info) - def make_render(self, html_render): + def make_render(self, html_render, **kwargs): issues = [] optimizations = [] for optimization in self.optimization_item: @@ -97,11 +98,20 @@ class OpDispatchAnalyzer(BaseAnalyzer): )) for record in self._issues_record: issues.append(dict(issue=record[0], - op_name=record[1], - counts=record[2], - total_time=record[3])) + op_name=record[1], + counts=record[2], + total_time=record[3])) html_render.render_template(key="schedule", template_dir="templates", template_name="operator_dispatch.html", issues=issues, - optimizers=optimizations) + optimizers=optimizations, + priority_background_color=self.get_priority()) + + def get_priority(self): + step_duration = getattr(self.dataset, "step_duration", None) + op_compile_total_dur = getattr(self._op_compile, "total_time", None) + if step_duration is None or op_compile_total_dur is None: + return PriorityBackgroundColor.low + + return self.get_priority_by_time_ratio(op_compile_total_dur, step_duration) diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py index c1eb24b8e1e11ac167a7eb9333867167a57dd524..305d239949bb2803ee822143d3facb9dd2e00a66 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -8,25 +8,29 @@ from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.common import constant as const from profiler.advisor.common.analyzer_scopes import SupportedScopes from profiler.advisor.common.timeline.event import TimelineEvent -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.utils.utils import format_timeline_result from profiler.advisor.common.timeline.fusion_ops_db import init_timeline_ops_db +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor logger = logging.getLogger() class TimelineFusionOpsAnalyzer(BaseAnalyzer): - dataset_cls_list = [TimelineEventDataset] + dataset_cls_list = [ScheduleAnalysisDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict() self.matched_op_stacks = {} self.empty_stacks = True - key = TimelineEventDataset.get_key() + key = ScheduleAnalysisDataset.get_key() self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key) + def get_priority(self): + return PriorityBackgroundColor.low + def optimize(self, **kwargs): for mode in [const.ATEN.lower(), const.OPTIMIZER.lower()]: @@ -154,8 +158,9 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): timeline_profiling_doc_url=const.TIMELINE_WITH_STACK_DOC_URL ) + sheet_name = "Affinity apis" optimization_item = OptimizeItem( - SupportedScopes.TIMELINE_FUSION_OPS, + sheet_name, desc, [suggestion] ) @@ -163,16 +168,16 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): self.result.add(OptimizeRecord(optimization_item)) record_title = ["Affinity API", "Code stacks", "Stack called counts"] - self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, headers=record_title) + self.result.add_detail(sheet_name, headers=record_title) for api_name, stacks_info in format_timeline_result(self.matched_op_stacks).items(): if not stacks_info: detail = [api_name, "null", "null"] - self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail) + self.result.add_detail(sheet_name, detail=detail) else: for stack in stacks_info: detail = [api_name, *stack] - self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail) + self.result.add_detail(sheet_name, detail=detail) def make_render(self): format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) @@ -185,7 +190,8 @@ class TimelineFusionOpsAnalyzer(BaseAnalyzer): empty_stacks=self.empty_stacks, with_stack_doc_url=const.TIMELINE_WITH_STACK_DOC_URL, api_doc_url=const.TIMELINE_API_DOC_URL, - result=format_result_for_html) + result=format_result_for_html, + priority_background_color=self.get_priority()) def query_stack(self, event_dataset): if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]): diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py b/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py index f684a4892111f113f6c502a010c9e14ccd43768a..92425910b219db7cfc105c2cbf979002d6026137 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py @@ -3,7 +3,7 @@ from typing import List from profiler.advisor.common import constant as const from profiler.advisor.common.timeline.event import TimelineEvent -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ComputationAnalysisDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.utils.utils import get_analyze_processes, ParallelJob @@ -21,7 +21,8 @@ class OpStackFinder: self.task_type = None self.matched_index = set() - def get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: List[str] = None, task_type: str = None, + def get_api_stack_by_op(self, event_dataset: ComputationAnalysisDataset, op_name: List[str] = None, + task_type: str = None, disable_multiprocess=False): """ :Param event_dataset: dataset of timeline event @@ -82,7 +83,13 @@ class OpStackFinder: for op_info in self._stack_record: result.add_detail('operator stacks', detail=op_info) - def _get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: str, task_type: str): + def query_stack(self, event_dataset: ComputationAnalysisDataset): + + if not event_dataset.dataset_len: + return + _ = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index) + + def _get_api_stack_by_op(self, event_dataset: ComputationAnalysisDataset, op_name: str, task_type: str): for _, src_op_event in event_dataset.ops_with_task_type.items(): op_task_type = src_op_event.get(const.TASK_TYPE) @@ -110,6 +117,7 @@ class OpStackFinder: task_id = src_op_event.task_id if not task_id: continue + self.matched_index.add(dst_op_index) if dst_op_index not in self._task_id_record: self._task_id_record[dst_op_index] = [] @@ -122,7 +130,7 @@ class OpStackFinder: if not dst_op_event: return const.TIMELINE_BACKWARD_NO_STACK_CODE - return dst_op_event.get("dataset_index") + return int(dst_op_event.get("dataset_index")) def _query_index_by_acl_to_npu(self, acl_to_npu_event): if acl_to_npu_event: @@ -148,6 +156,7 @@ class OpStackFinder: return None event = TimelineEvent(event) stack = event.args.get(const.CALL_STACKS) + stack = stack if stack else const.NO_STACK_REASON_MAP.get(const.TIMELINE_BACKWARD_NO_STACK_CODE) for matched_op_info in self._task_id_record.get(index, []): self._stack_record.append([*matched_op_info, stack]) @@ -156,8 +165,3 @@ class OpStackFinder: self._stack_record.append([*matched_op_info, const.NO_STACK_REASON_MAP.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE)]) return None - - def query_stack(self, event_dataset: TimelineEventDataset): - if not event_dataset.dataset_len: - return - _ = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index) diff --git a/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py index 4321793026bf41e9fe51b83d7ada66c1530beaec..950fe13a5b7be7689309db589cc47d43526d7d9d 100644 --- a/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py +++ b/profiler/advisor/analyzer/schedule/gc/gc_analyzer.py @@ -40,3 +40,6 @@ class GcAnalyzer(BaseAnalyzer): gc_checker.make_record(self.result) gc_checker.make_render(self.html_render) return self.result + + def get_priority(self): + return PriorityBackgroundColor.medium \ No newline at end of file diff --git a/profiler/advisor/analyzer/schedule/gc/gc_checker.py b/profiler/advisor/analyzer/schedule/gc/gc_checker.py index 05ef287609b4c7471e75f361a7a1b6c4be04758c..4e5a96118ebe93583a776c3072180391a9b3992b 100644 --- a/profiler/advisor/analyzer/schedule/gc/gc_checker.py +++ b/profiler/advisor/analyzer/schedule/gc/gc_checker.py @@ -81,9 +81,10 @@ class GcChecker: row = [self.rank_id] + row result.add_detail(sub_table_name, detail=row) - def make_render(self, html_render): + def make_render(self, html_render, **kwargs): if not self.gc_issues: return + priority = kwargs.get("priority") show_num = min(self.gc_topk_num, self.abnormal_gc_count) html_render.render_template(key="schedule", template_dir="templates", @@ -92,7 +93,8 @@ class GcChecker: solutions=self.solutions, headers=self.headers, datas=self.abnormal_gc_list[:show_num], - num=show_num) + num=show_num, + priority_background_color=priority) def _init_rule(self): gc_rule_path = os.path.join( diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py index 2786a784087bb449df3f7126e42f713fcf6a3cc6..df8c22fa5161d8f4315748cb629a3dd19b79e39a 100644 --- a/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_analyzer.py @@ -1,30 +1,32 @@ import logging -from typing import List, Dict, Any - from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.result.result import OptimizeResult from profiler.advisor.analyzer.schedule.syncbn.syncbn_checker import SyncBNChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset logger = logging.getLogger() class SyncBNAnalyzer(BaseAnalyzer): - dataset_cls_list = [TimelineEventDataset] + dataset_cls_list = [ScheduleAnalysisDataset] def __init__(self, collection_path, **kwargs): super().__init__(collection_path, **kwargs) self.result = OptimizeResult() self.html_render = HTMLRender() - key = TimelineEventDataset.get_key() + key = ScheduleAnalysisDataset.get_key() self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key) - @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),)) + @BaseAnalyzer.check_data((ScheduleAnalysisDataset.get_key(),)) def optimize(self, **kwargs): syncbn_checker = SyncBNChecker() syncbn_checker.check_syncbn(self.timeline_event_dataset) syncbn_checker.make_record(self.result) - syncbn_checker.make_render(self.html_render) + syncbn_checker.make_render(self.html_render, priority=self.get_priority()) return self.result + + def get_priority(self): + return PriorityBackgroundColor.high \ No newline at end of file diff --git a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py index c0e10448f3f05736c0fb0518fbcf5729244b058b..e83a1549184b2a48f5ddc25ae15f6cece34825c2 100644 --- a/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py +++ b/profiler/advisor/analyzer/schedule/syncbn/syncbn_checker.py @@ -1,7 +1,7 @@ import logging import os -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.cluster_analyse.common_func.file_manager import FileManager @@ -20,7 +20,7 @@ class SyncBNChecker: self.max_syncbn_num = None self._init_rule() - def check_syncbn(self, event_dataset: TimelineEventDataset): + def check_syncbn(self, event_dataset: ScheduleAnalysisDataset): """ :Param event_dataset: dataset of timeline event """ @@ -43,14 +43,17 @@ class SyncBNChecker: for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - def make_render(self, html_render): + def make_render(self, html_render, **kwargs): if not self.syncbn_issues: return + + priority = kwargs.get("priority") html_render.render_template(key="schedule", template_dir="templates", template_name="sync_batchnorm.html", desc=self.desc, - solutions=self.solutions) + solutions=self.solutions, + priority_background_color=priority) def _init_rule(self): syncbn_rule_path = os.path.join( diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py index d8906504c39141807d45ed1303f0672d6983a2ca..61ec7d1fa602f8359ce2bf9d1ae0297151588ef3 100644 --- a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_analyzer.py @@ -5,28 +5,33 @@ from typing import List, Dict, Any from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.result.result import OptimizeResult from profiler.advisor.analyzer.schedule.synchronize_stream.synchronize_stream_checker import SynchronizeStreamChecker +from profiler.advisor.display.html.priority_background_color import PriorityBackgroundColor from profiler.advisor.display.html.render import HTMLRender -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset logger = logging.getLogger() class SynchronizeStreamAnalyzer(BaseAnalyzer): - dataset_cls_list = [TimelineEventDataset] + dataset_cls_list = [ScheduleAnalysisDataset] def __init__(self, collection_path, **kwargs): super().__init__(collection_path, **kwargs) self.result = OptimizeResult() self.html_render = HTMLRender() - key = TimelineEventDataset.get_key() + key = ScheduleAnalysisDataset.get_key() self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key) - @BaseAnalyzer.check_data((TimelineEventDataset.get_key(),)) + @BaseAnalyzer.check_data((ScheduleAnalysisDataset.get_key(),)) def optimize(self, **kwargs): synchronize_stream_checker = SynchronizeStreamChecker() synchronize_stream_checker.check_synchronize(self.timeline_event_dataset, kwargs.get("profiling_with_stack")) synchronize_stream_checker.make_record(self.result) - synchronize_stream_checker.make_render(self.html_render) + synchronize_stream_checker.make_render(self.html_render, priority=self.get_priority()) return self.result + + + def get_priority(self): + return PriorityBackgroundColor.low \ No newline at end of file diff --git a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py index 83ddd80a0f918d1e7e58e3081e9d15ee936128f2..7af46f766ebbeb33808a4aa943a4ea54166419b0 100644 --- a/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py +++ b/profiler/advisor/analyzer/schedule/synchronize_stream/synchronize_stream_checker.py @@ -1,7 +1,7 @@ import logging from profiler.advisor.common import constant as const -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord from profiler.advisor.analyzer.schedule.timeline_base_checker import TimelineBaseChecker @@ -21,7 +21,7 @@ class SynchronizeStreamChecker(TimelineBaseChecker): self.solutions = [] self.max_synchronize_num = None - def check_synchronize(self, event_dataset: TimelineEventDataset, profiling_with_stack=None): + def check_synchronize(self, event_dataset: ScheduleAnalysisDataset, profiling_with_stack=None): """ :Param event_dataset: dataset of timeline event """ @@ -73,10 +73,10 @@ class SynchronizeStreamChecker(TimelineBaseChecker): for optimization in self.optimization_item: result.add(OptimizeRecord(optimization)) - def make_render(self, html_render): + def make_render(self, html_render, **kwargs): if not self.synchronize_issues: return - + priority = kwargs.get("priority") format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) html_render.render_template(key="schedule", template_dir="templates", @@ -86,4 +86,5 @@ class SynchronizeStreamChecker(TimelineBaseChecker): result=format_result_for_html, with_stack_doc_url=const.TIMELINE_WITH_STACK_DOC_URL, empty_stacks=self.empty_stacks, - framework_black_list=self.framework_black_list) + framework_black_list=self.framework_black_list, + priority_background_color=priority) diff --git a/profiler/advisor/analyzer/schedule/timeline_base_checker.py b/profiler/advisor/analyzer/schedule/timeline_base_checker.py index 8bc69150263c11006979f64d12df1dde29a45f15..f481733d4ac2af03ae9622373fed8749c8dc5573 100644 --- a/profiler/advisor/analyzer/schedule/timeline_base_checker.py +++ b/profiler/advisor/analyzer/schedule/timeline_base_checker.py @@ -4,7 +4,7 @@ import logging from profiler.advisor.common import constant as const from profiler.advisor.common.timeline.event import TimelineEvent -from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.dataset.timeline_event_dataset import ScheduleAnalysisDataset from profiler.advisor.result.result import OptimizeResult logger = logging.getLogger() @@ -19,19 +19,11 @@ class TimelineBaseChecker(ABC): self.empty_stacks = True self.framework_black_list = False - @abstractmethod - def make_record(self, result: OptimizeResult): - pass - - @abstractmethod - def make_render(self, html_render): - pass - - def query_stack(self, event_dataset: TimelineEventDataset = None, profiling_with_stack: str = None): + def query_stack(self, event_dataset: ScheduleAnalysisDataset = None, profiling_with_stack: str = None): if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]): return - event_dataset = event_dataset if not profiling_with_stack else TimelineEventDataset( + event_dataset = event_dataset if not profiling_with_stack else ScheduleAnalysisDataset( collection_path=profiling_with_stack, data={}, _datasets={}, analysis_mode="fusion_ops", build_dataset=False) diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py index 652e10b083730eb8c0bad5fc62d9e7247e92b4fe..72b8dd3df2786caa731de0178a5fa1af7ee84fbd 100644 --- a/profiler/advisor/common/analyzer_scopes.py +++ b/profiler/advisor/common/analyzer_scopes.py @@ -33,4 +33,6 @@ class SupportedScopes: SYNCBN = "syncbn" SYNCHRONIZE_STREAM = "synchronize_stream" FREQ_ANALYSIS = "freq_analysis" + MEMORY = "memory" + STAGE_COMPUTE = "stage_compute" GC_ANALYSIS = "gc_analysis" diff --git a/profiler/advisor/common/async_analysis_status.py b/profiler/advisor/common/async_analysis_status.py new file mode 100644 index 0000000000000000000000000000000000000000..f67ca235a97c54cd107308a030a3b82d0eaf3352 --- /dev/null +++ b/profiler/advisor/common/async_analysis_status.py @@ -0,0 +1,7 @@ +class AsyncAnalysisStatus: + FAILED = "failed" + SUCCESS = "success" + ANALYZING = "analyzing" + + FAILED_STATUS_CODE = 400 + NON_FAILED_STATUS_CODE = 200 diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py index c97cfbfd11e27a3d83ea2f9a25ea7870899bcfd1..298e94fc18c171ff9b6e84aba035db782af4809d 100644 --- a/profiler/advisor/common/constant.py +++ b/profiler/advisor/common/constant.py @@ -33,6 +33,7 @@ TASK_TYPE = "Task Type" CPU_OP = "cpu_op" AI_CORE = "AI_CORE" AI_CPU = "AI_CPU" +MIX_AIC = "MIX_AIC" CALL_STACKS = "Call stack" INPUT_DIMS = "Input Dims" OP_SEP = "-" @@ -48,8 +49,7 @@ NO_STACK_REASON_MAP = { TIMELINE_BACKWARD_NO_STACK_CODE: "Backward broadcast, without call stacks in profiling.", TIMELINE_ACL_TO_NPU_NO_STACK_CODE: "Incoming flow is 'acl_to_npu', without call stacks in profiling." } -TIMELINE_API_DOC_URL = "https://gitee.com/ascend/mstt/blob/master/profiler/advisor/doc/"\ - "Samples%20of%20Fused%20Operator%20API%20Replacement.md" +TIMELINE_API_DOC_URL = "https://gitee.com/ascend/mstt/blob/master/profiler/advisor/doc/Samples%20of%20Fused%20Operator%20API%20Replacement.md" AFFINITY_TRAINING_API = "Affinity training api" TIMELINE_WITH_STACK_DOC_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/" \ "70RC1/modeldevpt/ptmigr/AImpug_0067.html" @@ -124,20 +124,6 @@ MAX_RETRIES = 3 TIMEOUT = 3 ADVISOR_RULE_PATH = "ADVISOR_RULE_PATH" -# Copyright (c) 2024, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. CLOUD_RULE_PATH = "rules/cloud/" DEFAULT_RULE_PATH = "./rules/" @@ -156,7 +142,17 @@ COMMUNICATION_JSON = "communication.json" BOTTLENECK = "bottleneck" DATA = "data" - +ADVISOR_ANALYSIS_OUTPUT_DIR = "advisor_analysis_result" +DEFAULT_PROCESSES = 8 +CLUSTER_ANALYSIS_FILE_PATTERN = [r'profiler_info_\d+\.json', "step_trace_time.csv", "communication.json", + "communication_matrix.json"] +ANALYSIS_OUTPUT_PATH = "ANALYSIS_OUTPUT_PATH" +DEFAULT_RANK_FOR_PROFILING_ANALYSIS = 0 +PROFILER_INFO_FILE_PATTERN = r"profiler_info_(\d+)\.json" +DISABLE_STREAMINIG_READER = "DISABLE_STREAMINIG_READER" FRAMEWORK_STACK_BLACK_LIST = ["torch", "torch_npu", "megatron", "deepspeed"] DISABLE_STREAMING_READER = "DISABLE_STREAMING_READER" -MAX_FILE_SIZE = 10**10 +MAX_FILE_SIZE = 10 ** 10 +MAX_NUM_PROCESSES = 4 +DEFAULT_STEP = "-1" +STEP_RANK_SEP = "_" diff --git a/profiler/advisor/dataset/cluster/cluster_dataset.py b/profiler/advisor/dataset/cluster/cluster_dataset.py index 445d4c87ed546563ec3b23c156bb3817c6a1e2fc..e268b4092f63a15e5849cac10a3cdf78ffaf959a 100644 --- a/profiler/advisor/dataset/cluster/cluster_dataset.py +++ b/profiler/advisor/dataset/cluster/cluster_dataset.py @@ -15,6 +15,7 @@ import logging import os +import re from profiler.advisor.dataset.dataset import Dataset from profiler.advisor.utils.utils import singleton @@ -81,9 +82,11 @@ class ClusterDataset(Dataset): @singleton class ClusterStepTraceTimeDataset(ClusterDataset): RANK = "rank" + STAGE = "stage" def __init__(self, collection_path: str, data: dict, **kwargs): self._step_dict = defaultdict() + self._stages = [] super().__init__(collection_path, data, **kwargs) def _parse(self): @@ -101,14 +104,31 @@ class ClusterStepTraceTimeDataset(ClusterDataset): step_dict = defaultdict(lambda: [0, 0, 0]) for step_bean in step_data: if step_bean.type == self.RANK: - step_dict[step_bean.index][0] += step_bean.compute - step_dict[step_bean.index][1] += step_bean.communication - step_dict[step_bean.index][2] += step_bean.free + step_rank_record = [] + step = str(step_bean.step).replace(" ", "") or str(const.DEFAULT_STEP) + rank = str(step_bean.index).replace(" ", "") + if step: + step_rank_record.append(step) + if rank: + step_rank_record.append(rank) + + step_rank_index = const.STEP_RANK_SEP.join(step_rank_record) + step_dict[step_rank_index][0] += step_bean.compute + step_dict[step_rank_index][1] += step_bean.communication + step_dict[step_rank_index][2] += step_bean.free + if step_bean.type == self.STAGE: + stage = sorted(list(map(int, re.findall(r'\d+', step_bean.stage)))) + if stage in self._stages: + continue + self._stages.append(stage) return step_dict def get_data(self): return self._step_dict + def get_stages(self): + return sorted(self._stages) + @singleton class ClusterCommunicationDataset(ClusterDataset): @@ -158,7 +178,7 @@ class ClusterCommunicationDataset(ClusterDataset): self.hccl_dict.setdefault(comm_group, defaultdict(lambda: defaultdict(list))) for step, step_dict in group_dict.items(): for op, op_dict in step_dict.items(): - self.compute_bandwidth(op_dict) + self.compute_bandwidth(step.lower().lstrip("step") or str(const.DEFAULT_STEP), op_dict) self.process_hccl_info(comm_group, step, op, op_dict) def process_hccl_info(self, group, step, op, op_dict): @@ -175,7 +195,7 @@ class ClusterCommunicationDataset(ClusterDataset): msg = "[ERROR] Cluster_communication.json has invalid structure." raise ValueError(msg) from e - def compute_bandwidth(self, op_dict: dict): + def compute_bandwidth(self, step, op_dict: dict): for rank_id, rank_dict in op_dict.items(): try: rank = int(rank_id) @@ -184,17 +204,17 @@ class ClusterCommunicationDataset(ClusterDataset): raise ValueError(msg) from e for comm_type, bw_dict in rank_dict.get(self.COMMUNICATION_BANDWIDTH_INFO, {}).items(): if comm_type == self.SDMA: - self.rank_bw_dict[rank][self.SDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) - self.rank_bw_dict[rank][self.SDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) + self.rank_bw_dict[f"{step}{const.STEP_RANK_SEP}{rank}"][self.SDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) + self.rank_bw_dict[f"{step}{const.STEP_RANK_SEP}{rank}"][self.SDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) if comm_type == self.RDMA: - self.rank_bw_dict[rank][self.RDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) - self.rank_bw_dict[rank][self.RDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) - - for rank, rank_dict in self.rank_bw_dict.items(): - self.rank_bw_dict[rank][self.RDMA_BANDWIDTH] = self.compute_ratio( - self.rank_bw_dict[rank][self.RDMA_SIZE_MB], self.rank_bw_dict[rank][self.RDMA_TIME_MS]) - self.rank_bw_dict[rank][self.SDMA_BANDWIDTH] = self.compute_ratio( - self.rank_bw_dict[rank][self.SDMA_SIZE_MB], self.rank_bw_dict[rank][self.SDMA_TIME_MS]) + self.rank_bw_dict[f"{step}{const.STEP_RANK_SEP}{rank}"][self.RDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) + self.rank_bw_dict[f"{step}{const.STEP_RANK_SEP}{rank}"][self.RDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) + + for step_rank in self.rank_bw_dict.keys(): + self.rank_bw_dict[step_rank][self.RDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[step_rank][self.RDMA_SIZE_MB], self.rank_bw_dict[step_rank][self.RDMA_TIME_MS]) + self.rank_bw_dict[step_rank][self.SDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[step_rank][self.SDMA_SIZE_MB], self.rank_bw_dict[step_rank][self.SDMA_TIME_MS]) def get_data(self): return self.rank_bw_dict diff --git a/profiler/advisor/dataset/cluster/cluster_step_trace_time_bean.py b/profiler/advisor/dataset/cluster/cluster_step_trace_time_bean.py index b108fc77a3f3408d48c79ce6b542f98427d88b0b..8ae0e55f2a5fbc05304fd95809e9b69220dfd3e5 100644 --- a/profiler/advisor/dataset/cluster/cluster_step_trace_time_bean.py +++ b/profiler/advisor/dataset/cluster/cluster_step_trace_time_bean.py @@ -65,3 +65,6 @@ class ClusterStepTraceTimeBean: msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Free'." raise ValueError(msg) from e + @property + def stage(self) -> int: + return self._data.get(self.INDEX) diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index c76314641032968a19090d604d53558ff91cae92..44bbc141d7f9f9abc89ae1517d28dd78dce8fc1d 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -1,101 +1,47 @@ +import inspect import logging -import os -from typing import List, Any import traceback +from collections import OrderedDict import ijson from tqdm import tqdm from profiler.advisor.common import constant as const from profiler.advisor.common.timeline.event import TimelineEvent -from profiler.advisor.utils.utils import get_file_path_from_directory, check_path_valid, singleton -from profiler.cluster_analyse.common_func.file_manager import FileManager +from profiler.advisor.utils.utils import get_file_path_from_directory, check_path_valid, singleton, convert_to_float +from profiler.advisor.dataset.timeline_op_collector.timeline_op_collector import ( + OpCompileCollector, + SynchronizeStreamCollector, + MemCollector, + DataloaderCollector, + SyncBNCollector, + AtenCollector, + OptimizerCollector, + FrequencyCollector, + SpecificTaskTypeOpCollector, + TorchToNpuCollector, + AclToNpuCollector, + OpStackCollector, + StepCollector, + GcCollector +) logger = logging.getLogger() -class OpCompileCollector: - def __init__(self): - self._total_op_compile_counter = 0 - self._total_op_compile_time = 0.0 +class BaseTimelineEventDataset: + PROFILER_STEP_PREFIX = "ProfilerStep" - @property - def total_time(self): - return self._total_op_compile_time - - @property - def total_count(self): - return self._total_op_compile_counter - - def is_empty(self): - return self._total_op_compile_counter == 0 - - def update(self, event: TimelineEvent): - self._total_op_compile_time += float(event.dur) - self._total_op_compile_counter += 1 - - def unset(self): - self._total_op_compile_counter = 0 - self._total_op_compile_time = 0.0 - - -class SynchronizeStreamCollector: - - def __init__(self): - self._synchronize_stream_count = 0 - self._slow_synchronize_stream = [] - self.rule = SynchronizeStreamCollector._load_rule() - - @property - def total_count(self): - return self._synchronize_stream_count - - @property - def slow_synchronize_stream(self): - return self._slow_synchronize_stream - - @staticmethod - def _load_rule(): - sync_stream_rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "rules", - "synchronize.yaml") - - sync_stream_rule = FileManager.read_yaml_file(sync_stream_rule_path) - return sync_stream_rule - - def update_sync_stream_count(self): - self._synchronize_stream_count += 1 - - def append_slow_sync_stream(self, event): - if float(event.dur) / 1000 >= self.rule.get("slow_synchronize_threshold", 10): - self._slow_synchronize_stream.append(event) - - def unset(self): - self._synchronize_stream_count = 0 - self._slow_synchronize_stream = [] - - -@singleton -class TimelineEventDataset: + collector_map = {} def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None: - self._ops_with_task_type = {} - self._ops_with_stack = {} - self._ops_compile = OpCompileCollector() - self._torch_to_npu = {} - self._acl_to_npu = set() - self._aten: List[Any] = [] - self._optimizer: List[Any] = [] - self._dataloader: List[Any] = [] - self._sync_batchnorm: List[Any] = [] - self._gc: List[Any] = [] - self._synchronize_stream = SynchronizeStreamCollector() self.timeline_dir = collection_path + self.profiler_step = [] self.timeline_data_list = get_file_path_from_directory(collection_path, lambda file: file.endswith("trace_view.json")) self.dataset_len = None - self.analysis_mode = kwargs.get("analysis_mode") - self.task_type = kwargs.get("task_type") - + self.step = kwargs.get("step") + self.step_duration = None if not build_dataset: return @@ -105,59 +51,6 @@ class TimelineEventDataset: data[key] = [] data[key].append(self) - if self.analysis_mode in ["op_stack", "all"]: - self._task_op_names = list(set([event_key.split("-")[0] for event_key in self._ops_with_task_type.keys()])) - - self._post_process() - - @property - def ops_with_stack(self): - return self._ops_with_stack - - @property - def ops_compile(self): - return self._ops_compile - - @property - def torch_to_npu(self): - return self._torch_to_npu - - @property - def acl_to_npu(self): - return self._acl_to_npu - - @property - def ops_with_task_type(self): - return self._ops_with_task_type - - @property - def task_op_names(self): - return self._task_op_names - - @property - def optimizer(self): - return self._optimizer - - @property - def aten(self): - return self._aten - - @property - def dataloader(self): - return self._dataloader - - @property - def sync_batchnorm(self): - return self._sync_batchnorm - - @property - def gc_events(self): - return self._gc - - @property - def synchronize_stream(self): - return self._synchronize_stream - @classmethod def get_key(cls): """ @@ -166,6 +59,23 @@ class TimelineEventDataset: """ return cls.__module__.rsplit('.', maxsplit=1)[-1] + def get_post_process_kwargs(self, func_name): + kwargs = {} + if func_name == FrequencyCollector.__name__: + ops_with_task_type = getattr(self, "ops_with_task_type", {}).values() + kwargs["ai_core_ops"] = [op for op in ops_with_task_type if + op.get(const.TASK_TYPE) in [const.AI_CORE, const.MIX_AIC]] + return kwargs + + def add_event(self, index, event): + event["dataset_index"] = index + if not isinstance(event, TimelineEvent): + event = TimelineEvent(event) + + for _, collector in self.collector_map.items(): + collector.add_op(event) + return True + def parse(self): if len(self.timeline_data_list) == 0: @@ -173,10 +83,10 @@ class TimelineEventDataset: return False if len(self.timeline_data_list) > 1: - logger.warning("Found multiple trace_view.json in %s, load the file of device 0 for analysis.", + logger.warning("Found multiple trace_view.json in %s, load the file of device 0 for analysis .", self.timeline_dir) - result = self.parse_data_with_generator(self._add_event) + result = self.parse_data_with_generator(self.add_event) if not self.dataset_len: self.dataset_len = len(result) @@ -202,137 +112,100 @@ class TimelineEventDataset: timeline_data_path) return result - def _add_ops_with_task_type(self, event): - key = f"{event.name}-{event.ts}" - self._ops_with_task_type[key] = TimelineEvent( - { - const.TASK_TYPE: event.args.get(const.TASK_TYPE), - "task_id": event.args.get("Task Id"), - "tid": event.tid, - "name": event.name, - "ts": str(event.ts) - } - ) - - def _add_ops_with_stack(self, event): - self._ops_with_stack[str(event.ts)] = TimelineEvent({"name": event.name, "dataset_index": event.dataset_index}) - - def _add_torch_to_npu(self, event): - key = f"{event.ph}-{event.id}" - self._torch_to_npu[key] = TimelineEvent({"tid": event.tid, "ts": str(event.ts)}) - - def _add_acl_to_npu(self, event): - # op with task type equals to ai_cpu which derived from acl_to_npu do not have stacks - self._acl_to_npu.add(str(event.ts)) - - def _add_op_compile(self, event: TimelineEvent): - if event.name == const.OP_COMPILE_NAME or event.args.get("id") == const.OP_COMPILE_ID: - self._ops_compile.update(event) - - def _add_gc(self, event: TimelineEvent): - if event.get("cat") and event.get("cat").lower() == 'gc': - self._gc.append(event) - - def _add_optimizer(self, event: TimelineEvent): - self._optimizer.append(TimelineEvent({"name": event.name, "dataset_index": event.dataset_index})) - - def _add_aten(self, event: TimelineEvent): - self._aten.append(TimelineEvent({ - "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur - })) - - def _add_dataloader(self, event: TimelineEvent): - if "dataloader" in event.name.lower(): - self._dataloader.append(TimelineEvent({ - "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur, - "stack": event.args.get("Call stack") - })) - - def _add_sync_batchnorm(self, event: TimelineEvent): - if event.name.lower() == "syncbatchnorm": - self._sync_batchnorm.append(TimelineEvent({ - "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur - })) - - def _add_synchronize(self, event: TimelineEvent): - if event.name.startswith(const.SYNC_STREAM): - self._synchronize.append(TimelineEvent({ - "name": event.name, "ts": event.ts, "dur": event.dur - })) - - def _add_specific_operator(self, event): - # for analysis of operator aclOpCompile, enable jit_compILE=False - self._add_op_compile(event) - # for analysis of slow dataloader.__next__ - self._add_dataloader(event) - # for analysis of syncBatchNorm operator, prompt users to replace source code of torch_npu's syncbn - self._add_sync_batchnorm(event) - # for analysis of GcAnalyzer - self._add_gc(event) - - def _add_event(self, index, event): - event["dataset_index"] = index - if not isinstance(event, TimelineEvent): - event = TimelineEvent(event) - - self._add_specific_operator(event) - - if self.analysis_mode == "fusion_ops": - self._add_event_for_fusion_ops(event) - elif self.analysis_mode == "op_stack": - self._add_event_for_op_stack(event) + def _get_target_ops_by_step(self, op_list): + target_ops = [] + if not self.profiler_step: + return op_list + if not self.step or f"ProfilerStep#{self.step}" not in [event.name for event in self.profiler_step]: + target_ops = op_list + if self.profiler_step: + self.step_duration = convert_to_float(self.profiler_step[-1].dur) else: - self._add_event_for_fusion_ops(event) - self._add_event_for_op_stack(event) - return True - - def _add_event_for_fusion_ops(self, event): - if event.name.lower().startswith(f"{const.ATEN}{const.ATEN_SEP}") or event.name.lower().startswith( - f"{const.NPU}{const.ATEN_SEP}"): - self._add_aten(event) - return - - # 检查cann层同步操作,根据时间窗口索引到host侧的aten算子并给出堆栈 - if event.name.startswith(const.SYNC_STREAM): - self._add_aten(event) + for step_event in self.profiler_step: + if step_event.name != f"ProfilerStep#{self.step}": + continue + self.step_duration = convert_to_float(step_event.dur) + for op_event in op_list: + if step_event.ts_include(op_event): + target_ops.append(op_event) + target_ops.sort(key=lambda x: convert_to_float(x.ts)) + return target_ops + + def _collector_post_process(self): + # 按step过滤collector中的算子,并将过滤后的算子设置为当前dataset的property,与原始TimelineEventDataset的property保持一致 + for collector_name, collector in self.collector_map.items(): + logger.debug("Start post process for operator collector: %s", collector_name) + if collector.require_filter_by_step: + logger.debug("Operator Collector %s requires filter ops by step %s", collector_name, self.step) + target_op_list = self._get_target_ops_by_step(collector.op_list) + else: + logger.debug("Operator Collector %s use operators of all step for analysis", collector_name) + target_op_list = collector.op_list - if event.name.startswith(f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}"): - self._add_optimizer(event) - return + logger.debug("Source number of ops is %s, number of ops after filtered by rank is %s", + len(collector.op_list), len(target_op_list)) - def _add_event_for_op_stack(self, event): - if event.name.lower() == const.TORCH_TO_NPU: - self._add_torch_to_npu(event) - return + collector_kwargs = self.get_post_process_kwargs(collector_name) + collector.post_process(target_op_list, **collector_kwargs) + for property_name, property_value in collector.attribute_to_dataset.items(): + setattr(self, property_name, property_value) - if event.args.get(const.CALL_STACKS): - self._add_ops_with_stack(event) - return - if event.args.get(const.TASK_TYPE) and event.args.get(const.TASK_TYPE) in [const.AI_CORE, const.AI_CPU]: - self._add_ops_with_task_type(event) - return +@singleton +class ScheduleAnalysisDataset(BaseTimelineEventDataset): + collector_map = OrderedDict( + StepCollector=StepCollector(), + MemCollector=MemCollector(), + OpCompileCollector=OpCompileCollector(), + SynchronizeStreamCollector=SynchronizeStreamCollector(), + DataloaderCollector=DataloaderCollector(), + SyncBNCollector=SyncBNCollector(), + AtenCollector=AtenCollector(), + OptimizerCollector=OptimizerCollector(), + GcCollector=GcCollector() + ) - if event.name and event.ts and event.name == const.ACL_TO_NPU: - self._add_acl_to_npu(event) - return + def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None: + super().__init__(collection_path, data, build_dataset, **kwargs) + self.aten = None + self.synchronize_stream = None + self._collector_post_process() + self._post_process() def _post_process(self): # eliminate sub aten operator of the first level aten operator by 'ts' and 'dur', # keep the first level aten operator contiguous formated_atens = [] - for event in sorted(self._aten, key=lambda x: x.get("ts", -1)): + if not hasattr(self, "aten") or not hasattr(self, "synchronize_stream"): + return + + for event in sorted(self.aten, key=lambda x: x.get("ts", -1)): if event.name.startswith(const.ATEN): if not formated_atens or not formated_atens[-1].ts_include(event): formated_atens.append(event) elif event.name.startswith(const.SYNC_STREAM): - self._synchronize_stream.update_sync_stream_count() - if formated_atens[-1].ts_include(event): + self.synchronize_stream.update_sync_stream_count() + if formated_atens and formated_atens[-1].ts_include(event): # 使用aten算子的索引,用于查询堆栈 event["dataset_index"] = formated_atens[-1].get("dataset_index") - self._synchronize_stream.append_slow_sync_stream(event) + self.synchronize_stream.append_slow_sync_stream(event) else: continue - self._aten = formated_atens + self.aten = formated_atens + + +class ComputationAnalysisDataset(BaseTimelineEventDataset): + collector_map = OrderedDict( + StepCollector=StepCollector(), + SpecificTaskTypeOpCollector=SpecificTaskTypeOpCollector(), + TorchToNpuCollector=TorchToNpuCollector(), + AclToNpuCollector=AclToNpuCollector(), + OpStackCollector=OpStackCollector(), + FrequencyCollector=FrequencyCollector(), + ) + + def __init__(self, collection_path, data: dict, build_dataset=True, **kwargs) -> None: + super().__init__(collection_path, data, build_dataset, **kwargs) + self._collector_post_process() diff --git a/profiler/advisor/dataset/timeline_op_collector/__init__.py b/profiler/advisor/dataset/timeline_op_collector/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py b/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py new file mode 100644 index 0000000000000000000000000000000000000000..56e6165dd24aa4d7a8aafaab455793c6c8df8e13 --- /dev/null +++ b/profiler/advisor/dataset/timeline_op_collector/timeline_op_collector.py @@ -0,0 +1,376 @@ +import logging +import math +import os +from abc import abstractmethod, ABCMeta + +from profiler.advisor.common import constant as const +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.utils.utils import convert_to_float +from profiler.cluster_analyse.common_func.file_manager import FileManager + +logger = logging.getLogger() + + +class BaseOpCollector(metaclass=ABCMeta): + + def __init__(self): + self.attribute_to_dataset = {} + self.op_list = [] + self.require_filter_by_step = True + + @abstractmethod + def add_op(self): + """ add timeline event into self.op_list, and then will filter event in self.op_list by specific step + """ + pass + + @abstractmethod + def post_process(self): + """ convert self.op_list to required format like dict, set and so on and then record the final object into + self.attribute_to_dataset which used to set property of timeline event dataset + """ + pass + + +class StepCollector(BaseOpCollector): + KEY_WORD = "ProfilerStep" + + def __init__(self): + super().__init__() + self.require_filter_by_step = False + + def add_op(self, event): + if event.name.startswith(self.KEY_WORD): + self.op_list.append(event) + + def post_process(self, *args, **kwargs): + self.attribute_to_dataset["profiler_step"] = self.op_list + + +class OpCompileCollector(BaseOpCollector): + def __init__(self): + super().__init__() + self._total_op_compile_counter = 0 + self._total_op_compile_time = 0.0 + + @property + def total_time(self): + return self._total_op_compile_time + + @property + def total_count(self): + return self._total_op_compile_counter + + def is_empty(self): + return self._total_op_compile_counter == 0 + + def update(self, event: TimelineEvent): + self._total_op_compile_time += float(event.dur) + self._total_op_compile_counter += 1 + + def unset(self): + self._total_op_compile_counter = 0 + self._total_op_compile_time = 0.0 + + def add_op(self, event): + if event.name == const.OP_COMPILE_NAME or event.args.get("id") == const.OP_COMPILE_ID: + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + for op in target_op_list: + self.update(op) + + self.attribute_to_dataset["ops_compile"] = self + + +class SynchronizeStreamCollector(BaseOpCollector): + + def __init__(self): + super().__init__() + self._synchronize_stream_count = 0 + self._slow_synchronize_stream = [] + self.rule = SynchronizeStreamCollector._load_rule() + + @property + def total_count(self): + return self._synchronize_stream_count + + @property + def slow_synchronize_stream(self): + return self._slow_synchronize_stream + + @staticmethod + def _load_rule(): + sync_stream_rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + "rules", + "synchronize.yaml") + + sync_stream_rule = FileManager.read_yaml_file(sync_stream_rule_path) + return sync_stream_rule + + def update_sync_stream_count(self): + self._synchronize_stream_count += 1 + + def append_slow_sync_stream(self, event): + if float(event.dur) / 1000 >= self.rule.get("slow_synchronize_threshold", 10): + self._slow_synchronize_stream.append(event) + + def unset(self): + self._synchronize_stream_count = 0 + self._slow_synchronize_stream = [] + + def add_op(self, event): + return self.op_list + + def post_process(self, *args, **kwargs): + self.attribute_to_dataset["synchronize_stream"] = self + + +class MemCollector(BaseOpCollector): + MEMORY_OP_NAME = ["AscendCL@aclMallocMemInner", "AscendCL@aclrtFreePhysical"] + + def __init__(self): + super().__init__() + self.mem_op_info = {} + self.rule = self._load_rule() + + @staticmethod + def _load_rule(): + memory_rule_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + "rules", + "memory.yaml") + + memory_rule = FileManager.read_yaml_file(memory_rule_path) + return memory_rule + + def add_op(self, event): + if event.name not in self.MEMORY_OP_NAME: + return + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + for op in target_op_list: + if op.name not in self.mem_op_info: + self.mem_op_info[op.name] = dict(count=0, total_dur=0) + self.mem_op_info[op.name]["count"] += 1 + self.mem_op_info[op.name]["total_dur"] += float(op.dur) + + self.attribute_to_dataset["memory_ops"] = self + + +class DataloaderCollector(BaseOpCollector): + key_word = "dataloader" + + def __init__(self): + super().__init__() + + def add_op(self, event): + if self.key_word in event.name.lower(): + self.op_list.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur, + "stack": event.args.get("Call stack") + })) + + def post_process(self, *args, **kwargs): + self.attribute_to_dataset["dataloader"] = self.op_list + + +class SyncBNCollector(BaseOpCollector): + key_word = "syncbatchnorm" + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.lower() == self.key_word: + self.op_list.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur + })) + + def post_process(self, target_op_list, **kwargs): + self.attribute_to_dataset["sync_batchnorm"] = target_op_list + + +class AtenCollector(BaseOpCollector): + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.lower().startswith(f"{const.ATEN}{const.ATEN_SEP}") or event.name.lower().startswith( + f"{const.NPU}{const.ATEN_SEP}"): + self._add_aten(event) + return + + # 检查cann层同步操作,根据时间窗口索引到host侧的aten算子并给出堆栈 + if event.name.startswith(const.SYNC_STREAM): + self._add_aten(event) + + def post_process(self, target_op_list, **kwargs): + self.attribute_to_dataset["aten"] = target_op_list + + def _add_aten(self, event: TimelineEvent): + self.op_list.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur + })) + + +class OptimizerCollector(BaseOpCollector): + + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.startswith(f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}"): + self.op_list.append(TimelineEvent( + {"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur})) + + def post_process(self, target_op_list, **kwargs): + self.attribute_to_dataset["optimizer"] = target_op_list + + +class FrequencyCollector(BaseOpCollector): + KEY_WORD = "AI Core Freq" + + def __init__(self): + super().__init__() + self._previous_freq_index = -1 + + @staticmethod + def get_op_frequency(ai_core_ops, ai_core_freq): + ai_core_freq.sort(key=lambda x: float(x.ts)) + op_freq_record = {} + + op_index, freq_index = 0, 0 + while op_index < len(ai_core_ops) and freq_index < len(ai_core_freq): + op_event = ai_core_ops[op_index] + op_end_time = convert_to_float(op_event.ts) + convert_to_float(op_event.dur) + op_freq_list = [] + while freq_index < len(ai_core_freq): + freq_event = ai_core_freq[freq_index] + if convert_to_float(freq_event.end) < op_end_time: + op_freq_list.append(convert_to_float(freq_event.args.MHz)) + freq_index += 1 + continue + elif convert_to_float(freq_event.ts) < op_end_time: + if op_event.name not in op_freq_record: + op_freq_record[op_event.name] = {"count": 0, "dur": 0, "freq_list": []} + op_freq_record[op_event.name]["count"] += 1 + op_freq_record[op_event.name]["dur"] += convert_to_float(op_event.dur) + op_freq_list.append(convert_to_float(freq_event.args.MHz)) + op_freq_record[op_event.name]["freq_list"].append(min(op_freq_list)) + break + else: + break + + op_index += 1 + return op_freq_record + + def add_op(self, event): + if event.name == self.KEY_WORD: + if self._previous_freq_index != -1: + self.op_list[self._previous_freq_index]["end"] = event.get("ts", float(math.inf)) + self._previous_freq_index += 1 + event.setdefault("end", float(math.inf)) + self.op_list.append(event) + + def post_process(self, target_op_list, **kwargs): + ai_core_ops = kwargs.get("ai_core_ops", []) + if not ai_core_ops: + return + ai_core_ops.sort(key=lambda x: float(x.ts)) + op_freq = FrequencyCollector.get_op_frequency(ai_core_ops, target_op_list) + self.attribute_to_dataset["op_freq"] = op_freq + + +class SpecificTaskTypeOpCollector(BaseOpCollector): + + def __init__(self, op_type_list=None): + super().__init__() + self.op_type_list = op_type_list if op_type_list else [const.AI_CPU, const.AI_CORE, const.MIX_AIC] + + def add_op(self, event): + if event.args.get(const.TASK_TYPE) and event.args.get(const.TASK_TYPE) in self.op_type_list: + self.op_list.append( + TimelineEvent( + { + const.TASK_TYPE: event.args.get(const.TASK_TYPE), + "task_id": event.args.get("Task Id"), + "tid": event.tid, + "name": event.name, + "ts": str(event.ts), + "dur": str(event.dur) + } + ) + ) + + def post_process(self, target_op_list, **kwargs): + op_map = dict() + for op in target_op_list: + key = f"{op.name}-{op.ts}" + op_map[key] = op + + self.attribute_to_dataset["ops_with_task_type"] = op_map + self.attribute_to_dataset["task_op_names"] = list( + set([event_key.split("-")[0] for event_key in op_map.keys()])) + + +class TorchToNpuCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name.lower() == const.TORCH_TO_NPU: + self.op_list.append(TimelineEvent({"tid": event.tid, "ts": str(event.ts), "ph": event.ph, "id": event.id})) + + def post_process(self, target_op_list, **kwargs): + op_map = dict() + for op in target_op_list: + key = f"{op.ph}-{op.id}" + op_map[key] = op + + self.attribute_to_dataset["torch_to_npu"] = op_map + + +class AclToNpuCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.name and event.ts and event.name == const.ACL_TO_NPU: + self.op_list.append(TimelineEvent({"ts": event.ts})) + + def post_process(self, target_op_list, **kwargs): + op_record = set(str(op.ts) for op in target_op_list) + self.attribute_to_dataset["acl_to_npu"] = op_record + + +class OpStackCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.args.get(const.CALL_STACKS): + self.op_list.append( + TimelineEvent({"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts})) + + def post_process(self, target_op_list, **kwargs): + op_map = dict() + for op in target_op_list: + op_map[str(op.ts)] = op + + self.attribute_to_dataset["ops_with_stack"] = op_map + + +class GcCollector(BaseOpCollector): + def __init__(self): + super().__init__() + + def add_op(self, event): + if event.cat and isinstance(event.cat, str) and event.cat.lower() == "gc": + self.op_list.append(TimelineEvent( + {"name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur})) + + def post_process(self, target_op_list, **kwargs): + self.attribute_to_dataset["gc_events"] = self.op_list diff --git a/profiler/advisor/display/html/priority_background_color.py b/profiler/advisor/display/html/priority_background_color.py new file mode 100644 index 0000000000000000000000000000000000000000..7da61a093099bc6e9fedf40367176d500b04b185 --- /dev/null +++ b/profiler/advisor/display/html/priority_background_color.py @@ -0,0 +1,4 @@ +class PriorityBackgroundColor: + high = "#B5495B" + medium = "#fcaf17" + low = "#65c294" diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py index 3984fa8f34f0858a7281c9b51caaa43a170baf86..0c1882f133e5ec8a94c8a1ee80d9c53f7bda989b 100644 --- a/profiler/advisor/display/html/render.py +++ b/profiler/advisor/display/html/render.py @@ -1,7 +1,7 @@ import os import logging from typing import List, Dict -from collections import defaultdict +from collections import defaultdict, OrderedDict from jinja2 import Environment, FileSystemLoader from profiler.advisor.common import constant @@ -14,31 +14,72 @@ logger = logging.getLogger() @singleton class HTMLRender: + SUPPORTED_KEYS = ["main", "overall", "comparison", "computation", "schedule", "communication", "dataloader", + "memory"] + PERFORMANCE_PROBLEM_ANALYSIS = "performance_problem_analysis" + def __init__(self): self.html = "" self.render_list = defaultdict(list) def render_html(self, template_dir: str = "templates", template_name: str = "main.html", template_header=constant.DEFAULT_TEMPLATE_HEADER): - self.html = self.render_template("main", template_dir, template_name, render_list=self.render_list, + + # 确保overall 和 comparison 在 performance problem analysis 之前 + sorted_render_htmls = OrderedDict() + for key in ["overall", "comparison"]: + if key in self.render_list: + sorted_render_htmls[key] = self.render_list.get(key) + for key, html in self.render_list.items(): + if key in sorted_render_htmls: + continue + sorted_render_htmls[key] = html + + self.html = self.render_template("main", template_dir, template_name, render_list=sorted_render_htmls, template_header=template_header) - def render_template(self, key: str, template_dir: str, template_name: str, **kwargs): + def get_rendered_html(self, key: str, template_dir: str, template_name: str, **kwargs): + if key not in self.SUPPORTED_KEYS: + error_msg = f"Error render template key {key}, optionals are {self.SUPPORTED_KEYS}" + logger.error(error_msg) + raise Exception(error_msg) + if not os.path.isabs(template_dir): template_dir = os.path.join(os.path.dirname(__file__), template_dir) env = Environment(loader=FileSystemLoader(template_dir), autoescape=True) template = env.get_template(template_name) + if "priority" not in kwargs: + kwargs["priority"] = "low priority" rendered_html = template.render(**kwargs) - self.render_list[key].append(rendered_html) + return rendered_html + + def render_template(self, key: str, template_dir: str, template_name: str, **kwargs): + rendered_html = self.get_rendered_html(key, template_dir, template_name, **kwargs) + + if not kwargs.get("add_render_list", True): + return rendered_html + + if key in ["main", "overall", "comparison"]: + if key not in self.render_list: + self.render_list[key] = [] + self.render_list[key].append(rendered_html) + else: + if self.PERFORMANCE_PROBLEM_ANALYSIS not in self.render_list: + self.render_list[self.PERFORMANCE_PROBLEM_ANALYSIS] = {} + if key not in self.render_list[self.PERFORMANCE_PROBLEM_ANALYSIS]: + self.render_list[self.PERFORMANCE_PROBLEM_ANALYSIS][key] = [] + self.render_list[self.PERFORMANCE_PROBLEM_ANALYSIS][key].append(rendered_html) + return rendered_html def save_to_file(self, save_path: str): + save_path = os.path.join(Config().work_path, save_path) if not save_path.endswith(".html"): logger.error("Skip save html file because file name must endswith `.html`, " "but got %s.", os.path.basename(save_path)) return safe_write(self.html, save_path) - logger.info("Save suggestion to %s.", os.path.join(Config().work_path, save_path)) + logger.info("Save suggestion to %s.", save_path) diff --git a/profiler/advisor/display/html/templates/affinity_api.html b/profiler/advisor/display/html/templates/affinity_api.html index 4d12c3e37536392d122f85fc6ef3a4fcc123ef77..e9f3dd29c433c6d2481fe755ab5426d42f94a50f 100644 --- a/profiler/advisor/display/html/templates/affinity_api.html +++ b/profiler/advisor/display/html/templates/affinity_api.html @@ -1,11 +1,11 @@ {% if result|length > 0 %}
-

Affinity API Issues

+

Affinity API Issues

The analysis results of following affinity APIs are based on runtime env - cann-{{ cann_version }} + cann-{{ cann_version }} and - torch-{{ torch_version }} + torch-{{ torch_version }}
@@ -13,7 +13,7 @@ Suggestion: These APIs have no code stack. If parameter 'with_stack=False' was set while profiling, please refer to Ascend PyTorch Profiler to set - 'with_stack=True'. Otherwise, ignore following affinity APIs due to backward broadcast lack of stack. + 'with_stack=True'. Otherwise, ignore following affinity APIs due to backward broadcast lack of stack. {% endif %} {% for api_name, stacks in result.items() %} diff --git a/profiler/advisor/display/html/templates/ai_core_frequency.html b/profiler/advisor/display/html/templates/ai_core_frequency.html index d04514203733b445ecb6ce2b69435ce5a86e353d..9e5f34cefed3c3ae3ba176cc54c1ff5875bedcbb 100644 --- a/profiler/advisor/display/html/templates/ai_core_frequency.html +++ b/profiler/advisor/display/html/templates/ai_core_frequency.html @@ -1,6 +1,6 @@ {% if data|length > 0 %}
-

AI CORE Frequency Issues

+

AI CORE Frequency Issues

Issue: {{ desc }}
diff --git a/profiler/advisor/display/html/templates/main.html b/profiler/advisor/display/html/templates/main.html index 3727125b419547fc6a9ac9743eab34e1e1b76256..61c52d1db906ed8754f17458eaaffcd5adfc4fe3 100644 --- a/profiler/advisor/display/html/templates/main.html +++ b/profiler/advisor/display/html/templates/main.html @@ -137,10 +137,21 @@

Performance Optimization Suggestions

+ +
+ Optimization Priority: +
+ High +
+ Medium +
+ Low +
+ {% for key, renders in render_list.items() %} - {% if key == 'operator'%} + {% if key != 'performance_problem_analysis' %}
-

computation

+

{{ key }}

{% for render in renders %} {{render|safe}} @@ -148,14 +159,25 @@
{% else %} +
-

{{ key }}

+

performance problem analysis

- {% for render in renders %} - {{render|safe}} - {% endfor %} + + + {% for sub_key, sub_renders in renders.items() %} +
+

{{ sub_key }}

+
+ {% for render in sub_renders %} + {{render|safe}} + {% endfor %} +
+
+ {% endfor %}
+ {% endif %} {% endfor %}