diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py index 5f4bd3202cd2071088f25564a7d4b14144a34826..e0e17320b3309ed24cfc7f45d6b09f73501be7da 100644 --- a/profiler/advisor/analyzer/base_analyzer.py +++ b/profiler/advisor/analyzer/base_analyzer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging from functools import wraps from typing import Dict, List, Union diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyser.py index 846b79a50f31abb8445a0e5c2e82aaaf3c8ee23d..0b1c295b3db6d126783dc234359eb370698c8e78 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyser.py @@ -19,7 +19,7 @@ from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset class SlowLinkAnalyzer(BaseAnalyzer): @@ -35,11 +35,11 @@ class SlowLinkAnalyzer(BaseAnalyzer): SDMA = "SDMA" RDMA = "RDMA" SLOW_LINK_ANALYSIS = "slow_link_analysis" - dataset_cls_list = [ClusterCommunicationDataSet] + dataset_cls_list = [ClusterCommunicationDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) - key = ClusterCommunicationDataSet.get_key() + key = ClusterCommunicationDataset.get_key() self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key) self.rank_bw_dict = self.communication_data_class.get_data() self.result = OptimizeResult() @@ -49,8 +49,9 @@ class SlowLinkAnalyzer(BaseAnalyzer): def optimize(self, **kwargs): if self.rank_bw_dict is None: - print("slow_link 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹, \ - 如不关心这类数据请忽略") + print("Slow link analysis failed due to data loading failure. \ + Please check your cluster_analysis_output folder. \ + If you are not concerned about this type of data, please ignore this message.") return self.result self.process() self.format_datas = self.format_details() @@ -65,8 +66,12 @@ class SlowLinkAnalyzer(BaseAnalyzer): def produce_bottleneck(self, link_type: str): data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()] - avg_bw = round(sum(data_list) / len(data_list), 3) - if avg_bw == 0: + if len(data_list) > 0: + avg_bw = round(sum(data_list) / len(data_list), 3) + else: + avg_bw = 0 + print("The slow link (identified bottleneck) cannot provide a bottleneck \ + because the analysis data is missing bandwidth information.") return self.bottelneck += f'{link_type}: \n' \ f' The average is {avg_bw}, \n' \ diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py index 4215b514a215a2a350571746ff9cb90c3c9956eb..f63abe51bd5388c7a6011c0adb88f9f837d46efd 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py @@ -29,6 +29,13 @@ class SlowRankAnalyzer(BaseAnalyzer): BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] dataset_cls_list = [ClusterStepTraceTimeDataSet] + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean + def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) key = ClusterStepTraceTimeDataSet.get_key() @@ -103,10 +110,3 @@ class SlowRankAnalyzer(BaseAnalyzer): cann_version=self.cann_version, torch_version=self.torch_version, result=result_for_html) - - @staticmethod - def compute_max_gap_ratio(data: list, mean: float): - if mean == 0: - return 0 - else: - return (max(data) - min(data)) / mean diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index a7d7ddd93c70e59dc0d10318fdac06fdc581f70c..7a873c65635fcc8f2ebb35c8d317de09d78da491 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -1,5 +1,4 @@ import logging - from typing import List from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker diff --git a/profiler/advisor/common/timeline/event.py b/profiler/advisor/common/timeline/event.py index 6001ac88722e5a77daba1c960e8ccfd6894889e6..68b65fbc2eb0ffabc8210f2617a6d2abb3c72ddf 100644 --- a/profiler/advisor/common/timeline/event.py +++ b/profiler/advisor/common/timeline/event.py @@ -1,3 +1,4 @@ +from decimal import Decimal class AdvisorDict(dict): def __getstate__(self): return self.__dict__ @@ -19,5 +20,5 @@ class TimelineEvent(AdvisorDict): def ts_include(self, event): - return float(self.ts) <= float(event.ts) and float(self.ts) + float(self.dur) >= float(event.ts) + float( + return Decimal(self.ts) <= Decimal(event.ts) and Decimal(self.ts) + Decimal(self.dur) >= Decimal(event.ts) + Decimal( event.dur) \ No newline at end of file diff --git a/profiler/advisor/dataset/cluster/cluster_dataset.py b/profiler/advisor/dataset/cluster/cluster_dataset.py index 09fda2d4dcf2df2f05abb0007befb5c5c36ef824..654e7e8706a70d4689db4ee6a4e405a549807106 100644 --- a/profiler/advisor/dataset/cluster/cluster_dataset.py +++ b/profiler/advisor/dataset/cluster/cluster_dataset.py @@ -25,9 +25,9 @@ class ClusterDataset(Dataset): """ for file in os.listdir(self.collection_path): if file == 'cluster_analysis_output': - print("[INFO]Cluster has been analyzed " + logger.info("[INFO]Cluster has been analyzed " "because of the existence of cluster analysis output directory.") - print("[INFO]Skip Cluster analyze backend.") + logger.info("[INFO]Skip Cluster analyze backend.") return True return False @@ -77,10 +77,10 @@ class ClusterStepTraceTimeDataSet(ClusterDataset): print("捕获到异常:", e) self._step_dict = None return False - self._step_dict = self.formate_data(step_data) + self._step_dict = self.format_data(step_data) return True - def formate_data(self, step_data: list): + def format_data(self, step_data: list): step_dict = defaultdict(lambda: [0, 0, 0]) for step_bean in step_data: if step_bean.type == self.RANK: @@ -94,7 +94,7 @@ class ClusterStepTraceTimeDataSet(ClusterDataset): @singleton -class ClusterCommunicationDataSet(ClusterDataset): +class ClusterCommunicationDataset(ClusterDataset): RDMA_TIME_MS = "RDMA time(ms)" RDMA_SIZE_MB = "RDMA size(mb)" SDMA_TIME_MS = "SDMA time(ms)" diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index 94b6fdfef78c044e37e24772699ed7ea67b0da30..34fbd89771a197a974c8ed858e34c38a9fe71a4a 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -9,6 +9,8 @@ from profiler.advisor.common import constant as const from profiler.advisor.common.timeline.event import TimelineEvent from profiler.advisor.utils.utils import get_file_path_from_directory from profiler.advisor.utils.utils import singleton +from profiler.cluster_analyse.common_func.file_manager import FileManager + logger = logging.getLogger() @@ -121,13 +123,13 @@ class TimelineEventDataset(Dataset): def parse_data_with_generator(self, func): result = [] try: - with open(self.timeline_data_list[0], "r") as f: - for i, event in tqdm(enumerate(ijson.items(f, "item")), - leave=False, ncols=100, desc="Building dataset for timeline analysis", - total=self.dataset_len): - func_res = func(index=i, event=event) - if func_res is not None: - result.append(func_res) + json_content = FileManager.read_json_file(self.timeline_data_list[0]) + for i, event in tqdm(enumerate(json_content), leave=False, ncols=100, + desc="Building dataset for timeline analysis", + total=self.dataset_len): + func_res = func(index=i, event=event) + if func_res: + result.append(func_res) except Exception as e: logger.warning("Error %s while parsing file %s, continue to timeline analysis", e, self.timeline_data_list[0]) diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py index 8ea7c9e0fc22c7da71a673e399fcfc231fbf1453..79c116845f159df1758c1945df5278d0cd237e32 100644 --- a/profiler/advisor/display/html/render.py +++ b/profiler/advisor/display/html/render.py @@ -1,12 +1,14 @@ import os import logging from typing import List, Dict +from collections import defaultdict from jinja2 import Environment, FileSystemLoader from profiler.advisor.common import constant from profiler.advisor.config.config import Config from profiler.advisor.utils.utils import singleton, safe_write +from profiler.cluster_analyse.common_func.file_manager import FileManager logger = logging.getLogger() @@ -15,7 +17,7 @@ logger = logging.getLogger() class HTMLRender: def __init__(self): self.html = "" - self.render_list: Dict[str, List] = {} + self.render_list = defaultdict(list) def render_html(self, template_dir: str = "templates", template_name: str = "main.html", template_header=constant.DEFAULT_TEMPLATE_HEADER): @@ -30,8 +32,6 @@ class HTMLRender: autoescape=True) template = env.get_template(template_name) rendered_html = template.render(**kwargs) - if key not in self.render_list: - self.render_list[key] = [] self.render_list[key].append(rendered_html) return rendered_html