diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py index 5f4bd3202cd2071088f25564a7d4b14144a34826..e0e17320b3309ed24cfc7f45d6b09f73501be7da 100644 --- a/profiler/advisor/analyzer/base_analyzer.py +++ b/profiler/advisor/analyzer/base_analyzer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging from functools import wraps from typing import Dict, List, Union diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyser.py index 846b79a50f31abb8445a0e5c2e82aaaf3c8ee23d..0914cff8f745de09cfb76f364b096b20979d65e2 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyser.py @@ -19,7 +19,7 @@ from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset class SlowLinkAnalyzer(BaseAnalyzer): @@ -35,11 +35,11 @@ class SlowLinkAnalyzer(BaseAnalyzer): SDMA = "SDMA" RDMA = "RDMA" SLOW_LINK_ANALYSIS = "slow_link_analysis" - dataset_cls_list = [ClusterCommunicationDataSet] + dataset_cls_list = [ClusterCommunicationDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) - key = ClusterCommunicationDataSet.get_key() + key = ClusterCommunicationDataset.get_key() self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key) self.rank_bw_dict = self.communication_data_class.get_data() self.result = OptimizeResult() @@ -49,8 +49,9 @@ class SlowLinkAnalyzer(BaseAnalyzer): def optimize(self, **kwargs): if self.rank_bw_dict is None: - print("slow_link 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹, \ - 如不关心这类数据请忽略") + print("Slow link analysis failed due to data loading failure. \ + Please check your cluster_analysis_output folder. \ + If you are not concerned about this type of data, please ignore this message.") return self.result self.process() self.format_datas = self.format_details() @@ -65,8 +66,11 @@ class SlowLinkAnalyzer(BaseAnalyzer): def produce_bottleneck(self, link_type: str): data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()] - avg_bw = round(sum(data_list) / len(data_list), 3) - if avg_bw == 0: + if len(data_list) > 0: + avg_bw = round(sum(data_list) / len(data_list), 3) + else: + print("The slow link (identified bottleneck) cannot provide a bottleneck \ + because the analysis data is missing bandwidth information.") return self.bottelneck += f'{link_type}: \n' \ f' The average is {avg_bw}, \n' \ diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py index aa0ddad5078252d61bf92b2be10f33dc56f85ab4..e331093003ca4a76bec11aa512cd3c487929335c 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py @@ -29,6 +29,13 @@ class SlowRankAnalyzer(BaseAnalyzer): BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] dataset_cls_list = [ClusterStepTraceTimeDataSet] + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean + def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) key = ClusterStepTraceTimeDataSet.get_key() @@ -104,9 +111,4 @@ class SlowRankAnalyzer(BaseAnalyzer): torch_version=self.torch_version, result=result_for_html) - @staticmethod - def compute_max_gap_ratio(data: list, mean: float): - if mean == 0: - return 0 - else: - return (max(data) - min(data)) / mean + diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index a7d7ddd93c70e59dc0d10318fdac06fdc581f70c..7a873c65635fcc8f2ebb35c8d317de09d78da491 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -1,5 +1,4 @@ import logging - from typing import List from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker diff --git a/profiler/advisor/common/timeline/event.py b/profiler/advisor/common/timeline/event.py index 6001ac88722e5a77daba1c960e8ccfd6894889e6..6edcffd8ffda2edcc3a367623a08cffeb7ca0fcc 100644 --- a/profiler/advisor/common/timeline/event.py +++ b/profiler/advisor/common/timeline/event.py @@ -1,3 +1,6 @@ +from decimal import Decimal + + class AdvisorDict(dict): def __getstate__(self): return self.__dict__ @@ -19,5 +22,5 @@ class TimelineEvent(AdvisorDict): def ts_include(self, event): - return float(self.ts) <= float(event.ts) and float(self.ts) + float(self.dur) >= float(event.ts) + float( - event.dur) \ No newline at end of file + return Decimal(self.ts) <= Decimal(event.ts) and \ + Decimal(self.ts) + Decimal(self.dur) >= Decimal(event.ts) + Decimal(event.dur) diff --git a/profiler/advisor/dataset/cluster/cluster_dataset.py b/profiler/advisor/dataset/cluster/cluster_dataset.py index 09fda2d4dcf2df2f05abb0007befb5c5c36ef824..e8d325d82e6fcc242e1b58fe3736c715cdad5ac8 100644 --- a/profiler/advisor/dataset/cluster/cluster_dataset.py +++ b/profiler/advisor/dataset/cluster/cluster_dataset.py @@ -25,9 +25,9 @@ class ClusterDataset(Dataset): """ for file in os.listdir(self.collection_path): if file == 'cluster_analysis_output': - print("[INFO]Cluster has been analyzed " - "because of the existence of cluster analysis output directory.") - print("[INFO]Skip Cluster analyze backend.") + logger.info("[INFO]Cluster has been analyzed " + "because of the existence of cluster analysis output directory.") + logger.info("[INFO]Skip Cluster analyze backend.") return True return False @@ -77,10 +77,10 @@ class ClusterStepTraceTimeDataSet(ClusterDataset): print("捕获到异常:", e) self._step_dict = None return False - self._step_dict = self.formate_data(step_data) + self._step_dict = self.format_data(step_data) return True - def formate_data(self, step_data: list): + def format_data(self, step_data: list): step_dict = defaultdict(lambda: [0, 0, 0]) for step_bean in step_data: if step_bean.type == self.RANK: @@ -94,7 +94,7 @@ class ClusterStepTraceTimeDataSet(ClusterDataset): @singleton -class ClusterCommunicationDataSet(ClusterDataset): +class ClusterCommunicationDataset(ClusterDataset): RDMA_TIME_MS = "RDMA time(ms)" RDMA_SIZE_MB = "RDMA size(mb)" SDMA_TIME_MS = "SDMA time(ms)" diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index 94b6fdfef78c044e37e24772699ed7ea67b0da30..d3889e4458fad8b34b5d811d152e255638999294 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -9,6 +9,7 @@ from profiler.advisor.common import constant as const from profiler.advisor.common.timeline.event import TimelineEvent from profiler.advisor.utils.utils import get_file_path_from_directory from profiler.advisor.utils.utils import singleton +from profiler.cluster_analyse.common_func.file_manager import FileManager logger = logging.getLogger() @@ -121,13 +122,13 @@ class TimelineEventDataset(Dataset): def parse_data_with_generator(self, func): result = [] try: - with open(self.timeline_data_list[0], "r") as f: - for i, event in tqdm(enumerate(ijson.items(f, "item")), - leave=False, ncols=100, desc="Building dataset for timeline analysis", - total=self.dataset_len): - func_res = func(index=i, event=event) - if func_res is not None: - result.append(func_res) + json_content = FileManager.read_json_file(self.timeline_data_list[0]) + for i, event in tqdm(enumerate(json_content), leave=False, ncols=100, + desc="Building dataset for timeline analysis", + total=self.dataset_len): + func_res = func(index=i, event=event) + if func_res: + result.append(func_res) except Exception as e: logger.warning("Error %s while parsing file %s, continue to timeline analysis", e, self.timeline_data_list[0]) diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py index 8ea7c9e0fc22c7da71a673e399fcfc231fbf1453..3984fa8f34f0858a7281c9b51caaa43a170baf86 100644 --- a/profiler/advisor/display/html/render.py +++ b/profiler/advisor/display/html/render.py @@ -1,6 +1,7 @@ import os import logging from typing import List, Dict +from collections import defaultdict from jinja2 import Environment, FileSystemLoader from profiler.advisor.common import constant @@ -15,7 +16,7 @@ logger = logging.getLogger() class HTMLRender: def __init__(self): self.html = "" - self.render_list: Dict[str, List] = {} + self.render_list = defaultdict(list) def render_html(self, template_dir: str = "templates", template_name: str = "main.html", template_header=constant.DEFAULT_TEMPLATE_HEADER): @@ -30,8 +31,6 @@ class HTMLRender: autoescape=True) template = env.get_template(template_name) rendered_html = template.render(**kwargs) - if key not in self.render_list: - self.render_list[key] = [] self.render_list[key].append(rendered_html) return rendered_html