From 8252eb91f6b23d952d6859380c9ca806239b0b18 Mon Sep 17 00:00:00 2001 From: kongdeshuo <1670690897@qq.com> Date: Sat, 29 Jun 2024 18:08:51 +0800 Subject: [PATCH] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81=E6=89=B9?= =?UTF-8?q?=E9=87=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- profiler/advisor/analyzer/base_analyzer.py | 14 ++++++++++++++ .../analyzer/cluster/slow_link_analyser.py | 19 ++++++++++++------- .../analyzer/cluster/slow_rank_analyser.py | 14 +++++++------- .../computation/bound/block_dim_checker.py | 1 - profiler/advisor/common/timeline/event.py | 3 ++- .../dataset/cluster/cluster_dataset.py | 10 +++++----- .../advisor/dataset/timeline_event_dataset.py | 16 +++++++++------- profiler/advisor/display/html/render.py | 6 +++--- 8 files changed, 52 insertions(+), 31 deletions(-) diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py index 5f4bd3202cd..e0e17320b33 100644 --- a/profiler/advisor/analyzer/base_analyzer.py +++ b/profiler/advisor/analyzer/base_analyzer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging from functools import wraps from typing import Dict, List, Union diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyser.py index 846b79a50f3..0b1c295b3db 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyser.py @@ -19,7 +19,7 @@ from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer from profiler.advisor.common import constant from profiler.advisor.result.result import OptimizeResult from profiler.advisor.result.item import OptimizeItem, OptimizeRecord -from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset class SlowLinkAnalyzer(BaseAnalyzer): @@ -35,11 +35,11 @@ class SlowLinkAnalyzer(BaseAnalyzer): SDMA = "SDMA" RDMA = "RDMA" SLOW_LINK_ANALYSIS = "slow_link_analysis" - dataset_cls_list = [ClusterCommunicationDataSet] + dataset_cls_list = [ClusterCommunicationDataset] def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) - key = ClusterCommunicationDataSet.get_key() + key = ClusterCommunicationDataset.get_key() self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key) self.rank_bw_dict = self.communication_data_class.get_data() self.result = OptimizeResult() @@ -49,8 +49,9 @@ class SlowLinkAnalyzer(BaseAnalyzer): def optimize(self, **kwargs): if self.rank_bw_dict is None: - print("slow_link 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹, \ - 如不关心这类数据请忽略") + print("Slow link analysis failed due to data loading failure. \ + Please check your cluster_analysis_output folder. \ + If you are not concerned about this type of data, please ignore this message.") return self.result self.process() self.format_datas = self.format_details() @@ -65,8 +66,12 @@ class SlowLinkAnalyzer(BaseAnalyzer): def produce_bottleneck(self, link_type: str): data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()] - avg_bw = round(sum(data_list) / len(data_list), 3) - if avg_bw == 0: + if len(data_list) > 0: + avg_bw = round(sum(data_list) / len(data_list), 3) + else: + avg_bw = 0 + print("The slow link (identified bottleneck) cannot provide a bottleneck \ + because the analysis data is missing bandwidth information.") return self.bottelneck += f'{link_type}: \n' \ f' The average is {avg_bw}, \n' \ diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py index 4215b514a21..f63abe51bd5 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py @@ -29,6 +29,13 @@ class SlowRankAnalyzer(BaseAnalyzer): BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] dataset_cls_list = [ClusterStepTraceTimeDataSet] + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean + def __init__(self, collection_path, n_processes: int = 1, **kwargs): super().__init__(collection_path, n_processes, **kwargs) key = ClusterStepTraceTimeDataSet.get_key() @@ -103,10 +110,3 @@ class SlowRankAnalyzer(BaseAnalyzer): cann_version=self.cann_version, torch_version=self.torch_version, result=result_for_html) - - @staticmethod - def compute_max_gap_ratio(data: list, mean: float): - if mean == 0: - return 0 - else: - return (max(data) - min(data)) / mean diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index a7d7ddd93c7..7a873c65635 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -1,5 +1,4 @@ import logging - from typing import List from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker diff --git a/profiler/advisor/common/timeline/event.py b/profiler/advisor/common/timeline/event.py index 6001ac88722..68b65fbc2eb 100644 --- a/profiler/advisor/common/timeline/event.py +++ b/profiler/advisor/common/timeline/event.py @@ -1,3 +1,4 @@ +from decimal import Decimal class AdvisorDict(dict): def __getstate__(self): return self.__dict__ @@ -19,5 +20,5 @@ class TimelineEvent(AdvisorDict): def ts_include(self, event): - return float(self.ts) <= float(event.ts) and float(self.ts) + float(self.dur) >= float(event.ts) + float( + return Decimal(self.ts) <= Decimal(event.ts) and Decimal(self.ts) + Decimal(self.dur) >= Decimal(event.ts) + Decimal( event.dur) \ No newline at end of file diff --git a/profiler/advisor/dataset/cluster/cluster_dataset.py b/profiler/advisor/dataset/cluster/cluster_dataset.py index 09fda2d4dcf..654e7e8706a 100644 --- a/profiler/advisor/dataset/cluster/cluster_dataset.py +++ b/profiler/advisor/dataset/cluster/cluster_dataset.py @@ -25,9 +25,9 @@ class ClusterDataset(Dataset): """ for file in os.listdir(self.collection_path): if file == 'cluster_analysis_output': - print("[INFO]Cluster has been analyzed " + logger.info("[INFO]Cluster has been analyzed " "because of the existence of cluster analysis output directory.") - print("[INFO]Skip Cluster analyze backend.") + logger.info("[INFO]Skip Cluster analyze backend.") return True return False @@ -77,10 +77,10 @@ class ClusterStepTraceTimeDataSet(ClusterDataset): print("捕获到异常:", e) self._step_dict = None return False - self._step_dict = self.formate_data(step_data) + self._step_dict = self.format_data(step_data) return True - def formate_data(self, step_data: list): + def format_data(self, step_data: list): step_dict = defaultdict(lambda: [0, 0, 0]) for step_bean in step_data: if step_bean.type == self.RANK: @@ -94,7 +94,7 @@ class ClusterStepTraceTimeDataSet(ClusterDataset): @singleton -class ClusterCommunicationDataSet(ClusterDataset): +class ClusterCommunicationDataset(ClusterDataset): RDMA_TIME_MS = "RDMA time(ms)" RDMA_SIZE_MB = "RDMA size(mb)" SDMA_TIME_MS = "SDMA time(ms)" diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index 94b6fdfef78..34fbd89771a 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -9,6 +9,8 @@ from profiler.advisor.common import constant as const from profiler.advisor.common.timeline.event import TimelineEvent from profiler.advisor.utils.utils import get_file_path_from_directory from profiler.advisor.utils.utils import singleton +from profiler.cluster_analyse.common_func.file_manager import FileManager + logger = logging.getLogger() @@ -121,13 +123,13 @@ class TimelineEventDataset(Dataset): def parse_data_with_generator(self, func): result = [] try: - with open(self.timeline_data_list[0], "r") as f: - for i, event in tqdm(enumerate(ijson.items(f, "item")), - leave=False, ncols=100, desc="Building dataset for timeline analysis", - total=self.dataset_len): - func_res = func(index=i, event=event) - if func_res is not None: - result.append(func_res) + json_content = FileManager.read_json_file(self.timeline_data_list[0]) + for i, event in tqdm(enumerate(json_content), leave=False, ncols=100, + desc="Building dataset for timeline analysis", + total=self.dataset_len): + func_res = func(index=i, event=event) + if func_res: + result.append(func_res) except Exception as e: logger.warning("Error %s while parsing file %s, continue to timeline analysis", e, self.timeline_data_list[0]) diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py index 8ea7c9e0fc2..79c116845f1 100644 --- a/profiler/advisor/display/html/render.py +++ b/profiler/advisor/display/html/render.py @@ -1,12 +1,14 @@ import os import logging from typing import List, Dict +from collections import defaultdict from jinja2 import Environment, FileSystemLoader from profiler.advisor.common import constant from profiler.advisor.config.config import Config from profiler.advisor.utils.utils import singleton, safe_write +from profiler.cluster_analyse.common_func.file_manager import FileManager logger = logging.getLogger() @@ -15,7 +17,7 @@ logger = logging.getLogger() class HTMLRender: def __init__(self): self.html = "" - self.render_list: Dict[str, List] = {} + self.render_list = defaultdict(list) def render_html(self, template_dir: str = "templates", template_name: str = "main.html", template_header=constant.DEFAULT_TEMPLATE_HEADER): @@ -30,8 +32,6 @@ class HTMLRender: autoescape=True) template = env.get_template(template_name) rendered_html = template.render(**kwargs) - if key not in self.render_list: - self.render_list[key] = [] self.render_list[key].append(rendered_html) return rendered_html -- Gitee