From 8252eb91f6b23d952d6859380c9ca806239b0b18 Mon Sep 17 00:00:00 2001
From: kongdeshuo <1670690897@qq.com>
Date: Sat, 29 Jun 2024 18:08:51 +0800
Subject: [PATCH] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81=E6=89=B9?=
 =?UTF-8?q?=E9=87=8F=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/advisor/analyzer/base_analyzer.py    | 14 ++++++++++++++
 .../analyzer/cluster/slow_link_analyser.py    | 19 ++++++++++++-------
 .../analyzer/cluster/slow_rank_analyser.py    | 14 +++++++-------
 .../computation/bound/block_dim_checker.py    |  1 -
 profiler/advisor/common/timeline/event.py     |  3 ++-
 .../dataset/cluster/cluster_dataset.py        | 10 +++++-----
 .../advisor/dataset/timeline_event_dataset.py | 16 +++++++++-------
 profiler/advisor/display/html/render.py       |  6 +++---
 8 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py
index 5f4bd3202cd..e0e17320b33 100644
--- a/profiler/advisor/analyzer/base_analyzer.py
+++ b/profiler/advisor/analyzer/base_analyzer.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 from functools import wraps
 from typing import Dict, List, Union
diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyser.py
index 846b79a50f3..0b1c295b3db 100644
--- a/profiler/advisor/analyzer/cluster/slow_link_analyser.py
+++ b/profiler/advisor/analyzer/cluster/slow_link_analyser.py
@@ -19,7 +19,7 @@ from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
 from profiler.advisor.common import constant
 from profiler.advisor.result.result import OptimizeResult
 from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
-from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet
+from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataset
 
 
 class SlowLinkAnalyzer(BaseAnalyzer):
@@ -35,11 +35,11 @@ class SlowLinkAnalyzer(BaseAnalyzer):
     SDMA = "SDMA"
     RDMA = "RDMA"
     SLOW_LINK_ANALYSIS = "slow_link_analysis"
-    dataset_cls_list = [ClusterCommunicationDataSet]
+    dataset_cls_list = [ClusterCommunicationDataset]
 
     def __init__(self, collection_path, n_processes: int = 1, **kwargs):
         super().__init__(collection_path, n_processes, **kwargs)
-        key = ClusterCommunicationDataSet.get_key()
+        key = ClusterCommunicationDataset.get_key()
         self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key)
         self.rank_bw_dict = self.communication_data_class.get_data()
         self.result = OptimizeResult()
@@ -49,8 +49,9 @@ class SlowLinkAnalyzer(BaseAnalyzer):
 
     def optimize(self, **kwargs):
         if self.rank_bw_dict is None:
-            print("slow_link 分析失败，原因是数据加载失败，请检查你的cluster_analysis_outpu文件夹, \
-                   如不关心这类数据请忽略")
+            print("Slow link analysis failed due to data loading failure. \
+            Please check your cluster_analysis_output folder. \
+            If you are not concerned about this type of data, please ignore this message.")
             return self.result
         self.process()
         self.format_datas = self.format_details()
@@ -65,8 +66,12 @@ class SlowLinkAnalyzer(BaseAnalyzer):
 
     def produce_bottleneck(self, link_type: str):
         data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()]
-        avg_bw = round(sum(data_list) / len(data_list), 3)
-        if avg_bw == 0:
+        if len(data_list) > 0:
+            avg_bw = round(sum(data_list) / len(data_list), 3)
+        else:
+            avg_bw = 0
+            print("The slow link (identified bottleneck) cannot provide a bottleneck \
+                   because the analysis data is missing bandwidth information.")
             return
         self.bottelneck += f'{link_type}: \n' \
                            f'    The average is {avg_bw}, \n' \
diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py
index 4215b514a21..f63abe51bd5 100644
--- a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py
+++ b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py
@@ -29,6 +29,13 @@ class SlowRankAnalyzer(BaseAnalyzer):
     BOTTLENECK_LIST = ['Computing', 'Communication', "Free"]
     dataset_cls_list = [ClusterStepTraceTimeDataSet]
 
+    @staticmethod
+    def compute_max_gap_ratio(data: list, mean: float):
+        if mean == 0:
+            return 0
+        else:
+            return (max(data) - min(data)) / mean
+
     def __init__(self, collection_path, n_processes: int = 1, **kwargs):
         super().__init__(collection_path, n_processes, **kwargs)
         key = ClusterStepTraceTimeDataSet.get_key()
@@ -103,10 +110,3 @@ class SlowRankAnalyzer(BaseAnalyzer):
                                          cann_version=self.cann_version,
                                          torch_version=self.torch_version,
                                          result=result_for_html)
-
-    @staticmethod
-    def compute_max_gap_ratio(data: list, mean: float):
-        if mean == 0:
-            return 0
-        else:
-            return (max(data) - min(data)) / mean
diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py
index a7d7ddd93c7..7a873c65635 100644
--- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py
+++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py
@@ -1,5 +1,4 @@
 import logging
-
 from typing import List
 
 from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
diff --git a/profiler/advisor/common/timeline/event.py b/profiler/advisor/common/timeline/event.py
index 6001ac88722..68b65fbc2eb 100644
--- a/profiler/advisor/common/timeline/event.py
+++ b/profiler/advisor/common/timeline/event.py
@@ -1,3 +1,4 @@
+from decimal import Decimal
 class AdvisorDict(dict):
     def __getstate__(self):
         return self.__dict__
@@ -19,5 +20,5 @@ class TimelineEvent(AdvisorDict):
 
     def ts_include(self, event):
 
-        return float(self.ts) <= float(event.ts) and float(self.ts) + float(self.dur) >= float(event.ts) + float(
+        return Decimal(self.ts) <= Decimal(event.ts) and Decimal(self.ts) + Decimal(self.dur) >= Decimal(event.ts) + Decimal(
             event.dur)
\ No newline at end of file
diff --git a/profiler/advisor/dataset/cluster/cluster_dataset.py b/profiler/advisor/dataset/cluster/cluster_dataset.py
index 09fda2d4dcf..654e7e8706a 100644
--- a/profiler/advisor/dataset/cluster/cluster_dataset.py
+++ b/profiler/advisor/dataset/cluster/cluster_dataset.py
@@ -25,9 +25,9 @@ class ClusterDataset(Dataset):
         """
         for file in os.listdir(self.collection_path):
             if file == 'cluster_analysis_output':
-                print("[INFO]Cluster has been analyzed "
+                logger.info("[INFO]Cluster has been analyzed "
                       "because of the existence of cluster analysis output directory.")
-                print("[INFO]Skip Cluster analyze backend.")
+                logger.info("[INFO]Skip Cluster analyze backend.")
                 return True
         return False
 
@@ -77,10 +77,10 @@ class ClusterStepTraceTimeDataSet(ClusterDataset):
             print("捕获到异常：", e)
             self._step_dict = None
             return False
-        self._step_dict = self.formate_data(step_data)
+        self._step_dict = self.format_data(step_data)
         return True
 
-    def formate_data(self, step_data: list):
+    def format_data(self, step_data: list):
         step_dict = defaultdict(lambda: [0, 0, 0])
         for step_bean in step_data:
             if step_bean.type == self.RANK:
@@ -94,7 +94,7 @@ class ClusterStepTraceTimeDataSet(ClusterDataset):
 
 
 @singleton
-class ClusterCommunicationDataSet(ClusterDataset):
+class ClusterCommunicationDataset(ClusterDataset):
     RDMA_TIME_MS = "RDMA time(ms)"
     RDMA_SIZE_MB = "RDMA size(mb)"
     SDMA_TIME_MS = "SDMA time(ms)"
diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py
index 94b6fdfef78..34fbd89771a 100644
--- a/profiler/advisor/dataset/timeline_event_dataset.py
+++ b/profiler/advisor/dataset/timeline_event_dataset.py
@@ -9,6 +9,8 @@ from profiler.advisor.common import constant as const
 from profiler.advisor.common.timeline.event import TimelineEvent
 from profiler.advisor.utils.utils import get_file_path_from_directory
 from profiler.advisor.utils.utils import singleton
+from profiler.cluster_analyse.common_func.file_manager import FileManager
+
 
 logger = logging.getLogger()
 
@@ -121,13 +123,13 @@ class TimelineEventDataset(Dataset):
     def parse_data_with_generator(self, func):
         result = []
         try:
-            with open(self.timeline_data_list[0], "r") as f:
-                for i, event in tqdm(enumerate(ijson.items(f, "item")),
-                                     leave=False, ncols=100, desc="Building dataset for timeline analysis",
-                                     total=self.dataset_len):
-                    func_res = func(index=i, event=event)
-                    if func_res is not None:
-                        result.append(func_res)
+            json_content = FileManager.read_json_file(self.timeline_data_list[0])
+            for i, event in tqdm(enumerate(json_content), leave=False, ncols=100,
+                                 desc="Building dataset for timeline analysis",
+                                 total=self.dataset_len):
+                func_res = func(index=i, event=event)
+                if func_res:
+                    result.append(func_res)
         except Exception as e:
             logger.warning("Error %s while parsing file %s, continue to timeline analysis", e,
                            self.timeline_data_list[0])
diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py
index 8ea7c9e0fc2..79c116845f1 100644
--- a/profiler/advisor/display/html/render.py
+++ b/profiler/advisor/display/html/render.py
@@ -1,12 +1,14 @@
 import os
 import logging
 from typing import List, Dict
+from collections import defaultdict
 
 from jinja2 import Environment, FileSystemLoader
 from profiler.advisor.common import constant
 
 from profiler.advisor.config.config import Config
 from profiler.advisor.utils.utils import singleton, safe_write
+from profiler.cluster_analyse.common_func.file_manager import FileManager
 
 logger = logging.getLogger()
 
@@ -15,7 +17,7 @@ logger = logging.getLogger()
 class HTMLRender:
     def __init__(self):
         self.html = ""
-        self.render_list: Dict[str, List] = {}
+        self.render_list = defaultdict(list)
 
     def render_html(self, template_dir: str = "templates", template_name: str = "main.html",
                     template_header=constant.DEFAULT_TEMPLATE_HEADER):
@@ -30,8 +32,6 @@ class HTMLRender:
                           autoescape=True)
         template = env.get_template(template_name)
         rendered_html = template.render(**kwargs)
-        if key not in self.render_list:
-            self.render_list[key] = []
         self.render_list[key].append(rendered_html)
         return rendered_html
 
-- 
Gitee