diff --git a/profiler/compare_tools/compare_backend/utils/constant.py b/profiler/compare_tools/compare_backend/utils/constant.py index 1b77b214c85f6733e36298e119e43a778fd7969f..e2854692ae3218c873171b75878e3e69203effa2 100644 --- a/profiler/compare_tools/compare_backend/utils/constant.py +++ b/profiler/compare_tools/compare_backend/utils/constant.py @@ -74,7 +74,7 @@ class Constant(object): MEMORY_LIST = "memory_list" COMMUNICATION_DICT = "comm_dict" - #compare type + # compare type OVERALL_COMPARE = "overall" BWD_LIST = ["bwd", "backward", "back"] diff --git a/profiler/module_visualization/__init__.py b/profiler/module_visualization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/module_visualization/graph/__init__.py b/profiler/module_visualization/graph/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/module_visualization/graph/prof_node.py b/profiler/module_visualization/graph/prof_node.py new file mode 100644 index 0000000000000000000000000000000000000000..cfcdabbb991d2abb86f31e5a5866e788cf9a3c6e --- /dev/null +++ b/profiler/module_visualization/graph/prof_node.py @@ -0,0 +1,90 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from profiler.prof_common.constant import Constant +from profiler.prof_common.base_node import BaseNode +from profiler.prof_common.trace_event_bean import TraceEventBean + + +class ProfNode(BaseNode): + MODULE_TYPE = 1 + + def __init__(self, event: TraceEventBean, parent_node=None): + super().__init__(event, parent_node) + self._kernel_total_list = [] + + @property + def node_id(self): + return self._event.unique_id + + @property + def total_kernels(self): + return self._kernel_total_list + + @property + def host_total_dur(self): + if self.is_root_node: + return sum((node.host_total_dur for node in self.child_nodes)) + return self._event.dur + + @property + def host_self_dur(self): + return self.host_total_dur - sum((node.host_total_dur for node in self.child_nodes)) + + @property + def device_total_dur(self): + if self.is_root_node: + return sum((node.device_total_dur for node in self.child_nodes)) + return sum((kernel.dur for kernel in self._kernel_total_list)) + + @property + def device_self_dur(self): + return self.device_total_dur - sum((node.device_total_dur for node in self.child_nodes)) + + @property + def input_data(self) -> dict: + data = {} + input_dim = self._event.args.get("Input Dims") + if input_dim: + data["Input Dims"] = input_dim + input_type = self._event.args.get("Input type") + if input_type: + data["Input type"] = input_type + return data + + @property + def data(self): + return {"Input Data": self.input_data, + "Host Self Duration(us)": round(self.host_self_dur, 2), + "Host Total Duration(us)": round(self.host_total_dur, 2), + "Device Self Duration(us)": round(self.device_self_dur, 2), + "Device Total Duration(us)": round(self.device_total_dur, 2)} + + @property + def info(self): + return {"id": self.node_id, + "node_type": self.MODULE_TYPE, + "data": self.data, + "upnode": self.parent_node.node_id if self.parent_node else "None", + "subnodes": [node.node_id for node in iter(self.child_nodes)]} + + @property + def is_root_node(self): + return self.node_id == Constant.NPU_ROOT_ID + + def update_child_nodes(self, node): + self._child_nodes.append(node) + + def update_kernel_total_list(self, kernel_list: list): + self._kernel_total_list.extend(kernel_list) diff --git a/profiler/module_visualization/graph_build/__init__.py b/profiler/module_visualization/graph_build/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/module_visualization/graph_build/fwd_module_node.py b/profiler/module_visualization/graph_build/fwd_module_node.py new file mode 100644 index 0000000000000000000000000000000000000000..34d7ab829649f482c97fb489ac0399d3a876c100 --- /dev/null +++ b/profiler/module_visualization/graph_build/fwd_module_node.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from profiler.prof_common.base_node import BaseNode +from profiler.prof_common.trace_event_bean import TraceEventBean + + +class FwdModuleNode(BaseNode): + def __init__(self, event: TraceEventBean, parent_node=None): + super().__init__(event, parent_node) + self._bwd_op_list = [] + + @property + def bwd_op_list(self): + return self._bwd_op_list + + def update_bwd_op(self, bwd_op_list: list): + self._bwd_op_list.extend(bwd_op_list) diff --git a/profiler/module_visualization/graph_build/prof_graph_builder.py b/profiler/module_visualization/graph_build/prof_graph_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..ff389cb42027663d0ba96b54a12e1f797ab5eefd --- /dev/null +++ b/profiler/module_visualization/graph_build/prof_graph_builder.py @@ -0,0 +1,115 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from profiler.module_visualization.graph.prof_node import ProfNode +from profiler.module_visualization.graph_build.fwd_module_node import FwdModuleNode +from profiler.prof_common.tree_builder import TreeBuilder +from profiler.prof_common.trace_event_bean import TraceEventBean +from profiler.prof_common.constant import Constant +from profiler.module_visualization.prof_parse.prof_data_pre_process import ProfDataPreProcess + + +class ProfGraphBuilder: + def __init__(self, prof_data_path: str): + self._prof_data_path = prof_data_path + self._prof_data = {} + + @classmethod + def _create_event_bean_from_ops(cls, op_list: list, name: str) -> TraceEventBean: + min_start = min([op.start_time for op in iter(op_list)]) + max_end = max([op.end_time for op in iter(op_list)]) + # 以反向算子的区间作为反向module的区间范围,为了module包含算子,做了+1 +2处理 + return TraceEventBean({"ts": min_start - 1, "dur": float(max_end - min_start) + 2, "name": name}) + + @classmethod + def _trans_flow_to_dict(cls, flow_events: dict, end_events: list) -> dict: + end_event_dict = {} + for event in end_events: + end_event_dict[event.start_time] = event + result_data = {} + for flow in flow_events.values(): + start_point = flow.get("start") + end_point = flow.get("end") + if not start_point or not end_point: + continue + end_event = end_event_dict.get(end_point.start_time) + if end_event: + result_data.setdefault(start_point.start_time, []).append(end_event) + return result_data + + def build_graph(self): + self._prof_data = ProfDataPreProcess(self._prof_data_path).run() + all_data = [*self._prof_data.get(Constant.MODULE_EVENT, []), + *self.find_bwd_module(), + *self._prof_data.get(Constant.CPU_OP_EVENT, [])] + all_data.sort(key=lambda x: x.start_time) + name_dict = {} + for event in all_data: + order_id = name_dict.get(event.name, 0) + event.set_id(f"{event.name}_{order_id}") + name_dict[event.name] = order_id + 1 + root_node = TreeBuilder.build_tree(all_data, ProfNode, TraceEventBean({}, Constant.NPU_ROOT_ID)) + kernel_flow_dict = self._trans_flow_to_dict(self._prof_data.get(Constant.TORCH_TO_NPU_FLOW, {}), + self._prof_data.get(Constant.KERNEL_EVENT, [])) + for start_time, kernels in kernel_flow_dict.items(): + matched_node = root_node.binary_search(start_time) + while matched_node != Constant.INVALID_RETURN: + matched_node.update_kernel_total_list(kernels) + matched_node = matched_node.binary_search(start_time) + all_data = root_node.find_all_child_nodes() + all_data.append(root_node) + return all_data + + def find_bwd_module(self) -> list: + bwd_module_list = [] + fwdbwd_flow = self._prof_data.get(Constant.FWD_BWD_FLOW, {}) + module_list = self._prof_data.get(Constant.MODULE_EVENT, []) + cpu_op_list = self._prof_data.get(Constant.CPU_OP_EVENT, []) + if not fwdbwd_flow or not module_list or not cpu_op_list: + return bwd_module_list + fwd_tid = module_list[0].tid + bwd_tid = fwd_tid + for end_point in (flow.get("end") for flow in fwdbwd_flow.values()): + if end_point: + bwd_tid = end_point.tid + break + if fwd_tid == bwd_tid: + return bwd_module_list + # 将每一个反向包成一个module,名字叫“nn.Module: BACKWARD_0” + cpu_op_list.sort(key=lambda x: x.start_time) + pre_status = Constant.FWD_OR_OPT + bwd_op_list = [] + for op in cpu_op_list: + if op.tid == bwd_tid: + bwd_op_list.append(op) + pre_status = Constant.BACKWARD + elif pre_status == Constant.BACKWARD: + bwd_module_list.append(self._create_event_bean_from_ops(bwd_op_list, "nn.Module: BACKWARD")) + bwd_op_list.clear() + pre_status = Constant.FWD_OR_OPT + + # 通过连线匹配正向module,构建出反向的整体module关系 + root_node = TreeBuilder.build_tree(module_list, FwdModuleNode, TraceEventBean({})) + fwdbwd_flow_dict = self._trans_flow_to_dict(fwdbwd_flow, cpu_op_list) + for start_time, end_events in fwdbwd_flow_dict.items(): + matched_node = root_node.binary_search(start_time) + while matched_node != Constant.INVALID_RETURN: + matched_node.update_bwd_op(end_events) + matched_node = matched_node.binary_search(start_time) + all_nodes = root_node.find_all_child_nodes() + for module_node in all_nodes: + if module_node.bwd_op_list: + bwd_module_list.append( + self._create_event_bean_from_ops(module_node.bwd_op_list, f"{module_node.name} [BACKWARD]")) + return bwd_module_list diff --git a/profiler/module_visualization/prof_graph_export.py b/profiler/module_visualization/prof_graph_export.py new file mode 100644 index 0000000000000000000000000000000000000000..2febbfd606bd50ee95c4269a58aa07d182cf216a --- /dev/null +++ b/profiler/module_visualization/prof_graph_export.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from datetime import datetime + +from profiler.prof_common.constant import Constant +from profiler.prof_common.file_reader import FileReader +from profiler.prof_common.path_manager import PathManager +from profiler.module_visualization.graph_build.prof_graph_builder import ProfGraphBuilder + + +class ProfGraphExport: + @staticmethod + def export_to_json(prof_data_path: str, output_path: str): + logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") + try: + PathManager.input_path_common_check(prof_data_path) + PathManager.check_input_directory_path(output_path) + PathManager.make_dir_safety(output_path) + all_nodes = ProfGraphBuilder(prof_data_path).build_graph() + result_data = {"root": Constant.NPU_ROOT_ID, "node": {}} + for node in all_nodes: + result_data["node"][node.node_id] = node.info + file_name = "prof_graph_json_{}.vis".format(datetime.utcnow().strftime("%Y%m%d%H%M%S%f")[:-3]) + FileReader.write_json_file(output_path, result_data, file_name, format_json=True) + except RuntimeError as err: + logging.error(err) diff --git a/profiler/module_visualization/prof_parse/__init__.py b/profiler/module_visualization/prof_parse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/module_visualization/prof_parse/prof_data_pre_process.py b/profiler/module_visualization/prof_parse/prof_data_pre_process.py new file mode 100644 index 0000000000000000000000000000000000000000..9dc820e4ca560f816b7738243197b90f1adb8c25 --- /dev/null +++ b/profiler/module_visualization/prof_parse/prof_data_pre_process.py @@ -0,0 +1,102 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from profiler.prof_common.file_reader import FileReader +from profiler.prof_common.constant import Constant +from profiler.prof_common.trace_event_bean import TraceEventBean + + +class ProfDataPreProcess: + def __init__(self, prof_data_path: str): + self._prof_data_path = prof_data_path + self._trace_path = "" + self._kernel_pid = None + self._result_data = {Constant.CPU_OP_EVENT: [], Constant.MODULE_EVENT: [], Constant.KERNEL_EVENT: [], + Constant.TORCH_TO_NPU_FLOW: {}, Constant.FWD_BWD_FLOW: {}} + + def run(self) -> dict: + self._check_trace_path() + self._parse_trace_events() + self._check_result_data() + return self._result_data + + def _check_trace_path(self): + if os.path.isfile(self._prof_data_path): + (split_file_path, split_file_name) = os.path.split(self._prof_data_path) + (shot_name, extension) = os.path.splitext(split_file_name) + if extension != ".json": + msg = f"Invalid profiling path suffix: {self._prof_data_path}. " \ + f"You should input in a json file path, such as trace_view.json." + raise RuntimeError(msg) + self._trace_path = self._prof_data_path + return + ascend_output = os.path.join(self._prof_data_path, "ASCEND_PROFILER_OUTPUT") + profiler_output = ascend_output if os.path.isdir(ascend_output) else self._prof_data_path + json_path = os.path.join(profiler_output, "trace_view.json") + if not os.path.isfile(json_path): + msg = f"Invalid profiling path: {self._prof_data_path}. The data path should be the " \ + f"folder that ends with the ascend_pt collected by the Ascend PyTorch Profiler." + raise RuntimeError(msg) + self._trace_path = json_path + + def _parse_trace_events(self): + trace_data = FileReader.read_json_file(self._trace_path) + self._check_trace_data(trace_data) + iter_trace_data = iter(trace_data) + for event in iter_trace_data: + bean = TraceEventBean(event) + if bean.is_optimizer(): + self._result_data[Constant.MODULE_EVENT].append(bean) + elif bean.is_cpu_op(): + if not bean.is_step(): + self._result_data[Constant.CPU_OP_EVENT].append(bean) + elif bean.is_nn_module(): + self._result_data[Constant.MODULE_EVENT].append(bean) + elif bean.is_torch_to_npu(): + if bean.is_flow_start(): + self._result_data[Constant.TORCH_TO_NPU_FLOW].setdefault(bean.id, {})["start"] = bean + else: + self._result_data[Constant.TORCH_TO_NPU_FLOW].setdefault(bean.id, {})["end"] = bean + elif bean.is_fwd_bwd_flow(): + if bean.is_flow_start(): + self._result_data[Constant.FWD_BWD_FLOW].setdefault(bean.id, {})["start"] = bean + else: + self._result_data[Constant.FWD_BWD_FLOW].setdefault(bean.id, {})["end"] = bean + elif bean.is_kernel_event(self._kernel_pid): + self._result_data[Constant.KERNEL_EVENT].append(bean) + + def _check_trace_data(self, trace_data): + if not isinstance(trace_data, list): + msg = f"Invalid profiling data path, this feature only supports performance data " \ + f"collected by Ascend PyTorch Profiler." + raise RuntimeError(msg) + iter_trace_data = iter(trace_data) + for event in iter_trace_data: + bean = TraceEventBean(event) + if bean.is_npu_process(): + self._kernel_pid = bean.pid + break + if self._kernel_pid is None: + msg = f"There is no operator on the NPU side for this data, please check whether the NPU switch is enabled." + raise RuntimeError(msg) + + def _check_result_data(self): + if not self._result_data.get(Constant.CPU_OP_EVENT): + msg = f"This data does not have any aten operator, please make sure to enable the CPU switch." + raise RuntimeError(msg) + if not self._result_data.get(Constant.MODULE_EVENT): + msg = f"This data does not collect any modules, please make sure to turn on the with_stack switch." + raise RuntimeError(msg) diff --git a/profiler/prof_common/base_node.py b/profiler/prof_common/base_node.py new file mode 100644 index 0000000000000000000000000000000000000000..b7cd6780003f9e0e5c58495ac43a893214e68beb --- /dev/null +++ b/profiler/prof_common/base_node.py @@ -0,0 +1,78 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from math import ceil +from queue import Queue + +from decimal import Decimal + +from profiler.prof_common.constant import Constant +from profiler.prof_common.trace_event_bean import TraceEventBean + + +class BaseNode: + def __init__(self, event: TraceEventBean, parent_node=None): + self._event = event + self._parent_node = parent_node + self._child_nodes = [] + + @property + def parent_node(self): + return self._parent_node + + @property + def child_nodes(self): + return self._child_nodes + + @property + def name(self): + return self._event.name + + @property + def start_time(self) -> Decimal: + return self._event.start_time + + @property + def end_time(self) -> Decimal: + return self._event.end_time + + def update_child_nodes(self, node): + self._child_nodes.append(node) + + def binary_search(self, ts_time): + if not self.child_nodes: + return Constant.INVALID_RETURN + right = len(self.child_nodes) - 1 + left = 0 + while right > left: + mid = left + ceil((right - left) / 2) + if ts_time >= self.child_nodes[mid].start_time: + left = mid + else: + right = mid - 1 + if self.child_nodes[left].start_time < ts_time < self.child_nodes[left].end_time: + return self.child_nodes[left] + return Constant.INVALID_RETURN + + def find_all_child_nodes(self) -> list: + result_data = [] + node_queue = Queue() + for child_node in self.child_nodes: + node_queue.put(child_node) + while not node_queue.empty(): + tree_node = node_queue.get() + result_data.append(tree_node) + for child_node in tree_node.child_nodes: + node_queue.put(child_node) + return result_data diff --git a/profiler/prof_common/constant.py b/profiler/prof_common/constant.py index 5789b89cb1a248977b64839339395acc5288b2ab..87bc51b56bc71c2a70e35a6b08aa4de7bd521f1d 100644 --- a/profiler/prof_common/constant.py +++ b/profiler/prof_common/constant.py @@ -15,4 +15,17 @@ class Constant(object): COLLECTION_PATH = "collection_path" ANALYSIS_MODE = "analysis_mode" - CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help']) \ No newline at end of file + CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help']) + + MAX_FILE_SIZE_5_GB = 1024 * 1024 * 1024 * 5 + + MODULE_EVENT = "module_event" + CPU_OP_EVENT = "op_event" + TORCH_TO_NPU_FLOW = "torch_to_device" + KERNEL_EVENT = "kernel_event" + FWD_BWD_FLOW = "fwd_to_bwd" + NPU_ROOT_ID = "NPU" + + FWD_OR_OPT = 0 + BACKWARD = 1 + INVALID_RETURN = -1 diff --git a/profiler/prof_common/file_reader.py b/profiler/prof_common/file_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..d8a9c8fb4d6599edf46973f8e93aa708903ff007 --- /dev/null +++ b/profiler/prof_common/file_reader.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import logging +import os + +from profiler.prof_common.path_manager import PathManager +from profiler.prof_common.constant import Constant + + +class FileReader: + DATA_FILE_AUTHORITY = 0o640 + DATA_DIR_AUTHORITY = 0o750 + + @classmethod + def read_json_file(cls, file_path: str) -> any: + PathManager.check_path_readable(file_path) + if not os.path.isfile(file_path): + raise FileNotFoundError("File not exists.") + file_size = os.path.getsize(file_path) + if file_size <= 0: + return [] + if file_size > Constant.MAX_FILE_SIZE_5_GB: + msg = f"The file({file_path}) size exceeds the preset max value, failed to read the file." + raise RuntimeError(msg) + try: + with open(file_path, "rt") as file: + json_data = json.loads(file.read()) + except Exception as e: + msg = f"Can't read file: {file_path}" + raise RuntimeError(msg) from e + return json_data + + @classmethod + def write_json_file(cls, output_path: str, data: dict, file_name: str, format_json: bool = False) -> None: + if not data: + return + output_file = os.path.join(output_path, file_name) + PathManager.check_path_writeable(output_path) + try: + with os.fdopen( + os.open(output_file, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY), 'w' + ) as file: + indent = 4 if format_json else None + file.write(json.dumps(data, indent=indent)) + except Exception as e: + raise RuntimeError(f"Can't create the file: {output_path}") from e diff --git a/profiler/prof_common/path_manager.py b/profiler/prof_common/path_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..3e41b8b50aca42ba33071b2661966d221102e106 --- /dev/null +++ b/profiler/prof_common/path_manager.py @@ -0,0 +1,191 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import re +import shutil +import platform + + +class PathManager: + MAX_PATH_LENGTH = 4096 + MAX_FILE_NAME_LENGTH = 255 + DATA_FILE_AUTHORITY = 0o640 + DATA_DIR_AUTHORITY = 0o750 + WINDOWS = "windows" + + @classmethod + def check_input_directory_path(cls, path: str): + """ + Function Description: + check whether the path is valid, some businesses can accept a path that does not exist, + so the function do not verify whether the path exists + Parameter: + path: the path to check, whether the incoming path is absolute or relative depends on the business + Exception Description: + when invalid data throw exception + """ + cls.input_path_common_check(path) + base_name = os.path.basename(path) + if os.path.isfile(path): + msg = f"Invalid input path which is a file path: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_input_file_path(cls, path: str): + """ + Function Description: + check whether the file path is valid, some businesses can accept a path that does not exist, + so the function do not verify whether the path exists + Parameter: + path: the file path to check, whether the incoming path is absolute or relative depends on the business + Exception Description: + when invalid data throw exception + """ + cls.input_path_common_check(path) + base_name = os.path.basename(path) + if os.path.isdir(path): + msg = f"Invalid input path which is a directory path: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_path_length(cls, path: str): + if len(path) > cls.MAX_PATH_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + path_split_list = path.split("/") + for path in path_split_list: + path_list = path.split("\\") + for name in path_list: + if len(name) > cls.MAX_FILE_NAME_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + + @classmethod + def input_path_common_check(cls, path: str): + cls.check_path_length(path) + + if os.path.islink(path): + msg = f"Invalid input path which is a soft link." + raise RuntimeError(msg) + + if platform.system().lower() == cls.WINDOWS: + pattern = r'(\.|:|\\|/|_|-|\s|[~0-9a-zA-Z\u4e00-\u9fa5])+' + else: + pattern = r'(\.|/|_|-|\s|[~0-9a-zA-Z])+' + if not re.fullmatch(pattern, path): + msg = f"Invalid input path." + raise RuntimeError(msg) + + @classmethod + def check_path_owner_consistent(cls, path: str): + """ + Function Description: + check whether the path belong to process owner + Parameter: + path: the path to check + Exception Description: + when invalid path, prompt the user + """ + base_name = os.path.basename(path) + if not os.path.exists(path): + msg = f"Invalid path: {base_name}" + raise RuntimeError(msg) + if platform.system().lower() == cls.WINDOWS: + return + if os.stat(path).st_uid != os.getuid(): + check_msg = input("The path does not belong to you, do you want to continue? [y/n]") + if check_msg.lower() != "y": + raise RuntimeError("The user choose not to continue.") + + @classmethod + def check_path_writeable(cls, path): + """ + Function Description: + check whether the path is writable + Parameter: + path: the path to check + Exception Description: + when invalid data throw exception + """ + cls.check_path_owner_consistent(path) + if os.path.islink(path): + msg = f"Invalid path which is a soft link." + raise RuntimeError(msg) + base_name = os.path.basename(path) + if not os.access(path, os.W_OK): + msg = f"The path permission check failed: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_path_readable(cls, path): + """ + Function Description: + check whether the path is writable + Parameter: + path: the path to check + Exception Description: + when invalid data throw exception + """ + cls.check_path_owner_consistent(path) + if os.path.islink(path): + msg = f"Invalid path which is a soft link." + raise RuntimeError(msg) + base_name = os.path.basename(path) + if not os.access(path, os.R_OK): + msg = f"The path permission check failed: {base_name}" + raise RuntimeError(msg) + + @classmethod + def remove_path_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to remove path: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + try: + shutil.rmtree(path) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def make_dir_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to make directory: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.makedirs(path, mode=cls.DATA_DIR_AUTHORITY) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def create_file_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to create file: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.close(os.open(path, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY)) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def get_realpath(cls, path: str) -> str: + if os.path.islink(path): + msg = f"Invalid input path which is a soft link." + raise RuntimeError(msg) + return os.path.realpath(path) diff --git a/profiler/prof_common/trace_event_bean.py b/profiler/prof_common/trace_event_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..2d4b96e4f6aa84ce225531da89085ba4a07335a5 --- /dev/null +++ b/profiler/prof_common/trace_event_bean.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from decimal import Decimal + +from profiler.prof_common.utils import convert_to_decimal +from profiler.prof_common.analyze_dict import AnalyzeDict + + +class TraceEventBean(AnalyzeDict): + def __init__(self, data: dict, unique_id: int = None): + super().__init__(data) + self._id = unique_id + + @property + def unique_id(self): + return self._id + + @property + def start_time(self) -> Decimal: + return convert_to_decimal(self.ts) + + @property + def end_time(self) -> Decimal: + return self.start_time + convert_to_decimal(self.dur) + + def set_id(self, name_id): + self._id = name_id + + def is_cpu_op(self): + return self.cat == "cpu_op" + + def is_optimizer(self): + return self.cat == "cpu_op" and self.name.lower().startswith("optimizer") + + def is_nn_module(self): + return self.cat == "python_function" and self.name.lower().startswith("nn.module") + + def is_step(self): + return self.name.lower().startswith("profilerstep#") + + def is_torch_to_npu(self): + return self.cat == "async_npu" + + def is_fwd_bwd_flow(self): + return self.cat == "fwdbwd" + + def is_flow_start(self): + return self.ph == "s" + + def is_flow_end(self): + return self.ph == "f" + + def is_kernel_event(self, kernel_pid): + return self.ph == "X" and self.pid == kernel_pid + + def is_npu_process(self): + return self.ph == "M" and self.name == "process_name" and self.args.get("name", "") == "Ascend Hardware" diff --git a/profiler/prof_common/tree_builder.py b/profiler/prof_common/tree_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..41775f02998873255b58c993882ae39711100e7a --- /dev/null +++ b/profiler/prof_common/tree_builder.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from profiler.prof_common.trace_event_bean import TraceEventBean + + +class TreeBuilder: + @classmethod + def build_tree(cls, event_list: list, node_class: any, root_bean: any): + root_node = node_class(root_bean) + event_list.sort(key=lambda x: x.start_time) + last_node = root_node + for event in event_list: + while last_node: + if last_node != root_node and event.start_time > last_node.end_time: + last_node = last_node.parent_node + continue + tree_node = node_class(event, last_node) + last_node.update_child_nodes(tree_node) + last_node = tree_node + break + return root_node diff --git a/profiler/prof_common/utils.py b/profiler/prof_common/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a9db41ad0b8d9dd91132959fd5b583f5711d88db --- /dev/null +++ b/profiler/prof_common/utils.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from decimal import Decimal + + +def convert_to_decimal(data: any) -> Decimal: + try: + decimal_value = Decimal(data) + except Exception: + logging.error('Invalid profiling data which failed to convert data to decimal.') + return 0.0 + return decimal_value