diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py index f71a9d4c545fed7138c88eb87e91bf19d83609b1..2d1613c47b12063b32dd9c46edaaf794b17c2053 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py @@ -1,8 +1,9 @@ import os +from concurrent.futures import ThreadPoolExecutor import torch from ..common.utils import Const, check_switch_valid, generate_compare_script, check_is_npu, print_error_log, \ CompareException, print_warn_log -from ..dump.dump import DumpUtil, acc_cmp_dump, write_to_disk, get_pkl_file_path, reset_module_count, GLOBAL_THREAD_POOL +from ..dump.dump import DumpUtil, acc_cmp_dump, write_to_disk, get_pkl_file_path, reset_module_count from ..dump.utils import set_dump_path, set_dump_switch_print_info, generate_dump_path_str, \ set_dump_switch_config, set_backward_input from ..overflow_check.utils import OverFlowUtil @@ -108,6 +109,7 @@ class PrecisionDebugger: register_hook_core(instance.hook_func, instance.model) instance.first_start = False DumpUtil.dump_switch = "ON" + DumpUtil.dump_thread_pool = ThreadPoolExecutor() OverFlowUtil.overflow_check_switch = "ON" dump_path_str = generate_dump_path_str() set_dump_switch_print_info("ON", DumpUtil.dump_switch_mode, dump_path_str) @@ -130,8 +132,8 @@ class PrecisionDebugger: dump_path_str = generate_dump_path_str() set_dump_switch_print_info("OFF", DumpUtil.dump_switch_mode, dump_path_str) write_to_disk() - if DumpUtil.is_single_rank: - GLOBAL_THREAD_POOL.shutdown(wait=True) + if DumpUtil.is_single_rank and DumpUtil.dump_thread_pool: + DumpUtil.dump_thread_pool.shutdown(wait=True) if check_is_npu() and DumpUtil.dump_switch_mode in [Const.ALL, Const.API_STACK, Const.LIST, Const.RANGE, Const.API_LIST]: generate_compare_script(DumpUtil.dump_data_dir, get_pkl_file_path(), DumpUtil.dump_switch_mode) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index a6b769ff2a41955aa6363f08d086a091335bf5f1..ea61be86364504885a3db240e60ddd60d360564b 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -47,7 +47,6 @@ pkl_name = "" rank = os.getpid() + 100000 multi_output_apis = ["_sort_", "npu_flash_attention"] module_count = {} -GLOBAL_THREAD_POOL = ThreadPoolExecutor() class APIList(list): @@ -186,12 +185,12 @@ def dump_data(prefix, data_info): def thread_dump_data(prefix, data_info): - GLOBAL_THREAD_POOL.submit(dump_data, prefix, data_info) + DumpUtil.dump_thread_pool.submit(dump_data, prefix, data_info) def dump_data_by_rank_count(dump_step, prefix, data_info): print_info_log(f"ptdbg is analyzing rank{rank} api: {prefix}" + " " * 10, end='\r') - if DumpUtil.is_single_rank: + if DumpUtil.is_single_rank and DumpUtil.dump_thread_pool: thread_dump_data(prefix, data_info) else: dump_data(prefix, data_info) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index bb0ae82c4110d568e31cd21a261f5875548c8672..bea18501aefaba4916cba7ad3b7e455d0c29033c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -87,6 +87,7 @@ class DumpUtil(object): need_replicate = False summary_mode = "all" is_single_rank = None + dump_thread_pool = None @staticmethod diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py index ed28dbe5d6712a76e5d0fc249a24911cf3e50b63..b40a7b60a7bad1178faf4e122f807cd2ba76e738 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_distributed.py @@ -19,7 +19,7 @@ import os import torch.distributed as dist import yaml - +from functools import wraps from .hook_module import HOOKModule from ..common.utils import torch_device_guard, Const from ..common.file_check_util import FileOpen @@ -60,9 +60,12 @@ class DistributedOPTemplate(HOOKModule): def wrap_distributed_op(op_name, hook): + + @wraps(DistributedOPTemplate) def distributed_op_template(*args, **kwargs): return DistributedOPTemplate(op_name, hook)(*args, **kwargs) - + + distributed_op_template.__name__ = op_name return distributed_op_template diff --git a/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py b/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py index 2b6e5270f278276521b20eae225b0c004a77a2f7..336bef7dd8553eb82586d52260443a7d01e84ab0 100644 --- a/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py +++ b/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py @@ -15,11 +15,13 @@ from common_func_advisor.constant import Constant from advice_factory.advice_factory import AdviceFactory from compute_advice.npu_fused_advice import NpuFusedAdvice +from compute_advice.npu_slow_advice import NpuSlowAdvice class ComputeAdviceFactory(AdviceFactory): ADVICE_LIB = { Constant.NPU_FUSED: NpuFusedAdvice, + Constant.NPU_SLOW: NpuSlowAdvice, } def __init__(self, collection_path: str): diff --git a/profiler/advisor/advisor_backend/common_func_advisor/constant.py b/profiler/advisor/advisor_backend/common_func_advisor/constant.py index 34879db9f2c078854aab6cfe658fc46865b885df..46a7fb24c2dade75c157f18118f29233eb924b88 100644 --- a/profiler/advisor/advisor_backend/common_func_advisor/constant.py +++ b/profiler/advisor/advisor_backend/common_func_advisor/constant.py @@ -15,11 +15,104 @@ from enum import Enum +class CsvTitle: + MODEL_NAME = "Model Name" + MODEL_ID = "Model ID" + TASK_ID = "Task ID" + STREAM_ID = "Stream ID" + INFER_ID = "Infer ID" + TASK_START_TIME = "Task Start Time(us)" + TASK_WAIT_TIME = "Task Wait Time(us)" + BLOCK_DIM = "Block Dim" + MIX_BLOCK_DIM = "Mix Block Dim" + HF32_ELIGIBLE = "HF32 Eligible" + INPUT_SHAPES = "Input Shapes" + INPUT_DATA_TYPES = "Input Data Types" + INPUT_FORMATS = "Input Formats" + OUTPUT_SHAPES = "Output Shapes" + OUTPUT_DATA_TYPES = "Output Data Types" + OUTPUT_FORMATS = "Output Formats" + CONTEXT_ID = "Context ID" + AICORE_TIME = "aicore_time(us)" + AIC_TOTAL_CYCLES = "aic_total_cycles" + AIC_MAC_TIME = "aic_mac_time(us)" + AIC_MAC_RATIO = "aic_mac_ratio" + AIC_SCALAR_TIME = "aic_scalar_time(us)" + AIC_SCALAR_RATIO = "aic_scalar_ratio" + AIC_MTE1_TIME = "aic_mte1_time(us)" + AIC_MTE1_RATIO = "aic_mte1_ratio" + AIC_MTE2_TIME = "aic_mte2_time(us)" + AIC_MTE2_RATIO = "aic_mte2_ratio" + AIC_FIXPIPE_TIME = "aic_fixpipe_time(us)" + AIC_FIXPIPE_RATIO = "aic_fixpipe_ratio" + AIC_ICACHE_MISS_RATE = "aic_icache_miss_rate" + AIV_TIME = "aiv_time(us)" + AIV_TOTAL_CYCLES = "aiv_total_cycles" + AIV_VEC_TIME = "aiv_vec_time(us)" + AIV_VEC_RATIO = "aiv_vec_ratio" + AIV_SCALAR_TIME = "aiv_scalar_time(us)" + AIV_SCALAR_RATIO = "aiv_scalar_ratio" + AIV_MTE2_TIME = "aiv_mte2_time(us)" + AIV_MTE2_RATIO = "aiv_mte2_ratio" + AIV_MTE3_TIME = "aiv_mte3_time(us)" + AIV_MTE3_RATIO = "aiv_mte3_ratio" + AIV_ICACHE_MISS_RATE = "aiv_icache_miss_rate" + CUBE_UTILIZATION = "cube_utilization( %)" + TASK_DURATION_SUM = "Task Duration Sum(us)" + TASK_DURATION_MEAN = "Task Duration Mean(us)" + TASK_DURATION_STD = "Task Duration Std(us)" + TASK_DURATION_RATIO = "Task Duration Ratio(100%)" + SIZE = "size(MB)" + THROUGHPUT = "throughput(GB/s)" + COLOR = "color" + GAP = "Gap(us)" + DURATION_SUM = "Duration Sum(us)" + COUNT = "Count" + MAX_DURATION = "Max Duration(us)" + MIN_DURATION = "Min Duration(us)" + AVG_DURATION = "Avg Duration(us)" + DURATION_RATIO = "Duration Ratio" + INDEX = "Index" + + +# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配旧版csv +class CsvTitleV1(CsvTitle): + OP_NAME = "Op Name" + OP_TYPE = "OP Type" + TASK_TYPE = "Task Type" + TASK_DURATION = "Task Duration(us)" + + +# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配新版csv +class CsvTitleV2(CsvTitle): + OP_NAME = "Name" + OP_TYPE = "Type" + TASK_TYPE = "Accelerator Core" + TASK_DURATION = "Duration(us)" + + class Constant: + DTYPE_SIZE_MAP = {"int8": 1, "uint8": 1, + "int16": 2, "uint16": 2, + "int32": 4, "uint32": 4, + "int64": 8, "uint64": 8, + "float16": 2, + "bfloat16": 2, + "bf16": 2, + "dt_bf16": 2, + "float32": 4, + "float": 4, + "float64": 8, + "complex64": 8, + "complex128": 16, + "bool": 1} + TP_THRESHOLD = 1150 MAX_INPUT_MODE_LEN = 30 MAX_INPUT_ADVICE_LEN = 30 SMALL_OP_DUR_RATIO = 0.2 SMALL_OP_NUM_RATIO = 0.2 + BYTE_UNIT_TRANS = 1024 + UNIT_TRANS = 1000 # mode list COMPUTE = "compute" @@ -35,6 +128,7 @@ class Constant: # compute NPU_FUSED = "npu_fused" + NPU_SLOW = "npu_slow" # timeline OPTIM = "optimizer" @@ -108,3 +202,24 @@ class Constant: ("Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast"): "torch_npu.npu_scaled_masked_softmax", ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul"): "torch_npu.npu_rotary_mul", ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "torch_npu.npu_rms_norm"} + TITLE = CsvTitleV2 + + @classmethod + def update_title(cls): + cls.TITLE = CsvTitleV1 + + +class CoreType: + AIV = "AI_VECTOR_CORE" + AIC = "AI_CORE" + AICPU = "AI_CPU" + MIX_AIV = "MIX_AIV" + MIX_AIC = "MIX_AIC" + HCCL = "HCCL" + + +class PerfColor(Enum): + WHITE = 0 + GREEN = 1 + YELLOW = 2 + RED = 3 diff --git a/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py b/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py index 08ef02876561001b9721e365c9aa6934057674de..8171f06ee235fc02da715044b4d310087c36c102 100644 --- a/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py +++ b/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os from abc import abstractmethod from dataclasses import dataclass from dataclasses import field from typing import Dict from typing import List +import pandas as pd + from common_func.file_manager import FileManager @@ -89,9 +91,34 @@ class TraceViewJson: self.cann_dur_events: Dict[str, DurationEvent] = dict() self.ascend_hardware_dur_events: Dict[str, DurationEvent] = dict() self.torch_2_npu_flow_events: Dict[str, FlowEvent] = dict() - traces = FileManager.read_json_file(path) self._load_obj(traces) + + def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str: + if ts_col not in data.columns.tolist(): + print("[ERROR] No {} col found in data columns.".format(ts_col)) + return "" + row = data.loc[index_id] + timestamp = row[ts_col] + flow_event = self.get_torch_2_npu_flow_event(timestamp) + if not flow_event.valid(): + print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern'])) + return "" + flow_event_s_key = flow_event.s_point_ts + python_dur_events = self.get_python_dur_events_contain_ts(flow_event_s_key) + if not python_dur_events: + print("[ERROR] No python dur event found for pattern {}.".format(row['pattern'])) + return "" + # 保持新老版本callstack兼容性 + if python_dur_events[0].args.get("Call stack"): + # 旧版本 + call_stack_list = python_dur_events[0].args.get("Call stack").split(";") + else: + python_dur_events.sort(key=lambda e: e.ts) + # 新版本 + call_stack_list = [event.name for event in python_dur_events if event.cat == "python_function"] + call_stack = "\n".join(call_stack_list) + return call_stack def get_torch_2_npu_flow_event(self, end_time) -> FlowEvent: if not self.torch_2_npu_flow_events or not self.torch_2_npu_flow_events.get(end_time): diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py index 5411610a7f4229c6f01c04e352d380f3a2864784..c85c14d618ceda199c9c376abc27a3581eed97b8 100644 --- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py +++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py @@ -28,18 +28,10 @@ class CSVAnalyzer: def process(self): df = pd.read_csv(self._path, dtype={"Start Time(us)": str}) - - - pool = multiprocessing.Pool(multiprocessing.cpu_count()) - # 数据预解析 - result = pool.map(self.update_op_row, df.iterrows()) - pool.close() - - preparse_df = pd.DataFrame(result) # 分析是否存在可融合的算子 - op_type_list = preparse_df["Type"].tolist() - duration_list = preparse_df["Duration(us)"].tolist() - start_times = preparse_df["Start Time(us)"].tolist() + op_type_list = df["Type"].tolist() + duration_list = df["Duration(us)"].tolist() + start_times = df["Start Time(us)"].tolist() # 去除末尾的\t分隔符 start_times = [start_time[:-1] for start_time in start_times] result_list = [] @@ -50,10 +42,6 @@ class CSVAnalyzer: "index", "first_timestamp"] return data_frame - @staticmethod - def update_op_row(row): - return OpPerfFactory.build(row[1]).update() - @staticmethod def find_all_sub_lists(op_type_list, duration_list, start_times, expect_sub_list): # 创建一个空字典,用来存储子列表和它们的出现次数和起始位置 diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py index 2442807fd10b7942177990d2283ad34c369659bd..7bcbed5a75807b57a55787c743cfaaff55a68589 100644 --- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py +++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py @@ -12,19 +12,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import functools from typing import Dict + from common_func_advisor.constant import Constant +from common_func_advisor.constant import CoreType +from common_func_advisor.constant import PerfColor class OpPerfFactory: @classmethod def build(cls, op_row: Dict): - return OpPerf(op_row) + if op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIV: + return VecOpPerf(op_row) + elif op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIC: + return CubeOpPerf(op_row) + else: + return OpPerf(op_row) class OpPerf: def __init__(self, op_row: Dict): + if "OP Type" in op_row.keys(): + Constant.update_title() self.row = op_row self.model_name = op_row.get("Model Name") self.model_id = op_row.get("Model ID") @@ -75,6 +85,112 @@ class OpPerf: self.aiv_mte3_ratio = op_row.get("aiv_mte3_ratio") self.aiv_icache_miss_rate = op_row.get("aiv_icache_miss_rate") self.cube_utilization = op_row.get("cube_utilization( %)") + + @staticmethod + def get_dtype_size(dtype_str: str): + return Constant.DTYPE_SIZE_MAP.get(dtype_str.lower(), 0) + + @staticmethod + def get_element_count(shape: list): + return functools.reduce(lambda x, y: int(x) * int(y), shape) + + @staticmethod + def shape_to_tuple(shape_str: str) -> tuple: + if not isinstance(shape_str, str): + return [] + shape_str = shape_str.strip('"') + split_shape = shape_str.strip(';') + if not split_shape: + return [] + pairs = split_shape.split(';') + shape_result = [] + for pair in pairs: + pair = pair.strip(";") + elements = pair.split(',') + elements = tuple(int(element) if "" != element else 0 for element in elements) + shape_result.append(elements) + return tuple(shape_result) + + @staticmethod + def dtype_to_tuple(dtypes_str: str) -> tuple: + if not isinstance(dtypes_str, str): + return [] + dtypes_str = dtypes_str.strip('"') + split_dtypes = dtypes_str.strip(';') + if not split_dtypes: + return [] + pairs = split_dtypes.split(';') + return tuple(pairs) + + def get_mac_ratio(self): + return self.aic_mac_ratio + + def get_size(self, shapes_str, dtypes_str): + shapes = self.shape_to_tuple(shapes_str) + dtypes = self.dtype_to_tuple(dtypes_str) + if len(shapes) > len(dtypes): + print(f"[ERROR] The size of shape is greater than that of dtypes.") + return 0 + if len(shapes) < len(dtypes): + shapes = list(shapes) + shapes.extend([(1,)] * (len(dtypes) - len(shapes))) + all_size = 0 + for index, shape in enumerate(shapes): + element_count = self.get_element_count(shape) + dtype_size = self.get_dtype_size(dtypes[index]) + all_size += element_count * dtype_size + return all_size + + def get_calc_size(self): + # input and output bytes (MB) + if not self.input_shapes or not self.output_shapes: + print("[ERROR] There is no tensor data, do not assess vector op performance.") + return 0 + intput_size = self.get_size(self.input_shapes, self.input_data_types) + output_size = self.get_size(self.output_shapes, self.output_data_types) + return (intput_size + output_size) / (Constant.BYTE_UNIT_TRANS * Constant.BYTE_UNIT_TRANS) + + def get_throughput(self): + # throughput(GB/s) + if not self.task_duration or abs(self.task_duration) < 1e-6: + print("[ERROR] There is no task_duration, do not assess vector op performance.") + return 0 + return self.row[Constant.TITLE.SIZE] / Constant.BYTE_UNIT_TRANS / self.task_duration * Constant.UNIT_TRANS * Constant.UNIT_TRANS + + def get_perf_color(self): + return PerfColor.WHITE def update(self): + self.row[Constant.TITLE.SIZE] = self.get_calc_size() + self.row[Constant.TITLE.THROUGHPUT] = self.get_throughput() + self.row[Constant.TITLE.COLOR] = self.get_perf_color().name return self.row + + +class VecOpPerf(OpPerf): + def get_perf_color(self) -> PerfColor: + throughput = self.row[Constant.TITLE.THROUGHPUT] + op_duration = self.task_duration + tp_threshold = Constant.TP_THRESHOLD + if throughput == 0: + return PerfColor.WHITE + if throughput < tp_threshold / 2 and op_duration > 20: + return PerfColor.RED + elif tp_threshold / 2 <= throughput < tp_threshold: + return PerfColor.YELLOW + else: + return PerfColor.GREEN + + +class CubeOpPerf(OpPerf): + def get_perf_color(self) -> PerfColor: + aic_mac_ratio = self.get_mac_ratio() + if not aic_mac_ratio: + print("[WARNING] There is no aic_mac_ratio, do not assess cube op performance.") + return PerfColor.WHITE + elif aic_mac_ratio < 0.6: + return PerfColor.RED + elif 0.6 <= aic_mac_ratio < 0.8: + return PerfColor.YELLOW + else: + return PerfColor.GREEN diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py b/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..caff1c792c2171c33a4dd876b0741d6c215c5766 --- /dev/null +++ b/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC +import multiprocessing + +import pandas as pd + +from compute_advice.compute_advice_base import ComputeAdviceBase +from compute_advice.npu_fused.op_perf import OpPerfFactory +from common_func_advisor.constant import Constant +from common_func_advisor.constant import PerfColor +from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson + + +class NpuSlowAdvice(ComputeAdviceBase, ABC): + OP_PERF_SHEET = "op_perf" + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.kernel_details_path = "" + self.data = pd.DataFrame() + + @staticmethod + def save_to_excel(data: pd.DataFrame, file_path: str) -> None: + writer = pd.ExcelWriter(file_path, engine="xlsxwriter", mode="w") + data.index.name = Constant.TITLE.INDEX + data.to_excel(writer, index=True, sheet_name=NpuSlowAdvice.OP_PERF_SHEET) + NpuSlowAdvice.color_sheet(data, writer.book, writer.sheets[NpuSlowAdvice.OP_PERF_SHEET]) + writer.sheets[NpuSlowAdvice.OP_PERF_SHEET].freeze_panes = "A2" + writer.close() + + @staticmethod + def color_sheet(data: pd.DataFrame, workbook, worksheet): + color_rgb = { + PerfColor.GREEN.name: workbook.add_format({'bg_color': '#C6EFCE'}), + PerfColor.YELLOW.name: workbook.add_format({'bg_color': '#FFEB9C'}), + PerfColor.RED.name: workbook.add_format({'bg_color': '#FFC7CE'}), + } + for row in data.iterrows(): + color = row[1][Constant.TITLE.COLOR] + fill_format = color_rgb.get(color) + if not fill_format: + continue + worksheet.set_row(row[0] + 1, None, fill_format) + + @staticmethod + def update_op_row(row: tuple): + return OpPerfFactory.build(row[1]).update() + + def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str: + if not self.has_callstack(): + print("There is no call stack info, please set 'with_stack=True'") + return "" + trace_json = TraceViewJson(self.trace_view_path) + return trace_json.get_call_stack(data, index_id, ts_col) + + def run(self): + if not self.path_check(): + return self.data + self.process() + return self.data + + def process(self): + self.data = pd.read_csv(self.kernel_details_path, dtype={"Start Time(us)": str}) + # 去除末尾的\t分隔符 + self.data["Start Time(us)"] = self.data["Start Time(us)"].apply(lambda x: x[:-1]) + pool = multiprocessing.Pool(multiprocessing.cpu_count()) + result = pool.map(self.update_op_row, self.data.iterrows()) + pool.close() + self.data = pd.DataFrame(result) diff --git a/profiler/advisor/compute_perf_analysis.ipynb b/profiler/advisor/compute_perf_analysis.ipynb index 27c9caf37bf43871f319a9418294953f54f9cafd..e7a663130c8da335129513a5ca1a99cf28fe48b7 100644 --- a/profiler/advisor/compute_perf_analysis.ipynb +++ b/profiler/advisor/compute_perf_analysis.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2024-02-21T09:19:13.937531900Z", @@ -11,6 +11,7 @@ }, "outputs": [], "source": [ + "import os\n", "import pandas as pd\n", "\n", "from advisor_backend.interface import Interface\n", @@ -24,15 +25,18 @@ "# 算子调优分析\n", "## 1. 算子分析的数据准备\n", "当前算子分析工具支持分析Ascend Pyorch Profiler方式生成的ascend_pt目录\n", - "## 2. 算子分析解决的问题\n", + "## 2. 融合算子分析\n", "当前支持分析模型中存在可融合的小算子,并给出优化建议。\n", "\n", - "\"更多融合算子信息,请查阅 https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha003/processormodel/hardwaredesc_0001.html" + "\"更多融合算子信息,请查阅 https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha003/processormodel/hardwaredesc_0001.html\n", + "\n", + "## 3. 异常性能算子分析\n", + "支持分析模型中性能异常的计算算子" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2024-02-22T08:41:17.455567500Z", @@ -44,18 +48,75 @@ "name": "stdout", "output_type": "stream", "text": [ - "[INFO] Start to analyse the target file: C:\\data\\ascend_pt\\ASCEND_PROFILER_OUTPUT\\kernel_details.csv\n", - " pattern_name pattern len count duration sum(us) op durations(us) index\n", - "18 torch_npu.npu_swiglu (Slice, Slice, Swish, Mul) 4 1 12.56 [3.14, 3.14, 3.14, 3.14] [0]\n", + "[INFO] Start to analyse the target file: D:\\work\\ascend_pt\\ASCEND_PROFILER_OUTPUT\\kernel_details.csv\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pattern_namepatternlencountduration sum(us)op durations(us)index
18torch_npu.npu_swiglu(Slice, Slice, Swish, Mul)4127.53[21.2, 0.05, 3.14, 3.14][0]
\n", + "
" + ], + "text/plain": [ + " pattern_name pattern len count duration sum(us) op durations(us) index\n", + "18 torch_npu.npu_swiglu (Slice, Slice, Swish, Mul) 4 1 27.53 [21.2, 0.05, 3.14, 3.14] [0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\n", "\n", - "The computing time of fusable op is 12.56 ms.\n", + "The computing time of fusable op is 27.53 ms.\n", "\n", "\n", "Advice 0:\n", "Replace [Slice, Slice, Swish, Mul] with torch_npu.npu_swiglu. This pattern first happened in: \n", - "torch/nn/modules/module.py(1513): _call_impl\n", - "profiler_main.py(116):forward\n" + "/root/torch/module.py\n", + "/root/test/slice.py(116)\n" ] } ], @@ -66,7 +127,7 @@ "data = interface.get_data('compute', 'npu_fused')\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.width', 900)\n", - "print(data['data'].iloc[:, :-2])\n", + "display(data['data'].iloc[:, :-2])\n", "print('\\n')\n", "print(data['bottleneck'])\n", "print('\\n')\n", @@ -75,21 +136,217 @@ }, { "cell_type": "code", - "outputs": [], + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] Start to analyse the target file: D:\\work\\ascend_pt\\ASCEND_PROFILER_OUTPUT\\kernel_details.csv\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Step IdModel IDTask IDStream IDNameTypeAccelerator CoreStart Time(us)Duration(us)Wait Time(us)Block DimMix Block DimInput ShapesInput Data TypesInput FormatsOutput ShapesOutput Data TypesOutput FormatsContext IDaicore_time(us)aic_total_cyclesaic_mac_ratioaic_mac_int8_ratioaic_cube_fopsaic_vector_fopsaiv_time(us)aiv_total_cyclesaiv_vec_fp32_ratioaiv_vec_fp16_ratioaiv_vec_int32_ratioaiv_vec_misc_ratioaiv_cube_fopsaiv_vector_fopssize(MB)throughput(GB/s)color
014294967295126516Slice1SliceAI_VECTOR_CORE169952962310675021.20261.56904,1025INT64FORMAT_ND4,1025INT32FORMAT_NDNaN0.00.00.00.00.00.01.7729508.00.00.00.00620.00.05856.00.0469212.161371RED
414294967295126516Add1AddAI_CORE16995296231067543.14261.56904,1025INT64FORMAT_ND4,1025INT32FORMAT_NDNaN2.328888.00.20.10.10.70.000.00.00.00.00000.00.00.00.04692114.592698RED
\n", + "
" + ], + "text/plain": [ + " Step Id Model ID Task ID Stream ID Name Type Accelerator Core Start Time(us) Duration(us) Wait Time(us) Block Dim Mix Block Dim Input Shapes Input Data Types Input Formats Output Shapes Output Data Types Output Formats Context ID aicore_time(us) aic_total_cycles aic_mac_ratio aic_mac_int8_ratio aic_cube_fops aic_vector_fops aiv_time(us) aiv_total_cycles aiv_vec_fp32_ratio aiv_vec_fp16_ratio aiv_vec_int32_ratio aiv_vec_misc_ratio aiv_cube_fops aiv_vector_fops size(MB) throughput(GB/s) color\n", + "0 1 4294967295 1265 16 Slice1 Slice AI_VECTOR_CORE 1699529623106750 21.20 261.56 9 0 4,1025 INT64 FORMAT_ND 4,1025 INT32 FORMAT_ND NaN 0.0 0.0 0.0 0.0 0.0 0.0 1.77 29508.0 0.0 0.0 0.0062 0.0 0.0 5856.0 0.046921 2.161371 RED\n", + "4 1 4294967295 1265 16 Add1 Add AI_CORE 1699529623106754 3.14 261.56 9 0 4,1025 INT64 FORMAT_ND 4,1025 INT32 FORMAT_ND NaN 2.3 28888.0 0.2 0.1 0.1 0.7 0.00 0.0 0.0 0.0 0.0000 0.0 0.0 0.0 0.046921 14.592698 RED" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ + "# 异常性能算子识别\n", + "from advisor_backend.compute_advice.npu_slow_advice import NpuSlowAdvice\n", "\n", - "\n" + "npu_slow_advice = NpuSlowAdvice(compute_path)\n", + "data = interface.get_data('compute', 'npu_slow')\n", + "slow_op_data = data[data[\"color\"] == \"RED\"]\n", + "display(slow_op_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(compute_path, \"slow_op.xlsx\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "call stack: \n", + "/root/torch/module.py\n", + "/root/test/slice.py(116)\n" + ] + } ], - "metadata": { - "collapsed": false - } + "source": [ + "# 异常性能算子call stack\n", + "call_stack = npu_slow_advice.get_call_stack(data, index_id=0, ts_col=\"Start Time(us)\")\n", + "print(\"call stack: \")\n", + "print(call_stack)" + ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -101,7 +358,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/profiler/cluster_analyse/analysis/analysis_facade.py b/profiler/cluster_analyse/analysis/analysis_facade.py index b383a704df27d18e0191b2b251efd9de61beee55..0b870bbaafa6483bf2cfde49971d79106c07aa23 100644 --- a/profiler/cluster_analyse/analysis/analysis_facade.py +++ b/profiler/cluster_analyse/analysis/analysis_facade.py @@ -14,14 +14,13 @@ # limitations under the License. from multiprocessing import Process -from common_func.constant import Constant -from analysis.communication_analysis import CommunicationAnalysis +from analysis.communication.comm_analysis_generator import CommunicationAnalysisGenerator +from analysis.communication_matrix.comm_matrix_generator import CommMatrixAnalysisGenerator from analysis.step_trace_time_analysis import StepTraceTimeAnalysis -from analysis.communication_analysis import CommMatrixAnalysis class AnalysisFacade: - analysis_module = {CommunicationAnalysis, StepTraceTimeAnalysis, CommMatrixAnalysis} + analysis_module = {CommunicationAnalysisGenerator, StepTraceTimeAnalysis, CommMatrixAnalysisGenerator} def __init__(self, params: dict): self.params = params diff --git a/profiler/cluster_analyse/analysis/base_analysis_json.py b/profiler/cluster_analyse/analysis/base_analysis_json.py new file mode 100644 index 0000000000000000000000000000000000000000..3df54b0ae2a742f966d8714ea5b850b0999091a7 --- /dev/null +++ b/profiler/cluster_analyse/analysis/base_analysis_json.py @@ -0,0 +1,64 @@ +from abc import abstractmethod +from common_func.constant import Constant +from common_func.file_manager import FileManager + + +class BaseAnalysisJson: + + def __init__(self, param: dict): + self.collection_path = param.get(Constant.COLLECTION_PATH) + self.data_map = param.get(Constant.DATA_MAP) + self.communication_ops = [] + self.collective_group_dict = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP) + self.comm_ops_struct = {} + + @staticmethod + def compute_ratio(dividend: float, divisor: float): + if abs(divisor) < Constant.EPS: + return 0 + else: + return round(dividend / divisor, 4) + + @staticmethod + def check_add_op(op_name: str): + """ + 兼容2个版本,判断是否需要将此算子信息相加 + """ + stat_list = ["middle", "top", "bottom", "total"] + total = "total" + for stat_name in stat_list: + if stat_name in op_name: + if stat_name != total: + return False + return True + + @abstractmethod + def run(self): + pass + + def dump_data(self): + if not self.comm_ops_struct: + print("[WARNING] There is no final comm ops data generated") + return + output_comm_data = {} + for key in self.comm_ops_struct: + output_comm_data[str(key)] = self.comm_ops_struct.get(key) + FileManager.create_json_file(self.collection_path, output_comm_data, self.SAVED_JSON) + + def split_op_by_group(self): + for single_op in self.communication_ops: + if single_op.get(Constant.COMM_OP_TYPE) == Constant.P2P: + rank_tup = Constant.P2P + else: + rank_tup = tuple(self.collective_group_dict.get(single_op.get(Constant.GROUP_NAME), [])) + rank_id = single_op.get(Constant.RANK_ID, 'N/A') + step_id = single_op.get(Constant.STEP_ID, 'N/A') + op_name = single_op.get(Constant.COMM_OP_NAME, 'N/A') + op_info = single_op.get(Constant.COMM_OP_INFO) + self.comm_ops_struct.setdefault(rank_tup, {}).setdefault(step_id, {}).\ + setdefault(op_name, {}).setdefault(rank_id, op_info) + + def combine_ops_total_info(self): + for rank_tup, group_dict in self.comm_ops_struct.items(): + for step_id, communication_ops in group_dict.items(): + self.compute_total_info(communication_ops) diff --git a/profiler/cluster_analyse/analysis/communication/__init__.py b/profiler/cluster_analyse/analysis/communication/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py b/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..4b737b5da4f35a0d8363de5869d9e269d85859fa --- /dev/null +++ b/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py @@ -0,0 +1,17 @@ +from analysis.communication.communication_analysis_db import CommunicationAnalysisDB +from analysis.communication.communication_analysis_json import CommunicationAnalysisJson +from common_func.constant import Constant + + +class CommunicationAnalysisGenerator: + + GROUP_MAP = { + Constant.DB: CommunicationAnalysisDB, + Constant.TEXT: CommunicationAnalysisJson + } + + def __init__(self, params: dict): + self.generator = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params) + + def run(self): + self.generator.run() diff --git a/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py new file mode 100644 index 0000000000000000000000000000000000000000..3559a9a28f52c42c0e62e7a53191a5276ab7354d --- /dev/null +++ b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py @@ -0,0 +1,165 @@ +import os + +from analysis.base_analysis_json import BaseAnalysisJson +from common_func.db_manager import DBManager +from common_func.constant import Constant +from common_func.table_constant import TableConstant + + +class CommunicationAnalysisDB: + COMMUNICATION_BANDWIDTH_TABLE = "ClusterCommAnalyzerBandwidth" + COMMUNICATION_TIME_TABLE = "ClusterCommAnalyzerTime" + TIME_EXTENSION = "time" + RANK_BAND_TYPE = "{}-{}" + + def __init__(self, params: any): + self.collection_path = params.get(Constant.COLLECTION_PATH) + self.communication_time_info = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_TIME_INFO) + self.communication_bandwidth_info = params.get(Constant.COMM_DATA_DICT, {}).get( + Constant.COMMUNICATION_BANDWIDTH_INFO) + self.collective_group_dict = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP) + self.comm_time_struct = {} + self.comm_bandwidth_struct = {} + self.res_comm_time = [] + self.res_comm_bandwidth = [] + + def run(self): + if not self.communication_time_info and not self.communication_bandwidth_info: + return + self.split_and_add_rank_set(self.communication_time_info, self.comm_time_struct) + self.split_and_add_rank_set(self.communication_bandwidth_info, self.comm_bandwidth_struct) + self.compute_total_info() + self.dump_data() + + def dump_data(self): + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.COMMUNICATION_TIME_TABLE, self.COMMUNICATION_BANDWIDTH_TABLE) + res_time, res_bandwidth = [], [] + conn, cursor = DBManager.create_connect_db(result_db) + for data in self.res_comm_time: + res_time.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.RANK_ID], + data[TableConstant.HCCL_OP_NAME], data[TableConstant.GROUP_NAME], + data[TableConstant.START_TIMESTAMP], data[TableConstant.ELAPSED_TIME], + data[TableConstant.TRANSIT_TIME], data[TableConstant.WAIT_TIME], + data[TableConstant.SYNCHRONIZATION_TIME], data[TableConstant.IDLE_TIME], + data[TableConstant.SYNCHRONIZATION_TIME_RATIO], data[TableConstant.WAIT_TIME_RATIO]]) + if res_time: + sql = "insert into {} values ({value})".format(self.COMMUNICATION_TIME_TABLE, + value="?," * (len(res_time[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res_time) + for data in self.res_comm_bandwidth: + res_bandwidth.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.RANK_ID], + data[TableConstant.HCCL_OP_NAME], data[TableConstant.GROUP_NAME], + data[TableConstant.TRANSPORT_TYPE], data[TableConstant.TRANSIT_SIZE], + data[TableConstant.TRANSIT_TIME], data[TableConstant.BANDWIDTH], + data[TableConstant.LARGE_PACKET_RATIO], data[TableConstant.PACKAGE_SIZE], + data[TableConstant.COUNT], data[TableConstant.TOTAL_DURATION]]) + if res_bandwidth: + sql = "insert into {} values ({value})".format(self.COMMUNICATION_BANDWIDTH_TABLE, + value="?," * (len(res_bandwidth[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res_bandwidth) + DBManager.destroy_db_connect(conn, cursor) + + def split_and_add_rank_set(self, data_list, res_dict): + for data in data_list: + if data[TableConstant.TYPE] == Constant.P2P: + rank_tuple = Constant.P2P + else: + rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME])) + res_dict.setdefault(rank_tuple, {}).setdefault(data[TableConstant.STEP], []).append(data) + + def compute_total_info(self): + for rank_tuple, op_dict in self.comm_time_struct.items(): + if rank_tuple != Constant.P2P: + for step, data_list in op_dict.items(): + self.compute_rank_set_total_time_info(data_list, rank_tuple) + else: + rank_set = set() + for step, data_list in op_dict.items(): + rank_set.add(data[TableConstant.RANK_ID] for data in data_list) + for step, data_list in op_dict.items(): + self.compute_rank_set_total_time_info(data_list, rank_set, True) + for rank_tuple, op_dict in self.comm_bandwidth_struct.items(): + for step, data_list in op_dict.items(): + if rank_tuple != Constant.P2P: + self.compute_rank_set_total_bandwidth_info(data_list, rank_tuple) + else: + self.compute_rank_set_total_bandwidth_info(data_list, rank_tuple, True) + + def compute_rank_set_total_bandwidth_info(self, data_list, rank_tuple, is_p2p=False): + if not data_list: + return + data_dict = {} + rank_tuple = "(" + ",".join(str(i) for i in rank_tuple) + ")" if not is_p2p else Constant.P2P + for data in data_list: + data[TableConstant.RANK_SET] = rank_tuple + rank_band_type = self.RANK_BAND_TYPE.format(data[TableConstant.RANK_ID], + data[TableConstant.TRANSPORT_TYPE]) + data_dict.setdefault(rank_band_type, []).append(data) + self.res_comm_bandwidth.append(data) + for rank_band_type, bandwidth_list in data_dict.items(): + package_set = set() + for data in bandwidth_list: + package_set.add(data[TableConstant.PACKAGE_SIZE]) + for package in package_set: + total_comm_bandwidth_info = dict() + for data in bandwidth_list: + self.compute_bandwidth(total_comm_bandwidth_info, data, package) + bandwidth = BaseAnalysisJson.compute_ratio(total_comm_bandwidth_info.get(TableConstant.TRANSIT_SIZE), + total_comm_bandwidth_info.get(TableConstant.TRANSIT_TIME)) + total_comm_bandwidth_info[TableConstant.BANDWIDTH] = bandwidth + total_comm_bandwidth_info[TableConstant.PACKAGE_SIZE] = package + total_comm_bandwidth_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO + total_comm_bandwidth_info[TableConstant.GROUP_NAME] = "" + total_comm_bandwidth_info[TableConstant.LARGE_PACKET_RATIO] = 0.0 + self.res_comm_bandwidth.append(total_comm_bandwidth_info) + + def compute_bandwidth(self, res_dict, data_dict, package): + for key in data_dict.keys(): + if key in [TableConstant.TRANSIT_TIME, TableConstant.TRANSIT_SIZE]: + if key not in res_dict.keys(): + res_dict[key] = 0.0 + res_dict[key] += data_dict[key] + elif key in [TableConstant.COUNT, TableConstant.TOTAL_DURATION]: + if data_dict[TableConstant.PACKAGE_SIZE] == package: + if key not in res_dict.keys(): + res_dict[key] = 0.0 + res_dict[key] += data_dict[key] + else: + res_dict[key] = data_dict[key] + + def compute_time(self, res_dict, data_dict, dict_key): + if dict_key.endswith(self.TIME_EXTENSION): + if dict_key not in res_dict.keys(): + res_dict[dict_key] = 0.0 + res_dict[dict_key] += data_dict[dict_key] + else: + res_dict[dict_key] = data_dict[dict_key] + + def compute_rank_set_total_time_info(self, data_list: list, rank_tuple: any, is_p2p: bool = False): + if not data_list: + return + rank_set = "(" + ",".join(str(i) for i in rank_tuple) + ")" if not is_p2p else Constant.P2P + for rank_id in rank_tuple: + total_comm_time_info = dict() + for data in data_list: + if data[TableConstant.RANK_ID] == rank_id: + data[TableConstant.RANK_SET] = rank_set + data[TableConstant.SYNCHRONIZATION_TIME_RATIO] = 0.0 + data[TableConstant.WAIT_TIME_RATIO] = 0.0 + for key, value in data.items(): + self.compute_time(total_comm_time_info, data, key) + syn_ratio = BaseAnalysisJson.compute_ratio(total_comm_time_info.get(TableConstant.SYNCHRONIZATION_TIME), + total_comm_time_info.get(TableConstant.SYNCHRONIZATION_TIME) + + total_comm_time_info.get(TableConstant.TRANSIT_TIME)) + wait_time_ratio = BaseAnalysisJson.compute_ratio(total_comm_time_info.get(TableConstant.WAIT_TIME), + total_comm_time_info.get(TableConstant.WAIT_TIME) + + total_comm_time_info.get(TableConstant.TRANSIT_TIME)) + total_comm_time_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO + total_comm_time_info[TableConstant.GROUP_NAME] = "" + total_comm_time_info[TableConstant.START_TIMESTAMP] = 0.0 + total_comm_time_info[TableConstant.WAIT_TIME_RATIO] = wait_time_ratio + total_comm_time_info[TableConstant.SYNCHRONIZATION_TIME_RATIO] = syn_ratio + self.res_comm_time.append(total_comm_time_info) + self.res_comm_time.extend(data_list) diff --git a/profiler/cluster_analyse/analysis/communication_analysis.py b/profiler/cluster_analyse/analysis/communication/communication_analysis_json.py similarity index 37% rename from profiler/cluster_analyse/analysis/communication_analysis.py rename to profiler/cluster_analyse/analysis/communication/communication_analysis_json.py index 88ac073a9cc899ecfb32378a8aca662de2bfe879..7fa680fe56ceb93cb6e277843107fb458b1f8807 100644 --- a/profiler/cluster_analyse/analysis/communication_analysis.py +++ b/profiler/cluster_analyse/analysis/communication/communication_analysis_json.py @@ -1,74 +1,10 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from collections import defaultdict -from abc import abstractmethod +from analysis.base_analysis_json import BaseAnalysisJson from common_func.constant import Constant -from common_func.file_manager import FileManager - -class BaseCommAnalysis: - def __init__(self, param: dict): - self.collection_path = param.get(Constant.COLLECTION_PATH) - self.data_map = param.get(Constant.DATA_MAP) - self.communication_ops = [] - self.collective_group_dict = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP) - self.comm_ops_struct = {} - - @staticmethod - def compute_ratio(dividend: float, divisor: float): - if abs(divisor) < Constant.EPS: - return 0 - else: - return round(dividend / divisor, 4) - - @abstractmethod - def run(self): - pass - - def dump_data(self): - if not self.comm_ops_struct: - print("[WARNING] There is no final comm ops data generated") - return - output_comm_data = {} - for key in self.comm_ops_struct: - output_comm_data[str(key)] = self.comm_ops_struct.get(key) - FileManager.create_json_file(self.collection_path, output_comm_data, self.SAVED_JSON) - - def split_op_by_group(self): - for single_op in self.communication_ops: - if single_op.get(Constant.COMM_OP_TYPE) == Constant.P2P: - rank_tup = Constant.P2P - else: - rank_tup = tuple(self.collective_group_dict.get(single_op.get(Constant.GROUP_NAME), [])) - rank_id = single_op.get(Constant.RANK_ID, 'N/A') - step_id = single_op.get(Constant.STEP_ID, 'N/A') - op_name = single_op.get(Constant.COMM_OP_NAME, 'N/A') - op_info = single_op.get(Constant.COMM_OP_INFO) - self.comm_ops_struct.setdefault(rank_tup, {}).setdefault(step_id, {}).\ - setdefault(op_name, {}).setdefault(rank_id, op_info) - - def combine_ops_total_info(self): - for rank_tup, group_dict in self.comm_ops_struct.items(): - for step_id, communication_ops in group_dict.items(): - self.compute_total_info(communication_ops) - - -class CommunicationAnalysis(BaseCommAnalysis): +class CommunicationAnalysisJson(BaseAnalysisJson): SAVED_JSON = "cluster_communication.json" def __init__(self, param: dict): @@ -144,100 +80,3 @@ class CommunicationAnalysis(BaseCommAnalysis): bandwidth_dict[Constant.BANDWIDTH_GB_S] = \ self.compute_ratio(bandwidth_dict.get(Constant.TRANSIT_SIZE_MB, 0), bandwidth_dict.get(Constant.TRANSIT_TIME_MS, 0)) - - -class CommMatrixAnalysis(BaseCommAnalysis): - SAVED_JSON = "cluster_communication_matrix.json" - STAT_LIST = ['middle', 'top', 'bottom', 'total'] - TOTAL = 'total' - - def __init__(self, param: dict): - super().__init__(param) - self.communication_ops = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS) - - @staticmethod - def combine_link(link_info_dict: dict, single_link_dict: dict): - link_info_dict[Constant.TRANSPORT_TYPE] = single_link_dict.get(Constant.TRANSPORT_TYPE) - link_info_dict[Constant.OP_NAME] = single_link_dict.get(Constant.OP_NAME, '') - link_info_dict[Constant.TRANSIT_TIME_MS] += single_link_dict.get(Constant.TRANSIT_TIME_MS, 0) - link_info_dict[Constant.TRANSIT_SIZE_MB] += single_link_dict.get(Constant.TRANSIT_SIZE_MB, 0) - - def run(self): - if not self.communication_ops: - return - self.split_op_by_group() - self.combine_ops_total_info() - self.dump_data() - - def compute_total_info(self, step_dict: dict): - self.merge_same_links(step_dict) - self.combine_link_info(step_dict) - - def merge_same_links(self, step_dict: dict): - def process_link_key(): - for link_key in rank_dict: - if '-' not in link_key: - print(f"[WARNING] {op_name} has an invalid link key {link_key}!") - break - src_rank = link_key.split('-')[0] - dst_rank = link_key.split('-')[1] - if src_rank == dst_rank: - if src_rank not in project_local_global_rank_map: - project_local_global_rank_map[src_rank] = rank_id - elif project_local_global_rank_map.get(src_rank) != rank_id: - print(f"[WARNING] In the same communication group, local ranks projecting to global ranks repeat!") - self.combine_link(link_info[link_key], rank_dict[link_key]) - - def convert_local_to_global_rank(): - tmp_link = {} - for link_key, link_dict in link_info.items(): - src_rank = link_key.split('-')[0] - dst_rank = link_key.split('-')[1] - src_rank = project_local_global_rank_map[src_rank] \ - if src_rank in project_local_global_rank_map else src_rank - dst_rank = project_local_global_rank_map[dst_rank] \ - if dst_rank in project_local_global_rank_map else dst_rank - link_dict[Constant.BANDWIDTH_GB_S] = \ - self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), - link_dict.get(Constant.TRANSIT_TIME_MS, 0)) - tmp_link[f"{src_rank}-{dst_rank}"] = link_dict - return tmp_link - - project_local_global_rank_map = dict() - for op_name, op_dict in step_dict.items(): - link_info = defaultdict(lambda: { - Constant.TRANSPORT_TYPE: '', - Constant.TRANSIT_TIME_MS: 0, - Constant.TRANSIT_SIZE_MB: 0, - Constant.OP_NAME: '' - }) - for rank_id, rank_dict in op_dict.items(): - process_link_key() - step_dict[op_name] = convert_local_to_global_rank() - - def combine_link_info(self, step_dict: dict): - total_op_info = defaultdict(lambda: { - Constant.TRANSPORT_TYPE: '', - Constant.TRANSIT_TIME_MS: 0, - Constant.TRANSIT_SIZE_MB: 0, - Constant.OP_NAME: '' - }) - for op_name, op_dict in step_dict.items(): - if self.check_add_op(op_name): - for link_key, link_dict in op_dict.items(): - self.combine_link(total_op_info[link_key], link_dict) - for link_key, link_dict in total_op_info.items(): - link_dict[Constant.BANDWIDTH_GB_S] = \ - self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), - link_dict.get(Constant.TRANSIT_TIME_MS, 0)) - step_dict[Constant.TOTAL_OP_INFO] = total_op_info - - def check_add_op(self: any, op_name: str): - """ - 兼容2个版本,判断是否需要将此算子信息相加 - """ - for stat_name in self.STAT_LIST: - if stat_name in op_name: - if stat_name != self.TOTAL: - return False - return True diff --git a/profiler/cluster_analyse/analysis/communication_matrix/__init__.py b/profiler/cluster_analyse/analysis/communication_matrix/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py new file mode 100644 index 0000000000000000000000000000000000000000..df58fcecff82be946fb95378b283db904af0b1d7 --- /dev/null +++ b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py @@ -0,0 +1,133 @@ +import os + +from analysis.base_analysis_json import BaseAnalysisJson +from common_func.db_manager import DBManager +from common_func.constant import Constant +from common_func.table_constant import TableConstant + + +class CommMatrixAnalysisDB: + COMMUNICATION_MATRIX_TABLE = "ClusterCommAnalyzerMatrix" + + def __init__(self, params: any): + self.collection_path = params.get(Constant.COLLECTION_PATH) + self.matrix_info = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS) + self.collective_group_dict = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP) + self.comm_matrix_struct = {} + self.res_comm_matrix = [] + + def run(self): + if not self.matrix_info: + return + self.set_rank_tuple() + self.combine_total_matrix_info() + self.dump_data() + + def dump_data(self): + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.COMMUNICATION_MATRIX_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + res = [] + for data in self.res_comm_matrix: + op_name = data.get(TableConstant.OPNAME) if data.get(TableConstant.OPNAME) is not None else "" + res.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.HCCL_OP_NAME], + data[TableConstant.GROUP_NAME], data[TableConstant.SRC_RANK], data[TableConstant.DST_RANK], + data[TableConstant.TRANSIT_SIZE], data[TableConstant.TRANSIT_TIME], + data[TableConstant.BANDWIDTH], data[TableConstant.TRANSPORT_TYPE], op_name]) + if res: + sql = "insert into {} values ({value})".format(self.COMMUNICATION_MATRIX_TABLE, + value="?," * (len(res[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res) + DBManager.destroy_db_connect(conn, cursor) + + def combine_total_matrix_info(self): + for rank_tuple, group_dict in self.comm_matrix_struct.items(): + if rank_tuple != Constant.P2P: + rank_tuple = "(" + ",".join(str(i) for i in rank_tuple) + ")" + for step, step_dict in group_dict.items(): + self.merge_same_info(step_dict, rank_tuple) + self.combine_total_info(step_dict) + + def combine_total_info(self, step_dict: dict): + link_key_set = set() + for op_name, matrix_dict in step_dict.items(): + self.res_comm_matrix.extend(matrix_dict.values()) + if BaseAnalysisJson.check_add_op(op_name): + for key in matrix_dict.keys(): + link_key_set.add(key) + for link_key in link_key_set: + total_matrix_info = dict() + total_matrix_info[TableConstant.TRANSIT_SIZE] = 0.0 + total_matrix_info[TableConstant.TRANSIT_TIME] = 0.0 + for op_name, matrix_dict in step_dict.items(): + if link_key in matrix_dict.keys() and BaseAnalysisJson.check_add_op(op_name): + total_matrix_info[TableConstant.RANK_SET] = matrix_dict[link_key][TableConstant.RANK_SET] + self.combine_link_info(total_matrix_info, matrix_dict[link_key]) + bandwidth = BaseAnalysisJson.compute_ratio(total_matrix_info[TableConstant.TRANSIT_SIZE], + total_matrix_info[TableConstant.TRANSIT_TIME]) + total_matrix_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO + total_matrix_info[TableConstant.GROUP_NAME] = "" + total_matrix_info[TableConstant.BANDWIDTH] = bandwidth + self.res_comm_matrix.append(total_matrix_info) + + def combine_link_info(self, link_info, data: dict): + for col in data.keys(): + if col in [TableConstant.TRANSIT_TIME, TableConstant.TRANSIT_SIZE]: + link_info[col] += data[col] + else: + link_info[col] = data[col] + + def merge_same_info(self, step_dict: dict, rank_tuple): + def process_matrix(): + for data in op_list: + if data[TableConstant.SRC_RANK] == data[TableConstant.DST_RANK]: + if data[TableConstant.SRC_RANK] not in local_global_rank_map: + local_global_rank_map[data[TableConstant.SRC_RANK]] = data[TableConstant.RANK_ID] + elif local_global_rank_map[data[TableConstant.SRC_RANK]] != data[TableConstant.RANK_ID]: + print(f"[WARNING] In the same communication group, local ranks projecting to global ranks " + f"repeat!") + if (link_key.split('-')[0] == data[TableConstant.SRC_RANK] and + link_key.split('-')[1] == data[TableConstant.DST_RANK]): + self.combine_link_info(matrix_info, data) + new_matrix_list[link_key] = matrix_info + + def convert_local_to_global_rank(): + res_dict = dict() + for key, new_matrix in new_matrix_list.items(): + src_rank = new_matrix[TableConstant.SRC_RANK] + dst_rank = new_matrix[TableConstant.DST_RANK] + src_rank = local_global_rank_map[src_rank] if src_rank in local_global_rank_map else src_rank + dst_rank = local_global_rank_map[dst_rank] if dst_rank in local_global_rank_map else dst_rank + bandwidth = BaseAnalysisJson.compute_ratio(new_matrix[TableConstant.TRANSIT_SIZE], + new_matrix[TableConstant.TRANSIT_TIME]) + key = f"{src_rank}-{dst_rank}" + new_matrix[TableConstant.SRC_RANK] = src_rank + new_matrix[TableConstant.DST_RANK] = dst_rank + new_matrix[TableConstant.BANDWIDTH] = bandwidth + res_dict[key] = new_matrix + return res_dict + + local_global_rank_map = dict() + for op_name, op_list in step_dict.items(): + new_matrix_list = {} + link_key_set = set() + for op_data in op_list: + link_key_set.add(op_data[TableConstant.SRC_RANK] + "-" + op_data[TableConstant.DST_RANK]) + for link_key in link_key_set: + matrix_info = dict() + matrix_info[TableConstant.RANK_SET] = rank_tuple + matrix_info[TableConstant.TRANSIT_SIZE] = 0.0 + matrix_info[TableConstant.TRANSIT_TIME] = 0.0 + process_matrix() + step_dict[op_name] = convert_local_to_global_rank() + + def set_rank_tuple(self): + for data in self.matrix_info: + op_name = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME] + if data[TableConstant.STEP] == Constant.P2P: + rank_tuple = Constant.P2P + else: + rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME])) + self.comm_matrix_struct.setdefault(rank_tuple, {}).setdefault(data[TableConstant.STEP], {}). \ + setdefault(op_name, []).append(data) diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py new file mode 100644 index 0000000000000000000000000000000000000000..7baca7e9283e7471b91be99d1b4b8ac828a80fe2 --- /dev/null +++ b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py @@ -0,0 +1,90 @@ +from collections import defaultdict + +from analysis.base_analysis_json import BaseAnalysisJson +from common_func.constant import Constant + + +class CommMatrixAnalysisJson(BaseAnalysisJson): + SAVED_JSON = "cluster_communication_matrix.json" + + def __init__(self, param: dict): + super().__init__(param) + self.communication_ops = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS) + + @staticmethod + def combine_link(link_info_dict: dict, single_link_dict: dict): + link_info_dict[Constant.TRANSPORT_TYPE] = single_link_dict.get(Constant.TRANSPORT_TYPE) + link_info_dict[Constant.OP_NAME] = single_link_dict.get(Constant.OP_NAME, '') + link_info_dict[Constant.TRANSIT_TIME_MS] += single_link_dict.get(Constant.TRANSIT_TIME_MS, 0) + link_info_dict[Constant.TRANSIT_SIZE_MB] += single_link_dict.get(Constant.TRANSIT_SIZE_MB, 0) + + def run(self): + if not self.communication_ops: + return + self.split_op_by_group() + self.combine_ops_total_info() + self.dump_data() + + def compute_total_info(self, step_dict: dict): + self.merge_same_links(step_dict) + self.combine_link_info(step_dict) + + def merge_same_links(self, step_dict: dict): + def process_link_key(): + for link_key in rank_dict: + if '-' not in link_key: + print(f"[WARNING] {op_name} has an invalid link key {link_key}!") + break + src_rank = link_key.split('-')[0] + dst_rank = link_key.split('-')[1] + if src_rank == dst_rank: + if src_rank not in project_local_global_rank_map: + project_local_global_rank_map[src_rank] = rank_id + elif project_local_global_rank_map.get(src_rank) != rank_id: + print(f"[WARNING] In the same communication group, local ranks projecting to global ranks " + f"repeat!") + self.combine_link(link_info[link_key], rank_dict[link_key]) + + def convert_local_to_global_rank(): + tmp_link = {} + for link_key, link_dict in link_info.items(): + src_rank = link_key.split('-')[0] + dst_rank = link_key.split('-')[1] + src_rank = project_local_global_rank_map[src_rank] \ + if src_rank in project_local_global_rank_map else src_rank + dst_rank = project_local_global_rank_map[dst_rank] \ + if dst_rank in project_local_global_rank_map else dst_rank + link_dict[Constant.BANDWIDTH_GB_S] = \ + self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), + link_dict.get(Constant.TRANSIT_TIME_MS, 0)) + tmp_link[f"{src_rank}-{dst_rank}"] = link_dict + return tmp_link + + project_local_global_rank_map = dict() + for op_name, op_dict in step_dict.items(): + link_info = defaultdict(lambda: { + Constant.TRANSPORT_TYPE: '', + Constant.TRANSIT_TIME_MS: 0, + Constant.TRANSIT_SIZE_MB: 0, + Constant.OP_NAME: '' + }) + for rank_id, rank_dict in op_dict.items(): + process_link_key() + step_dict[op_name] = convert_local_to_global_rank() + + def combine_link_info(self, step_dict: dict): + total_op_info = defaultdict(lambda: { + Constant.TRANSPORT_TYPE: '', + Constant.TRANSIT_TIME_MS: 0, + Constant.TRANSIT_SIZE_MB: 0, + Constant.OP_NAME: '' + }) + for op_name, op_dict in step_dict.items(): + if self.check_add_op(op_name): + for link_key, link_dict in op_dict.items(): + self.combine_link(total_op_info[link_key], link_dict) + for link_key, link_dict in total_op_info.items(): + link_dict[Constant.BANDWIDTH_GB_S] = \ + self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), + link_dict.get(Constant.TRANSIT_TIME_MS, 0)) + step_dict[Constant.TOTAL_OP_INFO] = total_op_info diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..03a1826955dd33e8bdd43e91a359f0915f2121c4 --- /dev/null +++ b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py @@ -0,0 +1,17 @@ +from analysis.communication_matrix.comm_matrix_analysis_db import CommMatrixAnalysisDB +from analysis.communication_matrix.comm_matrix_analysis_json import CommMatrixAnalysisJson +from common_func.constant import Constant + + +class CommMatrixAnalysisGenerator: + + GROUP_MAP = { + Constant.DB: CommMatrixAnalysisDB, + Constant.TEXT: CommMatrixAnalysisJson + } + + def __init__(self, params: dict): + self.generator = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params) + + def run(self): + self.generator.run() diff --git a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py index d24a7f1fe635e62c0857e276578463539a61ee76..20a71df3c57437e5f278ebe450c8811b26bbe3ef 100644 --- a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py +++ b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py @@ -14,8 +14,8 @@ # limitations under the License. import os -from collections import defaultdict +from common_func.db_manager import DBManager from common_func.constant import Constant from common_func.file_manager import FileManager from prof_bean.step_trace_time_bean import StepTraceTimeBean @@ -23,6 +23,7 @@ from prof_bean.step_trace_time_bean import StepTraceTimeBean class StepTraceTimeAnalysis: CLUSTER_TRACE_TIME_CSV = "cluster_step_trace_time.csv" + CLUSTER_TRACE_TIME_TABLE = "ClusterStepTraceTime" def __init__(self, param: dict): self.collection_path = param.get(Constant.COLLECTION_PATH) @@ -30,6 +31,7 @@ class StepTraceTimeAnalysis: self.communication_group = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_GROUP) self.step_time_dict = {} self.step_data_list = [] + self.data_type = param.get(Constant.DATA_TYPE) @staticmethod def get_max_data_row(data_group_list: list): @@ -51,21 +53,44 @@ class StepTraceTimeAnalysis: def dump_data(self): if not self.step_data_list: print("[WARNING] Can't get step time info!") - headers = self.get_headers() - FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers) + if self.data_type == Constant.TEXT: + headers = self.get_headers() + FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers) + else: + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.CLUSTER_TRACE_TIME_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + sql = "insert into {} values ({value})".format(self.CLUSTER_TRACE_TIME_TABLE, + value="?," * (len(self.step_data_list[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, self.step_data_list) + DBManager.destroy_db_connect(conn, cursor) def load_step_trace_time_data(self): for rank_id, profiling_dir_path in self.data_map.items(): - step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV) - if step_time_file: - self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean) + if self.data_type == Constant.TEXT: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV) + if step_time_file: + self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean) + else: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, + Constant.DB_COMMUNICATION_ANALYZER) + if step_time_file and DBManager.check_tables_in_db(step_time_file, Constant.TABLE_STEP_TRACE): + conn, cursor = DBManager.create_connect_db(step_time_file) + sql = "select * from {0}".format(Constant.TABLE_STEP_TRACE) + data = DBManager.fetch_all_data(cursor, sql, is_dict=False) + self.step_time_dict[rank_id] = data + DBManager.destroy_db_connect(conn, cursor) if not self.step_time_dict.get(rank_id): print(f"[WARNING] Rank {rank_id} does not have a valid step_trace_time.json.") def analyze_step_time(self): for rank_id, data_bean_list in self.step_time_dict.items(): for data_bean in data_bean_list: - self.step_data_list.append([data_bean.step, Constant.RANK, rank_id] + data_bean.row) + if self.data_type == Constant.TEXT: + self.step_data_list.append([data_bean.step, Constant.RANK, rank_id] + data_bean.row) + else: + self.step_data_list.append([data_bean[0], Constant.RANK, rank_id] + list(data_bean[1:])) stage_list = self.communication_group.get(Constant.P2P) if not stage_list: return @@ -80,7 +105,11 @@ class StepTraceTimeAnalysis: step_group_dict.setdefault(key, []).append(data_list[3:]) for key, data_group_list in step_group_dict.items(): - self.step_data_list.append([key[0], Constant.STAGE, key[1]] + self.get_max_data_row(data_group_list)) + if self.data_type == Constant.TEXT: + self.step_data_list.append([key[0], Constant.STAGE, key[1]] + self.get_max_data_row(data_group_list)) + else: + index = "(" + ",".join(str(i) for i in key[1]) + ")" + self.step_data_list.append([key[0], Constant.STAGE, index] + self.get_max_data_row(data_group_list)) def get_headers(self): if self.step_time_dict: diff --git a/profiler/cluster_analyse/cluster_analysis.py b/profiler/cluster_analyse/cluster_analysis.py index e07cac170300650bbf735f7e302b33377dd30a5e..68eae526fb05479bc8b93f3bfc51037df221dc25 100644 --- a/profiler/cluster_analyse/cluster_analysis.py +++ b/profiler/cluster_analyse/cluster_analysis.py @@ -14,6 +14,7 @@ # limitations under the License. import argparse +import glob import os from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor @@ -28,6 +29,8 @@ from analysis.analysis_facade import AnalysisFacade class Interface: ASCEND_PT = "ascend_pt" ASCEND_MS = "ascend_ms" + DB_RESULT_INFO = "*.db" + ALL_RESULT_INFO = "*.*" def __init__(self, params: dict): self.collection_path = PathManager.get_realpath(params.get(Constant.COLLECTION_PATH)) @@ -38,6 +41,25 @@ class Interface: self.communication_ops = [] self.matrix_ops = [] + def check_db_or_other_files(self, data_map: dict) -> tuple: + type_db_count = 0 + type_text_count = 0 + for _, folder_path in data_map.items(): + folder_path = os.path.join(folder_path, Constant.SINGLE_OUTPUT) + db_files = glob.glob(os.path.join(folder_path, self.DB_RESULT_INFO)) + all_files = glob.glob(os.path.join(folder_path, self.ALL_RESULT_INFO)) + if all_files and db_files and len(all_files) != len(db_files): + return False, None + if db_files: + type_db_count += 1 + else: + type_text_count += 1 + if type_db_count == len(data_map): + return True, Constant.DB + if type_text_count == len(data_map): + return True, Constant.TEXT + return False, None + def allocate_prof_data(self): ascend_pt_dirs = [] ascend_ms_dirs = [] @@ -51,7 +73,7 @@ class Interface: ms_data_map = MindsporeDataPreprocessor(ascend_ms_dirs).get_data_map() if pt_data_map and ms_data_map: print("[ERROR] Can not analyze pytorch and mindspore meantime.") - return[] + return [] return pt_data_map if pt_data_map else ms_data_map def run(self): @@ -62,10 +84,15 @@ class Interface: if not data_map: print("[WARNING] Can not get rank info or profiling data.") return + is_valid, data_type = self.check_db_or_other_files(data_map) + if not is_valid: + print("[WARNING] The current folder contains both DB and other files. Please check.") + return params = { Constant.COLLECTION_PATH: self.collection_path, Constant.DATA_MAP: data_map, - Constant.ANALYSIS_MODE: self.analysis_mode + Constant.ANALYSIS_MODE: self.analysis_mode, + Constant.DATA_TYPE: data_type } comm_data_dict = CommunicationGroupGenerator(params).generate() params[Constant.COMM_DATA_DICT] = comm_data_dict diff --git a/profiler/cluster_analyse/common_func/constant.py b/profiler/cluster_analyse/common_func/constant.py index e426a9d22567ae9e70411f709c1c09ce02cbdeca..71caee40db8b58ff263ad5d7311e797684883f3d 100644 --- a/profiler/cluster_analyse/common_func/constant.py +++ b/profiler/cluster_analyse/common_func/constant.py @@ -30,6 +30,7 @@ class Constant(object): MAX_JSON_SIZE = 1024 * 1024 * 1024 * 10 MAX_CSV_SIZE = 1024 * 1024 * 1024 * 5 MAX_PATH_LENGTH = 4096 + MAX_READ_DB_FILE_BYTES = 1024 * 1024 * 1024 * 8 # communication P2P = "p2p" @@ -66,11 +67,12 @@ class Constant(object): COMMUNICATION_GROUP = "communication_group" TRANSPORT_TYPE = "Transport Type" COMM_DATA_DICT = "comm_data_dict" + DATA_TYPE = "data_type" ANALYSIS_MODE = "analysis_mode" # step time - RANK = 'rank' - STAGE = 'stage' + RANK = "rank" + STAGE = "stage" # epsilon EPS = 1e-15 @@ -78,3 +80,17 @@ class Constant(object): # file suffix JSON_SUFFIX = ".json" CSV_SUFFIX = ".csv" + + # result files type + TEXT = "text" + DB = "db" + + # db name + DB_COMMUNICATION_ANALYZER = "analysis.db" + DB_CLUSTER_COMMUNICATION_ANALYZER = "cluster_analysis.db" + + # db tables + TABLE_COMM_ANALYZER_BANDWIDTH = "CommAnalyzerBandwidth" + TABLE_COMM_ANALYZER_TIME = "CommAnalyzerTime" + TABLE_COMM_ANALYZER_MATRIX = "CommAnalyzerMatrix" + TABLE_STEP_TRACE = "StepTraceTime" diff --git a/profiler/cluster_analyse/common_func/db_manager.py b/profiler/cluster_analyse/common_func/db_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..f19bc15dc83821d6d65cf3f39713565da8293989 --- /dev/null +++ b/profiler/cluster_analyse/common_func/db_manager.py @@ -0,0 +1,206 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sqlite3 + +from common_func.constant import Constant +from common_func.empty_class import EmptyClass +from common_func.file_manager import check_db_path_valid +from common_func.tables_config import TablesConfig + + +class DBManager: + """ + class to manage DB operation + """ + FETCH_SIZE = 10000 + INSERT_SIZE = 10000 + MAX_ROW_COUNT = 100000000 + + @staticmethod + def create_connect_db(db_path: str) -> tuple: + """ + create and connect database + """ + if check_db_path_valid(db_path): + try: + conn = sqlite3.connect(db_path) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return EmptyClass("empty conn"), EmptyClass("empty curs") + try: + if isinstance(conn, sqlite3.Connection): + curs = conn.cursor() + os.chmod(db_path, Constant.FILE_AUTHORITY) + return conn, curs + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return EmptyClass("empty conn"), EmptyClass("empty curs") + return EmptyClass("empty conn"), EmptyClass("empty curs") + + @staticmethod + def destroy_db_connect(conn: any, curs: any) -> None: + """ + destroy db connection + """ + try: + if isinstance(curs, sqlite3.Cursor): + curs.close() + except sqlite3.Error as err: + print(f"[ERROR] {err}") + try: + if isinstance(conn, sqlite3.Connection): + conn.close() + except sqlite3.Error as err: + print(f"[ERROR] {err}") + + @staticmethod + def judge_table_exists(curs: any, table_name: str) -> any: + """ + judge table exists + """ + if not isinstance(curs, sqlite3.Cursor): + return False + try: + curs.execute("select count(*) from sqlite_master where type='table' and name=?", (table_name,)) + return curs.fetchone()[0] + except sqlite3.Error as err: + print("[ERROR] {}".format(err)) + return False + + @staticmethod + def sql_generate_table(table_map: str): + header_with_type_begin = "(" + header_with_type_end = ")" + header_with_type_list = [] + if table_map in TablesConfig.DATA: + items = TablesConfig.DATA[table_map] + for item in items: + if item[0] == "index": + header_with_type_list.append('"' + item[0] + '" ' + item[1].split(",")[0]) + else: + header_with_type_list.append(item[0] + ' ' + item[1].split(",")[0]) + header_with_type_begin += ",".join(header_with_type_list) + header_with_type_begin += header_with_type_end + return header_with_type_begin + return "" + + @classmethod + def check_tables_in_db(cls, db_path: any, *tables: any) -> bool: + if check_db_path_valid(db_path, True): + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return False + res = True + for table in tables: + if not cls.judge_table_exists(curs, table): + res = False + break + cls.destroy_db_connect(conn, curs) + return res + return False + + @classmethod + def create_tables(cls, db_path: any, *tables: any) -> bool: + conn, curs = cls.create_connect_db(db_path) + for table_name in tables: + if not cls.judge_table_exists(curs, table_name): + table_map = "{0}Map".format(table_name) + header_with_type = cls.sql_generate_table(table_map) + sql = "CREATE TABLE IF NOT EXISTS " + table_name + header_with_type + cls.execute_sql(conn, sql) + + @staticmethod + def execute_sql(conn: any, sql: str, params: any = None) -> bool: + """ + execute sql + """ + try: + if isinstance(conn, sqlite3.Connection): + if params: + conn.cursor().execute(sql, params) + else: + conn.cursor().execute(sql) + conn.commit() + return True + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return False + print("[ERROR] conn is invalid param") + return False + + @staticmethod + def executemany_sql(conn: any, sql: str, params: any) -> bool: + """ + execute many sql once + """ + try: + if isinstance(conn, sqlite3.Connection): + conn.cursor().executemany(sql, params) + conn.commit() + return True + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return False + print("[ERROR] conn is invalid param") + return False + + @classmethod + def fetch_all_data(cls: any, curs: any, sql: str, param: tuple = None, is_dict: bool = True) -> list: + """ + fetch 10000 num of data from db each time to get all data + """ + if not isinstance(curs, sqlite3.Cursor): + return [] + data = [] + try: + if param: + res = curs.execute(sql, param) + else: + res = curs.execute(sql) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + curs.row_factory = None + return [] + try: + description = res.description + while True: + res = curs.fetchmany(cls.FETCH_SIZE) + if is_dict: + data += CustomizedDictFactory.generate_dict_from_db(res, description) + else: + data += res + if len(data) > cls.MAX_ROW_COUNT: + print("[WARRING] The records count in the table exceeds the limit!") + if len(res) < cls.FETCH_SIZE: + break + return data + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return [] + finally: + curs.row_factory = None + + +class CustomizedDictFactory: + @staticmethod + def generate_dict_from_db(data_result: any, description: any) -> any: + description_set = [i[0] for i in description] + res = [] + for data in data_result: + data_dict = dict(zip(description_set, data)) + res.append(data_dict) + return res diff --git a/profiler/cluster_analyse/common_func/empty_class.py b/profiler/cluster_analyse/common_func/empty_class.py new file mode 100644 index 0000000000000000000000000000000000000000..df100d156fa064cca4514260db0b2e843e217d09 --- /dev/null +++ b/profiler/cluster_analyse/common_func/empty_class.py @@ -0,0 +1,20 @@ +class EmptyClass: + + def __init__(self: any, info: str = "") -> None: + self._info = info + + @classmethod + def __bool__(cls: any) -> bool: + return False + + @classmethod + def __str__(cls: any) -> str: + return "" + + @property + def info(self: any) -> str: + return self._info + + @staticmethod + def is_empty() -> bool: + return True diff --git a/profiler/cluster_analyse/common_func/file_manager.py b/profiler/cluster_analyse/common_func/file_manager.py index 3853c806f92de1d8da14e32105fcc869789a9a40..28ecbeaaf16ec5461660f414df03728b36b521d7 100644 --- a/profiler/cluster_analyse/common_func/file_manager.py +++ b/profiler/cluster_analyse/common_func/file_manager.py @@ -115,3 +115,13 @@ class FileManager: file_size = os.path.getsize(file_path) if file_size > limit_size: raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + + +def check_db_path_valid(path: str, is_create: bool = False, max_size: int = Constant.MAX_READ_DB_FILE_BYTES) -> bool: + if os.path.islink(path): + print(f'[ERROR] The db file path: {path} is link. Please check the path') + return False + if not is_create and os.path.exists(path) and os.path.getsize(path) > max_size: + print(f'[ERROR] The db file: {path} is too large to read. Please check the file') + return False + return True diff --git a/profiler/cluster_analyse/common_func/table_constant.py b/profiler/cluster_analyse/common_func/table_constant.py new file mode 100644 index 0000000000000000000000000000000000000000..de6d47e97e5683493905de5353a9978195e87b70 --- /dev/null +++ b/profiler/cluster_analyse/common_func/table_constant.py @@ -0,0 +1,27 @@ +class TableConstant: + + RANK_SET = "rank_set" + STEP = "step" + RANK_ID = "rank_id" + TYPE = "type" + HCCL_OP_NAME = "hccl_op_name" + GROUP_NAME = "group_name" + START_TIMESTAMP = "start_timestamp" + ELAPSED_TIME = "elapse_time" + TRANSIT_TIME = "transit_time" + WAIT_TIME = "wait_time" + SYNCHRONIZATION_TIME = "synchronization_time" + IDLE_TIME = "idle_time" + SYNCHRONIZATION_TIME_RATIO = "synchronization_time_ratio" + WAIT_TIME_RATIO = "wait_time_ratio" + BAND_TYPE = "band_type" + TRANSIT_SIZE = "transit_size" + BANDWIDTH = "bandwidth" + LARGE_PACKET_RATIO = "large_packet_ratio" + PACKAGE_SIZE = "package_size" + COUNT = "count" + TOTAL_DURATION = "total_duration" + SRC_RANK = "src_rank" + DST_RANK = "dst_rank" + TRANSPORT_TYPE = "transport_type" + OPNAME = "op_name" diff --git a/profiler/cluster_analyse/common_func/tables_config.py b/profiler/cluster_analyse/common_func/tables_config.py new file mode 100644 index 0000000000000000000000000000000000000000..75ee41c058f7435ecc267cdc000ecd5a86e9339a --- /dev/null +++ b/profiler/cluster_analyse/common_func/tables_config.py @@ -0,0 +1,63 @@ +class TablesConfig: + DATA = { + "ClusterCommAnalyzerTimeMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("rank_id", "INTEGER, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("start_timestamp", "NUMERIC, null"), + ("elapsed_time", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("wait_time", "NUMERIC, null"), + ("synchronization_time", "NUMERIC, null"), + ("idle_time", "NUMERIC, null"), + ("synchronization_time_ratio", "NUMERIC, null"), + ("wait_time_ratio", "NUMERIC, null") + ], + "CommunicationGroupMap": [ + ("type", "TEXT, null"), + ("rank_set", "TEXT, null") + ], + "ClusterCommAnalyzerBandwidthMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("rank_id", "INTEGER, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("band_type", "TEXT, null"), + ("transit_size", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("bandwidth", "NUMERIC, null"), + ("large_packet_ratio", "NUMERIC, null"), + ("package_size", "NUMERIC, null"), + ("count", "NUMERIC, null"), + ("total_duration", "NUMERIC, null") + ], + "ClusterCommAnalyzerMatrixMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("src_rank", "TEXT, null"), + ("dst_rank", "TEXT, null"), + ("transit_size", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("bandwidth", "NUMERIC, null"), + ("transport_type", "TEXT, null"), + ("op_name", "TEXT, null") + ], + "ClusterStepTraceTimeMap": [ + ("step", "TEXT, null"), + ("type", "TEXT, null"), + ("index", "TEXT, null"), + ("computing", "NUMERIC, null"), + ("communication_not_overlapped", "NUMERIC, null"), + ("overlapped", "NUMERIC, null"), + ("communication", "NUMERIC, null"), + ("free", "NUMERIC, null"), + ("stage", "NUMERIC, null"), + ("bubble", "NUMERIC, null"), + ("communication_not_overlapped_and_exclude_receive", "NUMERIC, null") + ] + } diff --git a/profiler/cluster_analyse/communication_group/base_communication_group.py b/profiler/cluster_analyse/communication_group/base_communication_group.py new file mode 100644 index 0000000000000000000000000000000000000000..515c77c93acf8cd7019747e2362ae103ba8fa528 --- /dev/null +++ b/profiler/cluster_analyse/communication_group/base_communication_group.py @@ -0,0 +1,138 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from collections import defaultdict +from copy import deepcopy +from multiprocessing import Pool + +from common_func.constant import Constant + + +class BaseCommunicationGroup: + def __init__(self, params: dict): + self.collection_path = params.get(Constant.COLLECTION_PATH) + self.data_map = params.get(Constant.DATA_MAP) + self.data_type = params.get(Constant.DATA_TYPE) + self.analysis_mode = params.get(Constant.ANALYSIS_MODE) + self.rank_comm_dir_dict = {} + self.p2p_link = [] + self.collective_group_dict = defaultdict(set) + self.p2p_comm_group = [] + self.communication_group = {} + + def load_communication_data(self): + comm_op_dirs = [] + for rank_id, profiling_dir_path in self.data_map.items(): + if self.data_type == Constant.TEXT: + comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON) + matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON) + else: + comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.DB_COMMUNICATION_ANALYZER) + matrix_dir = comm_dir + if comm_dir and matrix_dir: + comm_op_dirs.append((rank_id, comm_dir, matrix_dir)) + else: + print( + f"[WARNING] Rank {rank_id} does not have a valid communication.json or communication_matrix.json.") + with Pool() as p: + self.rank_comm_dir_dict = p.map(self.read_communication_func, comm_op_dirs) + + def set_p2p_groups(self): + self.p2p_link = sorted(self.p2p_link, key=lambda x: min(x)) + while self.p2p_link: + union_set = deepcopy(self.p2p_link[0]) + rm_list = [self.p2p_link[0]] + for idx, link_rank_set_x in enumerate(self.p2p_link[1:]): + if UnionFind.is_connected(link_rank_set_x, union_set): + union_set = union_set.union(link_rank_set_x) + rm_list.append(link_rank_set_x) + self.p2p_comm_group.append(union_set) + self.p2p_link = [element for element in self.p2p_link if element not in rm_list] + + def generate_collective_communication_group(self): + self.communication_group[Constant.COLLECTIVE] = \ + [list(group) for group_name, group in self.collective_group_dict.items()] + + def generate_p2p_communication_group(self): + stage_group = {} + for group_name, rank_set in self.collective_group_dict.items(): + if not self.whether_valid_comm_group(rank_set): + continue + unioned_set = set() + remove_key = [] + for first_rank, stage in stage_group.items(): + if UnionFind.is_connected(rank_set, stage): + unioned_set = UnionFind.union(rank_set, stage, unioned_set) + remove_key.append(first_rank) + if unioned_set: + for key in remove_key: + del stage_group[key] + stage_group[min(unioned_set)] = unioned_set + else: + stage_group[min(rank_set)] = rank_set + first_rank_sort_list = sorted([first_rank for first_rank in stage_group]) + self.communication_group[Constant.P2P] = \ + [list(stage_group.get(first_rank, {})) for first_rank in first_rank_sort_list] + + def whether_valid_comm_group(self, rank_set: set): + """ + while distinguish which communication group should be used to infer stage info, these group should be ignored: + 1. group can not include more than 1 rank in every single p2p group + """ + for p2p_rank_set in self.p2p_comm_group: + if len(rank_set.intersection(p2p_rank_set)) > 1: + return False + return True + + @abstractmethod + def read_communication_func(self, params: tuple): + pass + + @abstractmethod + def analyze_communication_data(self): + pass + + @abstractmethod + def dump_data(self): + pass + + def generate(self): + self.load_communication_data() + self.analyze_communication_data() + self.set_p2p_groups() + self.generate_collective_communication_group() + self.generate_p2p_communication_group() + return self.dump_data() + + +class UnionFind(object): + """Disjoint Set Union""" + + @classmethod + def union(cls, first_set: set, second_set: set, third_set: set): + """make p and q the same set""" + return first_set | second_set | third_set + + @classmethod + def is_connected(cls, first_set: set, second_set: set): + """ + check whether set p and set q are connected + """ + if first_set & second_set: + return True + else: + return False diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py new file mode 100644 index 0000000000000000000000000000000000000000..e0cd7215e5ed39ef4d2866d32d396a7015654831 --- /dev/null +++ b/profiler/cluster_analyse/communication_group/communication_db_group.py @@ -0,0 +1,106 @@ +import os + +from common_func.db_manager import DBManager +from common_func.constant import Constant +from common_func.table_constant import TableConstant +from communication_group.base_communication_group import BaseCommunicationGroup + + +class CommunicationDBGroup(BaseCommunicationGroup): + COMMUNICATION_GROUP_TABLE = "CommunicationGroup" + + def __init__(self, params: dict): + super().__init__(params) + self.communication_bandwidth_info = [] + self.communication_time_info = [] + self.matrix_info = [] + + def read_communication_func(self, params: tuple): + if len(params) < 3: + return -1, ({}, {}, {}) + rank_id = params[0] + db_path = params[1] + time_data = {} + bandwidth_data = {} + matrix_data = {} + if DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_TIME, + Constant.TABLE_COMM_ANALYZER_BANDWIDTH, + Constant.TABLE_COMM_ANALYZER_MATRIX): + conn, cursor = DBManager.create_connect_db(db_path) + time_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_TIME) + bandwidth_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_BANDWIDTH) + matrix_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_MATRIX) + if self.analysis_mode in ["all", "communication_time"]: + time_data = DBManager.fetch_all_data(cursor, time_info_sql) + bandwidth_data = DBManager.fetch_all_data(cursor, bandwidth_info_sql) + if self.analysis_mode in ["all", "communication_matrix"]: + matrix_data = DBManager.fetch_all_data(cursor, matrix_info_sql) + DBManager.destroy_db_connect(conn, cursor) + return rank_id, (self.data_group_by_step(time_data), self.data_group_by_step(bandwidth_data), + self.data_group_by_step(matrix_data)) + + @staticmethod + def data_group_by_step(data: any) -> any: + res = {} + for item in data: + res.setdefault(item[TableConstant.STEP], []).append(item) + return res + + def dump_data(self): + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.COMMUNICATION_GROUP_TABLE) + res = [] + conn, cursor = DBManager.create_connect_db(result_db) + for data_type, data_list in self.communication_group.items(): + for data in data_list: + rank_set = "(" + ",".join(str(i) for i in data) + ")" + data = [data_type, rank_set] + res.append(data) + if res: + sql = "insert into {} values ({value})".format(self.COMMUNICATION_GROUP_TABLE, + value="?," * (len(res[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res) + DBManager.destroy_db_connect(conn, cursor) + comm_data_dict = { + Constant.COLLECTIVE_GROUP: self.collective_group_dict, + Constant.COMMUNICATION_TIME_INFO: self.communication_time_info, + Constant.COMMUNICATION_BANDWIDTH_INFO: self.communication_bandwidth_info, + Constant.MATRIX_OPS: self.matrix_info, + Constant.COMMUNICATION_GROUP: self.communication_group + } + return comm_data_dict + + def analyze_communication_data(self): + for rank_id, data_tuple in self.rank_comm_dir_dict: + time_data, bandwidth_data, matrix_data = data_tuple[0], data_tuple[1], data_tuple[2] + for step, data_list in time_data.items(): + for data in data_list: + self.compute_collective_group(data, rank_id, self.communication_time_info) + for data in bandwidth_data[step]: + self.compute_collective_group(data, rank_id, self.communication_bandwidth_info) + for step, data_list in matrix_data.items(): + self.add_p2p_and_rank(rank_id, step, matrix_data) + for data in data_list: + self.compute_collective_group(data, rank_id, self.matrix_info) + + def compute_collective_group(self, data, rank_id, res_list): + if data[TableConstant.TYPE] == Constant.COLLECTIVE: + self.collective_group_dict[data[TableConstant.GROUP_NAME]].add(rank_id) + data[TableConstant.RANK_ID] = rank_id + res_list.append(data) + + def add_p2p_and_rank(self, rank_id: int, step: str, data_dict: dict): + data_list = data_dict[step] + if not data_list: + print(f"[WARNING] rank {rank_id} {step} don't have communication matrix ops data") + return + for data in data_list: + if data[TableConstant.TYPE] != Constant.COLLECTIVE and data[TableConstant.TYPE] != Constant.P2P: + print(f"[WARNING] Unknown communication operators type!") + continue + if data[TableConstant.TYPE] == Constant.P2P: + if data[TableConstant.SRC_RANK] != data[TableConstant.DST_RANK]: + rank_set = {data[TableConstant.SRC_RANK], data[TableConstant.DST_RANK]} + if rank_set not in self.p2p_link: + self.p2p_link.append(rank_set) diff --git a/profiler/cluster_analyse/communication_group/communication_group_generator.py b/profiler/cluster_analyse/communication_group/communication_group_generator.py index 4963bf95399fea29edf31be324a49801e7f485d1..3dca90454b608fe3ffb1c365854c2aa3950b6cee 100644 --- a/profiler/cluster_analyse/communication_group/communication_group_generator.py +++ b/profiler/cluster_analyse/communication_group/communication_group_generator.py @@ -13,211 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from copy import deepcopy -from multiprocessing import Pool -from collections import defaultdict from common_func.constant import Constant -from common_func.file_manager import FileManager +from communication_group.communication_db_group import CommunicationDBGroup +from communication_group.communication_json_group import CommunicationJsonGroup class CommunicationGroupGenerator: - COMMUNICATION_GROUP_JSON = "communication_group.json" + + GROUP_MAP = { + Constant.DB: CommunicationDBGroup, + Constant.TEXT: CommunicationJsonGroup + } def __init__(self, params: dict): - self.collection_path = params.get(Constant.COLLECTION_PATH) - self.data_map = params.get(Constant.DATA_MAP) - self.analysis_mode = params.get(Constant.ANALYSIS_MODE) - self.communication_group = {} - self.collective_group_dict = defaultdict(set) - self.p2p_group_dict = defaultdict(list) - self.rank_comm_dir_dict = {} - self.communication_ops = [] - self.p2p_comm_group = [] - self.p2p_link = [] - self.matrix_ops = [] + self.processor = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params) def generate(self): - self.load_communication_json() - self.analyze_communication_ops() - self.set_p2p_groups() - self.generate_collective_communication_group() - self.generate_p2p_communication_group() - FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON) - comm_data_dict = { - Constant.COLLECTIVE_GROUP: self.collective_group_dict, - Constant.COMMUNICATION_OPS: self.communication_ops, - Constant.MATRIX_OPS: self.matrix_ops, - Constant.COMMUNICATION_GROUP: self.communication_group - } - return comm_data_dict - - def analyze_communication_ops(self): - for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict: - for step_id, step_id_dict in rank_id_comm_dict.items(): - if not isinstance(step_id_dict, dict): - print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.") - continue - self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) - for comm_op_type, comm_op_dict in step_id_dict.items(): - self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict) - - for step_id, step_id_dict in rank_id_matrix_dict.items(): - if not isinstance(step_id_dict, dict): - print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.") - continue - self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict) - self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) - - def read_comm_json_func(self: any, params: tuple): - if len(params) < 3: - return -1, {}, {} - rank_id = params[0] - comm_json_path = params[1] - matrix_json_path = params[2] - comm_data = {} - matrix_data = {} - if os.path.exists(comm_json_path) and self.analysis_mode in ['all', 'communication_time']: - comm_data = FileManager.read_json_file(comm_json_path) - if os.path.exists(matrix_json_path) and self.analysis_mode in ['all', 'communication_matrix']: - matrix_data = FileManager.read_json_file(matrix_json_path) - return rank_id, comm_data, matrix_data - - def load_communication_json(self): - comm_op_dirs = [] - for rank_id, profiling_dir_path in self.data_map.items(): - comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON) - matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON) - if comm_dir and matrix_dir: - comm_op_dirs.append((rank_id, comm_dir, matrix_dir)) - else: - print(f"[WARNING] Rank {rank_id} does not have a valid communication.json or communication_matrix.json.") - with Pool() as p: - self.rank_comm_dir_dict = p.map(self.read_comm_json_func, comm_op_dirs) - - def generate_collective_communication_group(self): - self.communication_group[Constant.COLLECTIVE] = \ - [list(group) for group_name, group in self.collective_group_dict.items()] - - def whether_valid_comm_group(self, rank_set: set): - """ - while distinguish which communication group should be used to infer stage info, these group should be ignored: - 1. group can not include more than 1 rank in every single p2p group - """ - for p2p_rank_set in self.p2p_comm_group: - if len(rank_set.intersection(p2p_rank_set)) > 1: - return False - return True - - def generate_p2p_communication_group(self): - stage_group = {} - for group_name, rank_set in self.collective_group_dict.items(): - if not self.whether_valid_comm_group(rank_set): - continue - unioned_set = set() - remove_key = [] - for first_rank, stage in stage_group.items(): - if UnionFind.is_connected(rank_set, stage): - unioned_set = UnionFind.union(rank_set, stage, unioned_set) - remove_key.append(first_rank) - if unioned_set: - for key in remove_key: - del stage_group[key] - stage_group[min(unioned_set)] = unioned_set - else: - stage_group[min(rank_set)] = rank_set - first_rank_sort_list = sorted([first_rank for first_rank in stage_group]) - self.communication_group[Constant.P2P] = \ - [list(stage_group.get(first_rank, {})) for first_rank in first_rank_sort_list] - - def set_p2p_groups(self): - self.p2p_link = sorted(self.p2p_link, key=lambda x: min(x)) - while self.p2p_link: - union_set = deepcopy(self.p2p_link[0]) - rm_list = [self.p2p_link[0]] - for idx, link_rank_set_x in enumerate(self.p2p_link[1:]): - if UnionFind.is_connected(link_rank_set_x, union_set): - union_set = union_set.union(link_rank_set_x) - rm_list.append(link_rank_set_x) - self.p2p_comm_group.append(union_set) - self.p2p_link = [element for element in self.p2p_link if element not in rm_list] - - def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict): - ops = rank_id_matrix_dict.get(step_id, {}) - self.add_matrix_ops(rank_id, step_id, ops) - if not ops: - print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.") - return - p2p_ops = ops.get(Constant.P2P, {}) - for op_name, link_dict in p2p_ops.items(): - self.append_p2p_link(op_name, link_dict) - - def append_p2p_link(self, op_name, link_dict): - for link in link_dict: - if '-' not in link: - print(f"[WARNING] {op_name} has an invalid link key {link}!") - break - src_rank = int(link.split('-')[0]) - dst_rank = int(link.split('-')[1]) - if src_rank != dst_rank: - rank_set = set([src_rank, dst_rank]) - if rank_set in self.p2p_link: - continue - self.p2p_link.append(rank_set) - - def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict): - for comm_op in comm_op_dict: - if comm_op.startswith('Total'): - continue - group_name = comm_op.split('@')[-1] - self.collective_group_dict[group_name].add(rank_id) - - def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict): - for comm_op in comm_op_dict: - if comm_op.startswith('Total'): - continue - group_name = comm_op.split('@')[-1] - self.communication_ops.append({ - Constant.RANK_ID: rank_id, - Constant.STEP_ID: step_id, - Constant.COMM_OP_TYPE: comm_op_type, - Constant.COMM_OP_NAME: comm_op, - Constant.GROUP_NAME: group_name, - Constant.COMM_OP_INFO: comm_op_dict.get(comm_op) - }) - - def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict): - for comm_op_type, comm_dict in step_id_dict.items(): - if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P: - print(f"[WARNING] Unknown communication operators type!") - continue - for op_name, op_link_info in comm_dict.items(): - if op_name.startswith('Total'): - continue - group_name = op_name.split('@')[-1] - self.matrix_ops.append({ - Constant.RANK_ID: rank_id, - Constant.STEP_ID: step_id, - Constant.COMM_OP_TYPE: comm_op_type, - Constant.COMM_OP_NAME: op_name, - Constant.GROUP_NAME: group_name, - Constant.COMM_OP_INFO: op_link_info - }) - - -class UnionFind(object): - """Disjoint Set Union""" - @classmethod - def union(cls, p: set, q: set, o: set): - """make p and q the same set""" - return p | q | o - - @classmethod - def is_connected(cls, p: set, q: set): - """ - check whether set p and set q are connected - """ - if p & q: - return True - else: - return False + return self.processor.generate() diff --git a/profiler/cluster_analyse/communication_group/communication_json_group.py b/profiler/cluster_analyse/communication_group/communication_json_group.py new file mode 100644 index 0000000000000000000000000000000000000000..da6e6c1fe4f699af49ad198df41afb80e34e8772 --- /dev/null +++ b/profiler/cluster_analyse/communication_group/communication_json_group.py @@ -0,0 +1,132 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from common_func.constant import Constant +from common_func.file_manager import FileManager +from communication_group.base_communication_group import BaseCommunicationGroup + + +class CommunicationJsonGroup(BaseCommunicationGroup): + COMMUNICATION_GROUP_JSON = "communication_group.json" + + def __init__(self, params: dict): + super().__init__(params) + self.communication_ops = [] + self.matrix_ops = [] + + def dump_data(self): + FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON) + comm_data_dict = { + Constant.COLLECTIVE_GROUP: self.collective_group_dict, + Constant.COMMUNICATION_OPS: self.communication_ops, + Constant.MATRIX_OPS: self.matrix_ops, + Constant.COMMUNICATION_GROUP: self.communication_group + } + return comm_data_dict + + def analyze_communication_data(self): + for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict: + for step_id, step_id_dict in rank_id_comm_dict.items(): + if not isinstance(step_id_dict, dict): + print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.") + continue + self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) + for comm_op_type, comm_op_dict in step_id_dict.items(): + self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict) + + for step_id, step_id_dict in rank_id_matrix_dict.items(): + if not isinstance(step_id_dict, dict): + print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.") + continue + self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict) + self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) + + def read_communication_func(self: any, params: tuple): + if len(params) < 3: + return -1, {}, {} + rank_id = params[0] + comm_json_path = params[1] + matrix_json_path = params[2] + comm_data = {} + matrix_data = {} + if os.path.exists(comm_json_path) and self.analysis_mode in ["all", "communication_time"]: + comm_data = FileManager.read_json_file(comm_json_path) + if os.path.exists(matrix_json_path) and self.analysis_mode in ["all", "communication_matrix"]: + matrix_data = FileManager.read_json_file(matrix_json_path) + return rank_id, comm_data, matrix_data + + def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict): + ops = rank_id_matrix_dict.get(step_id, {}) + self.add_matrix_ops(rank_id, step_id, ops) + if not ops: + print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.") + return + p2p_ops = ops.get(Constant.P2P, {}) + for op_name, link_dict in p2p_ops.items(): + self.append_p2p_link(op_name, link_dict) + + def append_p2p_link(self, op_name, link_dict): + for link in link_dict: + if '-' not in link: + print(f"[WARNING] {op_name} has an invalid link key {link}!") + break + src_rank = int(link.split('-')[0]) + dst_rank = int(link.split('-')[1]) + if src_rank != dst_rank: + rank_set = set([src_rank, dst_rank]) + if rank_set in self.p2p_link: + continue + self.p2p_link.append(rank_set) + + def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict): + for comm_op in comm_op_dict: + if comm_op.startswith('Total'): + continue + group_name = comm_op.split('@')[-1] + self.collective_group_dict[group_name].add(rank_id) + + def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict): + for comm_op in comm_op_dict: + if comm_op.startswith('Total'): + continue + group_name = comm_op.split('@')[-1] + self.communication_ops.append({ + Constant.RANK_ID: rank_id, + Constant.STEP_ID: step_id, + Constant.COMM_OP_TYPE: comm_op_type, + Constant.COMM_OP_NAME: comm_op, + Constant.GROUP_NAME: group_name, + Constant.COMM_OP_INFO: comm_op_dict.get(comm_op) + }) + + def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict): + for comm_op_type, comm_dict in step_id_dict.items(): + if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P: + print(f"[WARNING] Unknown communication operators type!") + continue + for op_name, op_link_info in comm_dict.items(): + if op_name.startswith('Total'): + continue + group_name = op_name.split('@')[-1] + self.matrix_ops.append({ + Constant.RANK_ID: rank_id, + Constant.STEP_ID: step_id, + Constant.COMM_OP_TYPE: comm_op_type, + Constant.COMM_OP_NAME: op_name, + Constant.GROUP_NAME: group_name, + Constant.COMM_OP_INFO: op_link_info + }) diff --git a/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py b/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..e09108f3cbe3744068daf6c5316dc318aea53177 --- /dev/null +++ b/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py @@ -0,0 +1,45 @@ +from collections import OrderedDict + +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id + + +class ModuleStatisticComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_module_dict, comparison_module_dict = self._group_by_module_name() + for module_name, base_data in base_module_dict.items(): + comparison_data = comparison_module_dict.pop(module_name, []) + self._rows.extend(self._bean(module_name, base_data, comparison_data).rows) + for module_name, comparison_data in comparison_module_dict.items(): + self._rows.extend(self._bean(module_name, [], comparison_data).rows) + update_order_id(self._rows) + + def _group_by_module_name(self): + base_module_dict, comparison_module_dict = OrderedDict(), OrderedDict() + base_all_data = [data for data in self._origin_data if data[0]] # index 0 for base module + base_all_data.sort(key=lambda x: x[0].start_time) + base_none_data = [data for data in self._origin_data if not data[0]] # index 0 for base module + base_none_data.sort(key=lambda x: x[1].start_time) + index = 0 + for base_module, comparison_module in base_all_data: + base_module_dict.setdefault(base_module.module_name, []).append(base_module) + if not comparison_module: + continue + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + if module.start_time < comparison_module.start_time: + comparison_module_dict.setdefault(module.module_name, []).append(module) + index += 1 + else: + break + comparison_module_dict.setdefault(comparison_module.module_name, []).append(comparison_module) + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + comparison_module_dict.setdefault(module.module_name, []).append(module) + index += 1 + return base_module_dict, comparison_module_dict diff --git a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py index bfc631c66c86f061b10445e117e9f947d7ebdbc5..7ad66c0fa7d8f42c6078bd1d04f2164ebda0e9d8 100644 --- a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py +++ b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py @@ -37,9 +37,12 @@ class OverallPerformanceComparator(BaseComparator): self._headers.append('Mem Usage') base_col.append(f'{base_profiling_info.memory_used:.2f}G') comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') - self._headers.extend(['Uncovered Communication Time']) - base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s']) - comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s']) + self._headers.extend(['Uncovered Communication Time(Wait Time)']) + if base_profiling_info.wait_time: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s({base_profiling_info.wait_time:.3f}s']) + else: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s( / )']) + comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s({comp_profiling_info.wait_time:.3f}s)']) if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: self._headers.append('SDMA Time(Num)') base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') diff --git a/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py b/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..97fc98bdd354e1ebe1fbb3fc44def4eaf3059235 --- /dev/null +++ b/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py @@ -0,0 +1,98 @@ +import re + +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class ModuleStatisticBean: + TABLE_NAME = Constant.MODULE_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._module_name = name.replace("nn.Module:", "") + pattern = re.compile('_[0-9]+$') + self._module_class = pattern.sub('', name.split("/")[-1]) + self._module_level = name.count("/") + self._base_info = ModuleStatisticInfo(base_data) + self._comparison_info = ModuleStatisticInfo(comparison_data) + + @property + def rows(self): + rows = [self.get_total_row()] + rows.extend(self.get_detail_rows()) + return rows + + @staticmethod + def _get_kernel_detail_rows(base_kernel_dict, com_kernel_dict): + base_kernel_detals = "" + com_kernel_details = "" + for kernel_name, base_dur_list in base_kernel_dict.items(): + base_dur = "%.3f" % sum(base_dur_list) + base_kernel_detals += f"{kernel_name}, [number: {len(base_dur_list)}], [duration_ms: {base_dur}]\n" + for kernel_name, com_dur_list in com_kernel_dict.items(): + com_dur = "%.3f" % sum(com_dur_list) + com_kernel_details += f"{kernel_name}, [number: {len(com_dur_list)}], [duration_ms: {com_dur}]\n" + return [base_kernel_detals, com_kernel_details] + + def get_total_row(self): + total_diff, total_ratio = calculate_diff_ratio(self._base_info.device_total_dur_ms, + self._comparison_info.device_total_dur_ms) + self_diff, _ = calculate_diff_ratio(self._base_info.device_self_dur_ms, + self._comparison_info.device_self_dur_ms) + row = [None, self._module_class, self._module_level, self._module_name, "[ TOTAL ]", None, + self._base_info.device_self_dur_ms, self._base_info.number, self._base_info.device_total_dur_ms, + None, self._comparison_info.device_self_dur_ms, self._comparison_info.number, + self._comparison_info.device_total_dur_ms, total_diff, self_diff, + total_ratio, self._base_info.call_stack, self._comparison_info.call_stack] + return row + + def get_detail_rows(self): + rows = [] + for op_name, base_dur_dict in self._base_info.api_dict.items(): + base_dur_list = base_dur_dict.get("total", []) + com_dur_dict = self._comparison_info.api_dict.pop(op_name, {}) + com_dur_list = com_dur_dict.get("total", []) + base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows(base_dur_dict.get("detail", {}), + com_dur_dict.get("detail", {})) + self_diff, self_ratio = calculate_diff_ratio(sum(base_dur_list), sum(com_dur_list)) + row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, + sum(base_dur_list), len(base_dur_list), None, com_kernel_details, sum(com_dur_list), + len(com_dur_list), None, None, self_diff, self_ratio, None, None] + rows.append(row) + + for op_name, com_dur_dict in self._comparison_info.api_dict.items(): + com_dur_list = com_dur_dict.get("total", []) + base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows({}, com_dur_dict.get("detail", {})) + self_diff, self_ratio = calculate_diff_ratio(0, sum(com_dur_list)) + row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, 0, 0, + None, com_kernel_details, sum(com_dur_list), len(com_dur_list), None, None, self_diff, + self_ratio, None, None] + rows.append(row) + return rows + + +class ModuleStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.device_self_dur_ms = 0 + self.device_total_dur_ms = 0 + self.call_stack = "" + self.number = len(data_list) + self.api_dict = {} + self._get_info() + + def _get_info(self): + if self._data_list: + self.call_stack = self._data_list[0].call_stack + for module in self._data_list: + self.device_self_dur_ms += module.device_self_dur / Constant.US_TO_MS + self.device_total_dur_ms += module.device_total_dur / Constant.US_TO_MS + for torch_op in module.toy_layer_api_list: + self.api_dict.setdefault(torch_op.name, {}).setdefault("total", []).append( + torch_op.device_dur / Constant.US_TO_MS) + for kernel in torch_op.kernel_list: + self.api_dict.setdefault(torch_op.name, {}).setdefault("detail", {}).setdefault(kernel.kernel_name, + []).append( + kernel.device_dur / Constant.US_TO_MS) diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py index 6ce91ba53c8f2a9286319f35f76b62773743bc49..b3491cfb1393025f5a0f148070f048c103aadd64 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py +++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py @@ -187,6 +187,21 @@ class TraceEventBean: return True return False + def is_python_function(self): + return self.lower_cat == "python_function" + + def is_optimizer(self): + return self.lower_name.startswith("optimizer") + + def is_fwdbwd(self): + return self.lower_cat == "fwdbwd" + + def is_step_profiler(self): + return self.name.find("ProfilerStep#") != -1 + + def reset_name(self, name): + self._name = name + def init(self): if isinstance(self._event, dict): self._pid = self._event.get("pid", 0) diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py index 9184c790b7ea59246b602442a13e7e533d921bc8..b100e7ba9877d6bf4c316b590afc3a37a9346070 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py +++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py @@ -18,6 +18,7 @@ class ProfilingInfo: self.fa_num_bwd = 0 self.compute_time = 0.0 self.communication_not_overlapped = 0.0 + self.wait_time = 0.0 self.memory_used = 0.0 self.e2e_time = 0.0 self.sdma_time = 0.0 @@ -33,6 +34,7 @@ class ProfilingInfo: self.vec_time = self.vec_time / 10 ** 6 self.compute_time = self.compute_time / 10 ** 6 self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6 + self.wait_time = self.wait_time / 10 ** 6 self.e2e_time = self.e2e_time / 10 ** 6 self.sdma_time = self.sdma_time / 10 ** 6 self.scheduling_time = self.scheduling_time / 10 ** 6 @@ -84,6 +86,9 @@ class ProfilingInfo: def update_comm_not_overlap(self, time: float): self.communication_not_overlapped += time + def update_comm_not_overlap_wait_time(self, time: float): + self.wait_time = time + def set_memory_used(self, memory: float): self.memory_used = memory diff --git a/profiler/compare_tools/compare_backend/data_prepare/__init__.py b/profiler/compare_tools/compare_backend/data_prepare/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py b/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..6d45b98dd700117d01d8f55a6a8de66983f25f8a --- /dev/null +++ b/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py @@ -0,0 +1,97 @@ +from queue import Queue + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.constant import Constant +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.tree_builder import TreeBuilder + + +class ModuleDataPrepare: + def __init__(self, profiling_data: ProfilingResult): + self.profiling_data = profiling_data + self._nn_module_list = [] + self._call_function = [] + for event in profiling_data.python_function_data: + if event.lower_name.startswith("nn.module:"): + self._nn_module_list.append(event) + else: + self._call_function.append(event) + self._bwd_dict = {} + self._bwd_pid = self._get_bwd_pid() + + @staticmethod + def update_module_node_info(fwd_root_node, bwd_root_node, func_root_node): + queue = Queue() + queue.put(fwd_root_node) + queue.put(bwd_root_node) + while not queue.empty(): + module_node = queue.get() + module_node.update_torch_op_kernel_list() + call_function = func_root_node.find_module_call(module_node.start_time) + if call_function: + module_node.reset_call_stack(call_function.call_stack) + for sub_module_node in module_node.child_nodes: + queue.put(sub_module_node) + + def build_module_tree(self): + if not self._nn_module_list: + return [None, None] + self._dispatch_torch_op() + event_list = [TraceEventBean({"ts": ts}) for ts in self.profiling_data.kernel_dict.keys()] + self._nn_module_list.extend(event_list) + root_node = TreeBuilder.build_module_tree(self._nn_module_list, self.profiling_data.kernel_dict) + func_root_node = TreeBuilder.build_module_tree(self._call_function, {}) + bwd_module_list = self.get_bwd_module(root_node) + if bwd_module_list: + bwd_module_list.extend(event_list) + bwd_root_node = TreeBuilder.build_module_tree(bwd_module_list, self.profiling_data.kernel_dict) + self.match_torch_op(root_node, bwd_root_node) + self.update_module_node_info(root_node, bwd_root_node, func_root_node) + return [root_node, bwd_root_node] + + def get_bwd_module(self, root_node: ModuleNode): + bwd_module_list = [] + for flow in self.profiling_data.fwdbwd_dict.values(): + start_point = flow.get("start") + end_point = flow.get("end") + if not start_point or not end_point: + continue + end_event = self._bwd_dict.get(end_point.start_time) + if not end_event: + continue + call_module = root_node.find_module_call(start_point.start_time) + if call_module: + end_event.reset_name(f"[ BACKWARD ]{call_module.module_name}") + bwd_module_list.append(end_event) + return bwd_module_list + + def match_torch_op(self, fwd_root_node, bwd_root_node): + torch_op_list = sorted(self.profiling_data.torch_op_data, key=lambda x: x.start_time) + for torch_op in torch_op_list: + if torch_op.is_optimizer(): + continue + if torch_op.is_step_profiler(): + continue + matched_module = fwd_root_node.find_module_call(torch_op.start_time) + if matched_module: + matched_module.find_torch_op_call(torch_op) + continue + matched_module = bwd_root_node.find_module_call(torch_op.start_time) + if matched_module: + matched_module.find_torch_op_call(torch_op) + + def _dispatch_torch_op(self): + for torch_op in self.profiling_data.torch_op_data: + if torch_op.is_optimizer(): + self._nn_module_list.append(torch_op) + continue + if torch_op.pid == self._bwd_pid: + self._bwd_dict[torch_op.start_time] = torch_op + + def _get_bwd_pid(self): + for flow in self.profiling_data.fwdbwd_dict.values(): + end_point = flow.get("end") + if end_point: + return end_point.pid + return Constant.INVALID_VALUE diff --git a/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..fdce23c6ab4ff7f9f6f7d6bc1442063c57cb6098 --- /dev/null +++ b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py @@ -0,0 +1,19 @@ +from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.tree_builder import TreeBuilder + + +class OperatorDataPrepare: + def __init__(self, profiling_data: ProfilingResult): + self.profiling_data = profiling_data + + def get_top_layer_ops(self) -> any: + root_node = TreeBuilder.build_tree(self.profiling_data.torch_op_data, self.profiling_data.kernel_dict, + self.profiling_data.memory_list) + level1_child_nodes = root_node.child_nodes + result_data = [] + for level1_node in level1_child_nodes: + if level1_node.is_step_profiler(): + result_data.extend(level1_node.child_nodes) + else: + result_data.append(level1_node) + return result_data diff --git a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py index 72ce3ba86893b08ffdd8deff5c586731db4b84f5..677b0d18c2cea7e72ccdc78b2a5d2b750276e0ed 100644 --- a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py +++ b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py @@ -1,23 +1,26 @@ import os from collections import deque from datetime import datetime - -import numpy as np +from queue import Queue from compare_backend.comparator.communication_comparator import CommunicationComparator +from compare_backend.comparator.module_statistic_comparator import ModuleStatisticComparator from compare_backend.comparator.operator_comparator import OperatorComparator from compare_backend.comparator.operator_statistic_comparator import OperatorStatisticComparator from compare_backend.compare_bean.communication_bean import CommunicationBean from compare_backend.compare_bean.memory_compare_bean import MemoryCompareBean from compare_backend.compare_bean.memory_statistic_bean import MemoryStatisticBean +from compare_backend.compare_bean.module_statistic_bean import ModuleStatisticBean from compare_backend.compare_bean.operator_compare_bean import OperatorCompareBean from compare_backend.compare_bean.operator_statistic_bean import OperatorStatisticBean +from compare_backend.data_prepare.module_data_prepare import ModuleDataPrepare +from compare_backend.data_prepare.operator_data_prepare import OperatorDataPrepare from compare_backend.generator.base_generator import BaseGenerator -from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.common_func import longest_common_subsequence_matching from compare_backend.utils.constant import Constant +from compare_backend.utils.module_node import ModuleNode from compare_backend.utils.name_function import NameFunction from compare_backend.utils.torch_op_node import TorchOpNode -from compare_backend.utils.tree_builder import TreeBuilder from compare_backend.view.excel_view import ExcelView @@ -46,6 +49,8 @@ class DetailPerformanceGenerator(BaseGenerator): comparator_list = [] if self._args.enable_operator_compare or self._args.enable_memory_compare: op_compare_result = self.match_torch_op() + if self._args.enable_operator_compare: + module_compare_result = self.match_nn_module() if self._args.enable_communication_compare: communication_data = { @@ -56,87 +61,24 @@ class DetailPerformanceGenerator(BaseGenerator): if self._args.enable_operator_compare: comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean)) comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean)) - + if module_compare_result: + comparator_list.append(ModuleStatisticComparator(module_compare_result, ModuleStatisticBean)) if self._args.enable_memory_compare: comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean)) comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean)) return comparator_list def match_torch_op(self) -> list: - base_ops = self._get_top_layer_ops(self._profiling_data_dict.get(Constant.BASE_DATA)) - comparison_ops = self._get_top_layer_ops(self._profiling_data_dict.get(Constant.COMPARISON_DATA)) + base_ops = OperatorDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).get_top_layer_ops() + comparison_ops = OperatorDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).get_top_layer_ops() if not base_ops and not comparison_ops: return [] name_func = NameFunction(self._args).get_name_func() - compare_result_data = self._matching_op(base_ops, comparison_ops, name_func) + op_compare_result = longest_common_subsequence_matching(base_ops, comparison_ops, name_func) if self._args.max_kernel_num is not None: - compare_result_data = self._drill_down(compare_result_data, name_func) - return compare_result_data - - @classmethod - def _matching_op(cls, base_ops: list, comparison_ops: list, name_func: any) -> list: - if not comparison_ops: - result_data = [None] * len(base_ops) - for index, value in enumerate(base_ops): - result_data[index] = [value, None] - return result_data - - result_data = [] - comparison_len, base_len = len(comparison_ops), len(base_ops) - dp = [[0] * (base_len + 1) for _ in range(comparison_len + 1)] - for comparison_index in range(1, comparison_len + 1): - for base_index in range(1, base_len + 1): - if name_func(base_ops[base_index - 1]) == name_func( - comparison_ops[comparison_index - 1]): - dp[comparison_index][base_index] = dp[comparison_index - 1][base_index - 1] + 1 - else: - dp[comparison_index][base_index] = max(dp[comparison_index][base_index - 1], - dp[comparison_index - 1][base_index]) - matched_op = [] - comparison_index, base_index = comparison_len, base_len - while comparison_index > 0 and base_index > 0: - if name_func(base_ops[base_index - 1]) == name_func( - comparison_ops[comparison_index - 1]): - matched_op.append([comparison_index - 1, base_index - 1]) - comparison_index -= 1 - base_index -= 1 - continue - if dp[comparison_index][base_index - 1] > dp[comparison_index - 1][base_index]: - base_index -= 1 - else: - comparison_index -= 1 - if not matched_op: - matched_base_index_list = [] - else: - matched_op.reverse() - matched_op = np.array(matched_op) - matched_base_index_list = list(matched_op[:, 1]) - curr_comparison_index = 0 - for base_index, base_api_node in enumerate(base_ops): - if base_index not in matched_base_index_list: - result_data.append([base_api_node, None]) - continue - matched_comparison_index = matched_op[matched_base_index_list.index(base_index), 0] - for comparison_index in range(curr_comparison_index, matched_comparison_index): - result_data.append([None, comparison_ops[comparison_index]]) - result_data.append([base_api_node, comparison_ops[matched_comparison_index]]) - curr_comparison_index = matched_comparison_index + 1 - if curr_comparison_index < len(comparison_ops): - for comparison_index in range(curr_comparison_index, len(comparison_ops)): - result_data.append([None, comparison_ops[comparison_index]]) - return result_data - - def _get_top_layer_ops(self, profiling_data: ProfilingResult) -> any: - root_node = TreeBuilder.build_tree(profiling_data.torch_op_data, profiling_data.kernel_dict, - profiling_data.memory_list) - level1_child_nodes = root_node.child_nodes - result_data = [] - for level1_node in level1_child_nodes: - if level1_node.is_step_profiler(): - result_data.extend(level1_node.child_nodes) - else: - result_data.append(level1_node) - return result_data + op_compare_result = self._drill_down(op_compare_result, name_func) + return op_compare_result def _drill_down(self, compare_result_data: list, name_func: any) -> list: drill_down_result = [] @@ -152,9 +94,41 @@ class DetailPerformanceGenerator(BaseGenerator): if max(base_op.kernel_num, comparison_op.kernel_num) <= self._args.max_kernel_num: drill_down_result.append(match_data) continue - match_list = self._matching_op(base_op.child_nodes, comparison_op.child_nodes, name_func) + match_list = longest_common_subsequence_matching(base_op.child_nodes, comparison_op.child_nodes, name_func) match_list.reverse() for data in match_list: op_deque.append(data) return drill_down_result + + def match_nn_module(self) -> list: + module_compare_result = [] + base_root_node = ModuleDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).build_module_tree() + comparison_root_node = ModuleDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).build_module_tree() + for index, base_node in enumerate(base_root_node): + comparison_node = comparison_root_node[index] if index < len(comparison_root_node) else None + if not base_node or not comparison_node: + continue + module_compare_result.extend(self._matching_all_modules(base_node, comparison_node)) + return module_compare_result + + def _matching_all_modules(self, base_node: ModuleNode, comparison_node: ModuleNode): + all_matched_modules = [] + matched_queue = Queue() + matched_queue.put([base_node, comparison_node]) + while not matched_queue.empty(): + matched_base_node, matched_comparison_node = matched_queue.get() + matched_node_list = self._matching_common_subsequence(matched_base_node, matched_comparison_node) + all_matched_modules.extend(matched_node_list) + for matched_node in matched_node_list: + matched_queue.put(matched_node) + return all_matched_modules + + def _matching_common_subsequence(self, base_node: ModuleNode, comparison_node: ModuleNode): + base_modules = base_node.child_nodes if base_node else [] + comparison_modules = comparison_node.child_nodes if comparison_node else [] + if not base_modules and not comparison_modules: + return [] + name_func = NameFunction(self._args).get_module_name + return longest_common_subsequence_matching(base_modules, comparison_modules, name_func) diff --git a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py index 4c0b51272b0bbf71f6632a7b28005bae2298d056..3adcaa99d698bd4421ae27963c2e46df726733aa 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py @@ -18,11 +18,19 @@ class ProfilingResult: self.memory_list = [] self.communication_dict = {} self.overall_metrics = ProfilingInfo(profiling_type) + self.python_function_data = [] + self.fwdbwd_dict = {} def update_torch_op_data(self, event: TraceEventBean): event.is_torch_op = True self.torch_op_data.append(event) + def update_python_function_data(self, event: TraceEventBean): + self.python_function_data.append(event) + + def update_fwdbwd_data(self, flow_type: str, event: TraceEventBean): + self.fwdbwd_dict.setdefault(event.id, {})[flow_type] = event + def update_kernel_dict(self, start_time: Decimal, kernel_event: TraceEventBean): self.kernel_dict.setdefault(start_time, []).append(KernelEvent(kernel_event, self._profiling_type)) @@ -53,6 +61,7 @@ class BaseProfilingParser(ABC): self._result_data = ProfilingResult(self._profiling_type) self._memory_events = [] self._flow_dict = {} + self._fwdbwd_dict = {} self._all_kernels = {} self._comm_task_list = [] self._comm_list = [] @@ -134,6 +143,21 @@ class BaseProfilingParser(ABC): return True return False + def _picking_python_function_event(self, event: TraceEventBean): + if event.is_python_function(): + self._result_data.update_python_function_data(event) + return True + return False + + def _picking_fwdbwd_flow_event(self, event: TraceEventBean): + if event.is_fwdbwd(): + if event.is_flow_start(): + self._result_data.update_fwdbwd_data("start", event) + elif event.is_flow_end(): + self._result_data.update_fwdbwd_data("end", event) + return True + return False + def _update_kernel_dict(self): if self._profiling_type == Constant.NPU: for comm in self._comm_list: diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py index 2ad2e1a557fad7095bea642892c64f32363182e9..923854bdf73ad4d60f8b48b7eb3c83d526113b82 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py @@ -9,7 +9,7 @@ from compare_backend.utils.constant import Constant class GPUProfilingParser(BaseProfilingParser): CUBE_MARK = 'gemm' - FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel']] + FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel'], ['attention', 'kernel']] SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)'] FLOW_CAT = ("async_gpu", "async_cpu_to_gpu", "ac2g", "async") TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator") @@ -136,6 +136,9 @@ class GPUProfilingParser(BaseProfilingParser): func_set.add(self._picking_torch_op_event) if self._enable_communication_compare: func_set.add(self._picking_kernel_event) + if self._enable_operator_compare: + func_set.add(self._picking_python_function_event) + func_set.add(self._picking_fwdbwd_flow_event) if self._enable_operator_compare or self._args.max_kernel_num: func_set.add(self._picking_kernel_event) func_set.add(self._picking_flow_event) diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py index f872e52a5314a40dbc2e0d4ff7868e875986b809..1de55d693f123bc205520488d7f8e5cc0492e4b0 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py @@ -41,6 +41,9 @@ class NPUProfilingParser(BaseProfilingParser): if self._enable_operator_compare or self._args.max_kernel_num: func_list.add(self._picking_kernel_event) func_list.add(self._picking_flow_event) + if self._enable_operator_compare: + func_list.add(self._picking_python_function_event) + func_list.add(self._picking_fwdbwd_flow_event) if self._enable_memory_compare: func_list.add(self._picking_task_queue_data) if self._enable_communication_compare: @@ -48,6 +51,7 @@ class NPUProfilingParser(BaseProfilingParser): if self._enable_profiling_compare: func_list.add(self._picking_overlap_analysis_data) func_list.add(self._picking_kernel_event) + func_list.add(self._picking_hccl_event) return list(func_list) def _update_memory_list(self): @@ -98,10 +102,69 @@ class NPUProfilingParser(BaseProfilingParser): self.__parse_kernel_csv() self.__add_sdma_time() self.__add_overlap_analysis_time() + self._picking_notify_wait_event_and_not_overlap_event() + self.__add_overlap_wait_time() self._result_data.overall_metrics.calculate_other_time() self._result_data.overall_metrics.calculate_schedule_time() self._result_data.overall_metrics.trans_time_to_s() + def _picking_notify_wait_event_and_not_overlap_event(self): + self.notify_event_cache = [] + self._not_overlaped_commu_event = [] + for event in self._comm_task_list: + if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK' \ + and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_ACK': + self.notify_event_cache.append(event) + for event in self._overlap_analysis: + if event.is_comm_not_overlap(): + self._not_overlaped_commu_event.append(event) + self._not_overlaped_commu_event.sort(key=lambda x: x.start_time) + + def __add_overlap_wait_time(self): + notify_wait_event_dict = dict() + for notify_event in self.notify_event_cache: + if notify_event.tid in notify_wait_event_dict: + notify_wait_event_dict[notify_event.tid].append(notify_event) + else: + notify_wait_event_dict[notify_event.tid] = [notify_event] + total_time = 0 + for commu_event in self._not_overlaped_commu_event: + wait_time_list = [] + commu_event_start_time = float(commu_event.start_time) + commu_event_end_time = float(commu_event.start_time) + commu_event.dur + + for plane_id, events in notify_wait_event_dict.items(): + wait_time = 0 + idx = 0 + for notify_event in events: + notify_event_start_time = float(notify_event.start_time) + notify_event_end_time = float(notify_event.start_time) + notify_event.dur + if notify_event_start_time < commu_event_start_time and notify_event_end_time > \ + commu_event_end_time: + wait_time = commu_event_end_time - commu_event_start_time + break + elif notify_event_start_time < commu_event_start_time <= notify_event_end_time <= \ + commu_event_end_time: + wait_time += notify_event_end_time - commu_event_start_time + idx += 1 + elif commu_event_start_time <= notify_event_start_time <= commu_event_end_time < \ + notify_event_end_time: + wait_time += commu_event_end_time - notify_event_start_time + break + elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= \ + commu_event_end_time: + wait_time += notify_event_end_time - notify_event_start_time + idx += 1 + elif notify_event_end_time < commu_event_start_time: + idx += 1 + else: + break + + wait_time_list.append(wait_time) + notify_wait_event_dict[plane_id] = notify_wait_event_dict[plane_id][idx:] + total_time += max(wait_time_list) + self._result_data.overall_metrics.update_comm_not_overlap_wait_time(total_time) + def _picking_hccl_event(self, event: TraceEventBean): if event.pid != self._hccl_pid or not event.is_x_mode(): return False @@ -235,7 +298,7 @@ class NPUProfilingParser(BaseProfilingParser): sdma_dict.setdefault(stream_id, []).append(event.dur) elif event.is_compute_event(): ai_core_stream.add(stream_id) - compute_stream = event_wait_stream & ai_core_stream + compute_stream = event_wait_stream & ai_core_stream if event_wait_stream else ai_core_stream for stream in compute_stream: dur_list = sdma_dict.get(stream, []) self._result_data.overall_metrics.update_sdma_info(sum(dur_list), len(dur_list)) diff --git a/profiler/compare_tools/compare_backend/utils/common_func.py b/profiler/compare_tools/compare_backend/utils/common_func.py index 26584626cd1786d32d4e7f5fcaef1a09d8726852..f8f0af97b17467e71ea0426b3ce4d9724f02bb2e 100644 --- a/profiler/compare_tools/compare_backend/utils/common_func.py +++ b/profiler/compare_tools/compare_backend/utils/common_func.py @@ -1,5 +1,7 @@ from decimal import Decimal +import numpy as np + def calculate_diff_ratio(base_value: float, comparison_value: float): if not base_value and not comparison_value: @@ -31,3 +33,56 @@ def convert_to_decimal(data: any) -> Decimal: print('[ERROR] Invalid profiling data which failed to convert data to decimal.') return 0.0 return decimal_value + + +def longest_common_subsequence_matching(base_ops: list, comparison_ops: list, name_func: any) -> list: + if not comparison_ops: + result_data = [None] * len(base_ops) + for index, value in enumerate(base_ops): + result_data[index] = [value, None] + return result_data + + result_data = [] + comparison_len, base_len = len(comparison_ops), len(base_ops) + dp = [[0] * (base_len + 1) for _ in range(comparison_len + 1)] + for comparison_index in range(1, comparison_len + 1): + for base_index in range(1, base_len + 1): + if name_func(base_ops[base_index - 1]) == name_func( + comparison_ops[comparison_index - 1]): + dp[comparison_index][base_index] = dp[comparison_index - 1][base_index - 1] + 1 + else: + dp[comparison_index][base_index] = max(dp[comparison_index][base_index - 1], + dp[comparison_index - 1][base_index]) + matched_op = [] + comparison_index, base_index = comparison_len, base_len + while comparison_index > 0 and base_index > 0: + if name_func(base_ops[base_index - 1]) == name_func( + comparison_ops[comparison_index - 1]): + matched_op.append([comparison_index - 1, base_index - 1]) + comparison_index -= 1 + base_index -= 1 + continue + if dp[comparison_index][base_index - 1] > dp[comparison_index - 1][base_index]: + base_index -= 1 + else: + comparison_index -= 1 + if not matched_op: + matched_base_index_list = [] + else: + matched_op.reverse() + matched_op = np.array(matched_op) + matched_base_index_list = list(matched_op[:, 1]) + curr_comparison_index = 0 + for base_index, base_api_node in enumerate(base_ops): + if base_index not in matched_base_index_list: + result_data.append([base_api_node, None]) + continue + matched_comparison_index = matched_op[matched_base_index_list.index(base_index), 0] + for comparison_index in range(curr_comparison_index, matched_comparison_index): + result_data.append([None, comparison_ops[comparison_index]]) + result_data.append([base_api_node, comparison_ops[matched_comparison_index]]) + curr_comparison_index = matched_comparison_index + 1 + if curr_comparison_index < len(comparison_ops): + for comparison_index in range(curr_comparison_index, len(comparison_ops)): + result_data.append([None, comparison_ops[comparison_index]]) + return result_data diff --git a/profiler/compare_tools/compare_backend/utils/constant.py b/profiler/compare_tools/compare_backend/utils/constant.py index d44f9fea93649f5301fa436a1dcac6a39702112a..d021a730c706894480b563372674d5147d075f99 100644 --- a/profiler/compare_tools/compare_backend/utils/constant.py +++ b/profiler/compare_tools/compare_backend/utils/constant.py @@ -53,6 +53,8 @@ class Constant(object): MEMORY_TOP_TABLE = "MemoryCompareStatistic" COMMUNICATION_TABLE = "CommunicationCompare" PERFORMANCE_TABLE = "Model Profiling Time Distribution" + MODULE_TABLE = "ModuleCompare" + MODULE_TOP_TABLE = "ModuleCompareStatistic" # memory SIZE = "Size(KB)" diff --git a/profiler/compare_tools/compare_backend/utils/excel_config.py b/profiler/compare_tools/compare_backend/utils/excel_config.py index 50b2e6329e3b450fc85caca1c0b0d8ab8895a522..965c229ede573da8e33798108182c78664694740 100644 --- a/profiler/compare_tools/compare_backend/utils/excel_config.py +++ b/profiler/compare_tools/compare_backend/utils/excel_config.py @@ -14,6 +14,10 @@ class CellFormatType: 'bold': True} # 字符串,无背景色,字体加粗 BLUE_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.BLUE_COLOR, 'align': 'left', 'valign': 'vcenter', 'bold': True, 'border': True} # 蓝色背景,加粗 + GREEN_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.GREEN_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 绿色背景,加粗 + YELLOW_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.YELLOW_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 黄色背景,加粗 class ExcelConfig(object): @@ -46,6 +50,19 @@ class ExcelConfig(object): AVG_DURATION = "Avg Duration(us)" MAX_DURATION = "Max Duration(us)" MIN_DURATION = "Min Duration(us)" + MODULE_CLASS = "Module Class" + MODULE_NAME = "Module Name" + HOST_SELF_TIME = "Host Self Time(ms)" + HOST_TOTAL_TIME = "Host Total Time(ms)" + DEVICE_SELF_TIME = "Device Self Time(ms)" + DEVICE_TOTAL_TIME = "Device Total Time(ms)" + DIFF_SELF_TIME = "Device Self Time Diff" + DIFF_TOTAL_RATIO = "Total Diff Ratio" + DIFF_TOTAL_TIME = "Device Total Time Diff" + NUMBER = "Number" + MODULE_LEVEL = "Module Level" + BASE_CALL_STACK = "Base Call Stack" + COMPARISON_CALL_STACK = "Comparison Call Stack" HEADERS = { Constant.OPERATOR_TABLE: [ @@ -118,9 +135,29 @@ class ExcelConfig(object): {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MODULE_TOP_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": MODULE_CLASS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MODULE_LEVEL, "type": CellFormatType.DEFAULT, "width": 15}, + {"name": MODULE_NAME, "type": CellFormatType.DEFAULT, "width": 35}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, + {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} ] } OVERHEAD = {Constant.OPERATOR_TABLE: ["B1:F1", "G1:K1"], Constant.MEMORY_TABLE: ["B1:F1", "G1:K1"], Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"], - Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"]} + Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"], Constant.MODULE_TOP_TABLE: ["F1:I1", "J1:M1"]} diff --git a/profiler/compare_tools/compare_backend/utils/module_node.py b/profiler/compare_tools/compare_backend/utils/module_node.py new file mode 100644 index 0000000000000000000000000000000000000000..0c9331b2cdec4445b7b86d3222b2eb50ac0bcdc5 --- /dev/null +++ b/profiler/compare_tools/compare_backend/utils/module_node.py @@ -0,0 +1,160 @@ +from math import ceil + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.torch_op_node import TorchOpNode + + +class ModuleNode: + ts = "ts" + kernels = "kernels" + + def __init__(self, event: TraceEventBean, parent_node=None): + self._event = event + self._parent_node = parent_node + self._child_nodes = [] + self._module_name = f"{parent_node.module_name}/{event.name}" if parent_node else event.name + self._kernel_self_list = [] + self._kernel_total_list = [] + self._call_stack = f"{parent_node.call_stack};\n{event.name}" if parent_node and parent_node.call_stack \ + else event.name + self._root_torch_op_node = TorchOpNode() + self._cur_torch_op_node = self._root_torch_op_node + + @property + def module_name(self): + return self._module_name + + @property + def name(self): + return self._event.name + + @property + def parent_node(self): + return self._parent_node + + @property + def child_nodes(self): + return self._child_nodes + + @property + def dur(self): + return self._event.dur + + @property + def start_time(self): + return self._event.start_time + + @property + def end_time(self): + return self._event.end_time + + @property + def host_self_dur(self): + return self.dur - sum([node.dur for node in self.child_nodes]) + + @property + def device_self_dur(self): + dur = 0 + for kernel_dict in self._kernel_self_list: + kernel_list = kernel_dict.get(self.kernels, []) + dur += sum([kernel.device_dur for kernel in kernel_list]) + return dur + + @property + def device_total_dur(self): + dur = 0 + for kernel_dict in self._kernel_total_list: + kernel_list = kernel_dict.get(self.kernels, []) + dur += sum([kernel.device_dur for kernel in kernel_list]) + return dur + + @property + def kernel_details(self): + kernel_details = "" + for kernel_dict in self._kernel_self_list: + kernel_list = kernel_dict.get(self.kernels, []) + for kernel in kernel_list: + kernel_details += kernel.kernel_details + return kernel_details + + @property + def toy_layer_api_list(self): + return self._root_torch_op_node.child_nodes + + @property + def call_stack(self): + return self._call_stack + + @staticmethod + def _binary_search(ts_time, parent_node): + if not parent_node.child_nodes: + return None + right = len(parent_node.child_nodes) - 1 + left = 0 + while right > left: + mid = left + ceil((right - left) / 2) + if ts_time >= parent_node.child_nodes[mid].start_time: + left = mid + else: + right = mid - 1 + if parent_node.child_nodes[left].start_time < ts_time < parent_node.child_nodes[left].end_time: + return parent_node.child_nodes[left] + return None + + def reset_call_stack(self, call_stack): + self._call_stack = call_stack + + def update_child_nodes(self, node): + self._child_nodes.append(node) + + def update_kernel_list(self, ts, kernel_list: list): + self._update_kernel_self_list(ts, kernel_list) + node = self + while node.parent_node: + node._update_kernel_total_list(ts, kernel_list) + node = node.parent_node + + def _update_kernel_self_list(self, ts, kernel_list: list): + self._kernel_self_list.append({self.ts: ts, self.kernels: kernel_list}) + + def _update_kernel_total_list(self, ts, kernel_list: list): + self._kernel_total_list.append({self.ts: ts, self.kernels: kernel_list}) + + def find_module_call(self, ts_time): + call_module = self._binary_search(ts_time, self) + while call_module: + module = self._binary_search(ts_time, call_module) + if not module: + return call_module + call_module = module + return call_module + + def find_torch_op_call(self, event): + while self._cur_torch_op_node: + if self._cur_torch_op_node != self._root_torch_op_node and \ + event.start_time > self._cur_torch_op_node.end_time: + self._cur_torch_op_node = self._cur_torch_op_node.parent + continue + tree_node = TorchOpNode(event, self._cur_torch_op_node) + self._cur_torch_op_node.add_child_node(tree_node) + self._cur_torch_op_node = tree_node + break + + def update_torch_op_kernel_list(self): + top_node_list = self._root_torch_op_node.child_nodes + if not top_node_list: + return + top_node_list.sort(key=lambda x: x.start_time) + cur_index = 0 + self._kernel_self_list.sort(key=lambda x: x.get(self.ts, 0)) + for kernel_dict in self._kernel_self_list: + ts = kernel_dict.get(self.ts, 0) + kernel_list = kernel_dict.get(self.kernels, []) + while cur_index < len(top_node_list): + if ts > top_node_list[cur_index].end_time: + cur_index += 1 + continue + if ts < top_node_list[cur_index].start_time: + break + top_node_list[cur_index].update_kernel_list(kernel_list) + break diff --git a/profiler/compare_tools/compare_backend/utils/name_function.py b/profiler/compare_tools/compare_backend/utils/name_function.py index d83f9e4291c9c1afbcbc1e398741d2bdbedd8df8..cd79e8a03fa7a970ce97ad59f14fae12766f096b 100644 --- a/profiler/compare_tools/compare_backend/utils/name_function.py +++ b/profiler/compare_tools/compare_backend/utils/name_function.py @@ -1,3 +1,4 @@ +from compare_backend.utils.module_node import ModuleNode from compare_backend.utils.torch_op_node import TorchOpNode @@ -41,3 +42,11 @@ class NameFunction: input_shape = ';\r\n'.join(data) return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{input_shape}' return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{op_node.input_shape}' + + def get_module_name(self, module: ModuleNode) -> str: + if not self.args.op_name_map: + return module.module_name + module = module.module_name + for old_name, new_name in self.args.op_name_map.items(): + module.replace(old_name, new_name) + return module diff --git a/profiler/compare_tools/compare_backend/utils/torch_op_node.py b/profiler/compare_tools/compare_backend/utils/torch_op_node.py index 45b9299ba0a23fcc0072546f73cec125890d2e21..690c46cd51c1e2991b0bfaf44e9af431cdad5151 100644 --- a/profiler/compare_tools/compare_backend/utils/torch_op_node.py +++ b/profiler/compare_tools/compare_backend/utils/torch_op_node.py @@ -60,6 +60,10 @@ class TorchOpNode: def memory_allocated(self): return self._memory_allocated_list + @property + def device_dur(self): + return sum([kernel.device_dur for kernel in self._kernel_list]) + def add_child_node(self, child_node): self._child_nodes.append(child_node) @@ -73,11 +77,16 @@ class TorchOpNode: cur_node._kernel_num += kernel_num cur_node = cur_node._parent_node + def update_kernel_list(self, kernel_list: list): + if not kernel_list: + return + self._kernel_list.extend(kernel_list) + def set_memory_allocated(self, memory_allocated: MemoryEvent): self._memory_allocated_list.append(memory_allocated) def is_step_profiler(self) -> bool: - return self.name.find("ProfilerStep#") != -1 + return self._event.is_step_profiler() def get_op_info(self) -> list: return [self.name, self.input_shape, self.input_type, self.call_stack] diff --git a/profiler/compare_tools/compare_backend/utils/tree_builder.py b/profiler/compare_tools/compare_backend/utils/tree_builder.py index f621453d1a5a2281425a01e93b3f89b012f35b88..34c1fe1a1f4046d1e60af107f5ee74484424174a 100644 --- a/profiler/compare_tools/compare_backend/utils/tree_builder.py +++ b/profiler/compare_tools/compare_backend/utils/tree_builder.py @@ -1,5 +1,7 @@ from queue import Queue +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.module_node import ModuleNode from compare_backend.utils.torch_op_node import TorchOpNode @@ -7,10 +9,12 @@ class TreeBuilder: @classmethod def build_tree(cls, event_list: list, kernel_dict: dict, memory_list: list) -> TorchOpNode: root_node = TorchOpNode() - event_list.extend(memory_list) - event_list.sort(key=lambda x: x.start_time) + all_event_list = [] + all_event_list.extend(event_list) + all_event_list.extend(memory_list) + all_event_list.sort(key=lambda x: x.start_time) last_node = root_node - for event in event_list: + for event in all_event_list: while last_node: if last_node != root_node and event.start_time > last_node.end_time: last_node = last_node.parent @@ -53,3 +57,26 @@ class TreeBuilder: for child_node in tree_node.child_nodes: node_queue.put(child_node) return result_list + + @classmethod + def build_module_tree(cls, event_list: list, kernel_dict: dict): + root_node = ModuleNode(TraceEventBean({})) + event_list.sort(key=lambda x: x.start_time) + last_node = root_node + for event in event_list: + while last_node: + if last_node != root_node and event.start_time > last_node.end_time: + last_node = last_node.parent_node + continue + if event.is_x_mode(): + tree_node = ModuleNode(event, last_node) + last_node.update_child_nodes(tree_node) + last_node = tree_node + break + if last_node == root_node: + break + kernel_list = kernel_dict.get(event.start_time, []) + if kernel_list: + last_node.update_kernel_list(event.start_time, kernel_list) + break + return root_node diff --git a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py index c5e56c2f8b9a7ae0c1d1a596dbe81e3541f6ce73..7a33168da377ae77ab64fff0886e09eef065b4e2 100644 --- a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py +++ b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py @@ -23,20 +23,28 @@ class WorkSheetCreator: self._write_data() def _write_headers(self): - header_format = self._work_book.add_format(CellFormatType.BLUE_BOLD) + base_header_format = self._work_book.add_format(CellFormatType.GREEN_BOLD) + com_header_format = self._work_book.add_format(CellFormatType.YELLOW_BOLD) + com_index_range = [-1, -1] overhead = self._data.get("overhead", []) if overhead: base_path = f"Base Profiling: {self._args.base_profiling_path}" - self._work_sheet.merge_range(overhead[0], base_path, header_format) + self._work_sheet.merge_range(overhead[0], base_path, base_header_format) + com_index_range = [self._col_ids.index(overhead[1].split(":")[0][0]), + self._col_ids.index(overhead[1].split(":")[1][0])] comparison_path = f"Comparison Profiling: {self._args.comparison_profiling_path}" - self._work_sheet.merge_range(overhead[1], comparison_path, header_format) + self._work_sheet.merge_range(overhead[1], comparison_path, com_header_format) self._row_id += 2 for index, header in enumerate(self._data.get("headers")): + if index in range(com_index_range[0], com_index_range[1] + 1): + header_format = com_header_format + else: + header_format = base_header_format col_id = self._col_ids[index] self._work_sheet.set_column(f"{col_id}:{col_id}", header.get("width")) self._work_sheet.write(f"{col_id}{self._row_id}", header.get("name"), header_format) self._field_format[index] = self._work_book.add_format(header.get("type")) - if header.get("name") == ExcelConfig.DIFF_RATIO: + if header.get("name") in (ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO): self._diff_ratio_index = index self._row_id += 1 diff --git a/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py b/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..8830d495992cfcd2c26024863f8b644d5b4c6902 --- /dev/null +++ b/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py @@ -0,0 +1,223 @@ +import json +import os +import shutil +import stat +import csv +import unittest + +from advisor_backend.interface import Interface +from advisor_backend.compute_advice.npu_slow_advice import NpuSlowAdvice + + +class TestNpuSlowAdvice(unittest.TestCase): + ASCEND_PT_DIR = "./ascend_pt" + OUTPUT_DIR = "./ascend_pt/ASCEND_PROFILER_OUTPUT" + interface = None + err_interface = None + + def tearDown(self): + if os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR): + shutil.rmtree(TestNpuSlowAdvice.ASCEND_PT_DIR) + + def setUp(self): + if os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR): + shutil.rmtree(TestNpuSlowAdvice.ASCEND_PT_DIR) + if not os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR): + os.makedirs(TestNpuSlowAdvice.ASCEND_PT_DIR) + if not os.path.exists(TestNpuSlowAdvice.OUTPUT_DIR): + os.makedirs(TestNpuSlowAdvice.OUTPUT_DIR) + + @classmethod + def get_basic_trace_view(cls): + # Python pid + py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}} + # ascend pid + ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}} + # ascend pid + cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}} + # ascend hardware ops + ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4, "args": {}} + ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106751", "dur": 80, "tid": 3, "pid": 4, "args": {}} + # flow event + flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}} + flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}} + return [py_pid_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, flow_event_s, flow_event_e] + + @classmethod + def create_profiler_info_json(cls): + info = { + "config": { + "common_config": { + "with_stack": True, + "activities": ["ProfilerActivity.CPU", "ProfilerActivity.NPU"] + } + } + } + with os.fdopen(os.open(f"{TestNpuSlowAdvice.ASCEND_PT_DIR}/profiler_info_0.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(info)) + + @classmethod + def create_old_version_trace_view(cls): + basic_info = cls.get_basic_trace_view() + + # python ops + py_event1 = {"ph": "X", "cat": "python_function", "name": "aten::slice", "ts": "200", "dur": 100, "tid": 2, + "pid": 1, + "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}} + py_event2 = {"ph": "X", "cat": "python_function", "name": "slice", "ts": "199", "dur": 200, "tid": 2, "pid": 1, + "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}} + raw_data = [ + *basic_info, py_event1, py_event2 + ] + + with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/trace_view.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(raw_data)) + + @classmethod + def create_new_version_trace_view(cls): + basic_info = cls.get_basic_trace_view() + # python ops + py_event1 = {"ph": "X", "name": "aten::slice", "ts": "200", "dur": 100, "tid": 2, "pid": 1, "args": {}} + py_event2 = {"ph": "X", "name": "slice", "ts": "199", "dur": 105, "tid": 2, "pid": 1, "args": {}} + py_event3 = {"ph": "X", "cat": "python_function", "name": "/root/test/slice.py(116)", "ts": "198", "dur": 120, + "tid": 2, "pid": 1, + "args": {}} + py_event4 = {"ph": "X", "cat": "python_function", "name": "/root/torch/module.py", "ts": "197", "dur": 150, + "tid": 2, "pid": 1, "args": {}} + + raw_data = [ + *basic_info, py_event1, py_event2, py_event3, py_event4 + ] + + with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/trace_view.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(raw_data)) + + @classmethod + def create_kernel_details(cls): + # create csv files + csv_header = ['Step Id', 'Model ID', 'Task ID', 'Stream ID', 'Name', 'Type', 'Accelerator Core', + 'Start Time(us)', + 'Duration(us)', 'Wait Time(us)', 'Block Dim', 'Mix Block Dim', 'Input Shapes', 'Input Data Types', + 'Input Formats', 'Output Shapes', 'Output Data Types', 'Output Formats', 'Context ID', + 'aicore_time(us)', + 'aic_total_cycles', 'aic_mac_ratio', 'aic_mac_int8_ratio', 'aic_cube_fops', + 'aic_vector_fops', + 'aiv_time(us)', 'aiv_total_cycles', 'aiv_vec_fp32_ratio', 'aiv_vec_fp16_ratio', + 'aiv_vec_int32_ratio', + 'aiv_vec_misc_ratio', 'aiv_cube_fops', 'aiv_vector_fops'] + # RED: size=0.0492 MB, throughput=2.32 GB/s, task_duration=21.2us + csv_row1 = [1, 4294967295, 1265, 16, 'Slice1', 'Slice', 'AI_VECTOR_CORE', "1699529623106750\t", 21.2, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 0, 0, 0, 0, 0, 0, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + # YELLOW: size=0.0492 MB, throughput=984 GB/s, task_duration=0.05us + csv_row2 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'AI_VECTOR_CORE', "1699529623106751\t", 0.05, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 0, 0, 0, 0, 0, 0, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + # WHITE: AI_CPU + csv_row3 = [1, 4294967295, 1265, 16, 'Swish1', 'Swish', 'AI_CPU', "1699529623106752\t", 3.14, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] + # GREEN: size=0.0492 MB, throughput=15.67 GB/s, task_duration = 3.14us + csv_row4 = [1, 4294967295, 1265, 16, 'Mul1', 'Mul', 'AI_VECTOR_CORE', "1699529623106753\t", 3.14, 261.56, 9, 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 0, 0, 0, 0, 0, 0, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + # RED: aic_mac_ratio=0.2 + csv_row5 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 28888, 0.2, 0.1, 0.1, 0.7, + 0, 0, 0, 0, 0, 0, 0, 0] + # GREEN: aic_mac_ratio=0.85 + csv_row6 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 38888, 0.85, 0.1, 0.1, 0.7, + 0, 0, 0, 0, 0, 0, 0, 0] + # YELLOW: aic_mac_ratio=0.64 + csv_row7 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 48888, 0.64, 0.1, 0.1, 0.7, + 0, 0, 0, 0, 0, 0, 0, 0] + # WHITE: MIX_AIC + csv_row8 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'MIX_AIC', "1699529623106751\t", 0.05, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 28888, 0.4, 0.1, 0.1, 0.7, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + # WHITE: MIX_AIV + csv_row9 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'MIX_AIV', "1699529623106751\t", 0.05, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 28888, 0.4, 0.1, 0.1, 0.7, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/kernel_details.csv", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + csv_writer = csv.writer(fp) + csv_writer.writerow(csv_header) + csv_writer.writerow(csv_row1) + csv_writer.writerow(csv_row2) + csv_writer.writerow(csv_row3) + csv_writer.writerow(csv_row4) + csv_writer.writerow(csv_row5) + csv_writer.writerow(csv_row6) + csv_writer.writerow(csv_row7) + csv_writer.writerow(csv_row8) + csv_writer.writerow(csv_row9) + + def test_run_should_return_empty_when_ascend_pt_path_not_exist(self): + interface = Interface("") + data = interface.get_data('compute', 'npu_slow') + self.assertEqual(0, len(data)) + + def test_run_should_return_empty_when_there_is_no_kernel_details(self): + interface = Interface(self.ASCEND_PT_DIR) + data = interface.get_data('compute', 'npu_slow') + self.assertEqual(0, len(data)) + + def test_run_should_return_7_data_without_call_stack_when_json_not_exist(self): + self.create_kernel_details() + interface = Interface(self.ASCEND_PT_DIR) + data = interface.get_data('compute', 'npu_slow') + call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)") + self.assertEqual(9, len(data)) + self.assertEqual("", call_stack) + + def test_run_should_return_7_data_with_call_stack_when_new_trace_view_exists(self): + self.create_profiler_info_json() + self.create_kernel_details() + self.create_new_version_trace_view() + interface = Interface(self.ASCEND_PT_DIR) + data = interface.get_data('compute', 'npu_slow') + slow_op_data = data[data["color"] == "RED"] + NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(self.ASCEND_PT_DIR, "slow_op.xlsx")) + call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)") + self.assertEqual(9, len(data)) + self.assertEqual(2, len(slow_op_data)) + print(call_stack) + call_stack_res = "/root/torch/module.py\n" \ + "/root/test/slice.py(116)" + self.assertEqual(call_stack_res, call_stack) + + def test_run_should_return_7_data_with_call_stack_when_old_trace_view_exists(self): + self.create_profiler_info_json() + self.create_kernel_details() + self.create_old_version_trace_view() + interface = Interface(self.ASCEND_PT_DIR) + data = interface.get_data('compute', 'npu_slow') + slow_op_data = data[data["color"] == "RED"] + NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(self.ASCEND_PT_DIR, "slow_op.xlsx")) + call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)") + self.assertEqual(9, len(data)) + self.assertEqual(2, len(slow_op_data)) + print(call_stack) + call_stack_res = "/root/test/slice.py(116)\n\r\n" \ + "/root/torch/module.py" + self.assertEqual(call_stack_res, call_stack) diff --git a/sample/README.md b/sample/README.md index 167b1a01cbd87c75eb6a6479a39fc198360a402f..6bd55a2f83422b2f0c8424c9687a38f1698aa6fb 100644 --- a/sample/README.md +++ b/sample/README.md @@ -5,12 +5,61 @@ 如果考虑商用集成,推荐使用CANN软件包中的AscendC样例工程,比如:ascendc_kernel_cmake目录。本项目中的工程就是基于其进行简化仅用于快速验证。 +说明:该sample目录中,每个最小目录就是一个完整的样例工程。这些样例工程本身可能以为依赖的不同存在差异。 + ## 依赖说明 安装CANN包,并使能环境变量,并确保```ASCEND_HOME_PATH```生效,可以在CANN包安装目录下使能: ``` source set_env.sh ``` +## 目录介绍 +整体目录结构如下: +``` +- sample + |- build # 编译并运行所有样例内容(建议按需使用,此处命令可以参考 + |- normal_sample # 纯C/C++的AscendC单算子极简工程,可配合msdebug和msprof工具 + |- cube_only # 仅含aic的AscendC单算子极简工程 + |- mix # mix算子的AscendC单算子极简工程 + |- vec_only # 仅含aiv的AscendC单算子极简工程 + |- pytorch_adapter # 适配pytorch的AscendC单算子极简工程,可配合msdebug和msprof工具 + |- jit_compile # jit模式,运行时编译使用 + |- with_setuptools # 编译成wheel包安装使用 + |- sanitizer_sample # 异常样例,用于配合mssanitizer工具 + |- racecheck # 含竞争问题的样例 + |- xx # 其他异常样例 +``` + +如果你关注自定义算子的pytorch框架适配,详见[此处](./pytorch_adapter/README.md) + + +## 算子调试 msdebug +若使用msdebug进行上板调试,还需要额外调整,具体如下: +1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: + + 调试信息增强,并扩大栈空间: + ``` + COMPILER_FLAG := -xcce -O2 -std=c++17 + 修改为: + COMPILER_FLAG := -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true + ``` + +2. 运行阶段: +``` +msdebug ./*.fatbin +``` + +## 内存检测 sanitizer +1. 编译阶段:在编译过程中添加```--cce-enable-sanitizer -g```参数, 在链接过程中添加```--cce-enable-sanitizer```参数。(现样例中已在Makefile中添加),执行如下命令: +``` +make +``` + +2. 运行阶段: +``` +mssanitizer ./*.fatbin # 默认进行memcheck检查 +``` + + ## 算子调优 算子调优工具可以支持上板和仿真算子的调优,下面将以vec_only中的算子为例,进行工具使用的实战命令讲解 @@ -84,30 +133,3 @@ source set_env.sh └── trace.json # 算子所有核的流水图 ``` 4. 更多指标信息请参考算子开发工具使用手册。 - -## 算子调试msdebug -若使用msdebug进行上板调试,还需要额外调整,具体如下: -1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: - + 调试信息增强,并扩大栈空间: - ``` - COMPILER_FLAG := -xcce -O2 -std=c++17 - 修改为: - COMPILER_FLAG := -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true - -## 内存检测 sanitizer -### sanitizer_sample目录介绍 - -此目录下为sanitizer对应的样例库,包含竞争检测和内存检测相关的样例。 - -#### Racecheck目录介绍 - -Racecheck为竞争检测相关的样例。 - -raw_error_kernel.cpp文件为UB上先读后写竞争和GM上先写后读竞争问题的样例。 - - -运行阶段: - -``` -/usr/local/Ascend/ascend-toolkit/latest/tools/mssanitizer/bin/mssanitizer --tool=racecheck ./raw_error.fatbin -``` \ No newline at end of file diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a2b1ba63570058ac954a121f4b14b396f5dace81 --- /dev/null +++ b/sample/pytorch_adapter/README.md @@ -0,0 +1,53 @@ +# 自定义算子的pytorch框架适配说明 + +## 简介 +昇腾提供丰富的算子接入框架的方式,此处将介绍最简单的一种,每个目录中都是一个独立的可使用的工程 + +## 依赖 +与业内pytorch的算子介入方式相同,算子接入框架需要保障设备上有正确的pytorch版本(我们还依赖torch_npu版本) + +pytorch版本可由pip安装,torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases),请选择与pytorch适配的torch_npu版本。 + +## 工程介绍 +整体工程目录如下: +``` +- pytorch_adapter + |- jit_compile # 实时编译的接入方式 + |- add_adapter.cpp # 使用算子动态库接口完成算子在pytorch框架的适配 + |- add_kernel.cpp # 昇腾算子实现,并提供host侧的动态库接口 + |- main.py # python的入口,实现整体集成 + |- Makefile # 用以生成昇腾算子的host侧动态库的编译脚本 + |- with_setuptools # wheel包的接入方式 + |- add_adapter.cpp + |- add_kernel.cpp + |- Makefile + |- setup.py # setuptools的入口,支持编译并打包生成wheel包 + |- test.py # 测试wheel包功能的入口 +``` + +## 工程使用 + +### jit_compile工程 +执行如下命令,就会在运行过程中,现场生成python模块并使用: +``` +python main.py +``` + +### setuptools工程 +针对with_setuptools工程,可以编译出可安装的wheel包,便于多机部署使用。 + + +1. 执行如下命令可以编译出软件包(setuptools可以支持多种方式,比如:build,install等,此处不一一展示): +``` +pytorch setup.py bdist_wheel # 编译出wheel包,在dist目录下 +``` + +2. 到```dist```目录下用pip命令安装对应软件包。 + +3. 执行测试脚本 +``` +python test.py +``` + +## 其他 +1. 此处样例使用的是静态tiling,如果使用动态tiling,则可以在adapter.cpp中对Tensor的shape进行分析,选择合适tiling。(这部分是流程中必须的,只是可能在不同位置,比如aclnn中,这部分在接口实现;此处,我们本身也可以对add_custom_do进行封装,将tiling内置。) \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/Makefile b/sample/pytorch_adapter/jit_compile/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ec9115f377a578677470b89f365583dfcf246515 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/Makefile @@ -0,0 +1,20 @@ +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: + rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/add_adapter.cpp b/sample/pytorch_adapter/jit_compile/add_adapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c65e60ec596fe8b5627e06f678549b5f2f05660 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/add_adapter.cpp @@ -0,0 +1,128 @@ +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; + +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); + +// 为NPU设备注册前向实现 +at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) +{ + // 创建输出内存 + at::Tensor result = at::Tensor(self); + // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 + // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) + // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + auto x = self.storage().data(); + auto y = other.storage().data(); + auto z = result.storage().data(); + + uint32_t blockDim = 8; + auto callback = [stream, blockDim, x, y, z]() -> int { + add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); + return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 + }; + // 下发算子 + at_npu::native::OpCommand cmd; + cmd.Name("my_add").SetCustomHandler(callback).Run(); + return result; +} + +// 为NPU设备注册反向实现 +std::tuple my_add_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + + return {result, result}; +} + +// 为Meta设备注册前向实现 +at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) +{ + return empty_like(self); +} + +// 为Meta设备注册反向实现 +std::tuple my_add_backward_impl_meta(const at::Tensor &self) +{ + auto result = empty_like(self); + return std::make_tuple(result, result); +} + +// 寻找注册在该op上的不同设备的实现 +at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) +{ + static auto op = + torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); + return op.call(self, other); +} +// 寻找注册在该op上的不同设备的实现 +std::tuple my_add_backward_impl(const at::Tensor &self) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("myaten::my_add_backward", "") + .typed(); + return op.call(self); +} + +// 在myaten命名空间里注册my_add和my_add_backward两个schema +TORCH_LIBRARY(myaten, m) +{ + m.def("my_add(Tensor self, Tensor other) -> Tensor"); + m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class MyAddFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) + { + at::AutoDispatchBelowADInplaceOrView guard; + return my_add_impl(self, other); + } + + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) + { + auto grad_output = grad_outputs[0]; + auto result = my_add_backward_impl(grad_output); + return {std::get<0>(result), std::get<1>(result)}; + } +}; + +at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) +{ + return MyAddFunction::apply(self, other); +} + +// 给op绑定NPU的自动求导实现 +// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA +TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_autograd); +} + +// 为NPU设备注册前反向实现 +// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA +TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_npu); + m.impl("my_add_backward", &my_add_backward_impl_npu); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(myaten, Meta, m) +{ + m.impl("my_add", &my_add_impl_meta); + m.impl("my_add_backward", &my_add_backward_impl_meta); +} + +// 通过pybind将c++接口和python接口绑定 +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("my_add", &my_add_impl_autograd, "x + y"); +} diff --git a/sample/pytorch_adapter/jit_compile/add_kernel.cpp b/sample/pytorch_adapter/jit_compile/add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aa62e093633de1f5bddc8d9b7f80fb58831bdb9 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/add_kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); +} \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py new file mode 100644 index 0000000000000000000000000000000000000000..847a51f1c4787dcf353759d1115f352c1c760353 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/main.py @@ -0,0 +1,70 @@ +import os +import subprocess +import torch +import torch_npu +import torch.utils.cpp_extension +from torch_npu.testing.testcase import TestCase, run_tests + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.path.abspath(os.path.dirname(__file__)) + + +def compile_kernels(): + # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + subprocess.run("make") + + +def compile_host(): + extra_ldflags = [] + extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib") + extra_ldflags.append("-ltorch_npu") + extra_ldflags.append(f"-L{CUR_PATH}/") + extra_ldflags.append("-lcustom_kernels") + extra_include_paths = [] + extra_include_paths.append("./") + extra_include_paths.append(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include")) + extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc")) + + module = torch.utils.cpp_extension.load( + name="jit_extension", + sources=[ + "add_adapter.cpp" + ], + extra_include_paths=extra_include_paths, + extra_ldflags=extra_ldflags, + verbose=True) + return module + + +class TestCustomAdd(TestCase): + def test_add(self): + module = compile_host() + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = module.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) + + +if __name__ == '__main__': + compile_kernels() + run_tests() diff --git a/sample/pytorch_adapter/with_setuptools/Makefile b/sample/pytorch_adapter/with_setuptools/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ec9115f377a578677470b89f365583dfcf246515 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/Makefile @@ -0,0 +1,20 @@ +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: + rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c65e60ec596fe8b5627e06f678549b5f2f05660 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp @@ -0,0 +1,128 @@ +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; + +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); + +// 为NPU设备注册前向实现 +at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) +{ + // 创建输出内存 + at::Tensor result = at::Tensor(self); + // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 + // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) + // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + auto x = self.storage().data(); + auto y = other.storage().data(); + auto z = result.storage().data(); + + uint32_t blockDim = 8; + auto callback = [stream, blockDim, x, y, z]() -> int { + add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); + return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 + }; + // 下发算子 + at_npu::native::OpCommand cmd; + cmd.Name("my_add").SetCustomHandler(callback).Run(); + return result; +} + +// 为NPU设备注册反向实现 +std::tuple my_add_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + + return {result, result}; +} + +// 为Meta设备注册前向实现 +at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) +{ + return empty_like(self); +} + +// 为Meta设备注册反向实现 +std::tuple my_add_backward_impl_meta(const at::Tensor &self) +{ + auto result = empty_like(self); + return std::make_tuple(result, result); +} + +// 寻找注册在该op上的不同设备的实现 +at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) +{ + static auto op = + torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); + return op.call(self, other); +} +// 寻找注册在该op上的不同设备的实现 +std::tuple my_add_backward_impl(const at::Tensor &self) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("myaten::my_add_backward", "") + .typed(); + return op.call(self); +} + +// 在myaten命名空间里注册my_add和my_add_backward两个schema +TORCH_LIBRARY(myaten, m) +{ + m.def("my_add(Tensor self, Tensor other) -> Tensor"); + m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class MyAddFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) + { + at::AutoDispatchBelowADInplaceOrView guard; + return my_add_impl(self, other); + } + + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) + { + auto grad_output = grad_outputs[0]; + auto result = my_add_backward_impl(grad_output); + return {std::get<0>(result), std::get<1>(result)}; + } +}; + +at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) +{ + return MyAddFunction::apply(self, other); +} + +// 给op绑定NPU的自动求导实现 +// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA +TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_autograd); +} + +// 为NPU设备注册前反向实现 +// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA +TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_npu); + m.impl("my_add_backward", &my_add_backward_impl_npu); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(myaten, Meta, m) +{ + m.impl("my_add", &my_add_impl_meta); + m.impl("my_add_backward", &my_add_backward_impl_meta); +} + +// 通过pybind将c++接口和python接口绑定 +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("my_add", &my_add_impl_autograd, "x + y"); +} diff --git a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aa62e093633de1f5bddc8d9b7f80fb58831bdb9 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); +} \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..92ab1d3c78c7866b4bd53d9531bf0674c8b2987e --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/setup.py @@ -0,0 +1,51 @@ +import os +import subprocess +import torch +import torch_npu +from setuptools import setup, find_packages +from torch.utils.cpp_extension import BuildExtension +from torch_npu.utils.cpp_extension import NpuExtension + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.path.abspath(os.path.dirname(__file__)) + + +def compile_kernels(): + # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + subprocess.run("make") + return "libcustom_kernels.so" # 这个make出来的库名字 + + +def compile_adapter(): + ext = NpuExtension( + name="ascend_custom_kernels_lib", # import的库的名字 + # 如果还有其他cpp文件参与编译,需要在这里添加 + sources=[f"{CUR_PATH}/add_adapter.cpp"], + extra_compile_args=[ + '-I' + os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"), + ], + library_dirs=[f"{CUR_PATH}"], # 编译时需要依赖的库文件的路径,相当于g++编译时的-L选项 + libraries=["custom_kernels"], # 编译时依赖的库文件,相当于-l选项 + ) + return [ext] + + +if __name__ == "__main__": + # 编译出含有算子的库,并以so的方式提供 + kernel_so = compile_kernels() + + # 编译出pytorch适配层的库,支持被框架集成 + exts = compile_adapter() + + # 将整体打包成wheel包 + setup( + name="ascend_custom_kernels", # package的名字 + version='1.0', + keywords='ascend_custom_kernels', + ext_modules=exts, + packages=find_packages(), + cmdclass={"build_ext": BuildExtension}, + data_files=[(".", [kernel_so])], + include_package_data=True, + ) diff --git a/sample/pytorch_adapter/with_setuptools/test.py b/sample/pytorch_adapter/with_setuptools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..896eef2c0fbb1a113377fb7dc770f45fd99832f4 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/test.py @@ -0,0 +1,34 @@ +import torch +import torch_npu +import ascend_custom_kernels_lib +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestCustomAdd(TestCase): + def test_add(self): + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = ascend_custom_kernels_lib.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) + + +if __name__ == "__main__": + run_tests() diff --git a/sample/third_party/lib/libruntime.so.aarch64 b/sample/third_party/lib/libruntime.so.aarch64 deleted file mode 100644 index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime.so.aarch64 and /dev/null differ diff --git a/sample/third_party/lib/libruntime.so.x86 b/sample/third_party/lib/libruntime.so.x86 deleted file mode 100644 index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime.so.x86 and /dev/null differ diff --git a/sample/third_party/lib/libruntime_camodel.so.aarch64 b/sample/third_party/lib/libruntime_camodel.so.aarch64 deleted file mode 100644 index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime_camodel.so.aarch64 and /dev/null differ diff --git a/sample/third_party/lib/libruntime_camodel.so.x86 b/sample/third_party/lib/libruntime_camodel.so.x86 deleted file mode 100644 index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime_camodel.so.x86 and /dev/null differ