From ae5ff82d0f854286c6461ce380d75ea35b227a2e Mon Sep 17 00:00:00 2001 From: jiangmianjiao Date: Wed, 17 May 2023 09:54:49 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=E3=80=91=20memory=20view=E9=9C=80=E6=B1=82=E5=BC=80=E5=8F=91?= =?UTF-8?q?=20=E3=80=90=E4=BF=AE=E6=94=B9=E4=BA=BA=E3=80=91=20jiangmianjia?= =?UTF-8?q?o=2030036454?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tb_plugin/fe/src/components/Kernel.tsx | 2 +- .../tb_plugin/torch_tb_profiler/plugin.py | 24 ++-- .../torch_tb_profiler/profiler/data.py | 14 ++ .../profiler/run_generator.py | 129 ++++++++++++++++-- .../tb_plugin/torch_tb_profiler/run.py | 5 + 5 files changed, 155 insertions(+), 19 deletions(-) diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx index aecb86b3ea3..89c2f5b2c52 100644 --- a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx @@ -275,7 +275,7 @@ export const Kernel: React.FC = (props) => { value={searchOpName} onChange={onSearchOpChanged} type="search" - label="Search by Step ID" + label="Search by Step Id" /> ) : diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py index 798fc5fb84f..25d7ede912a 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py @@ -291,17 +291,22 @@ class TorchProfilerPlugin(base_plugin.TBPlugin): start_ts = int(start_ts) if end_ts is not None: end_ts = int(end_ts) - - return self.respond_as_json( - profile.get_memory_stats(start_ts=start_ts, end_ts=end_ts, memory_metric=memory_metric), True) + if profile.device_target == 'Ascend': + return None + else: + return self.respond_as_json( + profile.get_memory_stats(start_ts=start_ts, end_ts=end_ts, memory_metric=memory_metric), True) @wrappers.Request.application def memory_curve_route(self, request: werkzeug.Request): profile = self._get_profile_for_request(request) time_metric = request.args.get('time_metric', 'ms') memory_metric = request.args.get('memory_metric', 'MB') - return self.respond_as_json( - profile.get_memory_curve(time_metric=time_metric, memory_metric=memory_metric), True) + if profile.device_target == 'Ascend': + return self.respond_as_json(profile.memory_all_curve, True) + else: + return self.respond_as_json( + profile.get_memory_curve(time_metric=time_metric, memory_metric=memory_metric), True) @wrappers.Request.application def memory_events_route(self, request: werkzeug.Request): @@ -315,9 +320,12 @@ class TorchProfilerPlugin(base_plugin.TBPlugin): if end_ts is not None: end_ts = int(end_ts) - return self.respond_as_json( - profile.get_memory_events(start_ts, end_ts, time_metric=time_metric, - memory_metric=memory_metric), True) + if profile.device_target == 'Ascend': + return self.respond_as_json(profile.memory_events, True) + else: + return self.respond_as_json( + profile.get_memory_events(start_ts, end_ts, time_metric=time_metric, + memory_metric=memory_metric), True) @wrappers.Request.application def module_route(self, request: werkzeug.Request): diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py index 4637527cda3..224c626e8ac 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py @@ -75,6 +75,7 @@ class RunProfileData(object): self.steps_costs = None self.steps_names = None self.avg_costs = None + self.has_memory: bool = False # GPU parser self.gpu_metrics_parser: GPUMetricsParser = None @@ -100,6 +101,10 @@ class RunProfileData(object): # recommendation based on analysis result. self.recommendations = [] + # npu memory data + self.memory_form_path: str = None + self.memory_line_path: str = None + @staticmethod def parse_gpu(worker, span, path, cache_dir): trace_path, trace_json = RunProfileData._preprocess_file(path, cache_dir, 'GPU') @@ -114,6 +119,8 @@ class RunProfileData(object): trace_path = path has_trace = False has_kernel = False + has_memory_line = False + has_memory_form = False for file in io.listdir(path): if utils.is_npu_trace_path(file): has_trace = True @@ -129,7 +136,14 @@ class RunProfileData(object): if str(file) == 'kernel_details.csv': has_kernel = True profile.kernel_file_path = io.join(path, file) + if str(file) == 'memory_view_line_chart.csv': + has_memory_line = True + profile.memory_line_path = io.join(path, file) + if str(file) == 'memory_view_form.csv': + has_memory_form = True + profile.memory_form_path = io.join(path, file) profile.has_kernel = has_kernel + profile.has_memory = has_memory_form and has_memory_line return profile @staticmethod diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py index 9107947a5ea..0045a9cfe20 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py @@ -1,7 +1,7 @@ # ------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # -------------------------------------------------------------------------- -from collections import OrderedDict +from collections import OrderedDict, defaultdict from typing import Dict, Iterable, List import csv @@ -11,6 +11,7 @@ from .data import DistributedRunProfileData, RunProfileData from .module_op import aggegate_module_view, aggegate_pl_module_view from .op_agg import KernelAggByNameOp, OperatorAgg from .overall_parser import ProfileRole +from ..utils import Canonicalizer logger = utils.get_logger() @@ -78,6 +79,14 @@ class RunGenerator(object): profile_run.views.append(consts.MEMORY_VIEW) profile_run.memory_snapshot = self.profile_data.memory_snapshot + profile_run.device_target = self.device_target + if self.device_target == 'Ascend': + if self.profile_data.has_memory: + profile_run.views.append(consts.MEMORY_VIEW) + profile_run.memory_div_curve = None + profile_run.memory_all_curve = self._get_memory_all_curve() + profile_run.memory_events = self._get_memory_event() + profile_run.module_stats = aggegate_module_view(self.profile_data.tid2tree, self.profile_data.events) profile_run.pl_module_stats = aggegate_pl_module_view(self.profile_data.tid2tree, self.profile_data.events) if profile_run.is_pytorch_lightning and profile_run.pl_module_stats: @@ -87,6 +96,105 @@ class RunGenerator(object): return profile_run + def _get_memory_event(self): + display_columns = ('Operator', 'Size(KB)', 'Allocation Time(us)', 'Release Time(us)', 'Duration(us)') + path = self.profile_data.memory_form_path + display_datas = defaultdict(list) + devices_type = [] + table = { + 'metadata': { + 'title': 'Memory Events', + 'default_device': 'all', + }, + 'columns': [], + 'rows': {} + } + datas = self._get_csv_data(path) + for idx, column in enumerate(datas[0]): + if column == 'Device Type': + self.device_type_form_idx = idx + if column in display_columns: + if column == 'Operator': + table['columns'].append({'name': column, 'type': 'string'}) + else: + table['columns'].append({'name': column, 'type': 'number'}) + for ls in datas[1:]: + device_type = ls[self.device_type_form_idx] + nums = [ls[1], float(ls[2]), float(ls[3])] + if ls[4]: + nums.append(float(ls[4])) + if ls[5]: + nums.append(round(float(ls[5]), 2)) + display_datas[device_type].append(nums) + table['rows'] = display_datas + for name in display_datas: + devices_type.append(name) + table['metadata'].update({'default_device': devices_type[0]}) + return table + + def _get_memory_all_curve(self): + time_metric: str = 'us' + memory_metric: str = 'KB' + cano = Canonicalizer(time_metric, memory_metric) + pta_and_ge_data, pta_or_ge_data = self._handle_memory_data() + devices_type, peaks = self._get_peaks_and_devices_type() + result = { + 'metadata': { + 'default_device': devices_type[0], + 'devices': devices_type, + 'peaks': peaks, + 'totals': {}, + 'first_ts': 0, + 'time_metric': cano.time_metric, + 'memory_metric': cano.memory_metric, + 'time_factor': cano.time_factor, + 'memory_factor': cano.memory_factor, + }, + 'columns': [ + {'name': f'Time ({cano.time_metric})', 'type': 'number', 'tooltip': 'Time since profiler starts.'}, + {'name': f'Allocated ({cano.memory_metric})', 'type': 'number', 'tooltip': 'Total memory in use.'}, + {'name': f'Reserved ({cano.memory_metric})', 'type': 'number', + 'tooltip': 'Total reserved memory by allocator, both used and unused.'}, + ], + 'rows': pta_and_ge_data, + } + return result + + def _get_peaks_and_devices_type(self): + devices_type = [] + peaks = {} + pta_and_ge_data, pta_or_ge_data = self._handle_memory_data() + for name in pta_and_ge_data: + devices_type.append(name) + max_reserved = 0 + for array_value in pta_and_ge_data.get(name): + max_reserved = max(array_value[2], max_reserved) + peaks[name] = 'Peak Memory Usage: {:.1f}'.format(max_reserved) + return devices_type, peaks + + def _handle_memory_data(self): + pta_and_ge_data = defaultdict(list) + pta_or_ge_data = {} + path = self.profile_data.memory_line_path + datas = self._get_csv_data(path) + for idx, column in enumerate(datas[0]): + if column == 'Tag': + self.tag_type_idx = idx + if column == 'Device Type': + self.device_type_idx = idx + if column == 'Timestamp(us)': + self.time_idx = idx + if column == 'Total Reserved(KB)': + self.reserved_idx = idx + if column == 'Total Allocated(KB)': + self.allocated_idx = idx + for ls in datas[1:]: + temp: list = [float(ls[self.time_idx]), float(ls[self.reserved_idx]), float(ls[self.allocated_idx])] + device_type = ls[self.device_type_idx] + pta_and_ge_data[device_type].append(temp) + pta_or_ge_data.setdefault(device_type, {}).setdefault(ls[self.tag_type_idx], []).append(temp) + return pta_and_ge_data, pta_or_ge_data + def _generate_overview(self): def build_part_time_str(part_cost: float, part_name: str): format_str = ('
' @@ -447,7 +555,7 @@ class RunGenerator(object): return datas def _generate_kernel_table_npu(self): - display_columns = ('Step ID', 'Name', 'Type', 'Accelerator Core', 'Start Time', 'Duration(us)', 'Wait Time(us)', + display_columns = ('Step Id', 'Name', 'Type', 'Accelerator Core', 'Start Time', 'Duration(us)', 'Wait Time(us)', 'Block Dim', 'Input Shapes', 'Input Data Types', 'Input Formats', 'Output Shapes', 'Output Data Types', 'Output Formats') display_idxs = [] @@ -478,7 +586,8 @@ class RunGenerator(object): enumerate(datas) if idx != 0] return result - def _get_csv_data(self, path: str): + @staticmethod + def _get_csv_data(path: str): if path is None: return datas = [] @@ -587,7 +696,7 @@ class DistributedRunGenerator(object): for used_device in data.used_devices: gpu_info = RunGenerator._get_gpu_info(data.device_props, used_device) if gpu_info is not None: - result[node][process_id]['GPU'+str(used_device)] = gpu_info + result[node][process_id]['GPU' + str(used_device)] = gpu_info if result: for k, v in result.items(): @@ -622,7 +731,7 @@ class DistributedRunGenerator(object): ] steps_to_overlap['all'][data.worker] = [ sum(x) for x in zip(steps_to_overlap['all'][data.worker], steps_to_overlap[step_name][data.worker])] - steps_to_overlap['all'][data.worker] = [x/step_number for x in steps_to_overlap['all'][data.worker]] + steps_to_overlap['all'][data.worker] = [x / step_number for x in steps_to_overlap['all'][data.worker]] for k, v in steps_to_overlap.items(): steps_to_overlap[k] = OrderedDict(sorted(v.items())) result['data'] = steps_to_overlap @@ -644,11 +753,11 @@ class DistributedRunGenerator(object): for step, comm_stats in data.step_comm_stats.items(): steps_to_wait.setdefault(step, OrderedDict())[data.worker] = [ comm_stats[1], - comm_stats[0]-comm_stats[1] + comm_stats[0] - comm_stats[1] ] steps_to_wait['all'][data.worker] = [ sum(x) for x in zip(steps_to_wait['all'][data.worker], steps_to_wait[step][data.worker])] - steps_to_wait['all'][data.worker] = [x/step_number for x in steps_to_wait['all'][data.worker]] + steps_to_wait['all'][data.worker] = [x / step_number for x in steps_to_wait['all'][data.worker]] for k, v in steps_to_wait.items(): steps_to_wait[k] = OrderedDict(sorted(v.items())) @@ -680,11 +789,11 @@ class DistributedRunGenerator(object): op, stats[0], stats[1], - round(stats[1]/stats[0]), + round(stats[1] / stats[0]), stats[2], - round(stats[2]/stats[0]), + round(stats[2] / stats[0]), stats[3], - round(stats[3]/stats[0]) + round(stats[3] / stats[0]) ] table['rows'].append(row) workers_to_comm_ops[data.worker] = table diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py index f148e4ccbf1..7d4e23503f0 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py @@ -139,6 +139,11 @@ class RunProfile(object): self.module_stats: Optional[List(Stats)] = None self.pl_module_stats: Optional[List(Stats)] = None + self.device_target = None + + self.memory_all_curve = None + self.memory_div_curve = None + self.memory_events = None def append_gpu_metrics(self, raw_data: bytes): counter_json_str = ', {}'.format(', '.join(self.gpu_metrics)) -- Gitee