From ae9a3d9cd135259f73f0e2e3edcdee4c917b4cb0 Mon Sep 17 00:00:00 2001 From: wuyulong11 Date: Thu, 27 Apr 2023 16:39:20 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=E3=80=91=20=E5=85=BC=E5=AE=B9GPU=E6=95=B0=E6=8D=AE=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=20=E3=80=90=E4=BF=AE=E6=94=B9=E4=BA=BA=E3=80=91=20wuy?= =?UTF-8?q?ulong=2030031080?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tb_plugin/fe/src/api/generated/api.ts | 3 +- .../tb_plugin/fe/src/components/Kernel.tsx | 57 ++++++--- .../fe/src/components/TooltipDescriptions.ts | 4 +- .../profiling/tb_plugin/test/test_diffrun.py | 2 +- .../tb_plugin/torch_tb_profiler/consts.py | 2 +- .../tb_plugin/torch_tb_profiler/plugin.py | 29 +++-- .../torch_tb_profiler/profiler/data.py | 27 ++-- .../torch_tb_profiler/profiler/loader.py | 63 +++++++--- .../profiler/run_generator.py | 118 ++++++++++++++++-- .../tb_plugin/torch_tb_profiler/run.py | 3 +- .../tb_plugin/torch_tb_profiler/utils.py | 4 +- 11 files changed, 242 insertions(+), 70 deletions(-) diff --git a/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts index b33fe1b6a84..37a5eb3127d 100644 --- a/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts +++ b/tb_plugins/profiling/tb_plugin/fe/src/api/generated/api.ts @@ -465,7 +465,8 @@ export interface KernelGraph { * @type {Graph} * @memberof KernelGraph */ - total: Graph + total: Graph, + device_target: string } /** * diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx index ea2fa63fe49..296f8aebec8 100644 --- a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx @@ -29,7 +29,8 @@ import { DataLoading } from './DataLoading' import { makeChartHeaderRenderer, useTooltipCommonStyles } from './helpers' import { GPUKernelTotalTimeTooltip, - TensorCoresPieChartTooltip + TensorCoresPieChartTooltip, + TensorCoresPieChartTooltipAscend } from './TooltipDescriptions' export interface IProps { @@ -74,11 +75,12 @@ export const Kernel: React.FC = (props) => { const [kernelTable, setKernelTable] = React.useState( undefined ) - const [groupBy, setGroupBy] = React.useState(KernelGroupBy.KernelNameAndOpName) + const [groupBy, setGroupBy] = React.useState(KernelGroupBy.Kernel) const [searchKernelName, setSearchKernelName] = React.useState('') const [searchOpName, setSearchOpName] = React.useState('') const [sortColumn, setSortColumn] = React.useState('') - const [hasStep, setHasStep] = React.useState(false); + const [hasStep, setHasStep] = React.useState(false) + const [deviceTarget, setDeviceTarget] = React.useState('GPU') const [topText, actualTop, useTop, setTopText, setUseTop] = useTopN({ defaultUseTop: UseTop.Use, @@ -111,6 +113,8 @@ export const Kernel: React.FC = (props) => { .kernelGet(run, worker, span, KernelGroupBy.Kernel) .then((resp) => { setKernelGraph(resp.total) + setDeviceTarget(resp.device_target) + setGroupBy(resp.device_target === 'Ascend' ? KernelGroupBy.KernelNameAndOpName : KernelGroupBy.Kernel) }) }, [run, worker, span]) @@ -123,7 +127,7 @@ export const Kernel: React.FC = (props) => { const [searchedKernelTable] = useSearch(searchKernelName, 'name', kernelTable) const [searchedOpTable] = useSearch( searchOpName, - 'step id', + deviceTarget === 'Ascend' ? 'step id' : 'operator', searchedKernelTable ) @@ -157,12 +161,17 @@ export const Kernel: React.FC = (props) => { ) const TensorCoresTitle = React.useMemo( - () => + () => deviceTarget === 'Ascend' ? chartHeaderRenderer( 'AI Cores Utilization', + TensorCoresPieChartTooltipAscend + ) + : + chartHeaderRenderer( + 'Tensor Cores Utilization', TensorCoresPieChartTooltip ), - [chartHeaderRenderer] + [chartHeaderRenderer, deviceTarget] ) return ( @@ -240,10 +249,10 @@ export const Kernel: React.FC = (props) => { onChange={onGroupByChanged} > - Statistic + {deviceTarget === 'Ascend' ? 'Statistic' : 'Kernel Properties + Op Name'} - All + {deviceTarget === 'Ascend' ? 'All' : 'Kernel Name'} @@ -258,16 +267,28 @@ export const Kernel: React.FC = (props) => { label="Search by Name" /> - {groupBy === KernelGroupBy.Kernel && hasStep && - - - + {deviceTarget === 'Ascend' ? + (groupBy === KernelGroupBy.Kernel && hasStep && + + + ) + : + (groupBy === KernelGroupBy.KernelNameAndOpName && + + + ) } diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts b/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts index f3fe25ff1a2..596e918eb82 100644 --- a/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts @@ -21,7 +21,9 @@ export const HostTotalTimeTooltip = `The accumulated time spent on Host, includi export const GPUKernelTotalTimeTooltip = `The accumulated time of all calls of this kernel.` -export const TensorCoresPieChartTooltip = `The accumulated time of all kernels using or not using AI Cores.` +export const TensorCoresPieChartTooltip = `The accumulated time of all kernels using or not using Tensor Cores.` + +export const TensorCoresPieChartTooltipAscend = `The accumulated time of all kernels using or not using AI Cores.` export const DistributedGpuInfoTableTooltip = `Information about GPU hardware used during the run.` diff --git a/tb_plugins/profiling/tb_plugin/test/test_diffrun.py b/tb_plugins/profiling/tb_plugin/test/test_diffrun.py index f1bebbfbfa4..06fbf398a7d 100644 --- a/tb_plugins/profiling/tb_plugin/test/test_diffrun.py +++ b/tb_plugins/profiling/tb_plugin/test/test_diffrun.py @@ -9,7 +9,7 @@ from torch_tb_profiler.utils import timing def load_profile(worker, span, path): - return RunProfileData.parse(worker, span, path, '.') + return RunProfileData.parse_gpu(worker, span, path, '.') class TestDiffRun(unittest.TestCase): diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py index 0a54ae5687c..486e5f05bf5 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py @@ -12,7 +12,7 @@ WORKER_PATTERN = re.compile(r"""^(.*?) # worker name (?:\.gz)?$""", re.X) # optional .gz extension TRACE_PATTERN = re.compile(r"""^trace_view\.json(\.gz)?$""") -WORKER_SPAN_PATTERN = re.compile(r"""([^\\]*)_(\d+)$""") +WORKER_SPAN_PATTERN = re.compile(r"""([^\\]*)_(\d+)_ascend_pt$""") NODE_PROCESS_PATTERN = re.compile(r"""^(.*)_(\d+)""") MONITOR_RUN_REFRESH_INTERNAL_IN_SECONDS = 10 diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py index c5dcfb535fc..7fa352261de 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/plugin.py @@ -438,9 +438,10 @@ class TorchProfilerPlugin(base_plugin.TBPlugin): # Assume no deletion on run directories, trigger async load if find a new run for run_dir in run_dirs: has_dir = True - if run_dir not in touched: - touched.add(run_dir) - logger.info('Find run directory %s', run_dir) + run_device = f'{run_dir["name"]}_{run_dir["device_target"]}' + if run_device not in touched: + touched.add(run_device) + logger.info('Find run directory %s', run_dir['name']) # Use threading to avoid UI stall and reduce data parsing time t = threading.Thread(target=self._load_run, args=(run_dir,)) t.start() @@ -472,7 +473,14 @@ class TorchProfilerPlugin(base_plugin.TBPlugin): def _get_run_dirs(self): """Scan logdir, find PyTorch Profiler run directories. - A directory is considered to be a run if it satisfies the following two conditions: + A directory is considered to be a gpu run if it contains 1 or more *.pt.trace.json[.gz]. + E.g. there are 2 runs: run1, run2 + /run1 + /[worker1].pt.trace.json.gz + /[worker2].pt.trace.json.gz + /run2 + /[worker1].pt.trace.json + A directory is considered to be an ascend run if it satisfies the following two conditions: 1.At least one subdirectory with the name in this format: {worker_span}. 2.The subdirectory in condition 1 has a 'ASCEND_PROFILER_OUTPUT' subdirectory which contains a 'trace_view.json(.gz)' file or a 'kernel_details.csv' file. @@ -490,21 +498,26 @@ class TorchProfilerPlugin(base_plugin.TBPlugin): /ASCEND_PROFILER_OUTPUT /kernel_details.csv """ - for root, subdirs, _ in io.walk(self.logdir): + for root, subdirs, files in io.walk(self.logdir): for subdir in subdirs: if str(subdir) == 'ASCEND_PROFILER_OUTPUT': match = consts.WORKER_SPAN_PATTERN.match(io.basename(root)) if match is not None: run_name = io.abspath(io.join(root, '..')) - yield run_name + yield {'name': run_name, 'device_target': 'Ascend'} break + for file in files: + if utils.is_gpu_chrome_trace_file(file): + yield {'name': root, 'device_target': 'GPU'} + break + def _load_run(self, run_dir): try: - name = self._get_run_name(run_dir) + name = self._get_run_name(run_dir['name']) logger.info('Load run %s', name) # Currently, assume run data is immutable, so just load once - loader = RunLoader(name, run_dir, self._cache) + loader = RunLoader(name, run_dir['name'], self._cache, run_dir['device_target']) run = loader.load() logger.info('Run %s loaded', name) self._queue.put(run) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py index 01864509c2a..4637527cda3 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py @@ -67,8 +67,8 @@ class RunProfileData(object): self.use_ddp: bool = False self.comm_lib = None self.has_runtime: bool = False - self.has_kernel: bool = False - self.has_trace: bool = False + self.has_kernel: bool = True + self.has_trace: bool = True self.has_communication: bool = False self.has_memcpy_or_memset: bool = False self.role_ranges = None @@ -101,15 +101,24 @@ class RunProfileData(object): self.recommendations = [] @staticmethod - def parse(worker, span, path, cache_dir): + def parse_gpu(worker, span, path, cache_dir): + trace_path, trace_json = RunProfileData._preprocess_file(path, cache_dir, 'GPU') + + profile = RunProfileData.from_json(worker, span, trace_json) + profile.trace_file_path = trace_path + return profile + + @staticmethod + def parse_npu(worker, span, path, cache_dir): trace_json = {} trace_path = path has_trace = False + has_kernel = False for file in io.listdir(path): - if utils.is_trace_path(file): + if utils.is_npu_trace_path(file): has_trace = True trace_file = io.join(path, file) - trace_path, trace_json = RunProfileData._preprocess_file(trace_file, cache_dir) + trace_path, trace_json = RunProfileData._preprocess_file(trace_file, cache_dir, 'Ascend') break profile = RunProfileData.from_json(worker, span, trace_json) @@ -118,8 +127,9 @@ class RunProfileData(object): for file in io.listdir(path): if str(file) == 'kernel_details.csv': - profile.has_kernel = True + has_kernel = True profile.kernel_file_path = io.join(path, file) + profile.has_kernel = has_kernel return profile @staticmethod @@ -131,7 +141,7 @@ class RunProfileData(object): return profile @staticmethod - def _preprocess_file(trace_path, cache_dir): + def _preprocess_file(trace_path, cache_dir, device_target): if not io.exists(trace_path): raise FileNotFoundError(trace_path) @@ -157,7 +167,8 @@ class RunProfileData(object): json_reencode = True # work-around to remove the 'Record Window End' events to avoid the huge end timestamp - trace_json = {'traceEvents': trace_json} + if device_target == 'Ascend': + trace_json = {'traceEvents': trace_json} event_list = trace_json['traceEvents'] end_index = None start_index = None diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py index ac1501aa535..5a42ddaecc8 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py @@ -19,27 +19,45 @@ logger = utils.get_logger() class RunLoader(object): - def __init__(self, name, run_dir, caches: io.Cache): + def __init__(self, name, run_dir, caches: io.Cache, device_target="GPU"): self.run_name = name self.run_dir = run_dir self.caches = caches self.queue = Queue() + self.device_target = device_target def load(self): workers = [] spans_by_workers = defaultdict(list) - for path in io.listdir(self.run_dir): - if io.isdir(io.join(self.run_dir, path)) and utils.is_worker_span_dir(path): - data_path = io.join(self.run_dir, path, 'ASCEND_PROFILER_OUTPUT') - for file in io.listdir(data_path): - if utils.is_trace_path(file) or str(file) == 'kernel_details.csv': - match = consts.WORKER_SPAN_PATTERN.match(path) - worker = match.group(1) - span = match.group(2) - if span is not None: - bisect.insort(spans_by_workers[worker], span) - workers.append((worker, span, io.join(path, 'ASCEND_PROFILER_OUTPUT'))) - break + if self.device_target == 'Ascend': + for path in io.listdir(self.run_dir): + if io.isdir(io.join(self.run_dir, path)) and utils.is_worker_span_dir(path): + data_path = io.join(self.run_dir, path, 'ASCEND_PROFILER_OUTPUT') + for file in io.listdir(data_path): + if utils.is_npu_trace_path(file) or str(file) == 'kernel_details.csv': + match = consts.WORKER_SPAN_PATTERN.match(path) + worker = match.group(1) + span = match.group(2) + if span is not None: + bisect.insort(spans_by_workers[worker], span) + workers.append((worker, span, io.join(path, 'ASCEND_PROFILER_OUTPUT'))) + break + else: + for path in io.listdir(self.run_dir): + if io.isdir(io.join(self.run_dir, path)): + continue + match = consts.WORKER_PATTERN.match(path) + if not match: + continue + + worker = match.group(1) + span = match.group(2) + if span is not None: + # remove the starting dot (.) + span = span[1:] + bisect.insort(spans_by_workers[worker], span) + + workers.append((worker, span, path)) span_index_map = {} for worker, span_array in spans_by_workers.items(): @@ -53,8 +71,8 @@ class RunLoader(object): p.start() logger.info('started all processing') - distributed_run = Run(self.run_name, self.run_dir) - run = Run(self.run_name, self.run_dir) + distributed_run = Run(self.run_name, self.run_dir, self.device_target) + run = Run(self.run_name, self.run_dir, self.device_target) num_items = len(workers) while num_items > 0: item: Tuple[RunProfile, DistributedRunProfileData] = self.queue.get() @@ -82,11 +100,14 @@ class RunLoader(object): try: logger.debug('Parse trace, run_dir=%s, data_dir=%s', self.run_dir, path) local_file = self.caches.get_remote_cache(io.join(self.run_dir, path)) - data = RunProfileData.parse(worker, span, local_file, self.caches.cache_dir) + if self.device_target == 'Ascend': + data = RunProfileData.parse_npu(worker, span, local_file, self.caches.cache_dir) + else: + data = RunProfileData.parse_gpu(worker, span, local_file, self.caches.cache_dir) if data.trace_file_path != local_file: self.caches.add_file(local_file, data.trace_file_path) - generator = RunGenerator(worker, span, data) + generator = RunGenerator(worker, span, data, self.device_target) profile = generator.generate_run_profile() dist_data = DistributedRunProfileData(data) @@ -96,8 +117,12 @@ class RunLoader(object): logger.warning('tb_plugin receive keyboard interrupt signal, process %d will exit' % (os.getpid())) sys.exit(1) except Exception as ex: - logger.warning('Failed to parse profile data for Run %s on %s_%s. Exception=%s', - self.run_name, worker, span_name, ex, exc_info=True) + if self.device_target == 'Ascend': + worker_name = f'{worker}_{span_name}' + else: + worker_name = worker + logger.warning('Failed to parse profile data for Run %s on %s. Exception=%s', + self.run_name, worker_name, ex, exc_info=True) self.queue.put((None, None)) logger.debug('finishing process data') diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py index b4c7680214b..f8d54d678ec 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/run_generator.py @@ -16,12 +16,13 @@ logger = utils.get_logger() class RunGenerator(object): - def __init__(self, worker, span, profile_data: RunProfileData): + def __init__(self, worker, span, profile_data: RunProfileData, device_target="GPU"): self.worker = worker self.span = span self.profile_data = profile_data self.statistic_data = {} self.accelerator_data = {} + self.device_target = device_target def generate_run_profile(self): profile_run = RunProfile(self.worker, self.span) @@ -45,10 +46,16 @@ class RunGenerator(object): if self.profile_data.has_kernel: profile_run.views.append(consts.KERNEL_VIEW) - profile_run.kernel_table = self._generate_kernel_table() - profile_run.kernel_op_table = self._generate_kernel_op_table() - profile_run.kernel_pie = self._generate_kernel_pie() - profile_run.tc_pie = self._generate_tc_pie() + if self.device_target == 'Ascend': + profile_run.kernel_table = self._generate_kernel_table_npu() + profile_run.kernel_op_table = self._generate_kernel_op_table_npu() + profile_run.kernel_pie = self._generate_kernel_pie_npu() + profile_run.tc_pie = self._generate_tc_pie_npu() + else: + profile_run.kernel_table = self._generate_kernel_table_gpu() + profile_run.kernel_op_table = self._generate_kernel_op_table_gpu() + profile_run.kernel_pie = self._generate_kernel_pie_gpu() + profile_run.tc_pie = self._generate_tc_pie_gpu() if self.profile_data.has_trace: profile_run.views.append(consts.TRACE_VIEW) @@ -316,7 +323,98 @@ class RunGenerator(object): result[k] = self._generate_op_table(v, group_by_input_shape, True) return result - def _generate_kernel_op_table(self): + def _generate_kernel_op_table_gpu(self): + table = {} + result = { + 'metadata': { + 'sort': 'Total Duration (us)' + }, + 'data': table + } + table['columns'] = [{'type': 'string', 'name': 'Name'}, + {'type': 'string', 'name': 'Operator'}, + {'type': 'string', 'name': 'Grid'}, + {'type': 'string', 'name': 'Block'}, + {'type': 'number', 'name': 'Register Per Thread'}, + {'type': 'number', 'name': 'Shared Memory'}, + {'type': 'string', 'name': 'Kernel Uses Tensor Cores', + 'tooltip': consts.TOOLTIP_KERNEL_USES_TC}, + {'type': 'string', 'name': 'Op is Tensor Cores eligible', + 'tooltip': consts.TOOLTIP_KERNEL_OP_TC_ELIGIBLE}] + col_names = ['Calls', 'Total Duration (us)', 'Mean Duration (us)', 'Max Duration (us)', 'Min Duration (us)'] + for column in col_names: + table['columns'].append({'type': 'number', 'name': column}) + gpu_metrics_columns = self.profile_data.gpu_metrics_parser.get_gpu_metrics_columns() + table['columns'].extend(gpu_metrics_columns) + + table['rows'] = [] + kernel_list: List[KernelAggByNameOp] = sorted( + self.profile_data.kernel_list_groupby_name_op, key=lambda x: x.total_duration, reverse=True) + for agg_by_name_op in kernel_list: + kernel_op_row = [agg_by_name_op.name, agg_by_name_op.op_name, + str(agg_by_name_op.grid), str(agg_by_name_op.block), + str(agg_by_name_op.regs_per_thread or '0'), str(agg_by_name_op.shared_memory or '0'), + 'Yes' if agg_by_name_op.tc_used else 'No', + 'Yes' if agg_by_name_op.op_tc_eligible else 'No', + agg_by_name_op.calls, + agg_by_name_op.total_duration, round(agg_by_name_op.avg_duration), + agg_by_name_op.max_duration, agg_by_name_op.min_duration] + if self.profile_data.gpu_metrics_parser.has_blocks_per_sm: + kernel_op_row.append(round(agg_by_name_op.avg_blocks_per_sm, 2)) + if self.profile_data.gpu_metrics_parser.has_occupancy: + kernel_op_row.append(round(agg_by_name_op.avg_occupancy, 2)) + table['rows'].append(kernel_op_row) + return result + + def _generate_kernel_pie_gpu(self): + pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []} + for _id, (name, row) in enumerate(self.profile_data.kernel_stat.iterrows()): + pie['rows'].append([name, row['sum']]) + data = {'total': pie, 'device_target': self.device_target} + return data + + def _generate_kernel_table_gpu(self): + table = {} + result = { + 'metadata': { + 'sort': 'Total Duration (us)' + }, + 'data': table + } + table['columns'] = [{'type': 'string', 'name': 'Name'}, + {'type': 'string', 'name': 'Tensor Cores Used', + 'tooltip': consts.TOOLTIP_KERNEL_USES_TC}] + columns = ['count', 'sum', 'mean', 'max', 'min'] + round_digits = [0, 0, 0, 0, 0] + if self.profile_data.gpu_metrics_parser.has_blocks_per_sm: + columns.append('blocks_per_sm') + round_digits.append(2) + if self.profile_data.gpu_metrics_parser.has_occupancy: + columns.append('occupancy') + round_digits.append(2) + col_names = ['Calls', 'Total Duration (us)', 'Mean Duration (us)', 'Max Duration (us)', 'Min Duration (us)'] + for column in col_names: + table['columns'].append({'type': 'number', 'name': column}) + gpu_metrics_columns = self.profile_data.gpu_metrics_parser.get_gpu_metrics_columns() + table['columns'].extend(gpu_metrics_columns) + + table['rows'] = [] + for _id, (name, row) in enumerate(self.profile_data.kernel_stat.iterrows()): + kernel_row = [name, 'Yes' if row['tc_used'] else 'No'] + for i, column in enumerate(columns): + kernel_row.append(round(row[column]) if round_digits[i] == 0 + else round(row[column], round_digits[i])) + table['rows'].append(kernel_row) + return result + + def _generate_tc_pie_gpu(self): + pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []} + pie['rows'].append(['Using Tensor Cores', self.profile_data.tc_used_ratio]) + pie['rows'].append(['Not Using Tensor Cores', 1.0 - self.profile_data.tc_used_ratio]) + data = {'total': pie} + return data + + def _generate_kernel_op_table_npu(self): table = {} result = { 'metadata': { @@ -338,7 +436,7 @@ class RunGenerator(object): table['rows'].append(temp) return result - def _generate_kernel_pie(self): + def _generate_kernel_pie_npu(self): pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []} with open(self.profile_data.kernel_file_path, encoding="utf-8") as f: reader = csv.DictReader(f) @@ -347,10 +445,10 @@ class RunGenerator(object): data.append(row.get('Name')) data.append(float(row.get('Duration(us)'))) pie['rows'].append(data) - datas = {'total': pie} + datas = {'total': pie, 'device_target': self.device_target} return datas - def _generate_kernel_table(self): + def _generate_kernel_table_npu(self): display_columns = ('Step ID', 'Name', 'Type', 'Accelerator Core', 'Start Time', 'Duration(us)', 'Wait Time(us)', 'Block Dim', 'Input Shapes', 'Input Data Types', 'Input Formats', 'Output Shapes', 'Output Data Types', 'Output Formats') @@ -388,7 +486,7 @@ class RunGenerator(object): datas.append(row) return datas - def _generate_tc_pie(self): + def _generate_tc_pie_npu(self): pie = {'columns': [{'type': 'string', 'name': 'name'}, {'type': 'number', 'name': 'value'}], 'rows': []} for key, val in self.accelerator_data.items(): pie['rows'].append(['Using ' + key.replace('_', ' '), val]) diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py index b86ea679874..3cc7ef7fd56 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/run.py @@ -19,11 +19,12 @@ class Run(object): May contain profiling results from multiple workers. E.g. distributed scenario. """ - def __init__(self, name, run_dir): + def __init__(self, name, run_dir, device_target='GPU'): self.name = name self.run_dir = run_dir self.profiles: Dict[Tuple[str, str], RunProfile] = {} self.span_view = {} + self.device_target = device_target @property def workers(self): diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py index 392e8bd61d6..7bd6e42bfa4 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py @@ -29,7 +29,7 @@ def get_logger(): return logger -def is_chrome_trace_file(path): +def is_gpu_chrome_trace_file(path): return consts.WORKER_PATTERN.match(path) @@ -37,7 +37,7 @@ def is_worker_span_dir(path): return consts.WORKER_SPAN_PATTERN.match(path) -def is_trace_path(path): +def is_npu_trace_path(path): return consts.TRACE_PATTERN.match(path) -- Gitee