From b3f81c59442ccc7c210f3816ef410fefe760ef62 Mon Sep 17 00:00:00 2001 From: wuyulong11 Date: Tue, 18 Apr 2023 14:12:37 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=E3=80=91=20=E6=94=AF=E6=8C=81NPU=E7=89=88=E6=9C=ACTrace=20View?= =?UTF-8?q?=E5=92=8CKernel=20View=E4=BF=AE=E6=94=B9=E9=83=A8=E5=88=86?= =?UTF-8?q?=E4=B8=8A=E5=BA=93=20=E3=80=90=E4=BF=AE=E6=94=B9=E4=BA=BA?= =?UTF-8?q?=E3=80=91=20wuyulong=2030031080?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tb_plugins/profiling/tb_plugin/fe/src/app.tsx | 2 +- .../tb_plugin/fe/src/components/Kernel.tsx | 23 +++++++---- .../fe/src/components/TooltipDescriptions.ts | 2 +- tb_plugins/profiling/tb_plugin/setup.py | 2 +- .../tb_plugin/torch_tb_profiler/consts.py | 3 ++ .../torch_tb_profiler/profiler/data.py | 41 ++++++++++++++----- .../torch_tb_profiler/profiler/loader.py | 37 ++++++++--------- .../tb_plugin/torch_tb_profiler/utils.py | 8 ++++ 8 files changed, 75 insertions(+), 43 deletions(-) diff --git a/tb_plugins/profiling/tb_plugin/fe/src/app.tsx b/tb_plugins/profiling/tb_plugin/fe/src/app.tsx index 012e39dbcc1..a42d31c85f0 100644 --- a/tb_plugins/profiling/tb_plugin/fe/src/app.tsx +++ b/tb_plugins/profiling/tb_plugin/fe/src/app.tsx @@ -55,7 +55,7 @@ export enum Views { const ViewNames = { [Views.Overview]: Views.Overview, [Views.Operator]: Views.Operator, - [Views.Kernel]: 'GPU Kernel', + [Views.Kernel]: 'Kernel', [Views.Trace]: Views.Trace, [Views.Distributed]: Views.Distributed, [Views.Memory]: Views.Memory, diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx index 0ad17faebcf..ea2fa63fe49 100644 --- a/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/Kernel.tsx @@ -74,10 +74,11 @@ export const Kernel: React.FC = (props) => { const [kernelTable, setKernelTable] = React.useState( undefined ) - const [groupBy, setGroupBy] = React.useState(KernelGroupBy.Kernel) + const [groupBy, setGroupBy] = React.useState(KernelGroupBy.KernelNameAndOpName) const [searchKernelName, setSearchKernelName] = React.useState('') const [searchOpName, setSearchOpName] = React.useState('') const [sortColumn, setSortColumn] = React.useState('') + const [hasStep, setHasStep] = React.useState(false); const [topText, actualTop, useTop, setTopText, setUseTop] = useTopN({ defaultUseTop: UseTop.Use, @@ -98,6 +99,10 @@ export const Kernel: React.FC = (props) => { api.defaultApi.kernelTableGet(run, worker, span, groupBy).then((resp) => { setSortColumn(resp.metadata.sort) setKernelTable(resp.data) + const nameColumnIdx = resp.data.columns.findIndex( + (c) => c.name.toLowerCase() === 'step id' + ) + setHasStep(nameColumnIdx > -1) }) }, [run, worker, span, groupBy]) @@ -118,7 +123,7 @@ export const Kernel: React.FC = (props) => { const [searchedKernelTable] = useSearch(searchKernelName, 'name', kernelTable) const [searchedOpTable] = useSearch( searchOpName, - 'operator', + 'step id', searchedKernelTable ) @@ -154,7 +159,7 @@ export const Kernel: React.FC = (props) => { const TensorCoresTitle = React.useMemo( () => chartHeaderRenderer( - 'Tensor Cores Utilization', + 'AI Cores Utilization', TensorCoresPieChartTooltip ), [chartHeaderRenderer] @@ -235,10 +240,10 @@ export const Kernel: React.FC = (props) => { onChange={onGroupByChanged} > - Kernel Properties + Op Name + Statistic - Kernel Name + All @@ -250,20 +255,20 @@ export const Kernel: React.FC = (props) => { value={searchKernelName} onChange={onSearchKernelChanged} type="search" - label="Search by Kernel Name" + label="Search by Name" /> - {groupBy === KernelGroupBy.KernelNameAndOpName && ( + {groupBy === KernelGroupBy.Kernel && hasStep && - )} + } diff --git a/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts b/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts index 4fd0d92d270..f3fe25ff1a2 100644 --- a/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts +++ b/tb_plugins/profiling/tb_plugin/fe/src/components/TooltipDescriptions.ts @@ -21,7 +21,7 @@ export const HostTotalTimeTooltip = `The accumulated time spent on Host, includi export const GPUKernelTotalTimeTooltip = `The accumulated time of all calls of this kernel.` -export const TensorCoresPieChartTooltip = `The accumulated time of all kernels using or not using Tensor Cores.` +export const TensorCoresPieChartTooltip = `The accumulated time of all kernels using or not using AI Cores.` export const DistributedGpuInfoTableTooltip = `Information about GPU hardware used during the run.` diff --git a/tb_plugins/profiling/tb_plugin/setup.py b/tb_plugins/profiling/tb_plugin/setup.py index b32dc154a28..89033843f0c 100644 --- a/tb_plugins/profiling/tb_plugin/setup.py +++ b/tb_plugins/profiling/tb_plugin/setup.py @@ -63,7 +63,7 @@ class build_fe(setuptools.Command): setuptools.setup( - name="torch_tb_profiler", + name="torch-tb-profiler-npu", version=get_version(os.path.join('torch_tb_profiler', '__init__.py')), description="PyTorch Profiler TensorBoard Plugin", long_description="PyTorch Profiler TensorBoard Plugin : \ diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py index df881b05aba..f558aec48e1 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/consts.py @@ -11,6 +11,9 @@ WORKER_PATTERN = re.compile(r"""^(.*?) # worker name \.pt\.trace\.json # the ending suffix (?:\.gz)?$""", re.X) # optional .gz extension +TRACE_PATTERN = re.compile(r"""^trace_view\.json(\.gz)?$""") +WORKER_SPAN_PATTERN = re.compile(r"""([^\\]*_\d+)_(\d+)$""") + NODE_PROCESS_PATTERN = re.compile(r"""^(.*)_(\d+)""") MONITOR_RUN_REFRESH_INTERNAL_IN_SECONDS = 10 MAX_GPU_PER_NODE = 64 diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py index 3f9e296d5b7..fd553c74115 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/data.py @@ -40,21 +40,24 @@ class RunProfileData(object): self.profiler_start_ts = float('inf') self.events: List[BaseEvent] = [] - trace_body = trace_json['traceEvents'] + trace_body = trace_json.get('traceEvents', None) fwd_bwd_events = [] - for data in trace_body: - if data.get('cat') == 'forward_backward': - fwd_bwd_events.append(data) - else: - event = trace.create_event(data, self.is_pytorch_lightning) - if event is not None: - self.profiler_start_ts = min(self.profiler_start_ts, event.ts) - self.events.append(event) + if trace_body is not None: + for data in trace_body: + if data.get('cat') == 'forward_backward': + fwd_bwd_events.append(data) + else: + event = trace.create_event(data, self.is_pytorch_lightning) + if event is not None: + self.profiler_start_ts = min(self.profiler_start_ts, event.ts) + self.events.append(event) self.events.sort(key=lambda e: e.ts) self.forward_backward_events = trace.create_association_events(fwd_bwd_events) self.trace_file_path: str = None + self.kernel_file_path: str = None + self.kernel_statistic_path: str = None # Event Parser results self.tid2tree: Dict[int, OperatorNode] = None @@ -65,6 +68,7 @@ class RunProfileData(object): self.comm_lib = None self.has_runtime: bool = False self.has_kernel: bool = False + self.has_trace: bool = False self.has_communication: bool = False self.has_memcpy_or_memset: bool = False self.role_ranges = None @@ -97,11 +101,25 @@ class RunProfileData(object): self.recommendations = [] @staticmethod - def parse(worker, span, path, cache_dir): - trace_path, trace_json = RunProfileData._preprocess_file(path, cache_dir) + def parse(run_name, worker, span_name, span, path, cache_dir): + trace_json = {} + trace_path = path + has_trace = False + for file in io.listdir(path): + if utils.is_trace_path(file): + has_trace = True + trace_file = io.join(path, file) + trace_path, trace_json = RunProfileData._preprocess_file(trace_file, cache_dir) + break profile = RunProfileData.from_json(worker, span, trace_json) profile.trace_file_path = trace_path + profile.has_trace = has_trace + + for file in io.listdir(path): + if str(file) == 'kernel_details.csv': + profile.has_kernel = True + profile.kernel_file_path = io.join(path, file) return profile @staticmethod @@ -139,6 +157,7 @@ class RunProfileData(object): json_reencode = True # work-around to remove the 'Record Window End' events to avoid the huge end timestamp + trace_json = {'traceEvents': trace_json} event_list = trace_json['traceEvents'] end_index = None start_index = None diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py index 424e62d7df8..8d8e93171b6 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/profiler/loader.py @@ -29,20 +29,17 @@ class RunLoader(object): workers = [] spans_by_workers = defaultdict(list) for path in io.listdir(self.run_dir): - if io.isdir(io.join(self.run_dir, path)): - continue - match = consts.WORKER_PATTERN.match(path) - if not match: - continue - - worker = match.group(1) - span = match.group(2) - if span is not None: - # remove the starting dot (.) - span = span[1:] - bisect.insort(spans_by_workers[worker], span) - - workers.append((worker, span, path)) + if io.isdir(io.join(self.run_dir, path)) and utils.is_worker_span_dir(path): + data_path = io.join(self.run_dir, path, 'ASCEND_PROFILER_OUTPUT') + for file in io.listdir(data_path): + if utils.is_trace_path(file) or str(file) == 'kernel_details.csv': + match = consts.WORKER_SPAN_PATTERN.match(path) + worker = match.group(1) + span = match.group(2) + if span is not None: + bisect.insort(spans_by_workers[worker], span) + workers.append((worker, span, io.join(path, 'ASCEND_PROFILER_OUTPUT'))) + break span_index_map = {} for worker, span_array in spans_by_workers.items(): @@ -52,7 +49,7 @@ class RunLoader(object): for worker, span, path in workers: # convert the span timestamp to the index. span_index = None if span is None else span_index_map[(worker, span)] - p = Process(target=self._process_data, args=(worker, span_index, path)) + p = Process(target=self._process_data, args=(worker, span, span_index, path)) p.start() logger.info('started all processing') @@ -78,14 +75,14 @@ class RunLoader(object): # for no daemon process, no need to join them since it will automatically join return run - def _process_data(self, worker, span, path): + def _process_data(self, worker, span_name, span, path): import absl.logging absl.logging.use_absl_handler() try: - logger.debug('Parse trace, run_dir=%s, worker=%s', self.run_dir, path) + logger.debug('Parse trace, run_dir=%s, data_dir=%s', self.run_dir, path) local_file = self.caches.get_remote_cache(io.join(self.run_dir, path)) - data = RunProfileData.parse(worker, span, local_file, self.caches.cache_dir) + data = RunProfileData.parse(self.run_name, worker, span_name, span, local_file, self.caches.cache_dir) if data.trace_file_path != local_file: self.caches.add_file(local_file, data.trace_file_path) @@ -99,8 +96,8 @@ class RunLoader(object): logger.warning('tb_plugin receive keyboard interrupt signal, process %d will exit' % (os.getpid())) sys.exit(1) except Exception as ex: - logger.warning('Failed to parse profile data for Run %s on %s. Exception=%s', - self.run_name, worker, ex, exc_info=True) + logger.warning('Failed to parse profile data for Run %s on %s_%s. Exception=%s', + self.run_name, worker, span_name, ex, exc_info=True) self.queue.put((None, None)) logger.debug('finishing process data') diff --git a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py index c40ce9befc3..392e8bd61d6 100644 --- a/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py +++ b/tb_plugins/profiling/tb_plugin/torch_tb_profiler/utils.py @@ -33,6 +33,14 @@ def is_chrome_trace_file(path): return consts.WORKER_PATTERN.match(path) +def is_worker_span_dir(path): + return consts.WORKER_SPAN_PATTERN.match(path) + + +def is_trace_path(path): + return consts.TRACE_PATTERN.match(path) + + def href(text, url): """"return html formatted hyperlink string -- Gitee