diff --git a/torch_npu/npu/profiler.py b/torch_npu/npu/profiler.py index d84510f320a55fbba5981a3dc23a2a4c077d7353..1c2f88a17ccddb2ca3faab7f279f2d75ec9a6058 100644 --- a/torch_npu/npu/profiler.py +++ b/torch_npu/npu/profiler.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import stat import itertools from typing import Any from collections import defaultdict, namedtuple @@ -49,6 +51,9 @@ __all__ = ["profile", "record_function", ] if not torch_npu._C._profiler_init(): raise RuntimeError("proflier initialization failed") +file_flags = os.O_WRONLY | os.O_CREAT +file_modes = stat.S_IWUSR | stat.S_IRUSR + class DeviceType(Enum): CPU = 0, CUDA = 1, # CUDA. @@ -240,8 +245,7 @@ class EventList(list): Args: path (str): Path where the trace will be written. """ - import os - with open(path, 'w') as f: + with os.fdopen(os.open(path, file_flags, file_modes), 'w') as f: chrome_events = [] next_id = 0 # Use file IO over using json.dump since JSON dumping is very slow and @@ -265,39 +269,8 @@ class EventList(list): evt.thread if not evt.is_remote else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "', - ) - ) - if self._use_cuda: - for k in evt.kernels: - # 's' and 'f' draw Flow arrows from - # the CPU launch to the GPU kernel - f.write('{"name": "%s", ' - '"ph": "s", ' - '"ts": %s, ' - '"tid": %s, ' - '"pid": "CPU functions", ' - '"id": %s, ' - '"cat": "cpu_to_cuda", ' - '"args": {}}, ' % (evt.trace_name, evt.time_range.start, - evt.thread, next_id)) - f.write('{"name": "%s", ' - '"ph": "f", ' - '"ts": %s, ' - '"tid": %s, ' - '"pid": "CUDA functions", ' - '"id": %s, ' - '"cat": "cpu_to_cuda", ' - '"args": {}}, ' % (k.name, k.interval.start, k.device, next_id)) - f.write('{"name": "%s", ' - '"ph": "X", ' - '"ts": %s, ' - '"dur": %s, ' - '"tid": %s, ' - '"pid": "CUDA functions", ' - '"args": {}}, ' % (k.name, k.interval.start, - k.interval.elapsed_us(), k.device)) - next_id += 1 - elif self._use_npu: + )) + if self._use_npu: for k in evt.kernels: # 's' and 'f' draw Flow arrows from # the CPU launch to the GPU kernel @@ -342,7 +315,7 @@ class EventList(list): if metric not in self.supported_export_stacks_metrics(): raise ValueError("metric should be one of: " + str(self.supported_export_stacks_metrics())) translate_table = str.maketrans(" ;\t\n", "____") - with open(path, 'w') as f: + with os.fdopen(os.open(path, file_flags, file_modes), 'w') as f: for evt in self: if evt.stack and len(evt.stack) > 0: metric_value = getattr(evt, metric) @@ -536,7 +509,7 @@ class profile(object): def __enter__(self): if not self.enabled: - return + raise RuntimeError("profiler is not enable") if self.entered: raise RuntimeError("profiler context manager is not reentrant") self.entered = True @@ -550,7 +523,7 @@ class profile(object): def __exit__(self, exc_type, exc_val, exc_tb): if not self.enabled: - return + raise RuntimeError("profiler is not enable") records = torch_npu._C._profiler._disable_profiler_legacy() parsed_results = parse_legacy_records(records) self.function_events = EventList( @@ -588,7 +561,7 @@ class profile(object): def export_chrome_trace(self, path): self._check_finish() if self.kineto_results is not None: - self.kineto_results.save(path) + raise RuntimeError("not support for kineto.") else: assert self.function_events is not None return self.function_events.export_chrome_trace(path) @@ -963,7 +936,8 @@ class FunctionEvent(FormattedTimesMixin): return ( ''.format( + 'cpu_memory_usage={} cuda_memory_usage={} npu_memory_usage={} is_async={} \ + is_remote={} seq_nr={} is_legacy={}>'.format( self.id, self.name, self.device_type, @@ -1142,115 +1116,6 @@ def rewrite_name(name, with_wildcard=False): name = "ProfilerStep*" return name -# Parsing of kineto profiler events -def parse_kineto_results(result): - # result.events() has most of the events - PyTorch op-level and device-level events - # result.legacy_events() has events not yet ported to kineto - # (e.g. start/stop marks, tensor memory allocator events) - - # First, find __start_profile mark to get the absolute time of the start of the trace; - # save memory allocation records - start_record = None - mem_records = [] - for record in itertools.chain(*result.legacy_events()): - if record.kind() == 'mark' and record.name() == '__start_profile': - assert start_record is None - start_record = record - if record.kind() == 'memory_alloc': - mem_records.append([record, False]) - assert start_record is not None, "Invalid profiler output, __start_profile is missing" - - # Create and return FunctionEvent list - function_events = [] - cuda_corr_map: Dict[int, List[FunctionEvent]] = {} - for kineto_event in result.events(): - if filter_name(kineto_event.name()): - continue - rel_start_us = kineto_event.start_us() - start_record.start_us() - rel_end_us = rel_start_us + kineto_event.duration_us() - abs_end_us = kineto_event.start_us() + kineto_event.duration_us() - - cpu_memory_usage = 0 - cuda_memory_usage = 0 - if kineto_event.device_type() == DeviceType.CPU: - # find the corresponding memory allocation events - for mem_record in mem_records: - if (mem_record[0].start_us() >= kineto_event.start_us() and - mem_record[0].start_us() <= abs_end_us): - cpu_memory_usage += mem_record[0].cpu_memory_usage() - cuda_memory_usage += mem_record[0].cuda_memory_usage() - mem_record[1] = True - - is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id() - fe = FunctionEvent( - id_event=kineto_event.correlation_id(), - name=rewrite_name(name=kineto_event.name(), with_wildcard=True), - trace_name=rewrite_name(name=kineto_event.name(), with_wildcard=False), - thread=kineto_event.start_thread_id(), - start_us=rel_start_us, - end_us=rel_end_us, - fwd_thread=kineto_event.fwd_thread_id(), - input_shapes=kineto_event.shapes(), - stack=[entry for entry in kineto_event.stack() if filter_stack_entry(entry)], - scope=kineto_event.scope(), - cpu_memory_usage=cpu_memory_usage, - cuda_memory_usage=cuda_memory_usage, - is_async=is_async, - sequence_nr=kineto_event.sequence_nr(), - device_type=kineto_event.device_type(), - device_index=kineto_event.device_index(), - flops=kineto_event.flops(), - ) - function_events.append(fe) - corr_id = kineto_event.linked_correlation_id() - if corr_id > 0: - if corr_id not in cuda_corr_map: - cuda_corr_map[corr_id] = [] - cuda_corr_map[corr_id].append(fe) - - # associate CUDA kernels and CUDA runtime (CPU) with CPU events - for fe in function_events: - if (fe.device_type == DeviceType.CPU and not fe.is_async and - fe.id in cuda_corr_map): - for f_evt in cuda_corr_map[fe.id]: - if f_evt.device_type == DeviceType.CUDA: - fe.append_kernel( - f_evt.name, - f_evt.device_index, - f_evt.time_range.start, - f_evt.time_range.end) - elif f_evt.device_type == DeviceType.CPU: - # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated - # with the 'thread' of the corresponding linked PyTorch event to properly track - # parents and children - f_evt.thread = fe.thread - - - # output top-level memory events - for mem_record in mem_records: - if not mem_record[1]: - fe = FunctionEvent( - id_event=mem_record[0].handle(), - name="[memory]", - trace_name=None, # not outputting in the trace - thread=mem_record[0].thread_id(), - start_us=mem_record[0].start_us(), - end_us=mem_record[0].start_us(), # no duration - fwd_thread=mem_record[0].fwd_thread_id(), - input_shapes=[], - stack=[], - scope=mem_record[0].scope(), - cpu_memory_usage=mem_record[0].cpu_memory_usage(), - cuda_memory_usage=mem_record[0].cuda_memory_usage(), - is_async=False, - sequence_nr=-1, - device_type=DeviceType.CPU, - device_index=0, - ) - function_events.append(fe) - - function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) - return function_events # Parsing of legacy profiler events def parse_legacy_records(thread_records): @@ -1277,25 +1142,17 @@ def parse_legacy_records(thread_records): def adjusted_time(device_record, device_records_map): assert device_record.device() != -1 assert start_record is not None - if device_record.has_cuda(): - cuda_time_0 = device_records_map[(device_record.node_id(), device_record.device())] - return cuda_time_0.cuda_elapsed_us(device_record) + start_record.cpu_elapsed_us(cuda_time_0) - elif device_record.has_npu(): + if device_record.has_npu(): npu_time_0 = device_records_map[(device_record.node_id(), device_record.device())] - return npu_time_0.npu_elapsed_us(device_record) + start_record.cpu_elapsed_us(npu_time_0) + return npu_time_0.npu_elapsed_us(device_record) + start_record.cpu_elapsed_us(npu_time_0) + else: + raise RuntimeError("device_record only support for npu") # '__start_profile' is not guaranteed to be first, so we must find it here for record in itertools.chain(*thread_records): name = record.name() if start_record is None and name == '__start_profile': start_record = record - elif '__cuda_start_event' in name: - # N.B.: Each CUDA device has its own __cuda_start_event. - assert record.device() != -1 - # key for cuda_records is (node_id, device) in case of multiple nodes - # having the same device - cuda_records[(record.node_id(), record.device())] = record - profiler_type = DeviceType.CUDA elif '__npu_start_event' in name: # N.B.: Each NPU device has its own __npu_start_event. assert record.device() != -1 @@ -1315,122 +1172,120 @@ def parse_legacy_records(thread_records): range_starts = {} filtered_handles = set() - prev_record = None - for record in thread_record_list: - record_key = get_record_key(record) - if (filter_name(record.name()) or - record_key in filtered_handles): - filtered_handles.add(record_key) - continue - - if record.kind() == 'push': - # workaround to reduce double logging from operator - # wrappers and redispatch - if prev_record is not None: - duplicate = ( - prev_record.name() == record.name() - and prev_record.kind() == record.kind() - and prev_record.node_id() == record.node_id() - ) - if duplicate: - filtered_handles.add(record_key) - continue - - range_starts[record_key] = record - cpu_memory_allocs[record_key] = 0 - cuda_memory_allocs[record_key] = 0 - npu_memory_allocs[record_key] = 0 - elif record.kind() == 'pop': - assert ( - record_key in range_starts - ), """Expected record with key {} to exist in range_starts. - This means that the pop event did not have a corresponding push.""".format( - record_key - ) - - start = range_starts[record_key] - - cpu_memory_usage = cpu_memory_allocs[record_key] - cuda_memory_usage = cuda_memory_allocs[record_key] - npu_memory_usage = npu_memory_allocs[record_key] - is_async = start.thread_id() != record.thread_id() - is_remote_event = record.is_remote() - start_flops = start.flops() - - fe = FunctionEvent( - id_event=record.handle(), - node_id=record.node_id(), - name=rewrite_name(name=start.name(), with_wildcard=True), - trace_name=rewrite_name(name=start.name(), with_wildcard=False), - thread=start.thread_id(), - start_us=start_record.cpu_elapsed_us(start), - end_us=start_record.cpu_elapsed_us(record), - fwd_thread=start.fwd_thread_id(), - input_shapes=start.shapes(), - stack=[entry for entry in start.stack() if filter_stack_entry(entry)], - scope=start.scope(), - cpu_memory_usage=cpu_memory_usage, - cuda_memory_usage=cuda_memory_usage, - npu_memory_usage=npu_memory_usage, - is_async=is_async, - is_remote=is_remote_event, - sequence_nr=start.sequence_nr(), - device_type=DeviceType.CPU, - is_legacy=True, - flops=start_flops, - ) - # note: async events have only cpu total time - if not is_async and start.has_cuda(): - cuda_start = adjusted_time(start, cuda_records) - cuda_end = adjusted_time(record, cuda_records) - if (cuda_end - cuda_start) > 0: - fe.append_kernel( - start.name(), - start.device(), - cuda_start, - cuda_end) - elif not is_async and start.has_npu(): - npu_start = adjusted_time(start, npu_records) - npu_end = adjusted_time(record, npu_records) - if (npu_end - npu_start) > 0: - fe.append_kernel( - start.name(), - start.device(), - npu_start, - npu_end) - functions.append(fe) - del range_starts[record_key] - del cpu_memory_allocs[record_key] - del cuda_memory_allocs[record_key] - del npu_memory_allocs[record_key] - elif record.kind() == 'memory_alloc': - num_open_handles_cpu = len(cpu_memory_allocs) - num_open_handles_cuda = len(cuda_memory_allocs) - assert num_open_handles_cpu == num_open_handles_cuda - for handle in cpu_memory_allocs.keys(): - cpu_memory_allocs[handle] += record.cpu_memory_usage() - for handle in cuda_memory_allocs.keys(): - cuda_memory_allocs[handle] += record.cuda_memory_usage() - for handle in npu_memory_allocs.keys(): - npu_memory_allocs[handle] += record.npu_memory_usage() - if num_open_handles_cpu == 0: - # output event as a top-level memory event - fe = FunctionEvent( - id_event=0, - name="[memory]", - trace_name=None, - thread=0, - start_us=0, - end_us=0, - stack=[], - cpu_memory_usage=record.cpu_memory_usage(), - cuda_memory_usage=record.cuda_memory_usage(), - npu_memory_usage=record.npu_memory_usage(), - is_legacy=True, - ) - functions.append(fe) - prev_record = record + def process_record(functions): + prev_record = None + for record in thread_record_list: + record_key = get_record_key(record) + if (filter_name(record.name()) or + record_key in filtered_handles): + filtered_handles.add(record_key) + continue + if record.kind() == 'push': + def process_push_record(prev_record): + # workaround to reduce double logging from operator + # wrappers and redispatch + if prev_record is not None: + duplicate = ( + prev_record.name() == record.name() + and prev_record.kind() == record.kind() + and prev_record.node_id() == record.node_id() + ) + if duplicate: + filtered_handles.add(record_key) + return + + range_starts[record_key] = record + cpu_memory_allocs[record_key] = 0 + cuda_memory_allocs[record_key] = 0 + npu_memory_allocs[record_key] = 0 + process_push_record(prev_record) + elif record.kind() == 'pop': + def process_op_record(): + assert ( + record_key in range_starts + ), """Expected record with key {} to exist in range_starts. + This means that the pop event did not have a corresponding push.""".format( + record_key + ) + start = range_starts[record_key] + + cpu_memory_usage = cpu_memory_allocs[record_key] + cuda_memory_usage = cuda_memory_allocs[record_key] + npu_memory_usage = npu_memory_allocs[record_key] + is_async = start.thread_id() != record.thread_id() + is_remote_event = record.is_remote() + start_flops = start.flops() + + fe = FunctionEvent( + id_event=record.handle(), + node_id=record.node_id(), + name=rewrite_name(name=start.name(), with_wildcard=True), + trace_name=rewrite_name(name=start.name(), with_wildcard=False), + thread=start.thread_id(), + start_us=start_record.cpu_elapsed_us(start), + end_us=start_record.cpu_elapsed_us(record), + fwd_thread=start.fwd_thread_id(), + input_shapes=start.shapes(), + stack=[entry for entry in start.stack() if filter_stack_entry(entry)], + scope=start.scope(), + cpu_memory_usage=cpu_memory_usage, + cuda_memory_usage=cuda_memory_usage, + npu_memory_usage=npu_memory_usage, + is_async=is_async, + is_remote=is_remote_event, + sequence_nr=start.sequence_nr(), + device_type=DeviceType.CPU, + is_legacy=True, + flops=start_flops, + ) + # note: async events have only cpu total time + if not is_async and start.has_npu(): + npu_start = adjusted_time(start, npu_records) + npu_end = adjusted_time(record, npu_records) + if (npu_end - npu_start) > 0: + fe.append_kernel( + start.name(), + start.device(), + npu_start, + npu_end) + functions.append(fe) + del range_starts[record_key] + del cpu_memory_allocs[record_key] + del cuda_memory_allocs[record_key] + del npu_memory_allocs[record_key] + process_op_record() + elif record.kind() == 'memory_alloc': + def process_memory_alloc_record(): + num_open_handles_cpu = len(cpu_memory_allocs) + num_open_handles_cuda = len(cuda_memory_allocs) + assert num_open_handles_cpu == num_open_handles_cuda + for handle in cpu_memory_allocs.keys(): + cpu_memory_allocs[handle] += record.cpu_memory_usage() + for handle in cuda_memory_allocs.keys(): + cuda_memory_allocs[handle] += record.cuda_memory_usage() + for handle in npu_memory_allocs.keys(): + npu_memory_allocs[handle] += record.npu_memory_usage() + if num_open_handles_cpu == 0: + # output event as a top-level memory event + fe = FunctionEvent( + id_event=0, + name="[memory]", + trace_name=None, + thread=0, + start_us=0, + end_us=0, + stack=[], + cpu_memory_usage=record.cpu_memory_usage(), + cuda_memory_usage=record.cuda_memory_usage(), + npu_memory_usage=record.npu_memory_usage(), + is_legacy=True, + ) + functions.append(fe) + process_memory_alloc_record() + prev_record = record + return functions + functions = process_record(functions) # Sort functions by start time then by end time ascending. # This ensures that--in the case of nested events which # have the same start time (which may happen due to the @@ -1477,9 +1332,9 @@ def build_table( """Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg).""" if len(events) == 0: return "" - - has_cuda_time = any([event.self_cuda_time_total > 0 for event in events]) and use_cuda - has_cuda_mem = any([event.self_cuda_memory_usage > 0 for event in events]) and use_cuda + if use_cuda: + raise RuntimeError("not support for CUDA.") + has_cuda_time = False has_npu_time = any([event.self_npu_time_total > 0 for event in events]) and use_npu has_npu_mem = any([event.self_npu_memory_usage > 0 for event in events]) and use_npu has_input_shapes = any( @@ -1489,72 +1344,52 @@ def build_table( events = EventList(sorted( events, key=lambda evt: getattr(evt, sort_by), reverse=True ), use_cuda=has_cuda_time, use_npu=has_npu_time, profile_memory=profile_memory, with_flops=with_flops) + def get_column_width(): + MAX_NAME_COLUMN_WIDTH = 55 + name_column_width = max([len(evt.key) for evt in events]) + 4 + name_column_width = min(name_column_width, MAX_NAME_COLUMN_WIDTH) - MAX_NAME_COLUMN_WIDTH = 55 - name_column_width = max([len(evt.key) for evt in events]) + 4 - name_column_width = min(name_column_width, MAX_NAME_COLUMN_WIDTH) - - DEFAULT_COLUMN_WIDTH = 12 - shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4 - shapes_column_width = min(shapes_column_width, 45) - - flops_column_width = DEFAULT_COLUMN_WIDTH + DEFAULT_COLUMN_WIDTH = 12 + shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4 + shapes_column_width = min(shapes_column_width, 45) + MAX_NAME_COLUMN_WIDTH, DEFAULT_COLUMN_WIDTH, name_column_width, shapes_column_width = get_column_width() src_column_width = None - stacks = [] - for evt in events: - if evt.stack is not None and len(evt.stack) > 0: - stacks.append(evt.stack) - has_stack = len(stacks) > 0 - if has_stack: - src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4 - src_column_width = min(src_column_width, max_src_column_width) - - headers = [ - 'Name', - 'Self CPU %', - 'Self CPU', - 'CPU total %', - 'CPU total', - 'CPU time avg', - ] - if has_cuda_time: - headers.extend([ - 'Self CUDA', - 'Self CUDA %', - 'CUDA total', - 'CUDA time avg', - ]) - if has_npu_time: - headers.extend([ - 'Self NPU', - 'Self NPU %', - 'NPU total', - 'NPU time avg', - ]) - if profile_memory: - headers.extend([ - 'CPU Mem', - 'Self CPU Mem', - ]) - if has_cuda_mem: + def get_headers(): + headers = [ + 'Name', + 'Self CPU %', + 'Self CPU', + 'CPU total %', + 'CPU total', + 'CPU time avg', + ] + if has_npu_time: headers.extend([ - 'CUDA Mem', - 'Self CUDA Mem', + 'Self NPU', + 'Self NPU %', + 'NPU total', + 'NPU time avg', ]) - if has_npu_mem: + if profile_memory: headers.extend([ - 'NPU Mem', - 'Self NPU Mem', + 'CPU Mem', + 'Self CPU Mem', ]) - headers.append( - '# of Calls' - ) - # Only append Node ID if any event has a valid (>= 0) Node ID - append_node_id = any([evt.node_id != -1 for evt in events]) - if append_node_id: - headers.append('Node ID') - + if has_npu_mem: + headers.extend([ + 'NPU Mem', + 'Self NPU Mem', + ]) + headers.append( + '# of Calls' + ) + # Only append Node ID if any event has a valid (>= 0) Node ID + append_node_id = any([evt.node_id != -1 for evt in events]) + if append_node_id: + headers.append('Node ID') + return headers, append_node_id + headers, append_node_id = get_headers() # Have to use a list because nonlocal is Py3 only... SPACING_SIZE = 2 row_format_lst = [""] @@ -1589,28 +1424,6 @@ def build_table( headers.append('Input Shapes') add_column(shapes_column_width) - if has_stack: - headers.append('Source Location') - add_column(src_column_width, text_dir='<') - - if with_flops: - # Auto-scaling of flops header - US_IN_SECOND = 1000.0 * 1000.0 # cpu_time_total is in us - raw_flops = [] - for evt in events: - if evt.flops > 0: - if evt.cuda_time_total != 0: - evt.flops = float(evt.flops) / evt.cuda_time_total * US_IN_SECOND - else: - evt.flops = float(evt.flops) / evt.cpu_time_total * US_IN_SECOND - raw_flops.append(evt.flops) - if len(raw_flops) != 0: - (flops_scale, flops_header) = auto_scale_flops(min(raw_flops)) - headers.append(flops_header) - add_column(flops_column_width) - else: - with_flops = False # can't find any valid flops - row_format = row_format_lst[0] header_sep = header_sep_lst[0] line_length = line_length_lst[0] @@ -1622,22 +1435,23 @@ def build_table( def append(s): result.append(s) result.append('\n') # Yes, newline after the end as well - - sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events]) - sum_self_cuda_time_total = 0 - sum_self_npu_time_total = 0 - for evt in events: - if evt.device_type == DeviceType.CPU: - # in legacy profiler, kernel info is stored in cpu events - if evt.is_legacy: - sum_self_cuda_time_total += evt.self_cuda_time_total + def get_time_total(): + sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events]) + sum_self_cuda_time_total = 0 + sum_self_npu_time_total = 0 + for evt in events: + if evt.device_type == DeviceType.CPU: + # in legacy profiler, kernel info is stored in cpu events + if evt.is_legacy: + sum_self_cuda_time_total += evt.self_cuda_time_total + sum_self_npu_time_total += evt.self_npu_time_total + elif evt.device_type == DeviceType.NPU: + # in kineto profiler, there're events with the correct device type (e.g. CUDA) sum_self_npu_time_total += evt.self_npu_time_total - elif evt.device_type == DeviceType.CUDA: - # in kineto profiler, there're events with the correct device type (e.g. CUDA) - sum_self_cuda_time_total += evt.self_cuda_time_total - elif evt.device_type == DeviceType.NPU: - # in kineto profiler, there're events with the correct device type (e.g. CUDA) - sum_self_npu_time_total += evt.self_npu_time_total + else: + raise RuntimeError("only support for NPU or CPU") + return sum_self_cpu_time_total, sum_self_npu_time_total + sum_self_cpu_time_total, sum_self_npu_time_total = get_time_total() # Actual printing if header is not None: @@ -1651,104 +1465,62 @@ def build_table( append(header_sep) - def trim_path(path, src_column_width): - if len(path) > src_column_width: - offset = len(path) - src_column_width - path = path[offset:] - if len(path) > 3: - path = "..." + path[3:] - return path - event_limit = 0 - for evt in events: - if event_limit == row_limit: - break - if top_level_events_only and evt.cpu_parent is not None: - continue - else: - event_limit += 1 - name = evt.key - if len(name) >= MAX_NAME_COLUMN_WIDTH - 3: - name = name[:(MAX_NAME_COLUMN_WIDTH - 3)] + "..." - row_values = [ - name, - # Self CPU total %, 0 for async events. - format_time_share(evt.self_cpu_time_total, - sum_self_cpu_time_total), - evt.self_cpu_time_total_str, # Self CPU total - # CPU total %, 0 for async events. - format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) if not evt.is_async else 0, - evt.cpu_time_total_str, # CPU total - evt.cpu_time_str, # CPU time avg - ] - if has_cuda_time: - row_values.extend([ - evt.self_cuda_time_total_str, - # CUDA time total % - format_time_share(evt.self_cuda_time_total, sum_self_cuda_time_total), - evt.cuda_time_total_str, - evt.cuda_time_str, # Cuda time avg - ]) - if has_npu_time: - row_values.extend([ - evt.self_npu_time_total_str, - # NPU time total % - format_time_share(evt.self_npu_time_total, sum_self_npu_time_total), - evt.npu_time_total_str, - evt.npu_time_str, # Npu time avg - ]) - if profile_memory: - row_values.extend([ - # CPU Mem Total - format_memory(evt.cpu_memory_usage), - # Self CPU Mem Total - format_memory(evt.self_cpu_memory_usage), - ]) - if has_cuda_mem: + def get_result(): + for evt in events: + if event_limit == row_limit: + break + if top_level_events_only and evt.cpu_parent is not None: + continue + else: + event_limit += 1 + name = evt.key + if len(name) >= MAX_NAME_COLUMN_WIDTH - 3: + name = name[:(MAX_NAME_COLUMN_WIDTH - 3)] + "..." + row_values = [ + name, + # Self CPU total %, 0 for async events. + format_time_share(evt.self_cpu_time_total, + sum_self_cpu_time_total), + evt.self_cpu_time_total_str, # Self CPU total + # CPU total %, 0 for async events. + format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) if not evt.is_async else 0, + evt.cpu_time_total_str, # CPU total + evt.cpu_time_str, # CPU time avg + ] + if has_npu_time: row_values.extend([ - # CUDA Mem Total - format_memory(evt.cuda_memory_usage), - # Self CUDA Mem Total - format_memory(evt.self_cuda_memory_usage), + evt.self_npu_time_total_str, + # NPU time total % + format_time_share(evt.self_npu_time_total, sum_self_npu_time_total), + evt.npu_time_total_str, + evt.npu_time_str, # Npu time avg ]) - if has_npu_mem: + if profile_memory: row_values.extend([ - # NPU Mem Total - format_memory(evt.npu_memory_usage), - # Self NPU Mem Total - format_memory(evt.self_npu_memory_usage), + # CPU Mem Total + format_memory(evt.cpu_memory_usage), + # Self CPU Mem Total + format_memory(evt.self_cpu_memory_usage), ]) - row_values.append( - evt.count, # Number of calls - ) - - if append_node_id: - row_values.append(evt.node_id) - if has_input_shapes: - row_values.append(str(evt.input_shapes)[:shapes_column_width]) - if with_flops: - if evt.flops <= 0.0: - row_values.append("--") - else: - row_values.append('{0:8.3f}'.format(evt.flops * flops_scale)) - if has_stack: - src_field = "" - if len(evt.stack) > 0: - src_field = trim_path(evt.stack[0], src_column_width) - row_values.append(src_field) - append(row_format.format(*row_values)) - - if has_stack: - empty_headers = [""] * (len(headers) - 1) - for entry in evt.stack[1:MAX_STACK_ENTRY]: - append(row_format.format(*(empty_headers + [trim_path(entry, src_column_width)]))) - empty_headers.append("") - append(row_format.format(*empty_headers)) + if has_npu_mem: + row_values.extend([ + # NPU Mem Total + format_memory(evt.npu_memory_usage), + # Self NPU Mem Total + format_memory(evt.self_npu_memory_usage), + ]) + row_values.append( + evt.count, # Number of calls + ) + if append_node_id: + row_values.append(evt.node_id) + if has_input_shapes: + row_values.append(str(evt.input_shapes)[:shapes_column_width]) + get_result() append(header_sep) append("Self CPU time total: {}".format(format_time(sum_self_cpu_time_total))) - if has_cuda_time: - append("Self CUDA time total: {}".format(format_time(sum_self_cuda_time_total))) if has_npu_time: append("Self NPU time total: {}".format(format_time(sum_self_npu_time_total))) return ''.join(result)