From 3467a8babe572cd8ed3a6f1a8b0e74e40c4d4103 Mon Sep 17 00:00:00 2001 From: l30036321 Date: Wed, 3 Apr 2024 16:30:29 +0800 Subject: [PATCH 1/2] Performance Optimization --- .../src/python/ptdbg_ascend/dump/dump.py | 111 ++++++++++++++---- 1 file changed, 87 insertions(+), 24 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 2e49a9743b..54f91e3772 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -74,9 +74,38 @@ class APIList(list): self.pkl_mode_changed = True self.clear() + def process_data(self, data, chunks): + data_stack = torch._C._VariableFunctionsClass.stack( + [summary[1] for summary in data], dim=0).cpu() + ubind_data = torch._C._VariableFunctionsClass.unbind(data_stack, dim=0) + for i, tensor_stat in enumerate(ubind_data): + index = data[i][0] + stat_data = [ + stat.item() for stat in torch._C._VariableFunctionsClass.chunk(tensor_stat, chunks=chunks, dim=0) + ] + self[index][5] = stat_data + + def process(self): + float_data = [] + not_float_data = [] + for index, data in enumerate(self): + if len(data) != Const.SUMMARY_COLUMN_NUM: + continue + if isinstance(data[5], torch.Tensor) and data[5].device.type != "cpu": + if data[5].numel() == 3: + not_float_data.append((index, data[5])) + if data[5].numel() == 4: + float_data.append((index, data[5])) + + if float_data: + self.process_data(float_data, 4) + if not_float_data: + self.process_data(not_float_data, 3) + def append(self, data): list.append(self, data) if len(self) >= APIList.threshold: + self.process() self.flush() @@ -101,16 +130,30 @@ def get_not_float_tensor_info(data): tensor_max = [] tensor_min = [] tensor_mean = [] + tensor_stat = [tensor_max, tensor_min, tensor_mean, CompareConst.NAN] elif len(data.shape) == 0: - item = data.float().item() - tensor_max = item - tensor_min = item - tensor_mean = item + if data.device.type == "cpu": + item = data.item() + tensor_stat = [item, item, item, CompareConst.NAN] + else: + item = torch._C._VariableFunctionsClass.unsqueeze(data, 0) + tensor_stat = torch._C._VariableFunctionsClass.cat([item, item, item], dim=0) else: - tensor_max = torch._C._VariableFunctionsClass.max(data).float().item() - tensor_min = torch._C._VariableFunctionsClass.min(data).float().item() - tensor_mean = torch._C._VariableFunctionsClass.mean(data.float()).float().item() - return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, CompareConst.NAN) + if data.device.type == "cpu": + tensor_max = torch._C._VariableFunctionsClass.max(data).float().item() + tensor_min = torch._C._VariableFunctionsClass.min(data).float().item() + tensor_mean = torch._C._VariableFunctionsClass.mean(data.float()).float().item() + tensor_stat = [tensor_max, tensor_min, tensor_mean, CompareConst.NAN] + else: + tensor_max = torch._C._VariableFunctionsClass.unsqueeze( + torch._C._VariableFunctionsClass.max(data), 0) + tensor_min = torch._C._VariableFunctionsClass.unsqueeze( + torch._C._VariableFunctionsClass.min(data), 0) + tensor_mean = torch._C._VariableFunctionsClass.unsqueeze( + torch._C._VariableFunctionsClass.mean(data.float()), 0) + tensor_stat = torch._C._VariableFunctionsClass.cat( + [tensor_max, tensor_min, tensor_mean], dim=0) + return get_tensor_data_info(data, tensor_stat) def get_scalar_data_info(data): @@ -121,24 +164,37 @@ def get_scalar_data_info(data): def get_float_tensor_info(data): if DumpUtil.summary_mode == "md5": return DataInfo([], [], str(data.dtype), tuple(data.shape), get_md5_for_tensor(data)) - tensor_max = torch._C._VariableFunctionsClass.max(data).float().item() - tensor_min = torch._C._VariableFunctionsClass.min(data).float().item() - tensor_mean = torch._C._VariableFunctionsClass.mean(data).float().item() - tensor_norm = torch._C._VariableFunctionsClass.norm(data).float().item() - return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, tensor_norm) + + if data.device.type == "cpu": + tensor_max = torch._C._VariableFunctionsClass.max(data).item() + tensor_min = torch._C._VariableFunctionsClass.min(data).item() + tensor_mean = torch._C._VariableFunctionsClass.mean(data).item() + tensor_norm = torch._C._VariableFunctionsClass.norm(data).item() + tensor_stat = [tensor_max, tensor_min, tensor_mean, tensor_norm] + else: + tensor_max = torch._C._VariableFunctionsClass.unsqueeze( + torch._C._VariableFunctionsClass.max(data), 0) + tensor_min = torch._C._VariableFunctionsClass.unsqueeze( + torch._C._VariableFunctionsClass.min(data), 0) + tensor_mean = torch._C._VariableFunctionsClass.unsqueeze( + torch._C._VariableFunctionsClass.mean(data), 0) + tensor_norm = torch._C._VariableFunctionsClass.unsqueeze( + torch._C._VariableFunctionsClass.norm(data), 0) + tensor_stat = torch._C._VariableFunctionsClass.cat( + [tensor_max, tensor_min, tensor_mean, tensor_norm], dim=0) + return get_tensor_data_info(data, tensor_stat) -def get_tensor_data_info(data, *tensor_args): - summary_data = [] - summary_data.extend([*tensor_args]) + +def get_tensor_data_info(data, tensor_stat): if DumpUtil.summary_mode == "all": saved_tensor = data.contiguous().cpu().detach() if data.dtype == torch.bfloat16: saved_numpy = saved_tensor.to(torch.float32).numpy() else: saved_numpy = saved_tensor.numpy() - return DataInfo(saved_numpy, summary_data, str(data.dtype), tuple(data.shape)) - return DataInfo([], summary_data, str(data.dtype), tuple(data.shape)) + return DataInfo(saved_numpy, tensor_stat, str(data.dtype), tuple(data.shape)) + return DataInfo([], tensor_stat, str(data.dtype), tuple(data.shape)) def dump_tensor(x, prefix, dump_step): @@ -199,17 +255,23 @@ def dump_data_by_rank_count(dump_step, prefix, data_info): def dump_stack_info(name_template): + if Const.BACKWARD in name_template: + return if check_inplace_op(name_template) and Const.PRE_FORWARD in name_template: return stack_str = [] try: - for (_, path, line, func, code, _) in inspect.stack()[4:]: - if code: - stack_line = [path, str(line), func, code[0].strip() if code else code] - else: - stack_line = [path, str(line), func, code] - stack_str.append(stack_line) + current_frame = inspect.currentframe() + for _ in range(6): + current_frame = current_frame.f_back + + while current_frame: + frame_info = inspect.getframeinfo(current_frame) + path, line, func, code = frame_info.filename, frame_info.lineno, frame_info.function, frame_info.code_context + code_str = code[0].strip() if code else code + stack_str.append([path, line, func, code_str]) + current_frame = current_frame.f_back except Exception as e: print_warn_log("Dump stack info failed, error: {}".format(e)) stack_str.append('') @@ -442,6 +504,7 @@ def acc_cmp_dump(name, **kwargs): def write_to_disk(): + api_list.process() api_list.flush() -- Gitee From 9ff4acee7a9271f034ccff9ec71eb57ec7e510dd Mon Sep 17 00:00:00 2001 From: l30036321 Date: Mon, 8 Apr 2024 10:00:13 +0800 Subject: [PATCH 2/2] fix ut --- .../src/python/ptdbg_ascend/dump/dump.py | 97 ++++++++----------- .../ptdbg_ascend/test/ut/test_dump.py | 3 +- 2 files changed, 44 insertions(+), 56 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index 54f91e3772..7812453e93 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -74,33 +74,27 @@ class APIList(list): self.pkl_mode_changed = True self.clear() - def process_data(self, data, chunks): - data_stack = torch._C._VariableFunctionsClass.stack( - [summary[1] for summary in data], dim=0).cpu() + def process_data(self, data): + data_stack = torch._C._VariableFunctionsClass.stack([summary[1] for summary in data], dim=0).cpu() ubind_data = torch._C._VariableFunctionsClass.unbind(data_stack, dim=0) for i, tensor_stat in enumerate(ubind_data): index = data[i][0] stat_data = [ - stat.item() for stat in torch._C._VariableFunctionsClass.chunk(tensor_stat, chunks=chunks, dim=0) + stat.item() for stat in torch._C._VariableFunctionsClass.unbind(tensor_stat) ] self[index][5] = stat_data def process(self): - float_data = [] - not_float_data = [] + data_summary = [] for index, data in enumerate(self): if len(data) != Const.SUMMARY_COLUMN_NUM: continue - if isinstance(data[5], torch.Tensor) and data[5].device.type != "cpu": - if data[5].numel() == 3: - not_float_data.append((index, data[5])) - if data[5].numel() == 4: - float_data.append((index, data[5])) - - if float_data: - self.process_data(float_data, 4) - if not_float_data: - self.process_data(not_float_data, 3) + tensor_stat = data[5] + if isinstance(tensor_stat, torch.Tensor) and tensor_stat.device.type != "cpu": + data_summary.append((index, tensor_stat)) + + if data_summary: + self.process_data(data_summary) def append(self, data): list.append(self, data) @@ -130,29 +124,30 @@ def get_not_float_tensor_info(data): tensor_max = [] tensor_min = [] tensor_mean = [] - tensor_stat = [tensor_max, tensor_min, tensor_mean, CompareConst.NAN] + tensor_norm = [] + tensor_stat = [tensor_max, tensor_min, tensor_mean, tensor_norm] elif len(data.shape) == 0: if data.device.type == "cpu": item = data.item() - tensor_stat = [item, item, item, CompareConst.NAN] + tensor_stat = [item, item, item, item] else: - item = torch._C._VariableFunctionsClass.unsqueeze(data, 0) - tensor_stat = torch._C._VariableFunctionsClass.cat([item, item, item], dim=0) + tensor_stat = torch._C._VariableFunctionsClass.stack([data, data, data, data]) else: if data.device.type == "cpu": - tensor_max = torch._C._VariableFunctionsClass.max(data).float().item() - tensor_min = torch._C._VariableFunctionsClass.min(data).float().item() - tensor_mean = torch._C._VariableFunctionsClass.mean(data.float()).float().item() - tensor_stat = [tensor_max, tensor_min, tensor_mean, CompareConst.NAN] + tensor_max = torch._C._VariableFunctionsClass.max(data).item() + tensor_min = torch._C._VariableFunctionsClass.min(data).item() + tensor_mean = torch._C._VariableFunctionsClass.mean(data.float()).item() + tensor_norm = torch._C._VariableFunctionsClass.norm(data.float()).item() + tensor_stat = [tensor_max, tensor_min, tensor_mean, tensor_norm] else: - tensor_max = torch._C._VariableFunctionsClass.unsqueeze( - torch._C._VariableFunctionsClass.max(data), 0) - tensor_min = torch._C._VariableFunctionsClass.unsqueeze( - torch._C._VariableFunctionsClass.min(data), 0) - tensor_mean = torch._C._VariableFunctionsClass.unsqueeze( - torch._C._VariableFunctionsClass.mean(data.float()), 0) - tensor_stat = torch._C._VariableFunctionsClass.cat( - [tensor_max, tensor_min, tensor_mean], dim=0) + tensor_stat = torch._C._VariableFunctionsClass.stack( + [ + torch._C._VariableFunctionsClass.max(data), + torch._C._VariableFunctionsClass.min(data), + torch._C._VariableFunctionsClass.mean(data.float()), + torch._C._VariableFunctionsClass.norm(data.float()) + ] + ) return get_tensor_data_info(data, tensor_stat) @@ -172,16 +167,14 @@ def get_float_tensor_info(data): tensor_norm = torch._C._VariableFunctionsClass.norm(data).item() tensor_stat = [tensor_max, tensor_min, tensor_mean, tensor_norm] else: - tensor_max = torch._C._VariableFunctionsClass.unsqueeze( - torch._C._VariableFunctionsClass.max(data), 0) - tensor_min = torch._C._VariableFunctionsClass.unsqueeze( - torch._C._VariableFunctionsClass.min(data), 0) - tensor_mean = torch._C._VariableFunctionsClass.unsqueeze( - torch._C._VariableFunctionsClass.mean(data), 0) - tensor_norm = torch._C._VariableFunctionsClass.unsqueeze( - torch._C._VariableFunctionsClass.norm(data), 0) - tensor_stat = torch._C._VariableFunctionsClass.cat( - [tensor_max, tensor_min, tensor_mean, tensor_norm], dim=0) + tensor_stat = torch._C._VariableFunctionsClass.stack( + [ + torch._C._VariableFunctionsClass.max(data), + torch._C._VariableFunctionsClass.min(data), + torch._C._VariableFunctionsClass.mean(data), + torch._C._VariableFunctionsClass.norm(data) + ] + ) return get_tensor_data_info(data, tensor_stat) @@ -255,23 +248,17 @@ def dump_data_by_rank_count(dump_step, prefix, data_info): def dump_stack_info(name_template): - if Const.BACKWARD in name_template: - return if check_inplace_op(name_template) and Const.PRE_FORWARD in name_template: return stack_str = [] try: - current_frame = inspect.currentframe() - for _ in range(6): - current_frame = current_frame.f_back - - while current_frame: - frame_info = inspect.getframeinfo(current_frame) - path, line, func, code = frame_info.filename, frame_info.lineno, frame_info.function, frame_info.code_context - code_str = code[0].strip() if code else code - stack_str.append([path, line, func, code_str]) - current_frame = current_frame.f_back + for (_, path, line, func, code, _) in inspect.stack()[4:]: + if code: + stack_line = [path, str(line), func, code[0].strip() if code else code] + else: + stack_line = [path, str(line), func, code] + stack_str.append(stack_line) except Exception as e: print_warn_log("Dump stack info failed, error: {}".format(e)) stack_str.append('') @@ -283,7 +270,7 @@ def dump_stack_info(name_template): api_list.append([prefix, stack_str]) else: api_list.append([prefix, stack_str]) - + def dump_api_tensor(dump_step, in_feat, name_template, out_feat): if check_inplace_op(name_template): diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py index 9673c292ba..fa3be6f6ae 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py @@ -37,7 +37,8 @@ class TestDump(unittest.TestCase): tensor_max = 3.0 tensor_min = 1.0 tensor_mean = 2.0 - data_info = get_tensor_data_info(self.tensor, tensor_max, tensor_min, tensor_mean) + tensor_stat = [tensor_max, tensor_min, tensor_mean] + data_info = get_tensor_data_info(self.tensor, tensor_stat) self.assertEqual(data_info.save_data.tolist(), self.tensor.numpy().tolist()) self.assertEqual(data_info.summary_data, [tensor_max, tensor_min, tensor_mean]) self.assertEqual(data_info.dtype, 'torch.float32') -- Gitee