From a7ea1e99bae925746c699be0049b7775ddbc927e Mon Sep 17 00:00:00 2001 From: hu guoheng Date: Mon, 23 Oct 2023 20:07:29 +0800 Subject: [PATCH] fix bug --- .../ptdbg_ascend/online_dispatch/dispatch.py | 13 +++- .../online_dispatch/dump_compare.py | 8 +- ...t_torch_ops.yaml => torch_ops_config.yaml} | 76 ++++++++++++------- .../ptdbg_ascend/online_dispatch/utils.py | 12 ++- 4 files changed, 76 insertions(+), 33 deletions(-) rename debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/{unsupport_torch_ops.yaml => torch_ops_config.yaml} (44%) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py index 6b3c98c33e8..5c34bbe62c6 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py @@ -59,9 +59,12 @@ class PtdbgDispatch(TorchDispatchMode): Path(self.root_npu_path).mkdir(mode=0o750, parents=True, exist_ok=True) self.aten_ops_blacklist = [] - yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "unsupport_torch_ops.yaml") + self.npu_adjust_autogard = [] + yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "torch_ops_config.yaml") with open(yaml_path, 'r') as f: - self.aten_ops_blacklist = yaml.safe_load(f).get('aten') + yaml_file = yaml.safe_load(f) + self.aten_ops_blacklist = yaml_file.get('aten_ops_blacklist') + self.npu_adjust_autogard = yaml_file.get('npu_adjust_autogard') self.process_num = process_num self.lock = None @@ -152,6 +155,10 @@ class PtdbgDispatch(TorchDispatchMode): save_csv(self.all_summery, self.call_stack_list, self.csv_path) + def enable_autogard(self, aten_api): + if aten_api in self.npu_adjust_autogard: + torch._C._dispatch_tls_set_dispatch_key_excluded(torch._C.DispatchKey.AutogradFunctionality, False) + def __torch_dispatch__(self, func, types, args=(), kwargs=None): if not is_npu: logger_error("Please confirm you run environment installed torch_npu!") @@ -159,6 +166,8 @@ class PtdbgDispatch(TorchDispatchMode): aten_api = func.__name__.split(".")[0] aten_api_overload_name = func.__name__.split(".")[1] + self.enable_autogard(aten_api) + if aten_api in self.aten_ops_blacklist: npu_out = func(*args, **kwargs) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py index 0dc7555f5a7..710d4947c33 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py @@ -44,11 +44,13 @@ class TimeStatistics: def __enter__(self): if self.debug: self.time = datetime.now() + logger_debug(f'Time[{self.tag}]-ENTER: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ + f'Id[{self.index}]') def __exit__(self, exc_type, exc_val, exc_tb): if self.debug: cost_time = datetime.now() - self.time - time_cost = f'Time[{self.tag}]: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ + time_cost = f'Time[{self.tag}]-EXIT: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ f'Id[{self.index}], time[{cost_time}]' hot_time_cost = "Hotspot " + time_cost @@ -61,8 +63,8 @@ class TimeStatistics: def get_compare_result(npu_data, cpu_data): # Do not modify the original data, output delay dump if isinstance(npu_data, torch.Tensor): - npu_npy = npu_data.numpy() - cpu_npy = cpu_data.numpy() + npu_npy = npu_data.detach().numpy() + cpu_npy = cpu_data.detach().numpy() # Do not check dtype, there maybe type cast if npu_npy.size == 0 or cpu_npy.size == 0: return "unsupported", 0, 0, "This is empty data, can not compare." diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/unsupport_torch_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/torch_ops_config.yaml similarity index 44% rename from debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/unsupport_torch_ops.yaml rename to debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/torch_ops_config.yaml index b660e46c118..29261716817 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/unsupport_torch_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/torch_ops_config.yaml @@ -1,26 +1,50 @@ -aten: - - _local_scalar_dense - - _to_copy - - _unsafe_view - - clone - - contiguous - - copy_ - - detach - - empty - - index_put_ - - lift_fresh - - max_pool2d_with_indices_backward # shape unmatch - - mul_ - - native_batch_norm_backward - - ones - - ones_like - - permute - - scalar_tensor - - select - - to - - transpose - - unbind - - view - - zero_ - - zeros - - zeros_like \ No newline at end of file +aten_ops_blacklist: + - _cudnn_rnn + - _local_scalar_dense + - _pin_memory + - _to_copy + - _unsafe_view + - clone + - contiguous + - copy_ + - cudnn_batch_norm + - cudnn_batch_norm_backward + - detach + - empty + - index_put_ + - lift_fresh + - max_pool2d_with_indices_backward # shape unmatch + - native_batch_norm_backward + - new_empty + - new_empty_strided + - new_full + - new_ones + - new_zeros + - ones + - ones_like + - permute + - rand + - rand_like + - randint + - randint_like + - randn + - randn_like + - randperm + - scalar_tensor + - select + - to + - transpose + - unbind + - view + - zero + - zero_ + - zeros + - zeros_like + +npu_adjust_autogard: + - adaptive_avg_pool2d + - batch_norm + - log_softmax + - nll_loss + - to + \ No newline at end of file diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/utils.py index fb3fc6dc8d0..5cb8e0f3e5a 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/utils.py @@ -4,6 +4,14 @@ import logging import psutil import torch import numpy as np + +try: + import torch_npu +except ImportError: + pta_cpu_device = None +else: + pta_cpu_device = torch.device("cpu") + from ..common.utils import print_error_log, CompareConst cpu_device = torch._C.device("cpu") @@ -72,10 +80,10 @@ def data_to_cpu(data, deep, data_cpu): global cpu_device list_cpu = [] if isinstance(data, torch.Tensor): - if data.device == cpu_device: + if data.device == cpu_device or data.device == pta_cpu_device: tensor_copy = data.clone().detach() else: - tensor_copy = data.cpu() + tensor_copy = data.cpu().detach() if tensor_copy.dtype in [torch.float16, torch.half]: tensor_copy = tensor_copy.float() -- Gitee