diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py index 328b25b88851a4548f1f9455923ad5eed4dd4a00..0fd1ea879161c873d5f2023ccd676b1f182ec72f 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dispatch.py @@ -60,9 +60,12 @@ class PtdbgDispatch(TorchDispatchMode): Path(self.root_npu_path).mkdir(mode=0o750, parents=True, exist_ok=True) self.aten_ops_blacklist = [] - yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "unsupport_torch_ops.yaml") + self.npu_adjust_autogard = [] + yaml_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "torch_ops_config.yaml") with FileOpen(yaml_path, 'r') as f: - self.aten_ops_blacklist = yaml.safe_load(f).get('aten') + yaml_file = yaml.safe_load(f) + self.aten_ops_blacklist = yaml_file.get('aten_ops_blacklist') + self.npu_adjust_autogard = yaml_file.get('npu_adjust_autogard') self.process_num = process_num self.lock = None @@ -153,6 +156,10 @@ class PtdbgDispatch(TorchDispatchMode): save_csv(self.all_summery, self.call_stack_list, self.csv_path) + def enable_autogard(self, aten_api): + if aten_api in self.npu_adjust_autogard: + torch._C._dispatch_tls_set_dispatch_key_excluded(torch._C.DispatchKey.AutogradFunctionality, False) + def __torch_dispatch__(self, func, types, args=(), kwargs=None): if not is_npu: logger_error("Please confirm you run environment installed torch_npu!") @@ -160,6 +167,8 @@ class PtdbgDispatch(TorchDispatchMode): aten_api = func.__name__.split(".")[0] aten_api_overload_name = func.__name__.split(".")[1] + self.enable_autogard(aten_api) + if aten_api in self.aten_ops_blacklist: npu_out = func(*args, **kwargs) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py index a4740fef8843e22d647888dd86bef3b6629256b1..75131f4f52acb63d67996ffcf3516e8996bfbb80 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/dump_compare.py @@ -45,11 +45,13 @@ class TimeStatistics: def __enter__(self): if self.debug: self.time = datetime.now() + logger_debug(f'Time[{self.tag}]-ENTER: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ + f'Id[{self.index}]') def __exit__(self, exc_type, exc_val, exc_tb): if self.debug: cost_time = datetime.now() - self.time - time_cost = f'Time[{self.tag}]: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ + time_cost = f'Time[{self.tag}]-EXIT: Dev[{self.device}], Pid[{os.getpid()}], Fun[{self.fun}], ' \ f'Id[{self.index}], time[{cost_time}]' hot_time_cost = "Hotspot " + time_cost @@ -62,8 +64,8 @@ class TimeStatistics: def get_compare_result(npu_data, cpu_data): # Do not modify the original data, output delay dump if isinstance(npu_data, torch.Tensor): - npu_npy = npu_data.numpy() - cpu_npy = cpu_data.numpy() + npu_npy = npu_data.detach().numpy() + cpu_npy = cpu_data.detach().numpy() # Do not check dtype, there maybe type cast if npu_npy.size == 0 or cpu_npy.size == 0: return "unsupported", 0, 0, "This is empty data, can not compare." diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/unsupport_torch_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/torch_ops_config.yaml similarity index 44% rename from debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/unsupport_torch_ops.yaml rename to debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/torch_ops_config.yaml index b660e46c118ae6d242223e8c3e6ee46d9ef2a3dc..2926171681754b133be6cee7dfd2edec89b80ebe 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/unsupport_torch_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/torch_ops_config.yaml @@ -1,26 +1,50 @@ -aten: - - _local_scalar_dense - - _to_copy - - _unsafe_view - - clone - - contiguous - - copy_ - - detach - - empty - - index_put_ - - lift_fresh - - max_pool2d_with_indices_backward # shape unmatch - - mul_ - - native_batch_norm_backward - - ones - - ones_like - - permute - - scalar_tensor - - select - - to - - transpose - - unbind - - view - - zero_ - - zeros - - zeros_like \ No newline at end of file +aten_ops_blacklist: + - _cudnn_rnn + - _local_scalar_dense + - _pin_memory + - _to_copy + - _unsafe_view + - clone + - contiguous + - copy_ + - cudnn_batch_norm + - cudnn_batch_norm_backward + - detach + - empty + - index_put_ + - lift_fresh + - max_pool2d_with_indices_backward # shape unmatch + - native_batch_norm_backward + - new_empty + - new_empty_strided + - new_full + - new_ones + - new_zeros + - ones + - ones_like + - permute + - rand + - rand_like + - randint + - randint_like + - randn + - randn_like + - randperm + - scalar_tensor + - select + - to + - transpose + - unbind + - view + - zero + - zero_ + - zeros + - zeros_like + +npu_adjust_autogard: + - adaptive_avg_pool2d + - batch_norm + - log_softmax + - nll_loss + - to + \ No newline at end of file diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/utils.py index fb3fc6dc8d04f2f1f4f98702c148bfb3b8a995e2..5cb8e0f3e5a55eff672fe0737337560ed3038668 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/online_dispatch/utils.py @@ -4,6 +4,14 @@ import logging import psutil import torch import numpy as np + +try: + import torch_npu +except ImportError: + pta_cpu_device = None +else: + pta_cpu_device = torch.device("cpu") + from ..common.utils import print_error_log, CompareConst cpu_device = torch._C.device("cpu") @@ -72,10 +80,10 @@ def data_to_cpu(data, deep, data_cpu): global cpu_device list_cpu = [] if isinstance(data, torch.Tensor): - if data.device == cpu_device: + if data.device == cpu_device or data.device == pta_cpu_device: tensor_copy = data.clone().detach() else: - tensor_copy = data.cpu() + tensor_copy = data.cpu().detach() if tensor_copy.dtype in [torch.float16, torch.half]: tensor_copy = tensor_copy.float()