diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index 6e63b7cf884869f6b69bbcc4b130613436b2e60e..430a7a9fa6b57591b3c1db7f0d4c89cafe8c9cc1 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -102,6 +102,9 @@ class Const: ASCEND_WORK_PATH = "ASCEND_WORK_PATH" DUMP_DIR = "dump_data" + ENV_ENABLE = "1" + ENV_DISABLE = "0" + MAX_SEED_VALUE = 2**32 - 1 INPLACE_LIST = ["broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter"] @@ -206,6 +209,15 @@ class DumpException(CompareException): pass +class OverflowConst: + """ + Class for Overflow + """ + OVERFLOW_DEBUG_MODE_ENABLE = "OVERFLOW_DEBUG_MODE_ENABLE" + OVERFLOW_ORIGINAL_MODE = 0 + OVERFLOW_DEBUG_MODE = 1 + + def make_dump_path_if_not_exists(dump_path): if not os.path.exists(dump_path): try: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py index 8f67a41a40779a20116a73fbc597d631f13e6b12..eb8a9784371ad9ea6dcdc5baf514964d9ddde2d4 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py @@ -28,7 +28,7 @@ from .wrap_functional import remove_dropout from ..common.utils import check_file_or_directory_path, print_error_log, CompareException, Const, \ print_info_log, print_warn_log, get_process_rank, torch_without_guard_version from ..dump.utils import make_dump_dirs, DumpUtil -from ..overflow_check.utils import OverFlowUtil +from ..overflow_check.utils import OverFlowUtil, clear_overflow_npu torch_version_above_2 = torch.__version__.split('+')[0] > '2.0' @@ -97,7 +97,7 @@ def add_clear_overflow(func, pid): return func(*args, **kwargs) nonlocal first_module if first_module: - torch_npu._C._clear_overflow_npu() + clear_overflow_npu() first_module = False return func(*args, **kwargs) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml index 6e484273681c54fd8cef2b7c2e85cfad4f877752..fc83b67e85547d60cb375a5b5b9058633ba526dc 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml @@ -1030,8 +1030,6 @@ torch_npu: - npu_scatter - npu_layer_norm_eval - npu_alloc_float_status - - npu_get_float_status - - npu_clear_float_status - npu_confusion_transpose - npu_bmmV2 - fast_gelu diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py index dc43c2c3bee01e036c3d6363dcbe4db8a085f590..5b227a8c14a2309b782c2e06a04e0a52ade22749 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py @@ -12,7 +12,7 @@ else: from ..common.utils import print_warn_log, get_time, print_info_log from ..dump.dump import forward_init_status, forward_acl_dump -from .utils import OverFlowUtil, dump_overflow +from .utils import OverFlowUtil, dump_overflow, check_overflow_npu, clear_overflow_npu from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo from ..dump import dump @@ -122,7 +122,7 @@ def overflow_check(name, **kwargs): check_feat = out_feat module.has_overflow = check_data_overflow(check_feat) else: - module.has_overflow = torch_npu._C._check_overflow_npu() + module.has_overflow = check_overflow_npu() if not module.has_overflow: if hasattr(module, 'input_args'): del module.input_args @@ -149,7 +149,7 @@ def overflow_check(name, **kwargs): acl_dump(module, module_name) dump.write_to_disk() # clear overflow flag for the next check - torch_npu._C._clear_overflow_npu() + clear_overflow_npu() if not OverFlowUtil.check_overflow_dump_times(overflow_nums): for key in forward_api_info: write_api_info_json(forward_api_info[key]) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py index 6af477753d09e45ab5bf30e993e9112d7db7ca10..ebd9f8c755bb79d5ee9f18f9e07f924693401681 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py @@ -1,6 +1,14 @@ +import os import torch -from ..common.utils import Const, check_switch_valid, check_inplace_op +try: + import torch_npu +except ImportError: + is_gpu = True +else: + is_gpu = False + +from ..common.utils import Const, check_switch_valid, check_inplace_op, OverflowConst from ..dump.dump import dump_stack_info, get_scalar_data_info, dump_data, \ get_not_float_tensor_info, get_float_tensor_info from ..dump.utils import DumpUtil, make_dump_data_dir @@ -78,3 +86,28 @@ def _dump_tensor_completely(x, prefix): if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): data_info = get_scalar_data_info(x) dump_data(dump_flag, prefix, data_info) + + +def overflow_debug_mode_enalbe(): + overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE) + return overflow_mode == Const.ENV_ENABLE + + +def check_overflow_npu(): + if overflow_debug_mode_enalbe(): + float_status = torch.zeros(8).npu() + result = torch_npu.npu_get_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE) + if (result.cpu()[0] != 0): + return True + else: + return False + else: + return torch_npu._C._check_overflow_npu() + + +def clear_overflow_npu(): + if overflow_debug_mode_enalbe(): + float_status = torch.zeros(8).npu() + torch_npu.npu_clear_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE) + else: + torch_npu._C._clear_overflow_npu() \ No newline at end of file