From 2f3a00f1e6b05183735aed2fb73a3c52d5c01316 Mon Sep 17 00:00:00 2001 From: hu guoheng Date: Wed, 8 Nov 2023 11:38:59 +0800 Subject: [PATCH 1/3] add over --- ...0\203\275\350\257\264\346\230\216_v4.1.md" | 1 + .../src/python/ptdbg_ascend/common/utils.py | 12 +++++++ .../ptdbg_ascend/hook_module/register_hook.py | 4 +-- .../hook_module/support_wrap_ops.yaml | 2 -- .../overflow_check/overflow_check.py | 6 ++-- .../ptdbg_ascend/overflow_check/utils.py | 35 ++++++++++++++++++- 6 files changed, 52 insertions(+), 8 deletions(-) diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" index 393b6c241d..89ab305463 100644 --- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" @@ -419,6 +419,7 @@ set_dump_switch("ON", mode="api_list", api_list=["distributed"]) * dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 * 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 +* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行抛错EZ3003 ## debugger方式dump和溢出检测(推荐) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index f2361a90b7..0f34846d18 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -101,6 +101,9 @@ class Const: ASCEND_WORK_PATH = "ASCEND_WORK_PATH" DUMP_DIR = "dump_data" + ENV_ENABLE = "1" + ENV_DISABLE = "0" + MAX_SEED_VALUE = 2**32 - 1 @@ -203,6 +206,15 @@ class DumpException(CompareException): pass +class OverflowConst: + """ + Class for Overflow + """ + OVERFLOW_DEBUG_MODE_ENABLE = "OVERFLOW_DEBUG_MODE_ENABLE" + OVERFLOW_ORIGINAL_MODE = 0 + OVERFLOW_DEBUG_MODE = 1 + + def make_dump_path_if_not_exists(dump_path): if not os.path.exists(dump_path): try: diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py index 8f67a41a40..eb8a978437 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py @@ -28,7 +28,7 @@ from .wrap_functional import remove_dropout from ..common.utils import check_file_or_directory_path, print_error_log, CompareException, Const, \ print_info_log, print_warn_log, get_process_rank, torch_without_guard_version from ..dump.utils import make_dump_dirs, DumpUtil -from ..overflow_check.utils import OverFlowUtil +from ..overflow_check.utils import OverFlowUtil, clear_overflow_npu torch_version_above_2 = torch.__version__.split('+')[0] > '2.0' @@ -97,7 +97,7 @@ def add_clear_overflow(func, pid): return func(*args, **kwargs) nonlocal first_module if first_module: - torch_npu._C._clear_overflow_npu() + clear_overflow_npu() first_module = False return func(*args, **kwargs) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml index 6e48427368..fc83b67e85 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml @@ -1030,8 +1030,6 @@ torch_npu: - npu_scatter - npu_layer_norm_eval - npu_alloc_float_status - - npu_get_float_status - - npu_clear_float_status - npu_confusion_transpose - npu_bmmV2 - fast_gelu diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py index dc43c2c3be..5b227a8c14 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py @@ -12,7 +12,7 @@ else: from ..common.utils import print_warn_log, get_time, print_info_log from ..dump.dump import forward_init_status, forward_acl_dump -from .utils import OverFlowUtil, dump_overflow +from .utils import OverFlowUtil, dump_overflow, check_overflow_npu, clear_overflow_npu from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo from ..dump import dump @@ -122,7 +122,7 @@ def overflow_check(name, **kwargs): check_feat = out_feat module.has_overflow = check_data_overflow(check_feat) else: - module.has_overflow = torch_npu._C._check_overflow_npu() + module.has_overflow = check_overflow_npu() if not module.has_overflow: if hasattr(module, 'input_args'): del module.input_args @@ -149,7 +149,7 @@ def overflow_check(name, **kwargs): acl_dump(module, module_name) dump.write_to_disk() # clear overflow flag for the next check - torch_npu._C._clear_overflow_npu() + clear_overflow_npu() if not OverFlowUtil.check_overflow_dump_times(overflow_nums): for key in forward_api_info: write_api_info_json(forward_api_info[key]) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py index 1a26c5ac04..691e3f100e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py @@ -1,6 +1,14 @@ +import os import torch -from ..common.utils import Const, check_switch_valid +try: + import torch_npu +except ImportError: + is_gpu = True +else: + is_gpu = False + +from ..common.utils import Const, check_switch_valid, OverflowConst from ..dump.dump import dump_stack_info, get_scalar_data_info, dump_data, \ get_not_float_tensor_info, get_float_tensor_info from ..dump.utils import DumpUtil, make_dump_data_dir @@ -71,3 +79,28 @@ def _dump_tensor_completely(x, prefix, dump_file_name): if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): data_info = get_scalar_data_info(x) dump_data(dump_file_name, dump_flag, prefix, data_info) + + +def overflow_debug_mode_enalbe(): + overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE) + return overflow_mode == Const.ENV_ENABLE + + +def check_overflow_npu(): + if overflow_debug_mode_enalbe(): + float_status = torch.zeros(8).npu() + result = torch_npu.npu_get_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE) + if (result.cpu()[0] != 0): + return True + else: + return False + else: + return torch_npu._C._check_overflow_npu() + + +def clear_overflow_npu(): + if overflow_debug_mode_enalbe(): + float_status = torch.zeros(8).npu() + torch_npu.npu_clear_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE) + else: + torch_npu._C._clear_overflow_npu() -- Gitee From 5ecab1b06ad6cc84a3ff69fd81cc634ead9e771c Mon Sep 17 00:00:00 2001 From: huguoheng Date: Wed, 15 Nov 2023 02:26:39 +0000 Subject: [PATCH 2/3] =?UTF-8?q?update=20debug/accuracy=5Ftools/ptdbg=5Fasc?= =?UTF-8?q?end/doc/ptdbg=5Fascend=E7=B2=BE=E5=BA=A6=E5=B7=A5=E5=85=B7?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E8=AF=B4=E6=98=8E=5Fv4.1.md.=20=E8=B5=84?= =?UTF-8?q?=E6=96=99=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: huguoheng --- ...267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" index f9de865ddd..308da7a8e4 100644 --- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" @@ -419,7 +419,7 @@ set_dump_switch("ON", mode="api_list", api_list=["distributed"]) * dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 * 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 -* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行抛错EZ3003 +* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行报错EZ3003。 ## debugger方式dump和溢出检测(推荐) -- Gitee From 9ca56e18905ceb2ef23d5c297ff519abe8637120 Mon Sep 17 00:00:00 2001 From: huguoheng Date: Thu, 16 Nov 2023 01:47:24 +0000 Subject: [PATCH 3/3] =?UTF-8?q?update=20debug/accuracy=5Ftools/ptdbg=5Fasc?= =?UTF-8?q?end/doc/ptdbg=5Fascend=E7=B2=BE=E5=BA=A6=E5=B7=A5=E5=85=B7?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E8=AF=B4=E6=98=8E=5Fv4.1.md.=20=E5=9B=9E?= =?UTF-8?q?=E9=80=80=E8=B5=84=E6=96=99=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: huguoheng --- ...\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" | 1 - 1 file changed, 1 deletion(-) diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" index 308da7a8e4..7ccf9439f1 100644 --- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.1.md" @@ -419,7 +419,6 @@ set_dump_switch("ON", mode="api_list", api_list=["distributed"]) * dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 * 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 -* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行报错EZ3003。 ## debugger方式dump和溢出检测(推荐) -- Gitee