diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py index 1792ac5f05c026b7fa153cd995ad3b0aee89340c..926612aca171ea1bc2586cdd3544b1bdb6bfc846 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py @@ -30,6 +30,7 @@ from tensorflow.contrib.mixed_precision.python import loss_scale_optimizer as ls from npu_bridge.hccl import hccl_ops from npu_bridge.estimator.npu.npu_loss_scale_manager import FixedLossScaleManager from npu_bridge.helper import helper +from npu_bridge.estimator.npu import npu_plugin gen_npu_ops = helper.get_gen_ops() @@ -51,6 +52,9 @@ class NPULossScaleOptimizer(lso.LossScaleOptimizer): def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients. See base class `tf.compat.v1.train.Optimizer`.""" + if npu_plugin.is_inf_nan_enabled(): + return super().apply_gradients(grads_and_vars, name) + if self._enable_overflow_check(): with tf.name_scope(self._name): grads = [] diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py index 502404e54cb9fd98a6e41d3363d1e6c98edc8836..06a39ad489400c92cc3f6a31d5ac44f89f9158d8 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py @@ -225,3 +225,8 @@ def get_rdma_cache(data_type, shape, name="rdma_w"): def set_device_sat_mode(mode): tf_adapter.SetDeviceSatMode(mode) + + +def is_inf_nan_enabled(): + """device sat mode: 1 is INF-NAN""" + return tf_adapter.GetDeviceSatMode() == 1 diff --git a/tf_adapter_2.x/python/npu_device/__init__.py b/tf_adapter_2.x/python/npu_device/__init__.py index 1089a95ec75bd00c43c89c0dc07ff3d82d33f00c..4ef1ed208139fd7caf02f49d810c0fee998b0266 100644 --- a/tf_adapter_2.x/python/npu_device/__init__.py +++ b/tf_adapter_2.x/python/npu_device/__init__.py @@ -22,7 +22,7 @@ from npu_device.npu_device import global_options from npu_device.npu_device import set_npu_loop_size from npu_device.npu_device import npu_run_context from npu_device.npu_device import set_device_sat_mode -from npu_device.npu_device import get_device_sat_mode +from npu_device.npu_device import is_inf_nan_enabled from npu_device.utils.scope import keep_dtype_scope from npu_device.utils.scope import npu_recompute_scope diff --git a/tf_adapter_2.x/python/npu_device/npu_device.py b/tf_adapter_2.x/python/npu_device/npu_device.py index 84110192a1ee3acd2f2c8ad49709ce4f452bfbc7..b0036738f8299905ba5f052c6c28f5eceeed1ae1 100644 --- a/tf_adapter_2.x/python/npu_device/npu_device.py +++ b/tf_adapter_2.x/python/npu_device/npu_device.py @@ -68,8 +68,9 @@ def set_device_sat_mode(mode): _npu_device_backends.SetDeviceSatMode(mode) -def get_device_sat_mode(): - return _npu_device_backends.GetDeviceSatMode() +def is_inf_nan_enabled(): + """device sat mode: 1 is INF-NAN""" + return _npu_device_backends.GetDeviceSatMode() == 1 _global_options = None diff --git a/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py b/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py index 96fa83b0b1e2b6e9aa5d2c7ea94e71c56a1f3969..4915ce9e05b5fb87b6ddcac122f1eb25a458de2c 100644 --- a/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py +++ b/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py @@ -30,6 +30,7 @@ from tensorflow.python.keras.mixed_precision.loss_scale_optimizer import _op_in_ from npu_device.npu_device import global_npu_ctx from npu_device import gen_npu_ops from npu_device.npu_device import npu_compat_function +from npu_device.npu_device import is_inf_nan_enabled from npu_device.distribute.hccl import all_reduce @@ -106,7 +107,7 @@ class NpuLossScaleOptimizer(tf.keras.mixed_precision.LossScaleOptimizer): grads_and_vars, name=None): """Apply gradients on variables""" - if global_npu_ctx() is None: + if global_npu_ctx() is None or is_inf_nan_enabled(): super().apply_gradients(grads_and_vars, name) grads_and_vars = tuple(grads_and_vars) # grads_and_vars origin type is zip and can only be iter once @@ -179,4 +180,4 @@ class NpuExperimentalLossScaleOptimizer(tf.keras.mixed_precision.experimental.Lo self._last_step_finite.assign(should_apply_grads) maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn, do_not_apply_fn) - return tf.group(maybe_apply_op, loss_scale_update_op) \ No newline at end of file + return tf.group(maybe_apply_op, loss_scale_update_op) diff --git a/tf_adapter_2.x/tests/st/adapter2_st.py b/tf_adapter_2.x/tests/st/adapter2_st.py index 03663ac704bf1ef0195b8e05a01ea9c7eb66b628..34eb90b1422cf980261b68bcce69dd711ecc06ce 100644 --- a/tf_adapter_2.x/tests/st/adapter2_st.py +++ b/tf_adapter_2.x/tests/st/adapter2_st.py @@ -24,7 +24,7 @@ os.environ['ASCEND_OPP_PATH'] = 'non-existed-path' import npu_device from npu_device.npu_device import stupid_repeat from npu_device.npu_device import set_device_sat_mode -from npu_device.npu_device import get_device_sat_mode +from npu_device.npu_device import is_inf_nan_enabled import unittest import tensorflow as tf from tensorflow.python.eager import context @@ -75,9 +75,9 @@ def foo_cpu_add_(v): class Adapter2St(unittest.TestCase): def test_set_device_sat_mode(self): set_device_sat_mode(2) - self.assertTrue(get_device_sat_mode(), -1); + self.assertTrue(is_inf_nan_enabled(), -1); set_device_sat_mode(1) - self.assertTrue(get_device_sat_mode(), 1); + self.assertTrue(is_inf_nan_enabled(), 1); def test_mix_resource(self): with context.device("/job:localhost/replica:0/task:0/device:CPU:0"):