From 9bf3de6013c106a2449febb5fa3d185de092f598 Mon Sep 17 00:00:00 2001 From: CLAY-panjw <1330286576@qq.com> Date: Wed, 13 Jul 2022 10:54:26 +0800 Subject: [PATCH] modify tf graph for overflow detetcion --- .../ops/aicore/npu_mixed_precesion_ops.cc | 5 +-- .../estimator/npu/npu_loss_scale_optimizer.py | 35 ++++++++++--------- .../npu_bridge/estimator/npu/npu_optimizer.py | 20 +++++------ .../testcase/n_p_u_get_float_statusV2_test.cc | 8 ++--- .../testcase/n_p_u_get_float_statusV2_test.cc | 10 ++---- .../optimizer/npu_loss_scale_optimizer.py | 17 +++++---- 6 files changed, 43 insertions(+), 52 deletions(-) diff --git a/tf_adapter/ops/aicore/npu_mixed_precesion_ops.cc b/tf_adapter/ops/aicore/npu_mixed_precesion_ops.cc index 2882a6c8a..8a1af5d50 100644 --- a/tf_adapter/ops/aicore/npu_mixed_precesion_ops.cc +++ b/tf_adapter/ops/aicore/npu_mixed_precesion_ops.cc @@ -55,10 +55,7 @@ REGISTER_OP("NpuGetFloatStatus") .SetIsStateful(); REGISTER_OP("NpuGetFloatStatusV2") - .Input("addr: N * T") - .Output("data: T") - .Attr("N: int >= 0") - .Attr("T: {float}") + .Output("data: int32") .SetIsStateful() .SetShapeFn([](InferenceContext *c) { std::vector output_dims; diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py index e5be96867..1792ac5f0 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_loss_scale_optimizer.py @@ -46,28 +46,32 @@ class NPULossScaleOptimizer(lso.LossScaleOptimizer): self._name = "NPULossScaleOptimizer{}".format(type(optimizer).__name__) super(NPULossScaleOptimizer, self).__init__(opt=opt, loss_scale_manager=loss_scale_manager) + def __getattr__(self, attr): + return getattr(self._opt, attr) + def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients. See base class `tf.compat.v1.train.Optimizer`.""" if self._enable_overflow_check(): with tf.name_scope(self._name): - self._float_status = gen_npu_ops.npu_alloc_float_status() - grads = [] - for (g, _) in grads_and_vars: - if g is not None: - grads.append(g) - with tf.get_default_graph().control_dependencies(grads): - local_float_status = gen_npu_ops.npu_get_float_status(self._float_status) - cleared_float_status = gen_npu_ops.npu_clear_float_status(local_float_status) + grads = [] + for (g, _) in grads_and_vars: + if g is not None: + grads.append(g) + with tf.get_default_graph().control_dependencies(grads): + local_float_status = gen_npu_ops.npu_get_float_status_v2() + with tf.get_default_graph().control_dependencies([local_float_status]): + cleared_float_status = gen_npu_ops.npu_clear_float_status_v2() if self._is_distributed: - with tf.get_default_graph().control_dependencies([local_float_status]): - aggregated_float_status = hccl_ops.allreduce([self._float_status], "sum", fusion=0) - is_overall_finite = math_ops.reduce_all(tf.equal(aggregated_float_status, - cleared_float_status), + aggregated_float_status = hccl_ops.allreduce([local_float_status], "sum", fusion=0) + with tf.get_default_graph().control_dependencies([cleared_float_status]): + op = tf.equal(aggregated_float_status, 0) + is_overall_finite = math_ops.reduce_all(op, name="overflow_status_reduce_all") else: - is_overall_finite = math_ops.reduce_all(tf.equal(self._float_status, - cleared_float_status), + with tf.get_default_graph().control_dependencies([cleared_float_status]): + op_ = tf.equal(0, local_float_status) + is_overall_finite = math_ops.reduce_all(op_, name="overflow_status_reduce_all") else: is_overall_finite = tf.constant(True, dtype=tf.bool) @@ -88,6 +92,3 @@ class NPULossScaleOptimizer(lso.LossScaleOptimizer): issubclass(type(self._loss_scale_manager), FixedLossScaleManager): return self._loss_scale_manager.get_enable_overflow_check() return True - - def __getattr__(self, attr): - return getattr(self._opt, attr) diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py index ca401f5ad..caae21f62 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_optimizer.py @@ -169,7 +169,6 @@ class NPUOptimizer(tf.train.Optimizer): loss_scale = self._loss_scale_manager.get_loss_scale() loss_value = loss() if callable(loss) else loss scaled_loss = loss_value * math_ops.cast(loss_scale, loss_value.dtype.base_dtype) - self._float_status = gen_npu_ops.npu_alloc_float_status() else: scaled_loss = loss @@ -223,17 +222,18 @@ class NPUOptimizer(tf.train.Optimizer): def _reduce_all(self, grads): with tf.get_default_graph().control_dependencies([grad for grad in grads if grad is not None]): - local_float_status = gen_npu_ops.npu_get_float_status(self._float_status) - cleared_float_status = gen_npu_ops.npu_clear_float_status(local_float_status) - + local_float_status = gen_npu_ops.npu_get_float_status_v2() + with tf.get_default_graph().control_dependencies([local_float_status]): + cleared_float_status = gen_npu_ops.npu_clear_float_status_v2() if self._is_distributed: - with tf.get_default_graph().control_dependencies([local_float_status]): - aggregated_float_status = hccl_ops.allreduce([self._float_status], "sum", fusion=0) - self._is_overall_finite = math_ops.reduce_all(tf.equal(aggregated_float_status, - cleared_float_status)) + aggregated_float_status = hccl_ops.allreduce([local_float_status], "sum", fusion=0) + with tf.get_default_graph().control_dependencies([cleared_float_status]): + op = tf.equal(aggregated_float_status, 0) + self._is_overall_finite = math_ops.reduce_all(op) else: - self._is_overall_finite = math_ops.reduce_all(tf.equal(self._float_status, - cleared_float_status)) + with tf.get_default_graph().control_dependencies([cleared_float_status]): + op_ = tf.equal(0, local_float_status) + self._is_overall_finite = math_ops.reduce_all(op_) def get_slot(self, *args, **kwargs): """Calls this same method on the underlying optimizer.""" diff --git a/tf_adapter/tests/st/kernels/testcase/n_p_u_get_float_statusV2_test.cc b/tf_adapter/tests/st/kernels/testcase/n_p_u_get_float_statusV2_test.cc index a81805d05..2391c755d 100644 --- a/tf_adapter/tests/st/kernels/testcase/n_p_u_get_float_statusV2_test.cc +++ b/tf_adapter/tests/st/kernels/testcase/n_p_u_get_float_statusV2_test.cc @@ -28,7 +28,7 @@ FakeInputFunctor FakeInputStub(DataType dt) { } TEST(NPUGetFloatStatusV2OpTest, TestNPUGetFloatStatusV2) { - DataTypeSlice input_types({DT_FLOAT}); + DataTypeSlice input_types({}); MemoryTypeSlice input_memory_types; DataTypeSlice output_types({DT_FLOAT}); MemoryTypeSlice output_memory_types; @@ -75,12 +75,8 @@ TEST(NPUGetFloatStatusV2OpTest, TestNPUGetFloatStatusV2OShapeInference) { std::vector src_list; src_list.emplace_back("in0", 0, DT_FLOAT); TF_CHECK_OK(NodeDefBuilder("dummy", &op_def) - .Input(src_list) - .Attr("T", DT_FLOAT) - .Attr("N", 1) .Finalize(&def)); - shape_inference::InferenceContext c(0, &def, op_def, {TShape({8})}, {}, {}, {}); - std::vector input_shapes; + shape_inference::InferenceContext c(0, &def, op_def, {TShape({})}, {}, {}, {}); TF_CHECK_OK(reg->shape_inference_fn(&c)); ASSERT_EQ("[8]", c.DebugString(c.output(0))); } diff --git a/tf_adapter/tests/ut/kernels/testcase/n_p_u_get_float_statusV2_test.cc b/tf_adapter/tests/ut/kernels/testcase/n_p_u_get_float_statusV2_test.cc index a81805d05..be9d3df02 100644 --- a/tf_adapter/tests/ut/kernels/testcase/n_p_u_get_float_statusV2_test.cc +++ b/tf_adapter/tests/ut/kernels/testcase/n_p_u_get_float_statusV2_test.cc @@ -28,7 +28,7 @@ FakeInputFunctor FakeInputStub(DataType dt) { } TEST(NPUGetFloatStatusV2OpTest, TestNPUGetFloatStatusV2) { - DataTypeSlice input_types({DT_FLOAT}); + DataTypeSlice input_types({}); MemoryTypeSlice input_memory_types; DataTypeSlice output_types({DT_FLOAT}); MemoryTypeSlice output_memory_types; @@ -72,15 +72,9 @@ TEST(NPUGetFloatStatusV2OpTest, TestNPUGetFloatStatusV2OShapeInference) { TF_CHECK_OK(OpRegistry::Global()->LookUp("NpuGetFloatStatusV2", ®)); OpDef op_def = reg->op_def; NodeDef def; - std::vector src_list; - src_list.emplace_back("in0", 0, DT_FLOAT); TF_CHECK_OK(NodeDefBuilder("dummy", &op_def) - .Input(src_list) - .Attr("T", DT_FLOAT) - .Attr("N", 1) .Finalize(&def)); - shape_inference::InferenceContext c(0, &def, op_def, {TShape({8})}, {}, {}, {}); - std::vector input_shapes; + shape_inference::InferenceContext c(0, &def, op_def, {TShape({0})}, {}, {}, {}); TF_CHECK_OK(reg->shape_inference_fn(&c)); ASSERT_EQ("[8]", c.DebugString(c.output(0))); } diff --git a/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py b/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py index f1410ae24..227ff1582 100644 --- a/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py +++ b/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py @@ -40,14 +40,17 @@ def _npu_finite_status_after_executed(executed_ops): with ops.get_default_graph()._attr_scope( {"_npu_loss_scale": attr_value_pb2.AttrValue(b=True)}): with tf.control_dependencies([v for v in executed_ops if v is not None]): - current_status = gen_npu_ops.npu_alloc_float_status() - assign_float_status = gen_npu_ops.npu_get_float_status(current_status) - finite_status = gen_npu_ops.npu_clear_float_status(assign_float_status) + assign_float_status = gen_npu_ops.npu_get_float_status_v2() + with tf.control_dependencies([assign_float_status]): + finite_status = gen_npu_ops.npu_clear_float_status_v2() if global_npu_ctx() and global_npu_ctx().workers_num > 1: - with tf.control_dependencies([assign_float_status]): - reduced_status = all_reduce(current_status, 'sum', fusion=0) - return tf.reduce_all(tf.equal(reduced_status, finite_status)) - return tf.reduce_all(tf.equal(current_status, finite_status)) + reduced_status = all_reduce([assign_float_status], 'sum', fusion=0) + with tf.control_dependencies([finite_status]): + op = tf.equal(reduced_status, 0) + return tf.reduce_all(op) + with tf.control_dependencies([finite_status]): + op_ = tf.equal(0, [assign_float_status]) + return tf.reduce_all(op_) def _npu_compat_loss_scale_update(m, grads): -- Gitee