From 6d3a9ab74bf8f3f1e3fd6dffd6aea334d8336392 Mon Sep 17 00:00:00 2001 From: xuepeng Date: Mon, 20 Feb 2023 20:55:26 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0npu=20stage=20scope=E7=9A=84?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../npu_bridge/estimator/npu/npu_scope.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py index 206a80f34..d9ec87927 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py @@ -63,6 +63,37 @@ def keep_dtype_scope(): yield +# npu_stage_scope provides a graph-building space, and all nodes under this space form a 'Stage'. +# And then the graph will be divided into different execution units according to different 'Stage' +# Stage's sequential execution is equivalent to graph execution, +# When the graph is executed multiple times, there is a possibility of parallelism between stages of different Steps, +# * input: +# stage: stage id of current scope, all nodes under this scope will have attr '_stage_level' with value 'stage' +# * Usage example: +# We want to perform a unique calculation on the input data first, and then sum, just like +# def my_model(x): +# ux, _ = tf.unique(x) +# sum = tf.reduce_sum(ux) +# The executed prof data is as follows: +# ┌─────────────────┐ ┌─────────────────────────┐ +# │ Unique @AICPU │ │ ReduceSum @AICORE │ +# └─────────────────┘ └─────────────────────────┘ +# We can optimize it at execution time through npu stage scope,and with 'iterations_per_loop' = '2' +# def my_model(x): +# with npu_stage_scope(0): +# ux, _ = tf.unique(x) +# sum = tf.reduce_sum(ux) +# Then the executed prof data is as follows: +# ┌─────────────────┐ ┌─────────────────────────┐ +# │ Unique @AICPU │ │ ReduceSum @AICORE │ +# └─────────────────┘ └─────────────────────────┘ +# ┌─────────────────┐ ┌─────────────────────────┐ +# │ Unique @AICPU │ │ ReduceSum @AICORE │ +# └─────────────────┘ └─────────────────────────┘ +# * Constraints: +# 1- The 'iterations_per_loop' config must be configured to be greater than 1 during pipeline execution +# 2- Performance gains are possible only when different computing resources are used between different stages +# 3- Communication under the same communication domain is not allowed between different Stages @contextlib.contextmanager def npu_stage_scope(stage): """ -- Gitee