From 6d3a9ab74bf8f3f1e3fd6dffd6aea334d8336392 Mon Sep 17 00:00:00 2001
From: xuepeng <xuepeng4@huawei.com>
Date: Mon, 20 Feb 2023 20:55:26 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0npu=20stage=20scope=E7=9A=84?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../npu_bridge/estimator/npu/npu_scope.py     | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
index 206a80f34..d9ec87927 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
@@ -63,6 +63,37 @@ def keep_dtype_scope():
         yield
 
 
+# npu_stage_scope provides a graph-building space, and all nodes under this space form a 'Stage'.
+# And then the graph will be divided into different execution units according to different 'Stage'
+# Stage's sequential execution is equivalent to graph execution，
+# When the graph is executed multiple times, there is a possibility of parallelism between stages of different Steps，
+# * input:
+#       stage: stage id of current scope, all nodes under this scope will have attr '_stage_level' with value 'stage'
+# * Usage example：
+# We want to perform a unique calculation on the input data first, and then sum， just like
+# def my_model(x):
+#     ux, _ = tf.unique(x)
+#     sum = tf.reduce_sum(ux)
+# The executed prof data is as follows：
+# ┌─────────────────┐ ┌─────────────────────────┐
+# │  Unique @AICPU  │ │    ReduceSum @AICORE    │
+# └─────────────────┘ └─────────────────────────┘
+# We can optimize it at execution time through npu stage scope，and with 'iterations_per_loop' = '2'
+# def my_model(x):
+#     with npu_stage_scope(0):
+#        ux, _ = tf.unique(x)
+#     sum = tf.reduce_sum(ux)
+# Then the executed prof data is as follows：
+# ┌─────────────────┐ ┌─────────────────────────┐
+# │  Unique @AICPU  │ │    ReduceSum @AICORE    │
+# └─────────────────┘ └─────────────────────────┘
+#                    ┌─────────────────┐         ┌─────────────────────────┐
+#                    │  Unique @AICPU  │         │    ReduceSum @AICORE    │
+#                    └─────────────────┘         └─────────────────────────┘
+# * Constraints：
+# 1- The 'iterations_per_loop' config must be configured to be greater than 1 during pipeline execution
+# 2- Performance gains are possible only when different computing resources are used between different stages
+# 3- Communication under the same communication domain is not allowed between different Stages
 @contextlib.contextmanager
 def npu_stage_scope(stage):
     """
-- 
Gitee