diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
index 206a80f342e69459a29ff796652fecb841b0d9ff..d9ec87927ee56a7d595e0a4378f5628088c09311 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_scope.py
@@ -63,6 +63,37 @@ def keep_dtype_scope():
         yield
 
 
+# npu_stage_scope provides a graph-building space, and all nodes under this space form a 'Stage'.
+# And then the graph will be divided into different execution units according to different 'Stage'
+# Stage's sequential execution is equivalent to graph execution，
+# When the graph is executed multiple times, there is a possibility of parallelism between stages of different Steps，
+# * input:
+#       stage: stage id of current scope, all nodes under this scope will have attr '_stage_level' with value 'stage'
+# * Usage example：
+# We want to perform a unique calculation on the input data first, and then sum， just like
+# def my_model(x):
+#     ux, _ = tf.unique(x)
+#     sum = tf.reduce_sum(ux)
+# The executed prof data is as follows：
+# ┌─────────────────┐ ┌─────────────────────────┐
+# │  Unique @AICPU  │ │    ReduceSum @AICORE    │
+# └─────────────────┘ └─────────────────────────┘
+# We can optimize it at execution time through npu stage scope，and with 'iterations_per_loop' = '2'
+# def my_model(x):
+#     with npu_stage_scope(0):
+#        ux, _ = tf.unique(x)
+#     sum = tf.reduce_sum(ux)
+# Then the executed prof data is as follows：
+# ┌─────────────────┐ ┌─────────────────────────┐
+# │  Unique @AICPU  │ │    ReduceSum @AICORE    │
+# └─────────────────┘ └─────────────────────────┘
+#                    ┌─────────────────┐         ┌─────────────────────────┐
+#                    │  Unique @AICPU  │         │    ReduceSum @AICORE    │
+#                    └─────────────────┘         └─────────────────────────┘
+# * Constraints：
+# 1- The 'iterations_per_loop' config must be configured to be greater than 1 during pipeline execution
+# 2- Performance gains are possible only when different computing resources are used between different stages
+# 3- Communication under the same communication domain is not allowed between different Stages
 @contextlib.contextmanager
 def npu_stage_scope(stage):
     """