diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh
index b61e09e1f5edf1d7c69a446cdf753e0f4574b72f..fa47899d73b2a79aa1350b619393cd7c410e6871 100644
--- a/tf_adapter/interface_spec/api_npu_config.pyh
+++ b/tf_adapter/interface_spec/api_npu_config.pyh
@@ -20,7 +20,8 @@ class NPURunConfig(run_config_lib.RunConfig):
                 customize_dtypes=None, op_debug_config=None, memory_config=None, experimental_config=None,
                 topo_sorting_mode=None, aoe_config_file=None, insert_op_file=None, stream_sync_timeout=-1,
                 event_sync_timeout=-1, external_weight=False, es_cluster_config=None, deterministic=0,
-                frozen_variable=False, variable_placement="Device", jit_compile="auto", precision_mode_v2=None):
+                frozen_variable=False, variable_placement="Device", jit_compile="auto", precision_mode_v2=None,
+                ac_parallel_enable=None):
 
 class ProfilingConfig():
     def __init__(self, enable_profiling=False, profiling_options=None):
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
index 3cd4f55b2ff752d00c53ec64ad1be0ba040aa14b..f672d6f337c47d696d70a014b888fa9d1d05fa96 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
@@ -63,7 +63,7 @@ class NPURunConfig(run_config_lib.RunConfig):
                  graph_run_mode=1,
                  op_debug_level=None,
                  enable_scope_fusion_passes=None,
-                 enable_exception_dump=0,
+                 enable_exception_dump=2,
                  op_select_implmode=None,
                  optypelist_for_implmode=None,
                  dynamic_input_config=None,
@@ -110,7 +110,8 @@ class NPURunConfig(run_config_lib.RunConfig):
                  frozen_variable=False,
                  variable_placement="Device",
                  jit_compile="auto",
-                 precision_mode_v2=None
+                 precision_mode_v2=None,
+                 ac_parallel_enable=None
                  ):
         """
         Constructs a NPUConfig.
@@ -144,6 +145,7 @@ class NPURunConfig(run_config_lib.RunConfig):
         dump_config: The dump configuration.
         stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine
                                  to achieve parallel execution between AICPU / AICORE operators.
+        ac_parallel_enable: Enable engines such as Aicpu to parallel with other engines in dynamic shape graphs.
         op_select_implmode: Selecting whether the operator is implemented with high_precision
                             or high_performance or high_precision_for_all or high_performance_for_all.
         optypelist_for_implmode: Operator list.
@@ -211,6 +213,7 @@ class NPURunConfig(run_config_lib.RunConfig):
 
         self._dump_config = self._get_dump_config(dump_config)
         self._stream_max_parallel_num = stream_max_parallel_num
+        self._ac_parallel_enable = ac_parallel_enable
 
         self.horovod_mode = self._get_horovod_mode(horovod_mode)
         util.check_nonnegative_integer(graph_run_mode, "graph_run_mode")
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
index f66fd572212d3cd2348548758a61b9aa5b3001d3..c32e72bfaf40b1d8e67eac067a024238003b73dd 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
@@ -631,6 +631,15 @@ class NPUEstimator(estimator_lib.Estimator):
         if config._stream_max_parallel_num is not None:
             custom_op.parameter_map["stream_max_parallel_num"].s = tf.compat.as_bytes(config._stream_max_parallel_num)
 
+    def __load_ac_parallel_enable_config(self, config, custom_op):
+        """Load ac_parallel_enable config, and add to custom_optimizers
+        Args:
+            config: NPURunConfig.
+            custom_op: Customer optimizers.
+        """
+        if config._ac_parallel_enable is not None:
+            custom_op.parameter_map["ac_parallel_enable"].s = tf.compat.as_bytes(config._ac_parallel_enable)
+
     def __load_ps_mode_config(self, custom_op):
         """Load stream_max_parallel_num config ,and add to custom_optimizers
         Args:
@@ -809,6 +818,8 @@ class NPUEstimator(estimator_lib.Estimator):
         # add stream_max_parallel to custom_op
         self.__load_stream_max_config(config, custom_op)
 
+        self.__load_ac_parallel_enable_config(config, custom_op)
+
         self.__load_ps_mode_config(custom_op)
 
         self._load_op_performance_config(config, custom_op)
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py
index 11623f0326c7d2b95e6328d94d6c3a60b586edcf..60055b205ff620919569b71cf6ce51cd470c6764 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_plugin.py
@@ -63,7 +63,7 @@ def npu_resource_init(graph_run_mode=1,
                       profiling_options=None,
                       precision_mode=None,
                       enable_scope_fusion_passes=None,
-                      enable_exception_dump=0,
+                      enable_exception_dump=2,
                       aoe_mode=None,
                       work_path=None,
                       op_compiler_cache_mode=None,
diff --git a/tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt b/tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt
index 3fe48905e8cc7190c2c3dc1ddbc2cb2327ebb604..3af7fa793b547a6b466dfc67e80fc918b2d0fcbb 100644
--- a/tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt
+++ b/tf_adapter/tests/st/optimizers/pbtxt/om_test_build_geop.pbtxt
@@ -128,6 +128,12 @@ node {
       s: "1"
     }
   }
+  attr {
+    key: "_ac_parallel_enable"
+    value {
+      s: "0"
+    }
+  }
   attr {
     key: "_is_tailing_optimization"
     value {
diff --git a/tf_adapter/tests/ut/optimizers/pbtxt/om_test_build_geop.pbtxt b/tf_adapter/tests/ut/optimizers/pbtxt/om_test_build_geop.pbtxt
index 3fe48905e8cc7190c2c3dc1ddbc2cb2327ebb604..3af7fa793b547a6b466dfc67e80fc918b2d0fcbb 100644
--- a/tf_adapter/tests/ut/optimizers/pbtxt/om_test_build_geop.pbtxt
+++ b/tf_adapter/tests/ut/optimizers/pbtxt/om_test_build_geop.pbtxt
@@ -128,6 +128,12 @@ node {
       s: "1"
     }
   }
+  attr {
+    key: "_ac_parallel_enable"
+    value {
+      s: "0"
+    }
+  }
   attr {
     key: "_is_tailing_optimization"
     value {
diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc
index 75389659fbf951bb9ebc031a5aedf91695f5dd07..eeb2eefd0ba2b81dcc98c53ead0e799f07052013 100644
--- a/tf_adapter/util/ge_plugin.cc
+++ b/tf_adapter/util/ge_plugin.cc
@@ -76,6 +76,7 @@ void SetOptionNameMap(json &option_name_map) {
   option_name_map.emplace(ge::OP_COMPILER_CACHE_MODE, "op_compiler_cache_mode");
   option_name_map.emplace(ge::OP_COMPILER_CACHE_DIR, "op_compiler_cache_dir");
   option_name_map.emplace(ge::STREAM_MAX_PARALLEL_NUM, "stream_max_parallel_num");
+  option_name_map.emplace(ge::AC_PARALLEL_ENABLE, "ac_parallel_enable");
   option_name_map.emplace(ge::HCOM_PARALLEL, "hcom_parallel");
   option_name_map.emplace(ge::HCOM_MULTI_MODE, "hcom_multi_mode");
   option_name_map.emplace(ge::OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION, "is_tailing_optimization");
diff --git a/tf_adapter/util/mbuf_allocator.cc b/tf_adapter/util/mbuf_allocator.cc
index 5999454575e372cd90f76c0875d3b14a4e3bbf2c..089482c6d10a18be734bee39175c6e72f52056b1 100644
--- a/tf_adapter/util/mbuf_allocator.cc
+++ b/tf_adapter/util/mbuf_allocator.cc
@@ -158,7 +158,7 @@ public:
     }
 
 private:
-    bool IsBuffTypeNomal(void *ptr) {
+    bool IsBuffTypeNomal(const void *ptr) {
       BuffTypeInfo buff_type_info{};
       uint32_t len = static_cast<uint32_t>(sizeof(BuffTypeInfo));
       (void) rtBuffGetInfo(RT_BUFF_GET_MBUF_BUILD_INFO, &ptr, sizeof(void *), &buff_type_info, &len);
diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc
index fe9f1c3b480f44b87748151fa8c304bd874879ec..56657a77a46c9421a6a589ad861896fb19184a0b 100644
--- a/tf_adapter/util/npu_attrs.cc
+++ b/tf_adapter/util/npu_attrs.cc
@@ -436,6 +436,7 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(const OpKernelConstr
   std::string dump_debug_mode = "all";
   std::string dump_layer;
   std::string stream_max_parallel_num;
+  std::string ac_parallel_enable;
   std::string npuOptimizer;
   std::string is_tailing_optimization = "0";
   std::string op_select_implmode;
@@ -501,6 +502,7 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(const OpKernelConstr
       }
     }
     (void) ctx->GetAttr("_stream_max_parallel_num", &stream_max_parallel_num);
+    (void) ctx->GetAttr("_ac_parallel_enable", &ac_parallel_enable);
     (void) ctx->GetAttr("_is_tailing_optimization", &is_tailing_optimization);
     (void) ctx->GetAttr("_op_select_implmode", &op_select_implmode);
     (void) ctx->GetAttr("_optypelist_for_implmode", &optypelist_for_implmode);
@@ -542,6 +544,7 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(const OpKernelConstr
   sess_options["ge.exec.variable_acc"] = variable_format_optimize;
   sess_options[ge::HCOM_PARALLEL] = hcom_parallel;
   sess_options[ge::STREAM_MAX_PARALLEL_NUM] = stream_max_parallel_num;
+  sess_options[ge::AC_PARALLEL_ENABLE] = ac_parallel_enable;
   if (!graph_memory_max_size.empty()) {
     sess_options[ge::GRAPH_MEMORY_MAX_SIZE] = graph_memory_max_size;
   }
@@ -619,7 +622,7 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
   std::string graph_run_mode = "1";
   std::string op_debug_level;
   std::string enable_scope_fusion_passes;
-  std::string enable_exception_dump;
+  std::string enable_exception_dump = "2";
   std::string op_compiler_cache_mode;
   std::string op_compiler_cache_dir;
   std::string debug_dir;
@@ -1098,6 +1101,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   std::string dump_data = "tensor";
   std::string dump_layer;
   std::string stream_max_parallel_num;
+  std::string ac_parallel_enable;
   std::string soc_config;
 
   std::string is_tailing_optimization = "0";
@@ -1112,7 +1116,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   std::string graph_run_mode = "1";
   std::string op_debug_level;
   std::string enable_scope_fusion_passes;
-  std::string enable_exception_dump;
+  std::string enable_exception_dump = "2";
   std::string op_select_implmode;
   std::string optypelist_for_implmode;
   std::string input_shape;
@@ -1187,6 +1191,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   auto dump_layer_value = attrs.Find("_dump_layer");
   auto dump_debug_mode_value = attrs.Find("_dump_debug_mode");
   auto stream_max_parallel_num_value = attrs.Find("_stream_max_parallel_num");
+  auto ac_parallel_enable_value = attrs.Find("_ac_parallel_enable");
   auto soc_config_value = attrs.Find("_soc_config");
   auto graph_slice_value = attrs.Find("_graph_slice");
 
@@ -1348,6 +1353,9 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
     if (stream_max_parallel_num_value != nullptr) {
       stream_max_parallel_num = stream_max_parallel_num_value->s();
     }
+    if (ac_parallel_enable_value != nullptr) {
+      ac_parallel_enable = ac_parallel_enable_value->s();
+    }
     if (graph_slice_value != nullptr) {
       graph_slice_mode = graph_slice_value->s();
     }
@@ -1544,6 +1552,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   all_options["variable_format_optimize"] = variable_format_optimize;
   all_options["hcom_parallel"] = hcom_parallel;
   all_options["stream_max_parallel_num"] = stream_max_parallel_num;
+  all_options["ac_parallel_enable"] = ac_parallel_enable;
   if (!graph_memory_max_size.empty()) {
     all_options["graph_memory_max_size"] = graph_memory_max_size;
   }
@@ -1683,6 +1692,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   std::string dump_data = "tensor";
   std::string dump_layer;
   std::string stream_max_parallel_num;
+  std::string ac_parallel_enable;
   std::string soc_config;
   std::string hccl_timeout;
   std::string HCCL_algorithm;
@@ -1719,7 +1729,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   std::string local_device_list;
   bool in_out_pair_flag = true;
   std::string in_out_pair;
-  int64_t enable_exception_dump = 0L;
+  int64_t enable_exception_dump = 2L;
   std::string op_select_implmode;
   std::string optypelist_for_implmode;
   std::string input_shape;
@@ -1832,6 +1842,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
       if (params.count("stream_max_parallel_num") > 0) {
         stream_max_parallel_num = params.at("stream_max_parallel_num").s();
       }
+      if (params.count("ac_parallel_enable") > 0) {
+        ac_parallel_enable = params.at("ac_parallel_enable").s();
+      }
 
       if (params.count("is_tailing_optimization") > 0) {
         is_tailing_optimization = params.at("is_tailing_optimization").b();
@@ -2287,6 +2300,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   sess_options["graph_slice"] = graph_slice_mode;
   sess_options["hcom_parallel"] = std::to_string(static_cast<int32_t>(hcom_parallel));
   sess_options["stream_max_parallel_num"] = stream_max_parallel_num;
+  sess_options["ac_parallel_enable"] = ac_parallel_enable;
   if (!graph_memory_max_size.empty()) {
     sess_options["graph_memory_max_size"] = graph_memory_max_size;
   }
diff --git a/tf_adapter/util/session_manager.cc b/tf_adapter/util/session_manager.cc
index a9fb0481f4da09e83bf4ef3176a8805703277eee..b5b28037731c7a1dcb391d65b5764caf428f4125 100644
--- a/tf_adapter/util/session_manager.cc
+++ b/tf_adapter/util/session_manager.cc
@@ -107,6 +107,8 @@ void SessionManager::PrintGeSessionOptions(std::map<std::string, std::string> &s
 
   // stream max parallel num
   ADP_LOG(INFO) << "[GEOP] stream_max_parallel_num :" << sess_options[ge::STREAM_MAX_PARALLEL_NUM];
+  // ac parallel enable
+  ADP_LOG(INFO) << "[GEOP] ac_parallel_enable :" << sess_options[ge::AC_PARALLEL_ENABLE];
 
   // graph memory configuration
   if (!sess_options[ge::GRAPH_MEMORY_MAX_SIZE].empty()) {
diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
index ce930125ed1425140725d9fbf229470dba22496c..3aa311d512a96afef7d75df7d2ec12a89de14e3d 100644
--- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
+++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
@@ -73,6 +73,7 @@ const std::map<std::string, std::string> kConfigurableOptions = {
   {"op_compiler_cache_mode", ge::OP_COMPILER_CACHE_MODE},
   {"op_compiler_cache_dir", ge::OP_COMPILER_CACHE_DIR},
   {"stream_max_parallel_num", ge::STREAM_MAX_PARALLEL_NUM},
+  {"ac_parallel_enable", ge::AC_PARALLEL_ENABLE},
   {"hcom_parallel", ge::HCOM_PARALLEL},
   {"hcom_multi_mode", ge::HCOM_MULTI_MODE},
   {"is_tailing_optimization", ge::OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION},
diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py
index 2015bab1de4ee2748f6990ef9bd9f5b9d9d79e5b..11522a749f7b9224b0dbfc44d6cd1d458f006a1c 100644
--- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py
+++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py
@@ -45,6 +45,7 @@ class NpuConfig(NpuBaseConfig):
         self.op_compiler_cache_mode = OptionValue('enable', ['enable', 'disable', 'force'])
         self.op_compiler_cache_dir = OptionValue(None, None)
         self.stream_max_parallel_num = OptionValue(None, None)
+        self.ac_parallel_enable = OptionValue(None, ['0', '1'])
         self.hcom_parallel = OptionValue(True, [True, False])
         self.hcom_multi_mode = OptionValue(None, None)
         self.is_tailing_optimization = OptionValue(False, [True, False])
@@ -52,7 +53,7 @@ class NpuConfig(NpuBaseConfig):
         self.op_debug_config = OptionValue(None, None)
         self.debug_dir = OptionValue(None, None)
         self.modify_mixlist = OptionValue(None, None)
-        self.enable_exception_dump = OptionValue(0, [0, 1])
+        self.enable_exception_dump = OptionValue(2, [0, 1, 2])
         self.dump_config = NpuDumpConfig()
         self.aoe_config = NpuAoeConfig()
         self.profiling_config = NpuProfilingConfig()
diff --git a/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py b/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py
index 4915ce9e05b5fb87b6ddcac122f1eb25a458de2c..45b7b9397f9ef2658e8b63a438b941fbdcf16e81 100644
--- a/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py
+++ b/tf_adapter_2.x/python/npu_device/train/optimizer/npu_loss_scale_optimizer.py
@@ -108,7 +108,7 @@ class NpuLossScaleOptimizer(tf.keras.mixed_precision.LossScaleOptimizer):
                         name=None):
         """Apply gradients on variables"""
         if global_npu_ctx() is None or is_inf_nan_enabled():
-            super().apply_gradients(grads_and_vars, name)
+            return super().apply_gradients(grads_and_vars, name)
 
         grads_and_vars = tuple(grads_and_vars)  # grads_and_vars origin type is zip and can only be iter once
         grads = [g for g, _ in grads_and_vars]
@@ -159,8 +159,8 @@ class NpuExperimentalLossScaleOptimizer(tf.keras.mixed_precision.experimental.Lo
                         grads_and_vars,
                         name=None):
         """Apply gradients on variables"""
-        if global_npu_ctx() is None:
-            super().apply_gradients(grads_and_vars, name)
+        if global_npu_ctx() is None or is_inf_nan_enabled():
+            return super().apply_gradients(grads_and_vars, name)
 
         grads_and_vars = tuple(grads_and_vars)  # grads_and_vars origin type is zip and can only be iter once
         grads = [g for g, _ in grads_and_vars]
diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h
index 1e35a910185186b67e7e441e63aa7a0e8c2ac75f..b5fbeb0e0b42f4060c49c3a5ccaa5b2177c1eac8 100644
--- a/tf_adapter_2.x/tests/stub/include/stub/defines.h
+++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h
@@ -168,6 +168,10 @@ const std::string GE_FE_FLAG = "ge.feFlag";
 // this option is to obtain stream max parallel num
 const std::string STREAM_MAX_PARALLEL_NUM = "ge.streamMaxParallelNum";
 
+// Configure engines such as Aicpu to compute parallelly with other engines in dynamic shape graphs.
+// its value should be "0" or "1", default value is "0"
+const std::string AC_PARALLEL_ENABLE = "ac_parallel_enable";
+
 // congigure outputDatatype to setting net output type
 const std::string OUTPUT_DATATYPE = "ge.outputDatatype";