diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh
index 3d36a2245917975cbbf36ea6e05c3a8c427e2f18..4bb2268e23d97b02b9d9673debd40382ab8dd0fe 100644
--- a/tf_adapter/interface_spec/api_npu_config.pyh
+++ b/tf_adapter/interface_spec/api_npu_config.pyh
@@ -23,7 +23,8 @@ class NPURunConfig(run_config_lib.RunConfig):
                 frozen_variable=False, variable_placement="Device", jit_compile="auto", precision_mode_v2=None,
                 ac_parallel_enable=None, quant_dumpable=None, input_fusion_size=131072, compile_dynamic_mode=None,
                 graph_max_parallel_model_num=1, export_compile_stat=1, aicore_num=None,
-                oo_constant_folding=True, input_batch_cpy=False, shape_generalization_mode="STRICT", all_tensor_not_empty=False):
+                oo_constant_folding=True, input_batch_cpy=False, shape_generalization_mode="STRICT", all_tensor_not_empty=False,
+                auto_multistream_parallel_mode=None):
 
 class ProfilingConfig():
     def __init__(self, enable_profiling=False, profiling_options=None):
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
index a38214b557b8e8e8d3c91e480729456ef3757b0f..4523153abe5a8ca45cf8e24149dac28510b2ae03 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
@@ -120,7 +120,8 @@ class NPURunConfig(run_config_lib.RunConfig):
                  oo_constant_folding=True,
                  input_batch_cpy=False,
                  shape_generalization_mode="STRICT",
-                 all_tensor_not_empty=False
+                 all_tensor_not_empty=False,
+                 auto_multistream_parallel_mode=None
                  ):
         """
         Constructs a NPUConfig.
@@ -197,6 +198,7 @@ class NPURunConfig(run_config_lib.RunConfig):
                                    FULL: full generalization;
                                    ADAPTIVE: generalizes the varying axes.
         all_tensor_not_empty: default is: False.
+        auto_multistream_parallel_mode: default is None; cv: cube vector parallel.
         """
 
         # Check iterations_per_loop.
@@ -299,6 +301,7 @@ class NPURunConfig(run_config_lib.RunConfig):
         self._input_batch_cpy = input_batch_cpy
         self._shape_generalization_mode = shape_generalization_mode
         self._all_tensor_not_empty = all_tensor_not_empty
+        self._auto_multistream_parallel_mode = auto_multistream_parallel_mode
         super(NPURunConfig, self).__init__(
             model_dir=model_dir,
             tf_random_seed=tf_random_seed,
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
index 52763a15a213fbad5ccbc485e17c6d92ecb5e0b6..5b8f23af227751370b096ad334c4e797c4883c79 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
@@ -853,6 +853,9 @@ class NPUEstimator(estimator_lib.Estimator):
                 config._shape_generalization_mode)
         if config._all_tensor_not_empty is not None:
             custom_op.parameter_map["all_tensor_not_empty"].b = config._all_tensor_not_empty
+        if config._auto_multistream_parallel_mode is not None:
+            custom_op.parameter_map["auto_multistream_parallel_mode"].s = tf.compat.as_bytes(
+                config._auto_multistream_parallel_mode)
 
         self.__load_session_device_id(config, custom_op)
         self.__load_modify_mixlist(config, custom_op)
diff --git a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc
index fd47759aeaf99525953a9a372c7129f819c08c5a..9c60554e78f76930e6631e8e93ca162a3260ee4c 100644
--- a/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc
+++ b/tf_adapter/tests/st/util/testcase/npu_attrs_test.cc
@@ -533,5 +533,41 @@ TEST_F(NpuAttrTest, GetAllAttrOptions_all_tensor_not_empty) {
   EXPECT_NE(all_options.find("ge.exec.allTensorNotEmpty"), all_options.cend());
 }
 
+TEST_F(NpuAttrTest, SetNpuOptimizerAttr_auto_multistream_parallel_mode) {
+  GraphOptimizationPassOptions options;
+  SessionOptions session_options;
+  session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_do_function_inlining(true);
+  auto *custom_config =
+      session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers();
+  custom_config->set_name("NpuOptimizer");
+  options.session_options = &session_options;
+
+  AttrValue auto_multistream_parallel_mode = AttrValue();
+  auto_multistream_parallel_mode.set_s("cv");
+  (*custom_config->mutable_parameter_map())["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode;
+
+  AttrValue jit_compile = AttrValue();
+  jit_compile.set_s("2");
+  (*custom_config->mutable_parameter_map())["jit_compile"] = jit_compile;
+  Status s = NpuAttrs::SetNpuOptimizerAttr(options, reinterpret_cast<Node *>(1));
+  EXPECT_EQ(s.ok(), false);
+}
+
+TEST_F(NpuAttrTest, GetAllAttrOptions_auto_multistream_parallel_mode) {
+  AttrValueMap attr_map;
+
+  AttrValue npu_optimizer = AttrValue();
+  npu_optimizer.set_s("NpuOptimizer");
+  attr_map["_NpuOptimizer"] = npu_optimizer;
+
+  AttrValue auto_multistream_parallel_mode = AttrValue();
+  auto_multistream_parallel_mode.set_s("cv");
+  attr_map["_auto_multistream_parallel_mode"] = auto_multistream_parallel_mode;
+
+  AttrSlice attrs(&attr_map);
+  const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs);
+  EXPECT_NE(all_options.find("ge.autoMultistreamParallelMode"), all_options.cend());
+}
+
 }
 } // end tensorflow
diff --git a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
index 006141158805f89fe9b63d320456f2237ca52830..8ecc3801df1d7805bc6e0e7838726b38a408ce19 100644
--- a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
+++ b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
@@ -671,5 +671,41 @@ TEST_F(NpuAttrTest, GetAllAttrOptions_all_tensor_not_empty) {
   const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs);
   EXPECT_NE(all_options.find("ge.exec.allTensorNotEmpty"), all_options.cend());
 }
+
+TEST_F(NpuAttrTest, SetNpuOptimizerAttr_auto_multistream_parallel_mode) {
+  GraphOptimizationPassOptions options;
+  SessionOptions session_options;
+  session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_do_function_inlining(true);
+  auto *custom_config =
+      session_options.config.mutable_graph_options()->mutable_rewrite_options()->add_custom_optimizers();
+  custom_config->set_name("NpuOptimizer");
+  options.session_options = &session_options;
+
+  AttrValue auto_multistream_parallel_mode = AttrValue();
+  auto_multistream_parallel_mode.set_s("cv");
+  (*custom_config->mutable_parameter_map())["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode;
+
+  AttrValue jit_compile = AttrValue();
+  jit_compile.set_s("2");
+  (*custom_config->mutable_parameter_map())["jit_compile"] = jit_compile;
+  Status s = NpuAttrs::SetNpuOptimizerAttr(options, reinterpret_cast<Node *>(1));
+  EXPECT_EQ(s.ok(), false);
+}
+
+TEST_F(NpuAttrTest, GetAllAttrOptions_auto_multistream_parallel_mode) {
+  AttrValueMap attr_map;
+
+  AttrValue npu_optimizer = AttrValue();
+  npu_optimizer.set_s("NpuOptimizer");
+  attr_map["_NpuOptimizer"] = npu_optimizer;
+
+  AttrValue auto_multistream_parallel_mode = AttrValue();
+  auto_multistream_parallel_mode.set_s("cv");
+  attr_map["_auto_multistream_parallel_mode"] = auto_multistream_parallel_mode;
+
+  AttrSlice attrs(&attr_map);
+  const auto &all_options = NpuAttrs::GetAllAttrOptions(attrs);
+  EXPECT_NE(all_options.find("ge.autoMultistreamParallelMode"), all_options.cend());
+}
 }
 } // end tensorflow
diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc
index 62ced776af7d686be43f69242377ab2ac945148c..2ae110fd7a70a701f21fa2cc7446b757ad6caed9 100644
--- a/tf_adapter/util/ge_plugin.cc
+++ b/tf_adapter/util/ge_plugin.cc
@@ -130,6 +130,7 @@ void SetOptionNameMap(json &option_name_map) {
   option_name_map.emplace(ge::AICORE_NUM, "aicore_num");
   option_name_map.emplace("ge.inputBatchCpy", "input_batch_cpy");
   option_name_map.emplace(ge::OPTION_ALL_TENSOR_NOT_EMPTY, "all_tensor_not_empty");
+  option_name_map.emplace("ge.autoMultistreamParallelMode", "auto_multistream_parallel_mode");
 }
 }  // namespace
 
diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc
index bfbb0063a8d00991c4810722efba9f25b7818346..7a2942ff59543a3c3b775369ecfd991ad53ffee4 100644
--- a/tf_adapter/util/npu_attrs.cc
+++ b/tf_adapter/util/npu_attrs.cc
@@ -508,6 +508,7 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(const OpKernelConstr
   std::string jit_compile;
   std::string aicore_num;
   std::string all_tensor_not_empty;
+  std::string auto_multistream_parallel_mode;
   if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) {
     (void) ctx->GetAttr("_variable_format_optimize", &variable_format_optimize);
     (void) ctx->GetAttr("_hcom_parallel", &hcom_parallel);
@@ -584,6 +585,7 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(const OpKernelConstr
     (void) ctx->GetAttr("_input_batch_cpy", &input_batch_cpy);
     (void) ctx->GetAttr("_aicore_num", &aicore_num);
     (void) ctx->GetAttr("_all_tensor_not_empty", &all_tensor_not_empty);
+    (void) ctx->GetAttr("_auto_multistream_parallel_mode", &auto_multistream_parallel_mode);
   }
 
   // session options
@@ -654,6 +656,8 @@ std::map<std::string, std::string> NpuAttrs::GetSessOptions(const OpKernelConstr
   sess_options["input_batch_cpy"] = input_batch_cpy;
   sess_options[ge::OPTION_ALL_TENSOR_NOT_EMPTY] = all_tensor_not_empty;
   sess_options["all_tensor_not_empty"] = all_tensor_not_empty;
+  sess_options["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode;
+  sess_options["ge.autoMultistreamParallelMode"] = auto_multistream_parallel_mode;
   SetForbiddenClosePassOn(sess_options);
   sess_options["aicore_num"] = aicore_num;
   sess_options["ge.aicoreNum"] = aicore_num;
@@ -1268,6 +1272,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   std::string input_batch_cpy;
   std::string shape_generalization_mode = "STRICT";
   std::string all_tensor_not_empty;
+  std::string auto_multistream_parallel_mode;
 
   auto NpuOptimizer_value = attrs.Find("_NpuOptimizer");
   auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc");
@@ -1369,6 +1374,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   auto input_batch_cpy_value = attrs.Find("_input_batch_cpy");
   auto shape_generalization_mode_value = attrs.Find("_shape_generalization_mode");
   auto all_tensor_not_empty_value = attrs.Find("_all_tensor_not_empty");
+  auto auto_multistream_parallel_mode_value = attrs.Find("_auto_multistream_parallel_mode");
   if (NpuOptimizer_value != nullptr) {
     do_npu_optimizer = "1";
     if (enable_data_pre_proc_value != nullptr) {
@@ -1692,6 +1698,9 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
     if (all_tensor_not_empty_value != nullptr) {
       all_tensor_not_empty = all_tensor_not_empty_value->s();
     }
+    if (auto_multistream_parallel_mode_value != nullptr) {
+      auto_multistream_parallel_mode = auto_multistream_parallel_mode_value->s();
+    }
   }
 
   all_options["variable_format_optimize"] = variable_format_optimize;
@@ -1809,6 +1818,8 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   all_options["ge.aicoreNum"] = aicore_num;
   all_options[ge::OPTION_ALL_TENSOR_NOT_EMPTY] = all_tensor_not_empty;
   all_options["all_tensor_not_empty"] = all_tensor_not_empty;
+  all_options["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode;
+  all_options["ge.autoMultistreamParallelMode"] = auto_multistream_parallel_mode;
   if (!oo_constant_folding.empty()) {
     all_options["oo_constant_folding"] = oo_constant_folding;
     all_options["ge.oo.constantFolding"] = oo_constant_folding;
@@ -1946,6 +1957,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   bool input_batch_cpy = false;
   std::string shape_generalization_mode = "STRICT";
   bool all_tensor_not_empty = false;
+  std::string auto_multistream_parallel_mode;
   const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options();
   for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) {
     if (custom_optimizer.name() == "NpuOptimizer") {
@@ -2517,6 +2529,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
       if (params.count("all_tensor_not_empty") > 0) {
         all_tensor_not_empty = params.at("all_tensor_not_empty").b();
       }
+      if (params.count("auto_multistream_parallel_mode") > 0) {
+        auto_multistream_parallel_mode = params.at("auto_multistream_parallel_mode").s();
+      }
       // input_batch_cpy
       if (params.count("input_batch_cpy") > 0) {
         input_batch_cpy = params.at("input_batch_cpy").b();
@@ -2618,6 +2633,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   graph_options["input_shape"] = input_shape;
   graph_options["dynamic_dims"] = dynamic_dims;
   graph_options["dynamic_node_type"] = std::to_string(dynamic_node_type);
+  sess_options["auto_multistream_parallel_mode"] = auto_multistream_parallel_mode;
+  sess_options["ge.autoMultistreamParallelMode"] = auto_multistream_parallel_mode;
   init_options_["profiling_mode"] = std::to_string(static_cast<int32_t>(profiling_mode));
   init_options_[ge::OPTION_EXEC_PROFILING_MODE] = std::to_string(static_cast<int32_t>(profiling_mode));
   init_options_["profiling_options"] = profiling_options;
diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
index 553569f0dd134abeabd3b50d26bb2bce77d94e8f..2671a05795a60610450629d031485f5d64104d9f 100644
--- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
+++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp
@@ -151,7 +151,8 @@ const std::map<std::string, std::string> kSessionConfigOptions = {
   {"graph_slice", "ge.graphSliceMode"},
   {"input_fusion_size", "ge.exec.input_fusion_size"},
   {"compile_dynamic_mode", "ge.compile_dynamic_mode"},
-  {"all_tensor_not_empty", ge::OPTION_ALL_TENSOR_NOT_EMPTY}
+  {"all_tensor_not_empty", ge::OPTION_ALL_TENSOR_NOT_EMPTY},
+  {"auto_multistream_parallel_mode", "ge.autoMultistreamParallelMode"}
 };
 }  // namespace
 
diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py
index 3293e3c32419120bea37ea200e610eec41d00978..b18bf56d9ac7f425ce7e7de028f85b708312b028 100644
--- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py
+++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py
@@ -85,5 +85,6 @@ class NpuConfig(NpuBaseConfig):
         self.input_batch_cpy = OptionValue(False, [True, False])
         self.shape_generalization_mode = OptionValue("STRICT", ["STRICT", "FULL", "ADAPTIVE"])
         self.all_tensor_not_empty = OptionValue(False, [True, False])
+        self.auto_multistream_parallel_mode = OptionValue(None, ['cv'])
 
         super(NpuConfig, self).__init__()
diff --git a/tf_adapter_2.x/tests/stub/include/stub/defines.h b/tf_adapter_2.x/tests/stub/include/stub/defines.h
index 45dc1da2a165308d6ec69b165bba9a5b5f40ef23..c29da28aa2e0bac75b6aa83f249002bcaa637383 100644
--- a/tf_adapter_2.x/tests/stub/include/stub/defines.h
+++ b/tf_adapter_2.x/tests/stub/include/stub/defines.h
@@ -66,6 +66,7 @@ const char *const OPTION_EXEC_LOGICAL_DEVICE_ID = "ge.exec.logicalDeviceId";
 const char *const OPTION_EXEC_MODEL_DEPLOY_MODE = "ge.exec.modelDeployMode";
 const char *const OPTION_EXEC_MODEL_DEPLOY_DEVICELIST = "ge.exec.modelDeployDevicelist";
 const char *const OPTION_ALL_TENSOR_NOT_EMPTY = "ge.exec.allTensorNotEmpty";
+const char *const OPTION_AUTO_MULTISTREAM_PARALLEL_MODE = "ge.autoMultistreamParallelMode";
 
 // Option key: memory init
 const char *const GRAPH_MEMORY_MAX_SIZE = "ge.graphMemoryMaxSize";