From 1eaa2b174e2097b1a55bc81d6491a8019749b8da Mon Sep 17 00:00:00 2001
From: htwang <onehaitao@foxmail.com>
Date: Tue, 25 Feb 2025 10:04:49 +0800
Subject: [PATCH 1/4] support local files only

---
 checkpoint/internvl2_hf_to_mm.py              | 10 ++++---
 checkpoint/utils.py                           |  8 +++---
 examples/diffusers/sana/patch_sana.py         |  3 ++-
 examples/hunyuanvideo/convert_ckpt_to_mm.py   | 23 ++++++++--------
 .../internvl2.5_convert_to_mm_ckpt.py         | 26 ++++++++++---------
 .../internvl2/internvl2_convert_to_mm_ckpt.py |  6 +++--
 examples/llava1.5/vicuna_converter.py         | 10 ++++---
 mindspeed                                     |  1 +
 mindspeed_mm/arguments.py                     | 18 ++++++++++---
 .../models/text_encoder/text_encoder.py       | 18 ++++++-------
 mindspeed_mm/models/text_encoder/tokenizer.py |  6 ++---
 .../test_text_encoder_processor.py            |  2 +-
 12 files changed, 77 insertions(+), 54 deletions(-)
 create mode 120000 mindspeed

diff --git a/checkpoint/internvl2_hf_to_mm.py b/checkpoint/internvl2_hf_to_mm.py
index 62f037aa..35f25e6e 100644
--- a/checkpoint/internvl2_hf_to_mm.py
+++ b/checkpoint/internvl2_hf_to_mm.py
@@ -17,8 +17,10 @@ from checkpoint.utils import ConvertVppMMConfig
 
 def load_from_hf(load_dir, trust_remote_code):
     # Load Huggingface model.
-    hf_model = AutoModelForCausalLM.from_pretrained(load_dir, device_map='cpu', trust_remote_code=trust_remote_code,
-                                                    local_files_only=True)
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        load_dir, device_map='cpu',
+        trust_remote_code=trust_remote_code,
+        local_files_only=True)
     print(hf_model)
 
     return hf_model
@@ -274,7 +276,7 @@ def main(convert_config: ConvertVppMMConfig):
 
     pp_size = parallel_config.pp_size
     vp_size = parallel_config.vpp_size
-    
+
     pp_split = merge_pp_index(
         vit_pipeline_num_layers=parallel_config.vit_pp_layers,
         llm_pipeline_num_layers=parallel_config.llm_pp_layers
@@ -288,7 +290,7 @@ def main(convert_config: ConvertVppMMConfig):
     if len(remains) > 0:
         print(remains)
         raise RuntimeWarning("There are some weights ungrouped.")
-    
+
     for rank, pipeline_state_dict in enumerate(pipeline_state_dicts):
         print(20 * '#', f'stage {rank}', 20 * '#')
         for key, value in pipeline_state_dict.items():
diff --git a/checkpoint/utils.py b/checkpoint/utils.py
index ee702b84..b995beaa 100644
--- a/checkpoint/utils.py
+++ b/checkpoint/utils.py
@@ -42,7 +42,7 @@ class ParallelConfig(BaseModel):
         if len(self.vit_pp_layers) < 1:
             raise ValueError("pp layers长度至少为1")
         return self
-    
+
 
 class VppParallelConfig(BaseModel):
     """权模型切分配置，包括tp的size，以及pp切分时vit和llm在pp域每张卡上切分的层数"""
@@ -59,7 +59,7 @@ class VppParallelConfig(BaseModel):
     @computed_field
     def pp_size(self) -> PositiveInt:
         return len(self.llm_pp_layers[0])
-    
+
     @computed_field
     def vpp_size(self) -> PositiveInt:
         return len(self.llm_pp_layers)
@@ -71,7 +71,7 @@ class VppParallelConfig(BaseModel):
         if len(self.vit_pp_layers) < 1:
             raise ValueError("pp layers长度至少为1")
         return self
-    
+
     @model_validator(mode='after')
     def validate_vpp_layers(self) -> "VppParallelConfig":
         pp_size = self.pp_size
@@ -147,7 +147,7 @@ class ConvertResplitConfig(BaseModel):
         if sum(self.source_parallel_config.llm_pp_layers) != sum(self.target_parallel_config.llm_pp_layers):
             raise ValueError("llm pp layers not equal!")
         return self
-    
+
 
 # BaseModel/dataclasses注意要在field的下一行添加描述说明
 class ConvertVppMMConfig(BaseModel):
diff --git a/examples/diffusers/sana/patch_sana.py b/examples/diffusers/sana/patch_sana.py
index 7f943b60..362cd5fc 100644
--- a/examples/diffusers/sana/patch_sana.py
+++ b/examples/diffusers/sana/patch_sana.py
@@ -72,7 +72,8 @@ def create_load_model_hook(
                     raise ValueError(f"unexpected save model: {model.__class__}")
         else:
             transformer_ = SanaTransformer2DModel.from_pretrained(
-                args.pretrained_model_name_or_path, subfolder="transformer"
+                args.pretrained_model_name_or_path, subfolder="transformer",
+                local_files_only=True
             )
 
         # Make sure the trainable params are in float32. This is again needed since the base models
diff --git a/examples/hunyuanvideo/convert_ckpt_to_mm.py b/examples/hunyuanvideo/convert_ckpt_to_mm.py
index c12ee1b8..2824b2bf 100644
--- a/examples/hunyuanvideo/convert_ckpt_to_mm.py
+++ b/examples/hunyuanvideo/convert_ckpt_to_mm.py
@@ -40,7 +40,8 @@ def preprocess_text_encoder_tokenizer(source_dir, save_dir):
     model = LlavaForConditionalGeneration.from_pretrained(
         source_dir,
         torch_dtype=torch.float16,
-        low_cpu_mem_usage=True
+        low_cpu_mem_usage=True,
+        local_files_only=True
     )
     model.language_model.save_pretrained(save_dir)
     processor.tokenizer.save_pretrained(save_dir)
@@ -122,7 +123,7 @@ def get_tp_split_layer_names(
             f"single_blocks.{index}.linear1.weight",
             f"single_blocks.{index}.linear1.bias",
         ]
-    
+
     return (
         column_parallel_linears,
         row_parallel_linears,
@@ -165,7 +166,7 @@ def split_by_tp(
             new_state_dict[name] = torch.chunk(state_dict[name], tp_size, dim=0)[tp_rank]
         for name in row_parallel_linears:
             new_state_dict[name] = torch.chunk(state_dict[name], tp_size, dim=1)[tp_rank]
-        
+
         for name in qkv_fused_projs:
             wq, wk, wv = torch.chunk(state_dict[name], 3, dim=0)
             wq = torch.chunk(wq, tp_size, dim=0)[tp_rank]
@@ -210,7 +211,7 @@ def merge_by_tp(
 
     if tp_size == 1:
         return state_dicts
-    
+
     merged_state_dict = copy.deepcopy(state_dicts[0])
     (
         column_parallel_linears,
@@ -234,7 +235,7 @@ def merge_by_tp(
             [state_dicts[tp_rank][name] for tp_rank in range(tp_size)],
             dim=1
         )
-    
+
     for name in qkv_fused_projs:
         wq = torch.cat(
             [torch.chunk(state_dicts[tp_rank][name], 3, dim=0)[0] for tp_rank in range(tp_size)],
@@ -270,7 +271,7 @@ def merge_by_tp(
             dim=0
         )
         merged_state_dict[name] = torch.cat([wq, wk, wv, wmlp], dim=0)
-    
+
     for name in x_mlp_fused_row_parallel_linear:
         wx = torch.cat(
             [state_dicts[tp_rank][name][:, :hidden_size // tp_size] for tp_rank in range(tp_size)],
@@ -281,7 +282,7 @@ def merge_by_tp(
             dim=1,
         )
         merged_state_dict[name] = torch.cat([wx, wmlp], dim=1)
-    
+
     return merged_state_dict
 
 
@@ -301,7 +302,7 @@ def load_state_dicts_by_tp(load_dir: str, tp_size: int = 2) -> List[Dict[str, An
         state_dict_path = os.path.join(load_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt")
         tp_state_dicts.append(torch.load(state_dict_path)['model'])
 
-    return tp_state_dicts    
+    return tp_state_dicts
 
 
 def save(state_dicts: List[Dict], save_dir: str, latest_checkpointed_iteration="release"):
@@ -316,7 +317,7 @@ def save(state_dicts: List[Dict], save_dir: str, latest_checkpointed_iteration="
         directory = 'release'
     else:
         directory = 'iter_{:07d}'.format(latest_checkpointed_iteration)
-    
+
     for tp_rank, state_dict in enumerate(state_dicts):
         os.makedirs(os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}"))
         save_path = os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt")
@@ -331,7 +332,7 @@ def get_args():
     parser.add_argument("--source_path", type=str, default="./transformers/mp_rank_00/model_states.pt", help="Source path of checkpoint")
     parser.add_argument("--target_path", type=str, default="./ckpt/hunyuanvideo/", help="Save path of MM checkpoint")
     parser.add_argument("--tp_size", type=int, default=2, help="Tensor model parallel world size")
-    parser.add_argument("--mode", type=str, default="split", choices=["split", "merge"], 
+    parser.add_argument("--mode", type=str, default="split", choices=["split", "merge"],
         help="Split mode is used to split the pretrained weights according to tp_size before training, \
         and Merge mode is used to merge weights based on tp_size after training is completed")
 
@@ -345,7 +346,7 @@ if __name__ == "__main__":
     if args.module == "text_encoder":
         preprocess_text_encoder_tokenizer(args.source_path, args.target_path)
     else:
-        if args.mode == "split":    
+        if args.mode == "split":
             source_state_dict = torch.load(args.source_path, map_location='cpu')['module']
             state_dict = replace_state_dict(source_state_dict, convert_mapping=DIT_CONVERT_MAPPING)
             state_dicts = split_by_tp(
diff --git a/examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py b/examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py
index a7be244a..31b5113d 100644
--- a/examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py
+++ b/examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py
@@ -3,7 +3,7 @@ import os
 from copy import deepcopy
 from dataclasses import dataclass
 import stat
-import re 
+import re
 
 import torch
 from transformers import AutoModelForCausalLM, AutoConfig
@@ -78,8 +78,10 @@ model_config_dict = {
 
 def load_from_hf(load_dir, trust_remote_code):
     # Load Huggingface model.
-    hf_model = AutoModelForCausalLM.from_pretrained(load_dir, device_map='cpu', trust_remote_code=trust_remote_code,
-                                                    local_files_only=True)
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        load_dir, device_map='cpu',
+        trust_remote_code=trust_remote_code,
+        local_files_only=True)
     print(hf_model)
     config = AutoConfig.from_pretrained(load_dir, trust_remote_code=trust_remote_code)
     global llm_arch
@@ -187,7 +189,7 @@ def convert_hg_to_mm(_state_dict, model_config, num_key_value_heads):
                 new_key = new_key.replace('post_attention_layernorm', 'pre_mlp_layernorm')
                 new_key = new_key.replace('gate_proj', 'linear_fc1_gate')
                 new_key = new_key.replace('up_proj', 'linear_fc1_up')
-                new_key = new_key.replace('down_proj', 'linear_fc2')                
+                new_key = new_key.replace('down_proj', 'linear_fc2')
                 new_key = new_key.replace('model.norm', 'decoder.final_layernorm')
                 new_key = new_key.replace('model.embed_tokens', 'embedding.word_embeddings')
 
@@ -222,10 +224,10 @@ def convert_hg_to_mm(_state_dict, model_config, num_key_value_heads):
                 wv = new_dict[v_name]
             else:
                 raise AssertionError(f'Missing key {v_name}')
-            
+
             q_chunks = torch.chunk(wq, num_key_value_heads, dim=0)
             k_chunks = torch.chunk(wk, num_key_value_heads, dim=0)
-            v_chunks = torch.chunk(wv, num_key_value_heads, dim=0)            
+            v_chunks = torch.chunk(wv, num_key_value_heads, dim=0)
             all_chunks = []
             for j in range(num_key_value_heads):
                 all_chunks.append(q_chunks[j])
@@ -260,10 +262,10 @@ def convert_hg_to_mm(_state_dict, model_config, num_key_value_heads):
                 wv = new_dict[v_name]
             else:
                 raise AssertionError(f'Missing key {v_name}')
-            
+
             q_chunks = torch.chunk(wq, num_key_value_heads, dim=0)
             k_chunks = torch.chunk(wk, num_key_value_heads, dim=0)
-            v_chunks = torch.chunk(wv, num_key_value_heads, dim=0)            
+            v_chunks = torch.chunk(wv, num_key_value_heads, dim=0)
             all_chunks = []
             for j in range(num_key_value_heads):
                 all_chunks.append(q_chunks[j])
@@ -276,7 +278,7 @@ def convert_hg_to_mm(_state_dict, model_config, num_key_value_heads):
             if k_name in new_dict:
                 new_dict.pop(k_name)
             if v_name in new_dict:
-                new_dict.pop(v_name)            
+                new_dict.pop(v_name)
 
 
     # 合并mlp的gate和up权重
@@ -433,17 +435,17 @@ if __name__ == '__main__':
     model_config = get_model_config(
         args.model_size, args.vpp)
     pp_split = merge_pp_index(model_config)
-    
+
     for key, value in state_dict.items():
         print(key, value.shape)
     state_dict = convert_hg_to_mm(state_dict, model_config, num_key_value_heads)
     pipeline_state_dicts, remains = split_model_by_pipeline(state_dict, pp_split)
-    
+
     if len(remains) > 0:
         print(remains)
         raise RuntimeWarning("There are some weights ungrouped.")
 
-    
+
     for rank, pipeline_state_dict in enumerate(pipeline_state_dicts):
         print(20 * '#', f'stage {rank}', 20 * '#')
         for key, value in pipeline_state_dict.items():
diff --git a/examples/internvl2/internvl2_convert_to_mm_ckpt.py b/examples/internvl2/internvl2_convert_to_mm_ckpt.py
index 6c6379d2..b2346ab2 100644
--- a/examples/internvl2/internvl2_convert_to_mm_ckpt.py
+++ b/examples/internvl2/internvl2_convert_to_mm_ckpt.py
@@ -13,8 +13,10 @@ llm_arch = ''
 
 def load_from_hf(load_dir, trust_remote_code):
     # Load Huggingface model.
-    hf_model = AutoModelForCausalLM.from_pretrained(load_dir, device_map='cpu', trust_remote_code=trust_remote_code,
-                                                    local_files_only=True)
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        load_dir, device_map='cpu',
+        trust_remote_code=trust_remote_code,
+        local_files_only=True)
     print(hf_model)
     config = AutoConfig.from_pretrained(load_dir, trust_remote_code=trust_remote_code)
     global llm_arch
diff --git a/examples/llava1.5/vicuna_converter.py b/examples/llava1.5/vicuna_converter.py
index ab95b13e..04727556 100644
--- a/examples/llava1.5/vicuna_converter.py
+++ b/examples/llava1.5/vicuna_converter.py
@@ -7,11 +7,13 @@ from transformers import AutoModelForCausalLM, AutoConfig
 
 def load_from_hf(load_dir, trust_remote_code):
     # Load Huggingface model.
-    hf_model = AutoModelForCausalLM.from_pretrained(load_dir, device_map='cpu', trust_remote_code=trust_remote_code,
-                                                    torch_dtype=torch.bfloat16, local_files_only=True)
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        load_dir, device_map='cpu',
+        trust_remote_code=trust_remote_code,
+        torch_dtype=torch.bfloat16, local_files_only=True)
     print(hf_model)
     config = AutoConfig.from_pretrained(load_dir, trust_remote_code=trust_remote_code)
-    
+
     return hf_model, config
 
 
@@ -26,7 +28,7 @@ def merge_qkv(wq, wk, wv, ng=32):
         qkv[j * d : j * d + dq, :] = wq[j * dq : (j + 1) * dq, :]
         qkv[j * d + dq : j * d + dq + dkv, :] = wk[j * dkv : (j + 1) * dkv, :]
         qkv[j * d + dq + dkv : j * d + dq + dkv * 2, :] = wv[j * dkv : (j + 1) * dkv, :]
-    
+
     return qkv
 
 
diff --git a/mindspeed b/mindspeed
new file mode 120000
index 00000000..22f84c16
--- /dev/null
+++ b/mindspeed
@@ -0,0 +1 @@
+../MindSpeed-3f09d67/mindspeed/
\ No newline at end of file
diff --git a/mindspeed_mm/arguments.py b/mindspeed_mm/arguments.py
index 51d17339..8a280412 100644
--- a/mindspeed_mm/arguments.py
+++ b/mindspeed_mm/arguments.py
@@ -34,6 +34,7 @@ def process_args(parser):
     parser = _add_network_size_args(parser)
     parser = _add_dummy_optimizer_args(parser)
     parser = _add_logging_args(parser)
+    parser = _add_security_args(parser)
     return parser
 
 
@@ -115,11 +116,22 @@ def _add_dummy_optimizer_args(parser):
 
 
 def _add_logging_args(parser):
-    group = parser.add_argument_group(title='mm_logging')
+    group = parser.add_argument_group(title='logging')
 
     group.add_argument('--log-tps',
                        action='store_true',
                        default=False,
                        help='calculate and log average tokens per sample')
-    
-    return parser
\ No newline at end of file
+
+    return parser
+
+
+def _add_security_args(parser):
+    group = parser.add_argument_group(title='security configuration')
+
+    group.add_argument('--trust-remote-files',
+                       action='store_true',
+                       default=False,
+                       help='Whether or not to allow for custom models defined on the Hub in their own modeling files.')
+
+    return parser
diff --git a/mindspeed_mm/models/text_encoder/text_encoder.py b/mindspeed_mm/models/text_encoder/text_encoder.py
index b6c150de..10d26bd5 100644
--- a/mindspeed_mm/models/text_encoder/text_encoder.py
+++ b/mindspeed_mm/models/text_encoder/text_encoder.py
@@ -25,18 +25,18 @@ class TextEncoder(nn.Module):
                 "backend": type-str, "hf" or "om",
                 "model_id": type-str, "AutoModel" or other automodel name,
                 "dtype": type-str, dtype of text encoder
-                
+
                 (2) args for automodel.from_pretrained() of transformers or openmind
                 "pretrained_model_name_or_path": type-str, local path or hub path,
                 "local_files_only": type-bool,
                 ...
             }
-        - If `config` is a list of dictionaries, each dictionary in the list will be used to instantiate a separate Text Encoder Model instance, 
+        - If `config` is a list of dictionaries, each dictionary in the list will be used to instantiate a separate Text Encoder Model instance,
             effectively allowing the creation of multiple Text Encoder based on different configurations.
     """
     def __init__(self, config):
         super().__init__()
-        
+
         if isinstance(config, list) or isinstance(config, tuple):
             self.text_encoders = nn.ModuleList()
             for config_i in config:
@@ -59,7 +59,7 @@ class TextEncoder(nn.Module):
         else:
             outputs = self._single_encode(self.text_encoders, input_ids, mask)
         return outputs
-    
+
     def _single_encode(self, text_encoder, input_ids, attention_mask, **kwargs):
         *BN, L = input_ids.shape
         input_ids = input_ids.to(text_encoder.device).view(-1, L)
@@ -90,16 +90,16 @@ class TextEncoder(nn.Module):
                 )
                 * emb
             )
-        
+
         if text_encoder.output_key in ["last_hidden_state", "hidden_states"]:
             emb = emb.view(*BN, L, -1)
         elif text_encoder.output_key in ["pooler_output"]:
             emb = emb.view(*BN, -1)
         else:
             raise NotImplementedError(f"Text encoder output_key: {text_encoder.output_key} is not implenmented! ")
-        
-        return emb            
-    
+
+        return emb
+
     def _init_text_encoder(self, config):
         if not isinstance(config, dict):
             config = config.to_dict()
@@ -116,7 +116,7 @@ class TextEncoder(nn.Module):
             self.automodel_name = TEXT_ENCODER_MAPPING[model_id]
         config["pretrained_model_name_or_path"] = config.pop("from_pretrained")
         config["torch_dtype"] = get_dtype(config.pop("dtype"))
-
+        config["local_files_only"] = True
         # Only huggingface backend is supported, OpenMind backend will be supported soon.
         module = importlib.import_module("transformers")
         automodel = getattr(module, self.automodel_name)
diff --git a/mindspeed_mm/models/text_encoder/tokenizer.py b/mindspeed_mm/models/text_encoder/tokenizer.py
index 3a875419..6600ed6c 100644
--- a/mindspeed_mm/models/text_encoder/tokenizer.py
+++ b/mindspeed_mm/models/text_encoder/tokenizer.py
@@ -20,7 +20,7 @@ class Tokenizer:
                 "local_files_only": type-bool,
                 ...
             }
-        - If `config` is a list of dictionaries, each dictionary in the list will be used to instantiate a separate Tokenizer instance, 
+        - If `config` is a list of dictionaries, each dictionary in the list will be used to instantiate a separate Tokenizer instance,
             effectively allowing the creation of multiple tokenizers based on different configurations.
     """
 
@@ -32,11 +32,11 @@ class Tokenizer:
                 tokenizer_i = self._init_tokenizer(module, config_i)
                 self.tokenizers.append(tokenizer_i)
         else:
-            self.tokenizers = self._init_tokenizer(module, config)           
+            self.tokenizers = self._init_tokenizer(module, config)
 
     def get_tokenizer(self):
         return self.tokenizers
-    
+
     def _init_tokenizer(self, module, config):
         if not isinstance(config, dict):
             config = config.to_dict()
diff --git a/tests/ut/models/text_encoder/test_text_encoder_processor.py b/tests/ut/models/text_encoder/test_text_encoder_processor.py
index fd194215..2ae206c6 100644
--- a/tests/ut/models/text_encoder/test_text_encoder_processor.py
+++ b/tests/ut/models/text_encoder/test_text_encoder_processor.py
@@ -26,7 +26,7 @@ class TestTextEncoder:
         text_encoder_dict = {
                 "hub_backend": "hf",
                 "model_id": "T5",
-                "dtype": "bf16", 
+                "dtype": "bf16",
                 "from_pretrained": T5_MODEL_PATH,
         }
         tokenizer_dict = {
-- 
Gitee


From 3f86bc843b3b4002e42986a4b32aa6df1213b4b1 Mon Sep 17 00:00:00 2001
From: htwang <onehaitao@foxmail.com>
Date: Tue, 25 Feb 2025 10:57:25 +0800
Subject: [PATCH 2/4] add trust remote code

---
 evaluate_vlm.py                                         | 3 ++-
 examples/internvl2.5/data_4B.json                       | 1 -
 examples/internvl2.5/data_78B.json                      | 1 -
 examples/internvl2.5/finetune_internvl2.5_4B.sh         | 3 ++-
 examples/internvl2.5/finetune_internvl2.5_78B.sh        | 3 ++-
 examples/internvl2.5/inference_4B.json                  | 3 +--
 examples/internvl2.5/inference_internvl.sh              | 1 +
 examples/internvl2/data_26B.json                        | 3 +--
 examples/internvl2/data_2B.json                         | 1 -
 examples/internvl2/data_76B.json                        | 1 -
 examples/internvl2/data_8B.json                         | 1 -
 examples/internvl2/evaluate_internvl2_8B.json           | 1 -
 examples/internvl2/evaluate_internvl2_8B.sh             | 1 +
 examples/internvl2/finetune_internvl2_26B.sh            | 1 +
 examples/internvl2/finetune_internvl2_2B.sh             | 1 +
 examples/internvl2/finetune_internvl2_8B.sh             | 1 +
 examples/internvl2/finetune_internvl2_8B_vpp.sh         | 1 +
 examples/internvl2/inference_2B.json                    | 3 +--
 examples/internvl2/inference_8B.json                    | 3 +--
 examples/internvl2/inference_internvl.sh                | 1 +
 inference_qihoo.py                                      | 3 ++-
 inference_vlm.py                                        | 3 ++-
 mindspeed_mm/arguments.py                               | 6 ++++--
 mindspeed_mm/data/datasets/audio_dataset.py             | 5 +++--
 mindspeed_mm/models/text_encoder/text_encoder.py        | 2 ++
 mindspeed_mm/models/text_encoder/tokenizer.py           | 3 +++
 tests/st/run_configs/finetune_internvl2_8B/data_8B.json | 1 -
 tests/st/shell_scripts/finetune_internvl2_8B.sh         | 1 +
 28 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/evaluate_vlm.py b/evaluate_vlm.py
index 0d412758..38fd0d74 100644
--- a/evaluate_vlm.py
+++ b/evaluate_vlm.py
@@ -3,13 +3,14 @@ from megatron.training import get_args
 from megatron.training.initialize import initialize_megatron
 from mindspeed_mm.configs.config import merge_mm_args
 from mindspeed_mm.configs.config import mm_extra_args_provider
+from mindspeed_mm.arguments import extra_args_provider_decorator
 from mindspeed_mm.tasks.evaluation.eval_datasets import eval_dataset_dict
 from mindspeed_mm.tasks.evaluation.eval_impl import eval_impl_dict, eval_pipeline_dict
 from mindspeed_mm.tasks.evaluation.eval_prompt import eval_model_prompt_dict
 
 
 def main():
-    initialize_megatron(extra_args_provider=mm_extra_args_provider)
+    initialize_megatron(extra_args_provider=extra_args_provider_decorator(mm_extra_args_provider))
     args = get_args()
     merge_mm_args(args)
     args = args.mm.model
diff --git a/examples/internvl2.5/data_4B.json b/examples/internvl2.5/data_4B.json
index f7b0e8e7..a134628c 100644
--- a/examples/internvl2.5/data_4B.json
+++ b/examples/internvl2.5/data_4B.json
@@ -24,7 +24,6 @@
             "from_pretrained": "OpenGVLab/InternVL2_5-4B",
             "model_max_length": 4096,
             "add_eos_token": false,
-            "trust_remote_code": true,
             "use_fast": false
         },
         "use_text_processer": true,
diff --git a/examples/internvl2.5/data_78B.json b/examples/internvl2.5/data_78B.json
index bd69b111..4cd0145d 100644
--- a/examples/internvl2.5/data_78B.json
+++ b/examples/internvl2.5/data_78B.json
@@ -24,7 +24,6 @@
             "from_pretrained": "OpenGVLab/InternVL2_5-78B",
             "model_max_length": 4096,
             "add_eos_token": false,
-            "trust_remote_code": true,
             "use_fast": false
         },
         "use_text_processer": true,
diff --git a/examples/internvl2.5/finetune_internvl2.5_4B.sh b/examples/internvl2.5/finetune_internvl2.5_4B.sh
index 2256b696..aa4abd96 100644
--- a/examples/internvl2.5/finetune_internvl2.5_4B.sh
+++ b/examples/internvl2.5/finetune_internvl2.5_4B.sh
@@ -79,7 +79,8 @@ GPT_ARGS="
     --load $LOAD_PATH \
     --variable-seq-lengths \
     --normalization RMSNorm \
-    --num-workers 4
+    --num-workers 4 \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/examples/internvl2.5/finetune_internvl2.5_78B.sh b/examples/internvl2.5/finetune_internvl2.5_78B.sh
index 2453d4c7..d4283d05 100644
--- a/examples/internvl2.5/finetune_internvl2.5_78B.sh
+++ b/examples/internvl2.5/finetune_internvl2.5_78B.sh
@@ -88,7 +88,8 @@ GPT_ARGS="
     --load $LOAD_PATH \
     --variable-seq-lengths \
     --normalization RMSNorm \
-    --num-workers 4
+    --num-workers 4 \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/examples/internvl2.5/inference_4B.json b/examples/internvl2.5/inference_4B.json
index 9d4988bc..e0f07a0a 100644
--- a/examples/internvl2.5/inference_4B.json
+++ b/examples/internvl2.5/inference_4B.json
@@ -1,4 +1,4 @@
-{   
+{
     "infer_data_type": "image",
     "file_path": "./examples/internvl2.5/view.jpg",
     "prompts": "Please describe the image shortly.",
@@ -102,7 +102,6 @@
         "from_pretrained": "OpenGVLab/InternVL2_5-4B",
         "model_max_length": 4096,
         "add_eos_token": false,
-        "trust_remote_code": true,
         "use_fast": false
     },
     "generation_config":{
diff --git a/examples/internvl2.5/inference_internvl.sh b/examples/internvl2.5/inference_internvl.sh
index 45fa37c3..0240ddf4 100644
--- a/examples/internvl2.5/inference_internvl.sh
+++ b/examples/internvl2.5/inference_internvl.sh
@@ -53,6 +53,7 @@ GPT_ARGS="
     --no-masked-softmax-fusion \
     --use-distributed-optimizer \
     --bf16 \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/examples/internvl2/data_26B.json b/examples/internvl2/data_26B.json
index 2f450d76..1c2f2a59 100644
--- a/examples/internvl2/data_26B.json
+++ b/examples/internvl2/data_26B.json
@@ -14,7 +14,7 @@
                     {"trans_type": "Resize", "param": {"size": [448, 448], "interpolation": "BICUBIC"}},
                     {"trans_type": "ToTensor"},
                     {"trans_type": "norm_fun", "param": {"mean":[0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225]}}
-                ]   
+                ]
             }
         },
         "tokenizer_config": {
@@ -23,7 +23,6 @@
             "from_pretrained": "OpenGVLab/InternVL2-26B",
             "model_max_length": 4096,
             "add_eos_token": false,
-            "trust_remote_code": true,
             "use_fast": false
         },
         "use_text_processer": true,
diff --git a/examples/internvl2/data_2B.json b/examples/internvl2/data_2B.json
index 25a649cc..7f6cd0b1 100644
--- a/examples/internvl2/data_2B.json
+++ b/examples/internvl2/data_2B.json
@@ -24,7 +24,6 @@
             "from_pretrained": "OpenGVLab/InternVL2-2B",
             "model_max_length": 4096,
             "add_eos_token": false,
-            "trust_remote_code": true,
             "use_fast": false
         },
         "use_text_processer": true,
diff --git a/examples/internvl2/data_76B.json b/examples/internvl2/data_76B.json
index aa9c6014..21476838 100644
--- a/examples/internvl2/data_76B.json
+++ b/examples/internvl2/data_76B.json
@@ -23,7 +23,6 @@
             "from_pretrained": "OpenGVLab/InternVL2-Llama3-76B",
             "model_max_length": 4096,
             "add_eos_token": false,
-            "trust_remote_code": false,
             "use_fast": false
         },
         "use_text_processer": true,
diff --git a/examples/internvl2/data_8B.json b/examples/internvl2/data_8B.json
index 94898a66..e5bd9f0b 100644
--- a/examples/internvl2/data_8B.json
+++ b/examples/internvl2/data_8B.json
@@ -23,7 +23,6 @@
             "from_pretrained": "OpenGVLab/InternVL2-8B",
             "model_max_length": 4096,
             "add_eos_token": false,
-            "trust_remote_code": true,
             "use_fast": false
         },
         "use_text_processer": true,
diff --git a/examples/internvl2/evaluate_internvl2_8B.json b/examples/internvl2/evaluate_internvl2_8B.json
index fcab66f3..9002918e 100644
--- a/examples/internvl2/evaluate_internvl2_8B.json
+++ b/examples/internvl2/evaluate_internvl2_8B.json
@@ -104,7 +104,6 @@
         "from_pretrained": "./InternVL2-8B",
         "model_max_length": 4096,
         "add_eos_token": false,
-        "trust_remote_code": true,
         "use_fast": false
     },
     "generation_config":{
diff --git a/examples/internvl2/evaluate_internvl2_8B.sh b/examples/internvl2/evaluate_internvl2_8B.sh
index 659bfaa9..cd6de99a 100644
--- a/examples/internvl2/evaluate_internvl2_8B.sh
+++ b/examples/internvl2/evaluate_internvl2_8B.sh
@@ -76,6 +76,7 @@ GPT_ARGS="
     --bf16 \
     --distributed-timeout-minutes 1000 \
     --use-flash-attn \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/examples/internvl2/finetune_internvl2_26B.sh b/examples/internvl2/finetune_internvl2_26B.sh
index 0a0e7a56..bee97ab8 100644
--- a/examples/internvl2/finetune_internvl2_26B.sh
+++ b/examples/internvl2/finetune_internvl2_26B.sh
@@ -82,6 +82,7 @@ GPT_ARGS="
     --no-save-rng \
     --num-workers 4 \
     --enable-dummy-optimizer \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/examples/internvl2/finetune_internvl2_2B.sh b/examples/internvl2/finetune_internvl2_2B.sh
index 09d068ad..6bed8ad0 100644
--- a/examples/internvl2/finetune_internvl2_2B.sh
+++ b/examples/internvl2/finetune_internvl2_2B.sh
@@ -81,6 +81,7 @@ GPT_ARGS="
     --no-save-optim \
     --no-save-rng \
     --num-workers 4 \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/examples/internvl2/finetune_internvl2_8B.sh b/examples/internvl2/finetune_internvl2_8B.sh
index c1fc0229..57d10a1b 100644
--- a/examples/internvl2/finetune_internvl2_8B.sh
+++ b/examples/internvl2/finetune_internvl2_8B.sh
@@ -81,6 +81,7 @@ GPT_ARGS="
     --no-save-optim \
     --no-save-rng \
     --num-workers 4 \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/examples/internvl2/finetune_internvl2_8B_vpp.sh b/examples/internvl2/finetune_internvl2_8B_vpp.sh
index c84611b5..2091707a 100644
--- a/examples/internvl2/finetune_internvl2_8B_vpp.sh
+++ b/examples/internvl2/finetune_internvl2_8B_vpp.sh
@@ -82,6 +82,7 @@ GPT_ARGS="
     --no-save-optim \
     --no-save-rng \
     --num-workers 4 \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/examples/internvl2/inference_2B.json b/examples/internvl2/inference_2B.json
index 67ecf8fb..0dd767ac 100644
--- a/examples/internvl2/inference_2B.json
+++ b/examples/internvl2/inference_2B.json
@@ -1,4 +1,4 @@
-{   
+{
     "infer_data_type": "image",
     "file_path": "./examples/internvl2/view.jpg",
     "prompts": "Please describe the image shortly.",
@@ -102,7 +102,6 @@
         "from_pretrained": "OpenGVLab/InternVL2-2B",
         "model_max_length": 4096,
         "add_eos_token": false,
-        "trust_remote_code": true,
         "use_fast": false
     },
     "generation_config":{
diff --git a/examples/internvl2/inference_8B.json b/examples/internvl2/inference_8B.json
index e79d6836..3f32d0e2 100644
--- a/examples/internvl2/inference_8B.json
+++ b/examples/internvl2/inference_8B.json
@@ -1,4 +1,4 @@
-{   
+{
     "infer_data_type": "image",
     "file_path": "./examples/internvl2/view.jpg",
     "prompts": "Please describe the image shortly.",
@@ -102,7 +102,6 @@
         "from_pretrained": "OpenGVLab/InternVL2-8B",
         "model_max_length": 4096,
         "add_eos_token": false,
-        "trust_remote_code": true,
         "use_fast": false
     },
     "generation_config":{
diff --git a/examples/internvl2/inference_internvl.sh b/examples/internvl2/inference_internvl.sh
index 4d945790..f7c0c11a 100644
--- a/examples/internvl2/inference_internvl.sh
+++ b/examples/internvl2/inference_internvl.sh
@@ -54,6 +54,7 @@ GPT_ARGS="
     --use-distributed-optimizer \
     --bf16 \
     --use-flash-attn \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
diff --git a/inference_qihoo.py b/inference_qihoo.py
index cf1ab3c5..a331df3f 100644
--- a/inference_qihoo.py
+++ b/inference_qihoo.py
@@ -6,6 +6,7 @@ from megatron.training.initialize import initialize_megatron
 from megatron.training import get_args
 
 from mindspeed_mm.configs.config import merge_mm_args, mm_extra_args_provider
+from mindspeed_mm.arguments import extra_args_provider_decorator
 from mindspeed_mm.tasks.inference.pipeline import sora_pipeline_dict
 from mindspeed_mm.tasks.inference.pipeline.utils.sora_utils import save_videos, load_prompts, save_image_or_videos
 from mindspeed_mm.models.predictor import PredictModel
@@ -37,7 +38,7 @@ def prepare_pipeline(args, device):
 
 
 def main():
-    initialize_megatron(extra_args_provider=mm_extra_args_provider, args_defaults={})
+    initialize_megatron(extra_args_provider=extra_args_provider_decorator(mm_extra_args_provider), args_defaults={})
     args = get_args()
     merge_mm_args(args)
     args = args.mm.model
diff --git a/inference_vlm.py b/inference_vlm.py
index d82b97df..245609c6 100644
--- a/inference_vlm.py
+++ b/inference_vlm.py
@@ -4,6 +4,7 @@ import mindspeed.megatron_adaptor
 from megatron.training import get_args
 from mindspeed_mm.tasks.inference.pipeline import vlm_pipeline_dict
 from mindspeed_mm.configs.config import mm_extra_args_provider
+from mindspeed_mm.arguments import extra_args_provider_decorator
 
 
 def main():
@@ -14,7 +15,7 @@ def main():
     torch.set_grad_enabled(False)
 
     initialize_megatron(
-        extra_args_provider=mm_extra_args_provider, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}
+        extra_args_provider=extra_args_provider_decorator(mm_extra_args_provider), args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}
     )
     args = get_args()
     merge_mm_args(args)
diff --git a/mindspeed_mm/arguments.py b/mindspeed_mm/arguments.py
index 8a280412..47fa1b24 100644
--- a/mindspeed_mm/arguments.py
+++ b/mindspeed_mm/arguments.py
@@ -76,7 +76,9 @@ def _add_training_args(parser):
                        action='store_true',
                        default=False,
                        help='Use internal format to train')
-    group.add_argument('--virtual-pipeline-model-parallel-size', type=int, default=None,
+    group.add_argument('--virtual-pipeline-model-parallel-size',
+                       type=int,
+                       default=None,
                        help='vpp size')
     group.add_argument('--encoder-dp-balance',
                        action='store_true',
@@ -129,7 +131,7 @@ def _add_logging_args(parser):
 def _add_security_args(parser):
     group = parser.add_argument_group(title='security configuration')
 
-    group.add_argument('--trust-remote-files',
+    group.add_argument('--trust-remote-code',
                        action='store_true',
                        default=False,
                        help='Whether or not to allow for custom models defined on the Hub in their own modeling files.')
diff --git a/mindspeed_mm/data/datasets/audio_dataset.py b/mindspeed_mm/data/datasets/audio_dataset.py
index c53bf8a0..d9265b2a 100644
--- a/mindspeed_mm/data/datasets/audio_dataset.py
+++ b/mindspeed_mm/data/datasets/audio_dataset.py
@@ -14,6 +14,7 @@
 from datasets import Audio, load_dataset
 from torch.utils.data import Dataset
 from transformers import WhisperProcessor
+from megatron.training import get_args
 
 
 class AudioDataset(Dataset):
@@ -40,7 +41,7 @@ class AudioDataset(Dataset):
             dataset_name_or_path,
             language,
             split="train+validation",
-            trust_remote_code=True,
+            trust_remote_code=get_args().trust_remote_code,
         )
         train_dataset = train_dataset.remove_columns(
             [
@@ -59,7 +60,7 @@ class AudioDataset(Dataset):
         processor = WhisperProcessor.from_pretrained(
             processor_name_or_path,
             language=processor_language,
-            task=task, 
+            task=task,
             local_files_only=True,
         )
         feature_extractor = processor.feature_extractor
diff --git a/mindspeed_mm/models/text_encoder/text_encoder.py b/mindspeed_mm/models/text_encoder/text_encoder.py
index 10d26bd5..505a8edd 100644
--- a/mindspeed_mm/models/text_encoder/text_encoder.py
+++ b/mindspeed_mm/models/text_encoder/text_encoder.py
@@ -2,6 +2,7 @@ import importlib
 import torch
 import torch.nn as nn
 from mindspeed_mm.utils.utils import get_dtype
+from megatron.training import get_args
 
 
 TEXT_ENCODER_MAPPING = {
@@ -116,6 +117,7 @@ class TextEncoder(nn.Module):
             self.automodel_name = TEXT_ENCODER_MAPPING[model_id]
         config["pretrained_model_name_or_path"] = config.pop("from_pretrained")
         config["torch_dtype"] = get_dtype(config.pop("dtype"))
+        config["trust_remote_code"] = get_args().trust_remote_code
         config["local_files_only"] = True
         # Only huggingface backend is supported, OpenMind backend will be supported soon.
         module = importlib.import_module("transformers")
diff --git a/mindspeed_mm/models/text_encoder/tokenizer.py b/mindspeed_mm/models/text_encoder/tokenizer.py
index 6600ed6c..7f72c9e7 100644
--- a/mindspeed_mm/models/text_encoder/tokenizer.py
+++ b/mindspeed_mm/models/text_encoder/tokenizer.py
@@ -1,5 +1,6 @@
 import importlib
 from torch import nn
+from megatron.training import get_args
 
 
 class Tokenizer:
@@ -45,5 +46,7 @@ class Tokenizer:
         self.backend = config.pop("hub_backend")
         tokenizer_name = config.pop("autotokenizer_name")
         config["pretrained_model_name_or_path"] = config.pop("from_pretrained")
+        config["trust_remote_code"] = get_args().trust_remote_code
+        config["local_files_only"] = True
         tokenizer_cls = getattr(module, tokenizer_name)
         return tokenizer_cls.from_pretrained(**config)
\ No newline at end of file
diff --git a/tests/st/run_configs/finetune_internvl2_8B/data_8B.json b/tests/st/run_configs/finetune_internvl2_8B/data_8B.json
index 8cc38468..bf61000c 100644
--- a/tests/st/run_configs/finetune_internvl2_8B/data_8B.json
+++ b/tests/st/run_configs/finetune_internvl2_8B/data_8B.json
@@ -23,7 +23,6 @@
             "from_pretrained": "/home/ci_resource/models/InternVL2-8B/pretrained/raw_ckpt/InternVL2-8B",
             "model_max_length": 4096,
             "add_eos_token": false,
-            "trust_remote_code": true,
             "use_fast": false
         },
         "use_text_processer": true,
diff --git a/tests/st/shell_scripts/finetune_internvl2_8B.sh b/tests/st/shell_scripts/finetune_internvl2_8B.sh
index 16fb69b3..e006622a 100644
--- a/tests/st/shell_scripts/finetune_internvl2_8B.sh
+++ b/tests/st/shell_scripts/finetune_internvl2_8B.sh
@@ -84,6 +84,7 @@ GPT_ARGS="
     --normalization RMSNorm \
     --use-fused-rmsnorm \
     --num-workers 4 \
+    --trust-remote-code \
 "
 
 OUTPUT_ARGS="
-- 
Gitee


From 59ecf3033371e5b77492e5c8d72948124fe27767 Mon Sep 17 00:00:00 2001
From: htwang <onehaitao@foxmail.com>
Date: Tue, 25 Feb 2025 11:21:16 +0800
Subject: [PATCH 3/4] update

---
 .gitignore                                                   | 4 ++--
 checkpoint/utils.py                                          | 2 +-
 examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py       | 3 ++-
 examples/internvl2/internvl2_convert_to_mm_ckpt.py           | 3 ++-
 examples/llava1.5/evaluate_llava1_5.json                     | 1 -
 examples/llava1.5/inference_llava.json                       | 3 +--
 examples/llava1.5/vicuna_converter.py                        | 3 ++-
 examples/opensora1.0/inference_model_120x256x256.json        | 2 --
 examples/opensora1.0/inference_model_16x512x512.json         | 2 --
 examples/opensora1.2/inference_model_102x720x1280.json       | 2 --
 examples/opensoraplan1.2/inference_model_29x480x640.json     | 2 --
 examples/opensoraplan1.3/i2v/inference_i2v_model.json        | 5 ++---
 examples/opensoraplan1.3/t2v/inference_t2v_model.json        | 5 ++---
 examples/qihoo_t2x/inference_model_image.json                | 4 +---
 examples/qwen2vl/evaluate_qwen2vl_7b.json                    | 3 +--
 examples/qwen2vl/inference_qwen2vl_2b.json                   | 3 +--
 examples/qwen2vl/inference_qwen2vl_72b.json                  | 3 +--
 examples/qwen2vl/inference_qwen2vl_7b.json                   | 3 +--
 examples/whisper/pretrain_whisper.sh                         | 1 +
 mindspeed                                                    | 1 -
 mindspeed_mm/models/text_encoder/text_encoder.py             | 3 ++-
 mindspeed_mm/models/text_encoder/tokenizer.py                | 1 -
 .../inference_qwen2vl_7B_pp1/inference_qwen2vl_7b.json       | 3 +--
 .../inference_qwen2vl_7B_pp4/inference_qwen2vl_7b.json       | 3 +--
 24 files changed, 24 insertions(+), 41 deletions(-)
 delete mode 120000 mindspeed

diff --git a/.gitignore b/.gitignore
index 7aa4dbb4..1977d146 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,9 +151,9 @@ cython_debug/
 /ci/kernel*/
 
 # mindspeed core
-/mindspeed/
+mindspeed
 
-# test 
+# test
 /tests/st/run_jsons/
 /tests/st/run_logs/
 
diff --git a/checkpoint/utils.py b/checkpoint/utils.py
index b995beaa..14f20618 100644
--- a/checkpoint/utils.py
+++ b/checkpoint/utils.py
@@ -91,7 +91,7 @@ class HfConfig(BaseModel):
 
     @cached_property
     def config(self) -> PretrainedConfig:
-        return AutoConfig.from_pretrained(self.hf_dir)
+        return AutoConfig.from_pretrained(self.hf_dir, local_files_only=True)
 
     @model_validator(mode='after')
     def validate_hf_dir(self) -> "HfConfig":
diff --git a/examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py b/examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py
index 31b5113d..d6f12037 100644
--- a/examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py
+++ b/examples/internvl2.5/internvl2.5_convert_to_mm_ckpt.py
@@ -83,7 +83,8 @@ def load_from_hf(load_dir, trust_remote_code):
         trust_remote_code=trust_remote_code,
         local_files_only=True)
     print(hf_model)
-    config = AutoConfig.from_pretrained(load_dir, trust_remote_code=trust_remote_code)
+    config = AutoConfig.from_pretrained(
+        load_dir, trust_remote_code=trust_remote_code, local_files_only=True)
     global llm_arch
     llm_arch = config.llm_config.architectures[0]
     return hf_model, config
diff --git a/examples/internvl2/internvl2_convert_to_mm_ckpt.py b/examples/internvl2/internvl2_convert_to_mm_ckpt.py
index b2346ab2..cbc2b920 100644
--- a/examples/internvl2/internvl2_convert_to_mm_ckpt.py
+++ b/examples/internvl2/internvl2_convert_to_mm_ckpt.py
@@ -18,7 +18,8 @@ def load_from_hf(load_dir, trust_remote_code):
         trust_remote_code=trust_remote_code,
         local_files_only=True)
     print(hf_model)
-    config = AutoConfig.from_pretrained(load_dir, trust_remote_code=trust_remote_code)
+    config = AutoConfig.from_pretrained(
+        load_dir, trust_remote_code=trust_remote_code, local_files_only=True)
     global llm_arch
     llm_arch = config.llm_config.architectures[0]
     return hf_model
diff --git a/examples/llava1.5/evaluate_llava1_5.json b/examples/llava1.5/evaluate_llava1_5.json
index deba5834..a93a82c1 100644
--- a/examples/llava1.5/evaluate_llava1_5.json
+++ b/examples/llava1.5/evaluate_llava1_5.json
@@ -87,7 +87,6 @@
     "hub_backend": "hf",
     "autotokenizer_name": "AutoTokenizer",
     "from_pretrained": "./llava_7b",
-    "local_files_only": false,
     "use_fast": false
   },
   "generation_config": {
diff --git a/examples/llava1.5/inference_llava.json b/examples/llava1.5/inference_llava.json
index 38ba3b3c..c18e068c 100644
--- a/examples/llava1.5/inference_llava.json
+++ b/examples/llava1.5/inference_llava.json
@@ -89,8 +89,7 @@
   "tokenizer":{
       "hub_backend": "hf",
       "autotokenizer_name": "AutoTokenizer",
-      "from_pretrained": "llava_weights/vicuna-7b-v1.5",
-      "local_files_only": false
+      "from_pretrained": "llava_weights/vicuna-7b-v1.5"
   },
   "generation_config":{
       "bos_token_id": 1,
diff --git a/examples/llava1.5/vicuna_converter.py b/examples/llava1.5/vicuna_converter.py
index 04727556..3e464140 100644
--- a/examples/llava1.5/vicuna_converter.py
+++ b/examples/llava1.5/vicuna_converter.py
@@ -12,7 +12,8 @@ def load_from_hf(load_dir, trust_remote_code):
         trust_remote_code=trust_remote_code,
         torch_dtype=torch.bfloat16, local_files_only=True)
     print(hf_model)
-    config = AutoConfig.from_pretrained(load_dir, trust_remote_code=trust_remote_code)
+    config = AutoConfig.from_pretrained(
+        load_dir, trust_remote_code=trust_remote_code, local_files_only=True)
 
     return hf_model, config
 
diff --git a/examples/opensora1.0/inference_model_120x256x256.json b/examples/opensora1.0/inference_model_120x256x256.json
index 31fe16d5..e8d7ddc3 100644
--- a/examples/opensora1.0/inference_model_120x256x256.json
+++ b/examples/opensora1.0/inference_model_120x256x256.json
@@ -10,7 +10,6 @@
         "hub_backend": "hf",
         "model_id": "T5",
         "from_pretrained": "DeepFloyd/t5-v1_1-xxl",
-        "local_files_only": false,
         "low_cpu_mem_usage": true,
         "dtype": "fp32"
     },
@@ -18,7 +17,6 @@
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
         "from_pretrained": "DeepFloyd/t5-v1_1-xxl",
-        "local_files_only": false,
         "model_max_length": 120
     },
     "predictor": {
diff --git a/examples/opensora1.0/inference_model_16x512x512.json b/examples/opensora1.0/inference_model_16x512x512.json
index d382f266..5965d4b0 100644
--- a/examples/opensora1.0/inference_model_16x512x512.json
+++ b/examples/opensora1.0/inference_model_16x512x512.json
@@ -10,7 +10,6 @@
         "hub_backend": "hf",
         "model_id": "T5",
         "from_pretrained": "DeepFloyd/t5-v1_1-xxl",
-        "local_files_only": false,
         "low_cpu_mem_usage": true,
         "dtype": "fp32"
     },
@@ -18,7 +17,6 @@
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
         "from_pretrained": "DeepFloyd/t5-v1_1-xxl",
-        "local_files_only": false,
         "model_max_length": 120
     },
     "predictor": {
diff --git a/examples/opensora1.2/inference_model_102x720x1280.json b/examples/opensora1.2/inference_model_102x720x1280.json
index 928ada67..469e945a 100644
--- a/examples/opensora1.2/inference_model_102x720x1280.json
+++ b/examples/opensora1.2/inference_model_102x720x1280.json
@@ -15,7 +15,6 @@
        "hub_backend": "hf",
         "model_id": "T5",
         "from_pretrained": "DeepFloyd/t5-v1_1-xxl",
-        "local_files_only": false,
         "low_cpu_mem_usage": true,
         "dtype": "fp32"
     },
@@ -23,7 +22,6 @@
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
         "from_pretrained": "DeepFloyd/t5-v1_1-xxl",
-        "local_files_only": false,
         "model_max_length":300
     },
     "predictor": {
diff --git a/examples/opensoraplan1.2/inference_model_29x480x640.json b/examples/opensoraplan1.2/inference_model_29x480x640.json
index 4c5548d4..8cc08231 100644
--- a/examples/opensoraplan1.2/inference_model_29x480x640.json
+++ b/examples/opensoraplan1.2/inference_model_29x480x640.json
@@ -47,7 +47,6 @@
         "hub_backend": "hf",
         "model_id": "MT5",
         "from_pretrained": "./weights/google/mt5-xxl",
-        "local_files_only": false,
         "low_cpu_mem_usage": true,
         "dtype": "fp16"
     },
@@ -55,7 +54,6 @@
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
         "from_pretrained": "./opensoraplanv12/weights/mt5",
-        "local_files_only": false,
         "model_max_length": 512
     },
     "predictor": {
diff --git a/examples/opensoraplan1.3/i2v/inference_i2v_model.json b/examples/opensoraplan1.3/i2v/inference_i2v_model.json
index cf26436c..a2050288 100644
--- a/examples/opensoraplan1.3/i2v/inference_i2v_model.json
+++ b/examples/opensoraplan1.3/i2v/inference_i2v_model.json
@@ -1,4 +1,4 @@
-{   
+{
     "predictor": {
         "model_id": "videoditsparsei2v",
         "from_pretrained": "./weights/sparsedit/sparsediti2v_mm.pth",
@@ -49,10 +49,9 @@
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
         "from_pretrained": "./weights/google/mt5-xxl",
-        "local_files_only": false,
         "model_max_length": 512
     },
-    
+
     "diffusion": {
         "model_id": "EulerAncestralDiscrete",
         "num_inference_steps":100,
diff --git a/examples/opensoraplan1.3/t2v/inference_t2v_model.json b/examples/opensoraplan1.3/t2v/inference_t2v_model.json
index bcda965e..249dce8f 100644
--- a/examples/opensoraplan1.3/t2v/inference_t2v_model.json
+++ b/examples/opensoraplan1.3/t2v/inference_t2v_model.json
@@ -1,4 +1,4 @@
-{   
+{
     "predictor": {
         "model_id": "videoditsparse",
         "from_pretrained": "./weights/sparsedit/sparsedit_mm.pth",
@@ -49,10 +49,9 @@
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
         "from_pretrained": "./weights/google/mt5-xxl",
-        "local_files_only": false,
         "model_max_length": 512
     },
-    
+
     "diffusion": {
         "model_id": "EulerAncestralDiscrete",
         "num_inference_steps":100,
diff --git a/examples/qihoo_t2x/inference_model_image.json b/examples/qihoo_t2x/inference_model_image.json
index 9878e1ed..0494baad 100644
--- a/examples/qihoo_t2x/inference_model_image.json
+++ b/examples/qihoo_t2x/inference_model_image.json
@@ -47,15 +47,13 @@
        "hub_backend": "hf",
         "model_id": "T5",
         "from_pretrained": "./pretrain_models/text_encoder",
-        "local_files_only": false,
         "low_cpu_mem_usage": true,
         "dtype": "fp32"
     },
     "tokenizer":{
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
-        "from_pretrained": "./pretrain_models/tokenizer",
-        "local_files_only": false
+        "from_pretrained": "./pretrain_models/tokenizer"
     },
     "predictor": {
         "dtype": "bf16",
diff --git a/examples/qwen2vl/evaluate_qwen2vl_7b.json b/examples/qwen2vl/evaluate_qwen2vl_7b.json
index fba234e0..8c77e68a 100644
--- a/examples/qwen2vl/evaluate_qwen2vl_7b.json
+++ b/examples/qwen2vl/evaluate_qwen2vl_7b.json
@@ -87,8 +87,7 @@
     "tokenizer": {
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
-        "from_pretrained": "./Qwen2-VL-7B-Instruct",
-        "local_files_only":false
+        "from_pretrained": "./Qwen2-VL-7B-Instruct"
     },
     "generation_config": {
         "bos_token_id": 151643,
diff --git a/examples/qwen2vl/inference_qwen2vl_2b.json b/examples/qwen2vl/inference_qwen2vl_2b.json
index 0f9aa052..8e5fdcdd 100644
--- a/examples/qwen2vl/inference_qwen2vl_2b.json
+++ b/examples/qwen2vl/inference_qwen2vl_2b.json
@@ -85,8 +85,7 @@
     "tokenizer": {
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
-        "from_pretrained": "ckpt/hf_path/Qwen2-VL-2B-Instruct",
-        "local_files_only":false
+        "from_pretrained": "ckpt/hf_path/Qwen2-VL-2B-Instruct"
     },
     "generation_config": {
         "bos_token_id": 151643,
diff --git a/examples/qwen2vl/inference_qwen2vl_72b.json b/examples/qwen2vl/inference_qwen2vl_72b.json
index 64e23985..5a39ca49 100644
--- a/examples/qwen2vl/inference_qwen2vl_72b.json
+++ b/examples/qwen2vl/inference_qwen2vl_72b.json
@@ -84,8 +84,7 @@
     "tokenizer": {
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
-        "from_pretrained": "ckpt/hf_path/Qwen2-VL-72B-Instruct",
-        "local_files_only":false
+        "from_pretrained": "ckpt/hf_path/Qwen2-VL-72B-Instruct"
     },
     "generation_config": {
         "bos_token_id": 151643,
diff --git a/examples/qwen2vl/inference_qwen2vl_7b.json b/examples/qwen2vl/inference_qwen2vl_7b.json
index 4fc01c72..6421d771 100644
--- a/examples/qwen2vl/inference_qwen2vl_7b.json
+++ b/examples/qwen2vl/inference_qwen2vl_7b.json
@@ -85,8 +85,7 @@
     "tokenizer": {
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
-        "from_pretrained": "ckpt/hf_path/Qwen2-VL-7B-Instruct",
-        "local_files_only":false
+        "from_pretrained": "ckpt/hf_path/Qwen2-VL-7B-Instruct"
     },
     "generation_config": {
         "bos_token_id": 151643,
diff --git a/examples/whisper/pretrain_whisper.sh b/examples/whisper/pretrain_whisper.sh
index 4fa7cb12..6927fa33 100644
--- a/examples/whisper/pretrain_whisper.sh
+++ b/examples/whisper/pretrain_whisper.sh
@@ -65,6 +65,7 @@ GPT_ARGS="
     --overlap-grad-reduce \
     --overlap-param-gather \
     --num-workers 4 \
+    --trust-remote-code \
 "
 
 MM_ARGS="
diff --git a/mindspeed b/mindspeed
deleted file mode 120000
index 22f84c16..00000000
--- a/mindspeed
+++ /dev/null
@@ -1 +0,0 @@
-../MindSpeed-3f09d67/mindspeed/
\ No newline at end of file
diff --git a/mindspeed_mm/models/text_encoder/text_encoder.py b/mindspeed_mm/models/text_encoder/text_encoder.py
index 505a8edd..26aad1bf 100644
--- a/mindspeed_mm/models/text_encoder/text_encoder.py
+++ b/mindspeed_mm/models/text_encoder/text_encoder.py
@@ -1,8 +1,9 @@
 import importlib
 import torch
 import torch.nn as nn
-from mindspeed_mm.utils.utils import get_dtype
+
 from megatron.training import get_args
+from mindspeed_mm.utils.utils import get_dtype
 
 
 TEXT_ENCODER_MAPPING = {
diff --git a/mindspeed_mm/models/text_encoder/tokenizer.py b/mindspeed_mm/models/text_encoder/tokenizer.py
index 7f72c9e7..00a63682 100644
--- a/mindspeed_mm/models/text_encoder/tokenizer.py
+++ b/mindspeed_mm/models/text_encoder/tokenizer.py
@@ -1,5 +1,4 @@
 import importlib
-from torch import nn
 from megatron.training import get_args
 
 
diff --git a/tests/st/run_configs/inference_qwen2vl_7B_pp1/inference_qwen2vl_7b.json b/tests/st/run_configs/inference_qwen2vl_7B_pp1/inference_qwen2vl_7b.json
index 7df1c60b..4bf6067b 100644
--- a/tests/st/run_configs/inference_qwen2vl_7B_pp1/inference_qwen2vl_7b.json
+++ b/tests/st/run_configs/inference_qwen2vl_7B_pp1/inference_qwen2vl_7b.json
@@ -85,8 +85,7 @@
     "tokenizer": {
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
-        "from_pretrained": "/home/ci_resource/models/qwen2vl_7b/qwen2vl7b",
-        "local_files_only":false
+        "from_pretrained": "/home/ci_resource/models/qwen2vl_7b/qwen2vl7b"
     },
     "generation_config": {
         "bos_token_id": 151643,
diff --git a/tests/st/run_configs/inference_qwen2vl_7B_pp4/inference_qwen2vl_7b.json b/tests/st/run_configs/inference_qwen2vl_7B_pp4/inference_qwen2vl_7b.json
index 7df1c60b..4bf6067b 100644
--- a/tests/st/run_configs/inference_qwen2vl_7B_pp4/inference_qwen2vl_7b.json
+++ b/tests/st/run_configs/inference_qwen2vl_7B_pp4/inference_qwen2vl_7b.json
@@ -85,8 +85,7 @@
     "tokenizer": {
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
-        "from_pretrained": "/home/ci_resource/models/qwen2vl_7b/qwen2vl7b",
-        "local_files_only":false
+        "from_pretrained": "/home/ci_resource/models/qwen2vl_7b/qwen2vl7b"
     },
     "generation_config": {
         "bos_token_id": 151643,
-- 
Gitee


From 46a4bdeed4fc6172a15d07a6d37aa11f1965d0ae Mon Sep 17 00:00:00 2001
From: htwang <onehaitao@foxmail.com>
Date: Tue, 25 Feb 2025 21:19:46 +0800
Subject: [PATCH 4/4] fix error

---
 mindspeed_mm/models/text_encoder/text_encoder.py | 8 ++++++--
 mindspeed_mm/models/text_encoder/tokenizer.py    | 7 +++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/mindspeed_mm/models/text_encoder/text_encoder.py b/mindspeed_mm/models/text_encoder/text_encoder.py
index 26aad1bf..2aad5aef 100644
--- a/mindspeed_mm/models/text_encoder/text_encoder.py
+++ b/mindspeed_mm/models/text_encoder/text_encoder.py
@@ -2,7 +2,6 @@ import importlib
 import torch
 import torch.nn as nn
 
-from megatron.training import get_args
 from mindspeed_mm.utils.utils import get_dtype
 
 
@@ -118,8 +117,13 @@ class TextEncoder(nn.Module):
             self.automodel_name = TEXT_ENCODER_MAPPING[model_id]
         config["pretrained_model_name_or_path"] = config.pop("from_pretrained")
         config["torch_dtype"] = get_dtype(config.pop("dtype"))
-        config["trust_remote_code"] = get_args().trust_remote_code
         config["local_files_only"] = True
+        try:
+            from megatron.training import get_args
+            config["trust_remote_code"] = get_args().trust_remote_code
+        except (ImportError, AssertionError):
+            config["trust_remote_code"] = False
+
         # Only huggingface backend is supported, OpenMind backend will be supported soon.
         module = importlib.import_module("transformers")
         automodel = getattr(module, self.automodel_name)
diff --git a/mindspeed_mm/models/text_encoder/tokenizer.py b/mindspeed_mm/models/text_encoder/tokenizer.py
index 00a63682..2e9ec259 100644
--- a/mindspeed_mm/models/text_encoder/tokenizer.py
+++ b/mindspeed_mm/models/text_encoder/tokenizer.py
@@ -1,5 +1,4 @@
 import importlib
-from megatron.training import get_args
 
 
 class Tokenizer:
@@ -45,7 +44,11 @@ class Tokenizer:
         self.backend = config.pop("hub_backend")
         tokenizer_name = config.pop("autotokenizer_name")
         config["pretrained_model_name_or_path"] = config.pop("from_pretrained")
-        config["trust_remote_code"] = get_args().trust_remote_code
         config["local_files_only"] = True
+        try:
+            from megatron.training import get_args
+            config["trust_remote_code"] = get_args().trust_remote_code
+        except (ImportError, AssertionError):
+            config["trust_remote_code"] = False
         tokenizer_cls = getattr(module, tokenizer_name)
         return tokenizer_cls.from_pretrained(**config)
\ No newline at end of file
-- 
Gitee