From 67cb07f6953fa57c4a12bfc83590facc7b63e8b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=87=AF=E5=AE=87?= <wangkaiyu11@h-partners.com>
Date: Fri, 9 May 2025 16:24:44 +0800
Subject: [PATCH] [build-in][PyTorch][OpenRLHF] add OpenRLHF-v0.5.7 adaptation
 code

---
 .../openrlhf/cli/__init__.py                  |   5 +
 .../openrlhf/cli/train_ppo.py                 |   2 +-
 .../openrlhf/cli/train_ppo_ray.py             |   9 +-
 .../openrlhf/cli/train_sft.py                 |   6 -
 .../openrlhf/datasets/sft_dataset.py          |  57 +-
 .../openrlhf/models/model.py                  |  12 +-
 .../openrlhf/trainer/ray/ppo_actor.py         |  44 +-
 .../openrlhf/trainer/ray/vllm_engine.py       |  20 +-
 .../openrlhf/trainer/ray/vllm_worker_wrap.py  |  33 +-
 .../openrlhf/utils/vision_utils.py            | 703 ++++++++++++++++++
 .../requirements.txt                          |  10 +-
 .../OpenRLHF_v0.5.7_for_PyTorch/version.txt   |   2 +-
 12 files changed, 757 insertions(+), 146 deletions(-)
 create mode 100644 PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/utils/vision_utils.py

diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/__init__.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/__init__.py
index e69de29bb2..764afa7e68 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/__init__.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/__init__.py
@@ -0,0 +1,5 @@
+from transformers import is_torch_npu_available
+
+if is_torch_npu_available():
+    import torch_npu
+    from torch_npu.contrib import transfer_to_npu
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_ppo.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_ppo.py
index 2fc03bac05..c21009b855 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_ppo.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_ppo.py
@@ -311,7 +311,7 @@ if __name__ == "__main__":
     parser.add_argument("--ptx_coef", type=float, default=0.05, help="PPO-ptx loss coef")
     parser.add_argument("--eps_clip", type=float, default=0.2, help="PPO clip range")
     parser.add_argument("--value_clip", type=float, default=0.2, help="PPO value clip range")
-    parser.add_argument("--lambd", type=float, default=1.0, help="PPO GAE lambd")
+    parser.add_argument("--lambd", type=float, default=0.95, help="PPO GAE lambd")
     parser.add_argument("--gamma", type=float, default=1, help="PPO GAE gamma")
     parser.add_argument("--micro_train_batch_size", type=int, default=4, help="batch size per GPU")
     parser.add_argument("--train_batch_size", type=int, default=128, help="Global training batch size")
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_ppo_ray.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_ppo_ray.py
index 245b855314..9bbd14a5c4 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_ppo_ray.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_ppo_ray.py
@@ -213,7 +213,6 @@ if __name__ == "__main__":
         help="tensor parallel size of vLLM Engine for multi-GPU inference",
     )
     parser.add_argument("--vllm_sync_backend", type=str, default="nccl", help="DeepSpeed -> vLLM weight sync backend")
-    parser.add_argument("--vllm_sync_with_ray", action="store_true", default=False)
     parser.add_argument("--enable_prefix_caching", action="store_true", default=False)
     parser.add_argument("--enforce_eager", action="store_true", default=False, help="Disable CUDA graph in vLLM")
 
@@ -269,7 +268,7 @@ if __name__ == "__main__":
     parser.add_argument("--ptx_coef", type=float, default=0.05, help="PPO-ptx loss coef")
     parser.add_argument("--eps_clip", type=float, default=0.2, help="PPO clip range")
     parser.add_argument("--value_clip", type=float, default=0.2, help="PPO value clip range")
-    parser.add_argument("--lambd", type=float, default=1.0, help="PPO GAE lambd")
+    parser.add_argument("--lambd", type=float, default=0.95, help="PPO GAE lambd")
     parser.add_argument("--gamma", type=float, default=1, help="PPO GAE gamma")
     parser.add_argument("--micro_train_batch_size", type=int, default=4, help="batch size per GPU")
     parser.add_argument("--train_batch_size", type=int, default=128, help="Global training batch size")
@@ -375,10 +374,8 @@ if __name__ == "__main__":
         args.remote_rm_url = args.remote_rm_url.split(",")
 
     if args.vllm_num_engines >= 1 and args.enable_prefix_caching:
-        import vllm
-        if vllm.__version__ < "0.7.0":
-            args.enable_prefix_caching = False
-            print("[Warning] Disable prefix cache because vLLM updates weights without updating the old KV Cache for vLLM version below 0.7.0.")
+        args.enable_prefix_caching = False
+        print("[Warning] Disable prefix cache because vLLM updates weights without updating the old KV Cache.")
 
     if args.input_template and "{}" not in args.input_template:
         print("[Warning] {} not in args.input_template, set to None")
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_sft.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_sft.py
index 843e37adad..ad3b2af98c 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_sft.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/cli/train_sft.py
@@ -63,7 +63,6 @@ def train(args):
         pretrain_mode=args.pretrain_mode,
         input_template=args.input_template,
         multiple_of=args.ring_attn_size,
-        multiturn=args.multiturn,
     )
     eval_dataset = SFTDataset(
         eval_data,
@@ -73,7 +72,6 @@ def train(args):
         pretrain_mode=args.pretrain_mode,
         input_template=args.input_template,
         multiple_of=args.ring_attn_size,
-        multiturn=args.multiturn,
     )
 
     # prepare dataloader
@@ -207,7 +205,6 @@ if __name__ == "__main__":
     parser.add_argument("--dataset_probs", type=str, default="1.0", help="sampling probs for datasets")
     parser.add_argument("--train_split", type=str, default="train", help="train split of the HF dataset")
     parser.add_argument("--eval_split", type=str, default="test", help="test split of the dataset")
-    parser.add_argument("--multiturn", action="store_true", default=False, help="Use compacted multiturn dataset")
 
     parser.add_argument("--input_key", type=str, default="input", help="JSON dataset key")
     parser.add_argument("--output_key", type=str, default=None, help="JSON dataset key")
@@ -235,9 +232,6 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
 
-    if args.multiturn:
-        assert args.apply_chat_template, "apply_chat_template must be enabled when using multiturn format"
-
     if args.input_template and "{}" not in args.input_template:
         print("[Warning] {} not in args.input_template, set to None")
         args.input_template = None
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/datasets/sft_dataset.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/datasets/sft_dataset.py
index 6e031f70ab..e5e0c004e9 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/datasets/sft_dataset.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/datasets/sft_dataset.py
@@ -7,7 +7,7 @@ from torch.utils.data import Dataset
 from .utils import zero_pad_sequences
 
 
-def preprocess_data(data, input_template=None, input_key="input", output_key=None, apply_chat_template=None, multiturn=False):
+def preprocess_data(data, input_template=None, input_key="input", output_key=None, apply_chat_template=None):
     if apply_chat_template:
         if output_key:
             prompt_message = data[input_key]
@@ -51,7 +51,6 @@ class SFTDataset(Dataset):
         pretrain_mode=False,
         num_processors=8,  # Specify the number of processors you want to use
         multiple_of=1,
-        multiturn=False,
     ) -> None:
         super().__init__()
         self.tokenizer = tokenizer
@@ -59,7 +58,6 @@ class SFTDataset(Dataset):
         self.pretrain_mode = pretrain_mode
         self.max_length = max_length
         self.multiple_of = multiple_of
-        self.multiturn = multiturn
 
         # chat template
         self.input_template = input_template
@@ -75,9 +73,7 @@ class SFTDataset(Dataset):
 
         # Parallel loading datasets
         processed_dataset = dataset.map(
-            self.process_data, 
-            remove_columns=dataset.column_names,
-            num_proc=num_processors,
+            self.process_data, remove_columns=dataset.column_names, num_proc=num_processors
         )
         processed_dataset = processed_dataset.filter(lambda x: x["prompt"] is not None)
 
@@ -85,51 +81,15 @@ class SFTDataset(Dataset):
         self.prompts = processed_dataset["prompt"]
         self.responses = processed_dataset["response"]
         self.prompt_ids_lens = processed_dataset["prompt_ids_len"]
-        self.response_ranges = processed_dataset["response_ranges"] if self.multiturn else None
 
     def process_data(self, data):
-        if self.multiturn and self.output_key:
-            data[self.input_key].append(data[self.output_key])
-            data[self.output_key] = None
-        
-        if self.multiturn:
-            assert not self.output_key or not data[self.output_key], "You should put the whole trajactory into data[input_key] and do not set output_key"
-            input_key = self.input_key
-            apply_chat_template = self.apply_chat_template
-            response_ranges = []
-            for idx, message in enumerate(data[input_key]):
-                if message['role'] == 'assistant':
-                    prompt = apply_chat_template(data[input_key][: idx], tokenize=False, add_generation_prompt=True)
-                    response = apply_chat_template(data[input_key][: idx + 1], tokenize=False)[len(prompt):]
-
-                    start_idx = self.tokenizer(
-                        prompt,
-                        max_length=self.max_length,
-                        padding=False,
-                        truncation=True,
-                        return_tensors="pt",
-                        add_special_tokens=False,
-                    )["attention_mask"].int().sum().item()
-                    
-                    end_idx = start_idx + self.tokenizer(
-                        response,
-                        max_length=self.max_length,
-                        padding=False,
-                        truncation=True,
-                        return_tensors="pt",
-                        add_special_tokens=False,
-                    )["attention_mask"].int().sum().item() - 1
-                    response_ranges.append((start_idx, end_idx)) # left close right open
-                    
         prompt, response = preprocess_data(
             data,
             None if self.pretrain_mode else self.input_template,
             self.input_key,
             self.output_key,
             apply_chat_template=None if self.pretrain_mode else self.apply_chat_template,
-            multiturn=self.multiturn,
         )
-
         if not self.pretrain_mode:
             prompt_token = self.tokenizer(
                 prompt,
@@ -147,7 +107,7 @@ class SFTDataset(Dataset):
         else:
             prompt_ids_len = 0
 
-        return {"prompt": prompt, "response": response, "prompt_ids_len": prompt_ids_len, "response_ranges": response_ranges if self.multiturn else None}
+        return {"prompt": prompt, "response": response, "prompt_ids_len": prompt_ids_len}
 
     def __len__(self):
         length = len(self.prompts)
@@ -178,7 +138,7 @@ class SFTDataset(Dataset):
             # to avoid EOS_token truncation
             input_token["input_ids"][0][-1] = self.tokenizer.eos_token_id
             input_token["attention_mask"][0][-1] = True
-        info = {"input": prompt, "output": response, "input_length": input_token["attention_mask"].int().sum().item(), "response_ranges": self.response_ranges[idx] if self.multiturn else None}
+        info = {"input": prompt, "output": response, "input_length": input_token["attention_mask"].int().sum().item()}
 
         return prompt_ids_len, input_token["input_ids"], input_token["attention_mask"], info
 
@@ -203,19 +163,14 @@ class SFTDataset(Dataset):
         packed_input_ids = []
         packed_attention_masks = []
         prompt_ids_lens = []
-        infos = {"input_length": [], "response_ranges": [] if self.multiturn else None}
+        infos = {"input_length": []}
+
         index = 1
         for prompt_ids_len, input_id, attention_mask, info in item_list:
             packed_input_ids.append(input_id.flatten())
             packed_attention_masks.append(torch.full_like(input_id.flatten(), index))
             prompt_ids_lens.append(prompt_ids_len)
             infos["input_length"].append(info["input_length"])
-            if self.multiturn:
-                if len(infos["response_ranges"]) >= 1:
-                    for i in range(len(info["response_ranges"])):
-                        info["response_ranges"][i][0] += infos["response_ranges"][-1][-1][1] # end_index of the last response of the last item
-                        info["response_ranges"][i][1] += infos["response_ranges"][-1][-1][1]
-                infos["response_ranges"].append(info["response_ranges"])
             index += 1
 
         packed_input_ids = torch.cat(packed_input_ids, dim=0).unsqueeze(0)
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/models/model.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/models/model.py
index 3d2102dc94..a7a71ae5b7 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/models/model.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/models/model.py
@@ -3,17 +3,20 @@ from typing import Optional, Union
 import deepspeed
 import torch
 import torch.nn as nn
-from flash_attn.utils.distributed import all_gather
 from peft import LoraConfig, get_peft_model
 from peft.tuners.lora import LoraLayer
 from transformers import AutoConfig, AutoModel, BitsAndBytesConfig
 from transformers.integrations.deepspeed import HfDeepSpeedConfig
+from transformers.utils import is_flash_attn_2_available
 
 from openrlhf.utils.logging_utils import init_logger
 
 from .ring_attn_utils import convert_ring_attn_params
 from .utils import reset_position_ids
 
+if is_flash_attn_2_available():
+    from flash_attn.utils.distributed import all_gather
+
 logger = init_logger(__name__)
 
 
@@ -68,7 +71,12 @@ def get_llm_for_sequence_regression(
 
     config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
     config.normalize_reward = normalize_reward
-    config._attn_implementation = "flash_attention_2" if use_flash_attention_2 else "eager"
+    if use_flash_attention_2 == "fa2":
+        config._attn_implementation = "flash_attention_2"
+    elif use_flash_attention_2 == "sdpa":
+        config._attn_implementation = "sdpa"
+    else:
+        config._attn_implementation = "eager"
 
     # Prioritize using the value_head_prefix in the model configuration.
     value_head_prefix = getattr(config, "value_head_prefix", value_head_prefix)
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/ppo_actor.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/ppo_actor.py
index 9661b0edb0..1d8bb59c45 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/ppo_actor.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/ppo_actor.py
@@ -82,37 +82,24 @@ class ActorPPOTrainer(PPOTrainer):
             world_size = vllm_num_engines * vllm_tensor_parallel_size + 1
 
             backend = getattr(self.strategy.args, "vllm_sync_backend", "nccl")
-            use_ray = getattr(self.strategy.args, "vllm_sync_with_ray", False)
-            group_name = "openrlhf"
             refs = [
                 engine.init_process_group.remote(
                     master_address,
                     master_port,
                     i * vllm_tensor_parallel_size + 1,
                     world_size,
-                    group_name,
+                    "openrlhf",
                     backend=backend,
-                    use_ray=use_ray,
                 )
                 for i, engine in enumerate(self.vllm_engines)
             ]
-            if use_ray:
-                import ray.util.collective as collective
-                collective.init_collective_group(
-                    world_size=world_size,
-                    rank=0,
-                    backend=backend,
-                    group_name=group_name
-                )
-                self._model_update_group = group_name
-            else:
-                self._model_update_group = init_process_group(
-                    backend=backend,
-                    init_method=f"tcp://{master_address}:{master_port}",
-                    world_size=world_size,
-                    rank=0,
-                    group_name=group_name,
-                )
+            self._model_update_group = init_process_group(
+                backend=backend,
+                init_method=f"tcp://{master_address}:{master_port}",
+                world_size=world_size,
+                rank=0,
+                group_name="openrlhf",
+            )
 
             ray.get(refs)
 
@@ -149,15 +136,8 @@ class ActorPPOTrainer(PPOTrainer):
         return self.training_step_actor(experience)
 
     def _broadcast_to_vllm(self):
-        use_prefix_cache = getattr(self.strategy.args, "enable_prefix_caching", False)
-        cache_reset_refs = []
-        if use_prefix_cache and torch.distributed.get_rank() == 0:
-            # clear prefix cache
-            for engine in self.vllm_engines:
-                cache_reset_refs.append(engine.reset_prefix_cache.remote())
         # avoid OOM
         torch.cuda.empty_cache()
-        use_ray = getattr(self.strategy.args, "vllm_sync_with_ray", False)
         model = self.actor.model.module
         count, num_params = 0, len(list(model.named_parameters()))
         for name, param in model.named_parameters():
@@ -174,14 +154,8 @@ class ActorPPOTrainer(PPOTrainer):
             # For ZeRO-3, allgather sharded parameter and broadcast to all vllm engines by rank 0
             with deepspeed.zero.GatheredParameters([param], enabled=self.strategy.args.zero_stage == 3):
                 if torch.distributed.get_rank() == 0:
-                    if use_ray:
-                        import ray.util.collective as collective
-                        collective.broadcast(param.data, 0, group_name=self._model_update_group)
-                    else:
-                        torch.distributed.broadcast(param.data, 0, group=self._model_update_group)
+                    torch.distributed.broadcast(param.data, 0, group=self._model_update_group)
                     ray.get(refs)
-        if cache_reset_refs:
-            ray.get(cache_reset_refs)
         torch.distributed.barrier()
 
     def _save_checkpoint(self, args, tag, client_states):
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/vllm_engine.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/vllm_engine.py
index 889b034242..733c57effb 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/vllm_engine.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/vllm_engine.py
@@ -35,11 +35,7 @@ class LLMRayActor:
         else:
             # RayGPUExecutor
             # See the patch https://github.com/vllm-project/vllm/commit/479d69fad0538f04cb22bf13e76ff91cfeb8a4e5
-            if vllm.__version__ >= "0.4.3":
-                # https://github.com/vllm-project/vllm/commit/676a99982fe9aabe72fd52a91e08988a653a7359
-                kwargs["distributed_executor_backend"] = "ray"
-            else:
-                kwargs["worker_use_ray"] = True
+            kwargs["worker_use_ray"] = True
 
             if vllm.__version__ > "0.6.4.post1":
                 # https://github.com/vllm-project/vllm/pull/10555
@@ -60,14 +56,14 @@ class LLMRayActor:
     def generate(self, *args, **kwargs):
         return self.llm.generate(*args, **kwargs)
 
-    def init_process_group(self, master_address, master_port, rank_offset, world_size, group_name, backend, use_ray):
+    def init_process_group(self, master_address, master_port, rank_offset, world_size, group_name, backend):
         if self.use_gpu_executor:
             return self.llm.llm_engine.model_executor.driver_worker.init_process_group(
-                master_address, master_port, rank_offset, world_size, group_name, backend, use_ray
+                master_address, master_port, rank_offset, world_size, group_name, backend
             )
         else:
             return self.llm.llm_engine.model_executor._run_workers(
-                "init_process_group", master_address, master_port, rank_offset, world_size, group_name, backend, use_ray
+                "init_process_group", master_address, master_port, rank_offset, world_size, group_name, backend
             )
 
     def update_weight(self, name, dtype, shape, empty_cache=False):
@@ -78,14 +74,6 @@ class LLMRayActor:
         else:
             return self.llm.llm_engine.model_executor._run_workers("update_weight", name, dtype, shape, empty_cache)
 
-    def reset_prefix_cache(self):
-        import vllm
-        if vllm.__version__ < "0.7.0":
-            # https://github.com/vllm-project/vllm/commit/7206ce4ce112ed117796a59045c968a6d353f691
-            logger.warning("Reset prefix cache API is available only from vLLM 0.7.0!")
-            return
-        self.llm.llm_engine.reset_prefix_cache()
-
     def stop_remote_worker_execution_loop(self):
         # Fix error for using 2 communication group
         # https://github.com/vllm-project/vllm/commit/eb6d3c264d0cd8e44dec16bca7947fbe96415ce9#diff-e1ad69e38e033accddfa5480ec808c4740eb39244d1ef51cc3407e20dde8cfd4
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/vllm_worker_wrap.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/vllm_worker_wrap.py
index 2f324793d0..730dd12b85 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/vllm_worker_wrap.py
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/trainer/ray/vllm_worker_wrap.py
@@ -8,30 +8,19 @@ logger = init_logger(__name__)
 
 
 class WorkerWrap(Worker):
-    def init_process_group(self, master_address, master_port, rank_offset, world_size, group_name, backend="nccl", use_ray=False):
+    def init_process_group(self, master_address, master_port, rank_offset, world_size, group_name, backend="nccl"):
         """Init torch process group for model weights update"""
         assert torch.distributed.is_initialized(), f"default torch process group must be initialized"
         assert group_name != "", f"group name must not be empty"
 
         rank = torch.distributed.get_rank() + rank_offset
-        if use_ray:
-            import ray.util.collective as collective
-            collective.init_collective_group(
-                world_size=world_size,
-                rank=rank,
-                backend=backend,
-                group_name=group_name
-            )
-            self._model_update_group = group_name
-        else:
-            self._model_update_group = init_process_group(
-                backend=backend,
-                init_method=f"tcp://{master_address}:{master_port}",
-                world_size=world_size,
-                rank=rank,
-                group_name=group_name,
-            )
-        self._model_update_with_ray = use_ray
+        self._model_update_group = init_process_group(
+            backend=backend,
+            init_method=f"tcp://{master_address}:{master_port}",
+            world_size=world_size,
+            rank=rank,
+            group_name=group_name,
+        )
         print(
             f"init_process_group: master_address={master_address}, master_port={master_port}, ",
             f"rank={rank}, world_size={world_size}, group_name={group_name}",
@@ -44,11 +33,7 @@ class WorkerWrap(Worker):
 
         assert dtype == self.model_config.dtype, f"mismatch dtype: src {dtype}, dst {self.model_config.dtype}"
         weight = torch.empty(shape, dtype=dtype, device="cuda")
-        if self._model_update_with_ray:
-            import ray.util.collective as collective
-            collective.broadcast(weight, 0, group_name=self._model_update_group)
-        else:
-            torch.distributed.broadcast(weight, 0, group=self._model_update_group)
+        torch.distributed.broadcast(weight, 0, group=self._model_update_group)
 
         self.model_runner.model.load_weights(weights=[(name, weight)])
 
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/utils/vision_utils.py b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/utils/vision_utils.py
new file mode 100644
index 0000000000..8c4e18fc90
--- /dev/null
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/openrlhf/utils/vision_utils.py
@@ -0,0 +1,703 @@
+import json
+import math
+import os
+import re
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from dataclasses import dataclass, field
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from PIL.Image import Image as ImageObject
+from transformers import AutoConfig, AutoProcessor
+from typing_extensions import override
+
+IGNORE_INDEX = -100
+ImageInput = Union[str, bytes, ImageObject]
+SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
+IMAGE_PLACEHOLDER = os.environ.get("IMAGE_PLACEHOLDER", "<image>")
+VIDEO_PLACEHOLDER = os.environ.get("VIDEO_PLACEHOLDER", "<video>")
+DEFAULT_TOOL_PROMPT = (
+    "You have access to the following tools:\n{tool_text}"
+    "Use the following format if using a tool:\n"
+    "```\n"
+    "Action: tool name (one of [{tool_names}])\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```)\n"""
+    "```\n"
+)
+
+
+def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+    tool_text = ""
+    tool_names = []
+    for tool in tools:
+        param_text = ""
+        for name, param in tool["parameters"]["properties"].items():
+            required, enum, items = "", "", ""
+            if name in tool["parameters"].get("required", []):
+                required = ", required"
+
+            if param.get("enum", None):
+                enum = ", should be one of [{}]".format(", ".join(param["enum"]))
+
+            if param.get("items", None):
+                items = ", where each item should be {}".format(param["items"].get("type", ""))
+
+            param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
+                name=name,
+                type=param.get("type", ""),
+                required=required,
+                desc=param.get("description", ""),
+                enum=enum,
+                items=items,
+            )
+
+        tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
+            name=tool["name"], desc=tool.get("description", ""), args=param_text
+        )
+        tool_names.append(tool["name"])
+
+    return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
+
+
+@dataclass
+class DatasetAttr:
+    ranking: bool = False
+    # common columns
+    system: Optional[str] = None
+    tools: Optional[str] = None
+    images: Optional[str] = None
+    videos: Optional[str] = None
+    # sharegpt columns
+    messages: Optional[str] = None
+    # sharegpt tags
+    role_tag: Optional[str] = "from"
+    content_tag: Optional[str] = "value"
+    user_tag: Optional[str] = "human"
+    assistant_tag: Optional[str] = "gpt"
+    observation_tag: Optional[str] = "observation"
+    function_tag: Optional[str] = "function_call"
+    system_tag: Optional[str] = "system"
+    # rlhf columns
+    chosen: Optional[str] = None
+    rejected: Optional[str] = None
+
+    def set_attr(self, key: str, obj, default=None) -> None:
+        setattr(self, key, obj.get(key, default))
+
+
+@dataclass
+class Formatter(ABC):
+    slots: SLOTS = field(default_factory=list)
+    tool_format: Optional[str] = None
+
+    @abstractmethod
+    def apply(self, **kwargs) -> SLOTS:
+        r"""
+        Forms a list of slots according to the inputs to encode.
+        """
+        ...
+
+    def extract(self, content) -> Union[str, List]:
+        r"""
+        Extract a list of tuples from the response message if using tools.
+
+        Each tuple consists of function name and function arguments.
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class EmptyFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if has_placeholder:
+            raise ValueError("Empty formatter should not contain any placeholder.")
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        return self.slots
+
+
+@dataclass
+class StringFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if not has_placeholder:
+            raise ValueError("A placeholder is required in the string formatter.")
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        elements = []
+        for slot in self.slots:
+            if isinstance(slot, str):
+                for name, value in kwargs.items():
+                    if not isinstance(value, str):
+                        raise RuntimeError(f"Expected a string, got {value}")
+
+                    slot = slot.replace("{{" + name + "}}", value, 1)
+                elements.append(slot)
+            elif isinstance(slot, (dict, set)):
+                elements.append(slot)
+            else:
+                raise RuntimeError(f"Input must be string, set[str] or dict[str, str], got {type(slot)}")
+
+        return elements
+
+
+@dataclass
+class FunctionFormatter(Formatter):
+    def __post_init__(self):
+        function_slots = ["Action: {{name}}\nAction Input: {{arguments}}\n"]
+        self.slots = function_slots + self.slots
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        content = kwargs.pop("content")
+        functions: List[Tuple[str, str]] = []
+        try:
+            tool_calls = json.loads(content)
+            if not isinstance(tool_calls, list):  # parallel function call
+                tool_calls = [tool_calls]
+
+            for tool_call in tool_calls:
+                functions.append((tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False)))
+
+        except json.JSONDecodeError:
+            raise RuntimeError(f"Invalid JSON format in function message: {str([content])}")  # flat string
+
+        elements = []
+        for name, arguments in functions:
+            for slot in self.slots:
+                if isinstance(slot, str):
+                    slot = slot.replace("{{name}}", name).replace("{{arguments}}", arguments)
+                    elements.append(slot)
+                elif isinstance(slot, (dict, set)):
+                    elements.append(slot)
+                else:
+                    raise RuntimeError(f"Input must be string, set[str] or dict[str, str], got {type(slot)}")
+
+        return elements
+
+
+@dataclass
+class ToolFormatter(Formatter):
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        content = kwargs.pop("content")
+        try:
+            tools = json.loads(content)
+            return [tool_formatter(tools) if len(tools) != 0 else ""]
+        except json.JSONDecodeError:
+            raise RuntimeError(f"Invalid JSON format in tool description: {str([content])}")  # flat string
+
+    @override
+    def extract(self, content: str) -> Union[str, List]:
+        regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
+        action_match: List[Tuple[str, str]] = re.findall(regex, content)
+        if not action_match:
+            return content
+
+        results = []
+        for match in action_match:
+            tool_name = match[0].strip()
+            tool_input = match[1].strip().strip('"').strip("```")
+            try:
+                arguments = json.loads(tool_input)
+                results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
+            except json.JSONDecodeError:
+                return content
+        return results
+
+# Copied from https://github.com/hiyouga/LLaMA-Factory/blob/main/src/llamafactory/data/collator.py
+
+
+class BasePlugin:
+    def __init__(self, image_token, video_token) -> None:
+        self.image_token = image_token
+        self.video_token = video_token
+
+    def _validate_input(self, images, videos) -> None:
+        r"""
+        Validates if this model accepts the input modalities.
+        """
+        if len(images) != 0 and self.image_token is None:
+            raise ValueError("This model does not support image input.")
+
+        if len(videos) != 0 and self.video_token is None:
+            raise ValueError("This model does not support video input.")
+
+    def _preprocess_image(self, image, **kwargs) -> ImageObject:
+        r"""
+        Pre-processes a single image.
+        """
+        image_resolution: int = kwargs.get("image_resolution")
+        if (image.width * image.height) > image_resolution:
+            resize_factor = math.sqrt(image_resolution / (image.width * image.height))
+            width, height = int(image.width * resize_factor), int(image.height * resize_factor)
+            image = image.resize((width, height), resample=Image.NEAREST)
+
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+
+        return image
+
+    def _get_video_sample_frames(self, video_stream, **kwargs) -> int:
+        r"""
+        Computes video sample frames according to fps.
+        """
+        video_fps: float = kwargs.get("video_fps")
+        video_maxlen: int = kwargs.get("video_maxlen")
+        total_frames = video_stream.frames
+        sample_frames = float(video_stream.duration * video_stream.time_base) * video_fps
+        sample_frames = min(total_frames, video_maxlen, sample_frames)
+        return math.floor(sample_frames)
+
+    def _regularize_images(self, images, **kwargs) -> List[ImageObject]:
+        r"""
+        Regularizes images to avoid error. Including reading and pre-processing.
+        """
+        results = []
+        for image in images:
+            if isinstance(image, str):
+                image = Image.open(image)
+            elif isinstance(image, bytes):
+                image = Image.open(BytesIO(image))
+            elif isinstance(image, dict):
+                if image["bytes"] is not None:
+                    image = Image.open(BytesIO(image["bytes"]))
+                else:
+                    image = Image.open(image["path"])
+
+            if not isinstance(image, ImageObject):
+                raise ValueError(f"Expect input is a list of Images, but got {type(image)}.")
+
+            results.append(self._preprocess_image(image, **kwargs))
+
+        return results
+
+    def _regularize_videos(self, videos, **kwargs) -> List[List[ImageObject]]:
+        r"""
+        Regularizes videos to avoid error. Including reading, resizing and converting.
+        """
+        results = []
+        for video in videos:
+            container = av.open(video, "r")
+            video_stream = next(stream for stream in container.streams if stream.type == "video")
+            total_frames = video_stream.frames
+            sample_frames = self._get_video_sample_frames(video_stream, **kwargs)
+            sample_indices = np.linspace(0, total_frames - 1, sample_frames).astype(np.int32)
+            frames: List["ImageObject"] = []
+            container.seek(0)
+            for frame_idx, frame in enumerate(container.decode(video_stream)):
+                if frame_idx in sample_indices:
+                    frames.append(frame.to_image())
+
+            frames = self._regularize_images(frames, **kwargs)
+            results.append(frames)
+
+        return results
+
+    def _get_mm_inputs(self, images, videos, processor) -> Dict[str, torch.Tensor]:
+        r"""
+        Processes visual inputs.
+
+        Returns: (llava and paligemma)
+            pixel_values: tensor with shape (B, C, H, W)
+
+        Returns: (qwen2-vl)
+            pixel_values: tensor with shape (num_patches, patch_dim)
+            image_grid_thw: tensor with shape (num_images, 3), where the three numbers are time, width, height
+
+        It holds num_patches == torch.prod(image_grid_thw)
+        """
+        image_processor = getattr(processor, "image_processor")
+        video_processor = getattr(processor, "video_processor", image_processor)
+        input_dict = {"images": None}  # default key
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_resolution=getattr(processor, "image_resolution", 512 * 512),
+            )
+            input_dict["images"] = images
+
+        if len(videos) != 0:
+            videos = self._regularize_videos(
+                videos,
+                image_resolution=getattr(processor, "video_resolution", 128 * 128),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 64),
+            )
+            input_dict["videos"] = videos
+
+        mm_inputs = {}
+        if image_processor != video_processor:
+            if input_dict.get("images") is not None:
+                mm_inputs.update(image_processor(input_dict["images"], return_tensors="pt"))
+            if input_dict.get("videos") is not None:
+                mm_inputs.update(video_processor(input_dict["videos"], return_tensors="pt"))
+        elif input_dict.get("images") is not None or input_dict.get("videos") is not None:
+            # qwen2-vl has same processor.
+            mm_inputs.update(image_processor(**input_dict, return_tensors="pt"))
+
+        return mm_inputs
+
+    def process_messages(self, messages, images, videos, processor) -> List[Dict[str, str]]:
+        r"""
+        Pre-processes input messages before tokenization for VLMs.
+        """
+        self._validate_input(images, videos)
+        return messages
+
+    def process_token_ids(self, input_ids, labels, images, videos, tokenizer, processor
+                          ) -> Tuple[List[int], Optional[List[int]]]:
+        r"""
+        Pre-processes token ids after tokenization for VLMs.
+        """
+        self._validate_input(images, videos)
+        return input_ids, labels
+
+    def get_mm_inputs(self, images, videos, imglens, vidlens, batch_ids, processor
+                      ) -> Dict[str, Union[List[int], torch.Tensor]]:
+        r"""
+        Builds batched multimodal inputs for VLMs.
+
+        Arguments:
+            images: a list of image inputs, shape (num_images,)
+            videos: a list of video inputs, shape (num_videos,)
+            imglens: number of images in each sample, shape (batch_size,)
+            vidlens: number of videos in each sample, shape (batch_size,)
+            batch_ids: input ids of samples, shape (batch_size, seq_len)
+            processor: a processor for pre-processing images and videos
+        """
+        self._validate_input(images, videos)
+        return {}
+
+# Copied from https://github.com/hiyouga/LLaMA-Factory/blob/main/src/llamafactory/data/collator.py
+
+
+class Qwen2vlPlugin(BasePlugin):
+    @override
+    def _preprocess_image(self, image, **kwargs) -> ImageObject:
+        image = super()._preprocess_image(image, **kwargs)
+        if min(image.width, image.height) < 28:
+            width, height = max(image.width, 28), max(image.height, 28)
+            image = image.resize((width, height), resample=Image.NEAREST)
+
+        if image.width / image.height > 200:
+            width, height = image.height * 180, image.height
+            image = image.resize((width, height), resample=Image.NEAREST)
+
+        if image.height / image.width > 200:
+            width, height = image.width, image.width * 180
+            image = image.resize((width, height), resample=Image.NEAREST)
+        return image
+
+    @override
+    def _get_video_sample_frames(self, video_stream, **kwargs) -> int:
+        sample_frames = super()._get_video_sample_frames(video_stream, **kwargs)
+        sample_frames = sample_frames // 2 * 2
+        return sample_frames
+
+    @override
+    def process_messages(self, messages, images, videos, processor
+                         ) -> List[Dict[str, str]]:
+        r"""
+        Pre-processes input messages before tokenization for VLMs.
+        """
+        self._validate_input(images, videos)
+        image_processor = getattr(processor, "image_processor")
+        merge_length = getattr(image_processor, "merge_size") ** 2
+        mm_inputs = self._get_mm_inputs(images, videos, processor)
+        image_grid_thw = mm_inputs.get("image_grid_thw", [])
+        video_grid_thw = mm_inputs.get("video_grid_thw", [])
+
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if num_image_tokens >= len(image_grid_thw):
+                    raise ValueError(f"`len(images)` is less than the number of {IMAGE_PLACEHOLDER} tokens.")
+
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    "<|vision_start|>{}<|vision_end|>".format(
+                        self.image_token * (image_grid_thw[num_image_tokens].prod() // merge_length)
+                    ),
+                    1,
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                if num_video_tokens >= len(video_grid_thw):
+                    raise ValueError(f"`len(videos)` is less than the number of {VIDEO_PLACEHOLDER} tokens.")
+
+                content = content.replace(
+                    VIDEO_PLACEHOLDER,
+                    "<|vision_start|>{}<|vision_end|>".format(
+                        self.video_token * (video_grid_thw[num_video_tokens].prod() // merge_length)
+                    ),
+                    1,
+                )
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        if len(images) != num_image_tokens:
+            raise ValueError(f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens.")
+
+        if len(videos) != num_video_tokens:
+            raise ValueError(f"The number of videos does not match the number of {VIDEO_PLACEHOLDER} tokens.")
+
+        return messages
+
+    @override
+    def get_mm_inputs(self, images, videos, imglens, vidlens, batch_ids, processor
+                      ) -> Dict[str, Union[List[int], torch.Tensor]]:
+        r"""
+        Builds batched multimodal inputs for VLMs.
+
+        Arguments:
+            images: a list of image inputs, shape (num_images,)
+            videos: a list of video inputs, shape (num_videos,)
+            imglens: number of images in each sample, shape (batch_size,)
+            vidlens: number of videos in each sample, shape (batch_size,)
+            batch_ids: input ids of samples, shape (batch_size, seq_len)
+            processor: a processor for pre-processing images and videos
+        """
+        self._validate_input(images, videos)
+        return self._get_mm_inputs(images, videos, processor)
+
+
+@dataclass
+class VisionEncoderUtils:
+    format_user: Formatter
+    format_assistant: Formatter
+    format_system: Formatter
+    format_function: Formatter
+    format_observation: Formatter
+    format_tools: Formatter
+    format_separator: Formatter
+    format_prefix: Formatter
+    default_system: str
+    stop_words: List[str]
+    efficient_eos: bool
+    replace_eos: bool
+    replace_jinja_template: bool
+    mm_plugin: Qwen2vlPlugin
+
+    def encode_oneturn(
+        self,
+        tokenizer,
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> Tuple[List[int], List[int]]:
+        r"""
+        Returns a single pair of token ids representing prompt and response respectively.
+        """
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        prompt_ids = []
+        for encoded_ids in encoded_messages[:-1]:
+            prompt_ids += encoded_ids
+
+        answer_ids = encoded_messages[-1]
+        return prompt_ids, answer_ids
+
+    def encode_multiturn(
+        self,
+        tokenizer,
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> List[Tuple[List[int], List[int]]]:
+        r"""
+        Returns multiple pairs of token ids representing prompts and responses respectively.
+        """
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+    def extract_tool(self, content) -> Union[str, List[Tuple[str, str]]]:
+        r"""
+        Extracts tool message.
+        """
+        return self.format_tools.extract(content)
+
+    def _encode(
+        self,
+        tokenizer,
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+    ) -> List[List[int]]:
+        r"""
+        Encodes formatted inputs to pairs of token ids.
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
+        """
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    elements += self.format_system.apply(content=(system + tool_text))
+
+            if i > 0 and i % 2 == 0:
+                elements += self.format_separator.apply()
+
+            if message["role"] == "user":
+                elements += self.format_user.apply(content=message["content"], idx=str(i // 2))
+            elif message["role"] == "assistant":
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == "observation":
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == "function":
+                elements += self.format_function.apply(content=message["content"])
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    def _convert_elements_to_ids(self, tokenizer, elements: SLOTS) -> List[int]:
+        r"""
+        Converts elements to token ids.
+        """
+        token_ids = []
+        for elem in elements:
+            if isinstance(elem, str):
+                if len(elem) != 0:
+                    token_ids += tokenizer.encode(elem, add_special_tokens=False)
+            elif isinstance(elem, dict):
+                token_ids += [tokenizer.convert_tokens_to_ids(elem.get("token"))]
+            elif isinstance(elem, set):
+                if "bos_token" in elem and tokenizer.bos_token_id is not None:
+                    token_ids += [tokenizer.bos_token_id]
+                elif "eos_token" in elem and tokenizer.eos_token_id is not None:
+                    token_ids += [tokenizer.eos_token_id]
+            else:
+                raise ValueError(f"Input must be string, set[str] or dict[str, str], got {type(elem)}")
+
+        return token_ids
+
+
+def get_dataset_attr(config_path):
+    dataset_info = None
+    try:
+        with open(config_path) as f:
+            dataset_info = json.load(f)
+    except Exception as e:
+        raise ValueError(f"Cannot open {config_path} due to {str(e)}.")
+    dataset_attr = DatasetAttr()
+    dataset_attr.set_attr("ranking", dataset_info, default=False)
+    if "columns" in dataset_info:
+        column_names = ["messages", "system", "tools", "images", "videos", "chosen", "rejected"]
+        for column_name in column_names:
+            dataset_attr.set_attr(column_name, dataset_info["columns"])
+    if "tags" in dataset_info:
+        tag_names = (
+            "role_tag", "content_tag", "user_tag", "assistant_tag",
+            "observation_tag", "function_tag", "system_tag",
+        )
+        for tag in tag_names:
+            dataset_attr.set_attr(tag, dataset_info["tags"])
+    return dataset_attr
+
+
+def get_image_seqlen(config):
+    r"""
+    Computes the number of special tokens per image.
+    """
+    model_type = getattr(config, "model_type", None)
+    if model_type == "llava":
+        image_seqlen = (config.vision_config.image_size // config.vision_config.patch_size) ** 2
+        if getattr(config, "vision_feature_select_strategy", "default") == "full":  # add [CLS] token
+            image_seqlen += 1
+    elif model_type == "paligemma":
+        image_seqlen = config.vision_config.num_image_tokens
+    else:
+        image_seqlen = -1
+
+    return image_seqlen
+
+
+def get_patch_size(config, processor):
+    r"""
+    Computes the patch size of the vit.
+    """
+    patch_size = getattr(config.vision_config, "patch_size", getattr(processor, "patch_size", -1))
+    return patch_size
+
+
+def get_vision_feature_select_strategy(config, processor):
+    r"""
+    Get the vision_feature_select_strategy.
+    """
+    vision_feature_select_strategy = getattr(
+        config, "vision_feature_select_strategy", getattr(processor, "vision_feature_select_strategy", "default")
+    )
+    return vision_feature_select_strategy
+
+
+def get_vision_processor(args, processor_path, tokenizer):
+    def patch_processor(processor, config, tokenizer):
+        setattr(processor, "tokenizer", tokenizer)
+        setattr(processor, "image_seqlen", get_image_seqlen(config))
+        setattr(processor, "image_resolution", args.image_resolution)
+        setattr(processor, "patch_size", get_patch_size(config, processor))
+        setattr(processor, "video_resolution", args.video_resolution)
+        setattr(processor, "video_fps", args.video_fps)
+        setattr(processor, "video_maxlen", args.video_maxlen)
+        setattr(processor, "vision_feature_select_strategy",
+                get_vision_feature_select_strategy(config, processor))
+
+    init_kwargs = {
+        "trust_remote_code": True,
+        "cache_dir": None,
+        "revision": 'main',
+        "token": None,
+    }
+    config = AutoConfig.from_pretrained(processor_path, **init_kwargs)
+    vision_processor = AutoProcessor.from_pretrained(processor_path, **init_kwargs)
+    patch_processor(vision_processor, config, tokenizer)
+    if vision_processor is not None and "Processor" not in vision_processor.__class__.__name__:
+        vision_processor = None
+    return vision_processor
+
+
+def get_qwen2_vl_utils(args):
+    eos_slots = [] if args.efficient_eos else [{"eos_token"}]
+    encoder_utils = VisionEncoderUtils(
+        format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+        format_assistant=StringFormatter(slots=["{{content}}"] + eos_slots),
+        format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+        format_function=FunctionFormatter(slots=eos_slots, tool_format="default"),
+        format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+        format_tools=ToolFormatter(tool_format="default"),
+        format_separator=EmptyFormatter(slots=["\n"]),
+        format_prefix=EmptyFormatter(),
+        default_system="You are a helpful assistant.",
+        stop_words=["<|im_end|>"],
+        efficient_eos=args.efficient_eos,
+        replace_eos=True,
+        replace_jinja_template=False,
+        mm_plugin=Qwen2vlPlugin(image_token="<|image_pad|>", video_token="<|video_pad|>")
+    )
+    return encoder_utils
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/requirements.txt b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/requirements.txt
index 3cdcd77765..be0bca46f7 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/requirements.txt
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/requirements.txt
@@ -1,21 +1,23 @@
 accelerate
 bitsandbytes
 datasets
-deepspeed==0.15.0
+decorator
+deepspeed==0.16.4
 einops
-flash-attn==2.7.0.post2
 isort
 jsonlines
 loralib
+numpy==1.26.4
 optimum
 packaging
 peft
-ray[default]==2.12.0
+pillow
+ray[default]==2.42.0
 tensorboard
 torch
 torchmetrics
 tqdm
-transformers==4.46.3
 transformers_stream_generator
 wandb
 wheel
+scipy
\ No newline at end of file
diff --git a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/version.txt b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/version.txt
index dc2b74e60a..7896b4708d 100644
--- a/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/version.txt
+++ b/PyTorch/built-in/rl/OpenRLHF_v0.5.7_for_PyTorch/version.txt
@@ -1 +1 @@
-0.5.7
\ No newline at end of file
+0.5.7.dev
\ No newline at end of file
-- 
Gitee