From 91013facd977ce4b0e112388b363e29cab57111d Mon Sep 17 00:00:00 2001 From: wangyibo <1605891897@qq.com> Date: Thu, 29 May 2025 11:50:03 +0800 Subject: [PATCH] =?UTF-8?q?code=5Fcheck=E6=B8=85=E9=9B=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- auto_convert_llm.sh | 7 ++--- ci/access_control_test.py | 8 +++--- test_convert_llm.sh | 6 ++-- tests/st/net/utils.py | 17 ++++++------ tools/__init__.py | 4 +-- tools/debug_utils/__init__.py | 4 +-- tools/rules/line_rules.py | 52 +++++++++++++++++------------------ tools/rules_rl/line_rules.py | 30 ++++++++++---------- 8 files changed, 63 insertions(+), 65 deletions(-) diff --git a/auto_convert_llm.sh b/auto_convert_llm.sh index 09381b68..7bc18eff 100644 --- a/auto_convert_llm.sh +++ b/auto_convert_llm.sh @@ -30,7 +30,7 @@ fi rm -rf Megatron-LM/tests echo "..............................................done Megatron-LM" -#msadaptor +#msadapter rm -rf MSAdapter git clone https://openi.pcl.ac.cn/OpenI/MSAdapter.git -b master if [ $? -ne 0 ]; then @@ -58,9 +58,9 @@ echo "..............................................done apply transformers" #accelerate rm -rf accelerate/ -git clone https://github.com/huggingface/accelerate.git -b v1.6.0 +git clone https://gitee.com/modelee/accelerate.git -b v1.6.0 if [ $? -ne 0 ]; then - echo "Error: git clone msadaptor" + echo "Error: git clone accelerate" exit 1 fi cd accelerate @@ -77,7 +77,6 @@ python3 tools/transfer.py \ --megatron_path ${MindSpeed_Core_MS_PATH}/Megatron-LM/megatron/ \ --mindspeed_path ${MindSpeed_Core_MS_PATH}/MindSpeed/mindspeed/ \ --mindspeed_llm_path ${MindSpeed_Core_MS_PATH}/MindSpeed-LLM/ \ ---gongcang export PYTHONPATH=${MindSpeed_Core_MS_PATH}/MSAdapter/mindtorch:${MindSpeed_Core_MS_PATH}/Megatron-LM:${MindSpeed_Core_MS_PATH}/MindSpeed:${MindSpeed_Core_MS_PATH}/MindSpeed-LLM:${MindSpeed_Core_MS_PATH}/transformers/src/:${MindSpeed_Core_MS_PATH}/accelerate/src/:$PYTHONPATH echo $PYTHONPATH diff --git a/ci/access_control_test.py b/ci/access_control_test.py index 2e09a961..7aceec63 100644 --- a/ci/access_control_test.py +++ b/ci/access_control_test.py @@ -25,15 +25,15 @@ import xmlrunner # ============================= def success_check(res): if res != 0: - raise CommandFailedError(f"命令执行失败,返回码: {res}") + raise CommandFailedError(f"The command execution failed, return code is: {res}") def success_check_ut(res): if len(res.failures) + len(res.errors) != 0: - raise CommandFailedError(f"命令执行失败,返回码: {res}") + raise CommandFailedError(f"The command execution failed, return code is: {res}") -class ST_Test: +class STTest: def __init__(self): self.shell_file_list = [] @@ -48,7 +48,7 @@ class ST_Test: if __name__ == "__main__": - st_test = ST_Test() + st_test = STTest() st_test.run_shell() test_loader = unittest.TestLoader() discover = test_loader.discover(start_dir="./", pattern="test*.py") diff --git a/test_convert_llm.sh b/test_convert_llm.sh index 526a0a8c..3a404373 100644 --- a/test_convert_llm.sh +++ b/test_convert_llm.sh @@ -30,7 +30,7 @@ fi rm -rf Megatron-LM/tests echo "..............................................done Megatron-LM" -#msadaptor +#msadapter rm -rf msadapter git clone https://gitee.com/mindspore/msadapter.git -b master if [ $? -ne 0 ]; then @@ -58,9 +58,9 @@ echo "..............................................done apply transformers" #accelerate rm -rf accelerate/ -git clone https://github.com/huggingface/accelerate.git -b v1.6.0 +git clone https://gitee.com/modelee/accelerate.git -b v1.6.0 if [ $? -ne 0 ]; then - echo "Error: git clone msadaptor" + echo "Error: git clone accelerate" exit 1 fi cd accelerate diff --git a/tests/st/net/utils.py b/tests/st/net/utils.py index 90107484..f597e9d3 100644 --- a/tests/st/net/utils.py +++ b/tests/st/net/utils.py @@ -15,7 +15,6 @@ import re import logging logging.basicConfig(level=logging.INFO) -num_npu = 8 def parse_memory_file(fname): @@ -53,15 +52,15 @@ def parse_script(file): context = f.read().split('\n') p_gbs = r'.*global-batch-size (\d*).*' p_len = r'.*seq-length (\d*).*' - gbs, len = None, None - for l in context: - match = re.match(p_gbs, l) + gbs, length = None, None + for line in context: + match = re.match(p_gbs, line) if match: gbs = match.group(1) - match = re.match(p_len, l) + match = re.match(p_len, line) if match: - len = match.group(1) - return gbs, len + length = match.group(1) + return gbs, length def parse_log_file(file): @@ -70,8 +69,8 @@ def parse_log_file(file): with open(file, 'r') as f: context = f.read().split('\n') data = {} - for l in context: - match = re.match(it_pattern, l) + for line in context: + match = re.match(it_pattern, line) if match: data[int(match.group(2))] = match.groups() return data diff --git a/tools/__init__.py b/tools/__init__.py index 623893e1..5b9f3f88 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -__all__ = ["breakpoint_", "clear_"] +from .debug_utils import breakpoint_, clear_ -from .debug_utils import breakpoint_, clear_ \ No newline at end of file +__all__ = ["breakpoint_", "clear_"] diff --git a/tools/debug_utils/__init__.py b/tools/debug_utils/__init__.py index 0e96e9c7..d24d7bcc 100644 --- a/tools/debug_utils/__init__.py +++ b/tools/debug_utils/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -__all__ = ["breakpoint_", "clear_"] - from .pdb_utils import breakpoint_, clear_ + +__all__ = ["breakpoint_", "clear_"] diff --git a/tools/rules/line_rules.py b/tools/rules/line_rules.py index f6528f43..fe479623 100644 --- a/tools/rules/line_rules.py +++ b/tools/rules/line_rules.py @@ -1,6 +1,6 @@ LINE_RULES = { "MindSpeed-LLM": { - "convert_ckpt.py":["""if __name__ == '__main__': + "convert_ckpt.py": ["""if __name__ == '__main__': + import mindspore as ms + ms.set_context(device_target = "CPU", pynative_synchronize=True) + import torch @@ -17,7 +17,7 @@ LINE_RULES = { "mindspeed_llm/core/datasets/blended_megatron_dataset_builder.py": [""" from ..parallel_state import get_pipeline_model_parallel_node_info +from mindspore.communication import get_local_rank - logger = logging.getLogger(__name__)""",""" if share_save: + logger = logging.getLogger(__name__)""", """ if share_save: return rank == 0 gpus_per_node = torch.cuda.device_count() - current_rank = torch.cuda.current_device() @@ -37,7 +37,7 @@ LINE_RULES = { """- if self.inv_freq.device.type == 'cpu': - # move `inv_freq` to GPU once at the first micro-batch forward pass - self.inv_freq = self.inv_freq.to(device=torch.cuda.current_device()) -""",""" mode = 1 if rotary_interleaved else 0 +""", """ mode = 1 if rotary_interleaved else 0 - t = npu_rotary_position_embedding(t.contiguous(), cos_, sin_, mode).to(t.dtype) + t = torch_npu.npu_rotary_position_embedding(t.contiguous(), cos_, sin_, mode).to(t.dtype)""" ], @@ -68,19 +68,19 @@ LINE_RULES = { + load_dir, trust_remote_code=trust_remote_code, local_files_only=True, low_cpu_mem_usage=False )] """], - "mindspeed_llm/tasks/models/transformer/multi_head_latent_attention.py":["""- output = torch.matmul(input_, self.weight.t()) + "mindspeed_llm/tasks/models/transformer/multi_head_latent_attention.py": ["""- output = torch.matmul(input_, self.weight.t()) + output = torch.matmul(input_.squeeze(1), self.weight.t()) + output = output.unsqueeze(1)"""], }, - "megatron":{ - "core/tensor_parallel/cross_entropy.py":[ + "megatron": { + "core/tensor_parallel/cross_entropy.py": [ """ grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output ) - return grad_input, None, None + return grad_input.to(torch.bfloat16), None, None""" ], - "core/pipeline_parallel/schedules.py":[ + "core/pipeline_parallel/schedules.py": [ """ from typing import Callable, Iterator, List, Optional, Union import torch @@ -89,7 +89,7 @@ LINE_RULES = { +from mindspore.common.api import _pynative_executor from megatron.core import parallel_state - from megatron.core.enums import ModelType""",""" set_input_tensor(input_tensor) + from megatron.core.enums import ModelType""", """ set_input_tensor(input_tensor) + if not parallel_state.is_pipeline_first_stage() and input_tensor is not None: + input_tensor[0].retain_grad() @@ -106,10 +106,10 @@ LINE_RULES = { + _pynative_executor.set_grad_flag(True) + _pynative_executor.new_graph(forward_step_func, input_tensor[0]) with context_manager: - if checkpoint_activations_microbatch is None:""",""" forward_data_store.append(data) + if checkpoint_activations_microbatch is None:""", """ forward_data_store.append(data) + _pynative_executor.end_graph(forward_step_func, output_tensor, input_tensor[0]) - if config.timers is not None:""","""-def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config): + if config.timers is not None:""", """-def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config): +def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model):""", """ if not isinstance(input_tensor, list): input_tensor = [input_tensor] @@ -119,7 +119,7 @@ LINE_RULES = { - x.retain_grad() + - if not isinstance(output_tensor, list):""",""" if output_tensor_grad[0] is None and config.grad_scale_func is not None: + if not isinstance(output_tensor, list):""", """ if output_tensor_grad[0] is None and config.grad_scale_func is not None: - output_tensor[0] = config.grad_scale_func(output_tensor[0]) + output_tensor_grad[0] = config.grad_scale_func(torch.ones_like(output_tensor[0])) + if output_tensor_grad[0] is None: @@ -140,40 +140,40 @@ LINE_RULES = { + _pynative_executor.grad(config.forward_step_func, grad_, weights, None, input_tensor[0], output_tensor_grad[0]) # Collect the grad of the input_tensor. - input_tensor_grad = [None]""",""" else: + input_tensor_grad = [None]""", """ else: input_tensor_grad.append(x.grad) + if not parallel_state.is_pipeline_first_stage(): + model.module.set_input_tensor(None) + # Handle single skip connection if it exists (encoder_hidden_state in - # model with encoder and decoder).""",""" config = get_model_config(model) + # model with encoder and decoder).""", """ config = get_model_config(model) + config.forward_step_func = forward_step_func - if config.timers is not None:""",""" forward_data_store = [] + if config.timers is not None:""", """ forward_data_store = [] - input_tensor, output_tensor_grad = None, None + input_tensor, output_tensor_grad = [None], [None] - total_num_tokens = torch.zeros([], dtype=torch.int, device="cuda")""",""" if not forward_only: + total_num_tokens = torch.zeros([], dtype=torch.int, device="cuda")""", """ if not forward_only: - backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config) + backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model) - # Run computation for last microbatch out of context handler (want to""",""" if not forward_only: + # Run computation for last microbatch out of context handler (want to""", """ if not forward_only: - backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config) + backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model) - if config.finalize_model_grads_func is not None and not forward_only:""",""" config = get_model_config(model[0]) + if config.finalize_model_grads_func is not None and not forward_only:""", """ config = get_model_config(model[0]) + config.forward_step_func = forward_step_func - if config.overlap_p2p_comm and config.batch_p2p_comm:""",""" input_tensors[model_chunk_id].append(None) + if config.overlap_p2p_comm and config.batch_p2p_comm:""", """ input_tensors[model_chunk_id].append(None) + if input_tensors[model_chunk_id][-1] is None: + input_tensors[model_chunk_id][-1] = torch.tensor(0, dtype=torch.int) - input_tensor = input_tensors[model_chunk_id][-1]""",""" output_tensor_grad = output_tensor_grads[model_chunk_id].pop(0) + input_tensor = input_tensors[model_chunk_id][-1]""", """ output_tensor_grad = output_tensor_grads[model_chunk_id].pop(0) - input_tensor_grad = backward_step( - input_tensor, output_tensor, output_tensor_grad, model_type, config - ) + input_tensor_grad = backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model[model_chunk_id]) - # launch grad synchronization (custom grad sync)""",""" config = get_model_config(model) + # launch grad synchronization (custom grad sync)""", """ config = get_model_config(model) + config.forward_step_func = forward_step_func - if config.overlap_p2p_comm:""",""" if config.grad_sync_func is None or rank == 0: + if config.overlap_p2p_comm:""", """ if config.grad_sync_func is None or rank == 0: enable_grad_sync() - input_tensor_grad = backward_step( @@ -181,7 +181,7 @@ LINE_RULES = { - ) + input_tensor_grad = backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model) - if last_iteration:""",""" output_tensor_grad = recv_backward(send_tensor_shapes, config) + if last_iteration:""", """ output_tensor_grad = recv_backward(send_tensor_shapes, config) - input_tensor_grad = backward_step( - input_tensor, output_tensor, output_tensor_grad, model_type, config @@ -190,8 +190,8 @@ LINE_RULES = { send_backward(input_tensor_grad, recv_tensor_shapes, config)"""], }, - "mindspeed":{ - "core/auto_parallel/auto_parallel_apply.py":[""" from mindspeed.core.auto_parallel import set_kv_store + "mindspeed": { + "core/auto_parallel/auto_parallel_apply.py": [""" from mindspeed.core.auto_parallel import set_kv_store -from mindspeed.core.auto_parallel.auto_parallel_optimizer import SearchByGreyBox from mindspeed.core.auto_parallel.auto_parallel_memory import MemoryCostModel """], @@ -199,7 +199,7 @@ LINE_RULES = { } SPECIAL_RULES = { - "megatron":{ + "megatron": { "core/tensor_parallel/cross_entropy.py": [(r"masked_target\[target_mask\] = 0", "masked_target *= (1-target_mask)"), (r"predicted_logits\[target_mask\] = 0\.0", "predicted_logits *= (1-target_mask)"), @@ -219,7 +219,7 @@ SPECIAL_RULES = { (r"log_string \+= \' grad norm\: \{\:\.3f\} \|\'\.format\(grad_norm\)", "log_string += ' grad norm: {:.16f} |'.format(grad_norm)") ] }, - "mindspeed":{ + "mindspeed": { "core/transformer/moe/token_dispatcher.py": [(r"\.to\(\n?torch\.device\(\"cpu\"\)\)\n?", ""), (r"\.to\(\n?.*torch\.device\(\"cpu\"\),.*\n?.*\)", ""), diff --git a/tools/rules_rl/line_rules.py b/tools/rules_rl/line_rules.py index 2c8a4ea6..e3015440 100644 --- a/tools/rules_rl/line_rules.py +++ b/tools/rules_rl/line_rules.py @@ -9,14 +9,14 @@ LINE_RULES = { - 'megatron.core.parallel_state.get_nccl_options', get_nccl_options_wrapper) """ ], - "mindspeed_llm/mindspore/mindspore_adaptor.py":[ + "mindspeed_llm/mindspore/mindspore_adaptor.py": [ """- MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel', GPTModel) + MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel', GPTModel, force_patch=True)""" ], "mindspeed_llm/core/datasets/blended_megatron_dataset_builder.py": [""" from ..parallel_state import get_pipeline_model_parallel_node_info +from mindspore.communication import get_local_rank - logger = logging.getLogger(__name__)""",""" if share_save: + logger = logging.getLogger(__name__)""", """ if share_save: return rank == 0 gpus_per_node = torch.cuda.device_count() - current_rank = torch.cuda.current_device() @@ -36,7 +36,7 @@ LINE_RULES = { """- if self.inv_freq.device.type == 'cpu': - # move `inv_freq` to GPU once at the first micro-batch forward pass - self.inv_freq = self.inv_freq.to(device=torch.cuda.current_device()) -""",""" mode = 1 if rotary_interleaved else 0 +""", """ mode = 1 if rotary_interleaved else 0 - t = npu_rotary_position_embedding(t.contiguous(), cos_, sin_, mode).to(t.dtype) + t = torch_npu.npu_rotary_position_embedding(t.contiguous(), cos_, sin_, mode).to(t.dtype)""" ], @@ -57,12 +57,12 @@ LINE_RULES = { """ self.module = [AutoModelForCausalLM.from_pretrained( - load_dir, device_map=device_map, trust_remote_code=trust_remote_code, local_files_only=True + load_dir, trust_remote_code=trust_remote_code, local_files_only=True, low_cpu_mem_usage=False"""], - "mindspeed_llm/tasks/models/transformer/multi_head_latent_attention.py":["""- output = torch.matmul(input_, self.weight.t()) + "mindspeed_llm/tasks/models/transformer/multi_head_latent_attention.py": ["""- output = torch.matmul(input_, self.weight.t()) + output = torch.matmul(input_.squeeze(1), self.weight.t()) + output = output.unsqueeze(1)"""], }, - "megatron":{ - "core/tensor_parallel/cross_entropy.py":[ + "megatron": { + "core/tensor_parallel/cross_entropy.py": [ """ grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output ) @@ -70,7 +70,7 @@ LINE_RULES = { + return grad_input.to(torch.bfloat16), None, None""" ], }, - "mindspeed":{ + "mindspeed": { }, "mindspeed-rl": { @@ -851,7 +851,7 @@ def register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.ALL, blocki return decorator """ ], - "mindspeed_rl/workers/scheduler/ray.py" :["""# Copyright 2024 Bytedance Ltd. and/or its affiliates + "mindspeed_rl/workers/scheduler/ray.py" : ["""# Copyright 2024 Bytedance Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -1156,7 +1156,7 @@ class Worker(WorkerHelper): + kv_cache.append(layer_kv_cache.view(kv_cache_shape))""" ], }, - "vllm-ascend":{ + "vllm-ascend": { "vllm_ascend/attention.py": [ """ mask_value = torch.finfo(torch.float32).min else: @@ -1320,12 +1320,12 @@ class Worker(WorkerHelper): self.cache_engine[ve].gpu_cache for ve in range(self.parallel_config.pipeline_parallel_size)""" ], - "vllm_ascend.egg-info/entry_points.txt":["""[vllm.general_plugins] + "vllm_ascend.egg-info/entry_points.txt": ["""[vllm.general_plugins] ascend_enhanced_model = vllm_ascend:register_model [vllm.platform_plugins] ascend = vllm_ascend:register"""], - "vllm_ascend.egg-info/PKG-INFO":["""Metadata-Version: 2.2 + "vllm_ascend.egg-info/PKG-INFO": ["""Metadata-Version: 2.2 Name: vllm_ascend Version: 0.1.dev68+g806235f.d20250308 Summary: vLLM Ascend backend plugin @@ -1337,7 +1337,7 @@ Project-URL: Homepage, https://github.com/vllm-project/vllm-ascend"""], } SPECIAL_RULES = { - "megatron":{ + "megatron": { "core/tensor_parallel/cross_entropy.py": [(r"masked_target\[target_mask\] = 0", "masked_target *= (1-target_mask)"), (r"predicted_logits\[target_mask\] = 0\.0", "predicted_logits *= (1-target_mask)"), @@ -1357,7 +1357,7 @@ SPECIAL_RULES = { (r"log_string \+= \' grad norm\: \{\:\.3f\} \|\'\.format\(grad_norm\)", "log_string += ' grad norm: {:.16f} |'.format(grad_norm)") ] }, - "mindspeed":{ + "mindspeed": { "core/transformer/moe/token_dispatcher.py": [(r"\.to\(\n?torch\.device\(\"cpu\"\)\)\n?", ""), (r"\.to\(\n?.*torch\.device\(\"cpu\"\),.*\n?.*\)", ""), @@ -1369,13 +1369,13 @@ SPECIAL_RULES = { "pytorch_utils.py": [(r"from safetensors\.torch import storage_ptr\, storage_size", "")] }, - "mindspeed-rl":{ + "mindspeed-rl": { "mindspeed_rl/datasets/dataloader.py": [("pin_memory=True", "pin_memory=False")], "mindspeed_rl/datasets/prompt_dataset.py": [("pin_memory=True", "pin_memory=False")], }, - "vllm":{ + "vllm": { }, "vllm-ascend": { -- Gitee