From a806274a93396c77c553c24abc221c5bdb3fcf2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AE=9C=E6=9D=B0?= Date: Wed, 10 Dec 2025 19:26:28 +0800 Subject: [PATCH 1/2] Add MindFormers model configs --- .../models/hf_models/mindformers_model.py | 32 + ais_bench/benchmark/models/__init__.py | 4 +- .../benchmark/models/mindformers_model.py | 657 ++++++++++++++++++ .../benchmark/models/mindformers_model_v1.py | 544 +++++++++++++++ 4 files changed, 1236 insertions(+), 1 deletion(-) create mode 100644 ais_bench/benchmark/configs/models/hf_models/mindformers_model.py create mode 100644 ais_bench/benchmark/models/mindformers_model.py create mode 100644 ais_bench/benchmark/models/mindformers_model_v1.py diff --git a/ais_bench/benchmark/configs/models/hf_models/mindformers_model.py b/ais_bench/benchmark/configs/models/hf_models/mindformers_model.py new file mode 100644 index 0000000000..92c7635789 --- /dev/null +++ b/ais_bench/benchmark/configs/models/hf_models/mindformers_model.py @@ -0,0 +1,32 @@ +from ais_bench.benchmark.models import MindFormerModel + +models = [ + dict( + attr="local", # local or service + type=MindFormerModel, # transformers < 4.33.0 用这个,优先AutoModelForCausalLM.from_pretrained加载模型,失败则用AutoModel.from_pretrained加载 + abbr='mindformer-model', + path='/path/to/model_dir/', # path to model dir, current value is just a example + checkpoint = '/path/to/checkpoint_path/', + yaml_cfg_file = '/path/to/your.yaml', + tokenizer_path='/home/l00841558/models/qwen3_06B', # path to tokenizer dir, current value is just a example + model_kwargs=dict( # 模型参数参考 huggingface.co/docs/transformers/v4.50.0/en/model_doc/auto#transformers.AutoModel.from_pretrained + device_map='auto', + ), + tokenizer_kwargs=dict( # tokenizer参数参考 huggingface.co/docs/transformers/v4.50.0/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase + padding_side='left', + ), + generation_kwargs = dict( # 后处理参数参考huggingface.co/docs/transformers/main_classes/test_generation + temperature = 0.5, + top_k = 10, + top_p = 0.95, + do_sample = True, + seed = None, + repetition_penalty = 1.03, + ), + run_cfg = dict(num_gpus=1, num_procs=1), # 多卡/多机多卡 参数,使用torchrun拉起任务 + max_out_len=100, # 最大输出token长度 + batch_size=2, # 每次推理的batch size + max_seq_len=2048, + batch_padding=True, + ) +] \ No newline at end of file diff --git a/ais_bench/benchmark/models/__init__.py b/ais_bench/benchmark/models/__init__.py index b95b5e17ce..9d6a7a0248 100644 --- a/ais_bench/benchmark/models/__init__.py +++ b/ais_bench/benchmark/models/__init__.py @@ -8,4 +8,6 @@ from ais_bench.benchmark.models.huggingface_above_v4_33 import HuggingFaceBaseMo from ais_bench.benchmark.models.tgi_api import TGICustomAPI, TGICustomAPIStream from ais_bench.benchmark.models.triton_api import TritonCustomAPI, TritonCustomAPIStream from ais_bench.benchmark.models.vllm_custom_api_chat_multiturn import VllmMultiturnAPIChatStream -from ais_bench.benchmark.models.vllm_function_call_api_chat import VLLMFunctionCallAPIChat \ No newline at end of file +from ais_bench.benchmark.models.vllm_function_call_api_chat import VLLMFunctionCallAPIChat +from ais_bench.benchmark.models.mindformers_model import MindFormerModel +from ais_bench.benchmark.models.mindformers_model_v1 import MindFormerModel_V1 \ No newline at end of file diff --git a/ais_bench/benchmark/models/mindformers_model.py b/ais_bench/benchmark/models/mindformers_model.py new file mode 100644 index 0000000000..44faafefa4 --- /dev/null +++ b/ais_bench/benchmark/models/mindformers_model.py @@ -0,0 +1,657 @@ +import os, sys +from typing import Dict, List, Optional, Union + +import numpy as np +import torch +import transformers + +from ais_bench.benchmark.models.base import BaseModel +from ais_bench.benchmark.models.base_api import APITemplateParser +from ais_bench.benchmark.registry import MODELS +from ais_bench.benchmark.utils.logging import get_logger +from ais_bench.benchmark.utils.prompt import PromptList + +from mindformers import MindFormerConfig, build_context +from mindformers.models import build_network +from mindformers.utils.load_checkpoint_utils import load_checkpoint_with_safetensors, get_load_path_after_hf_convert +from mindspore import Tensor +PromptType = Union[PromptList, str, dict] + + +class MultiTokenEOSCriteria(transformers.StoppingCriteria): + """Criteria to stop on the specified multi-token sequence.""" + + def __init__( + self, + sequence: str, + + tokenizer: transformers.PreTrainedTokenizer, + batch_size: int, + ): + self.done_tracker = [False] * batch_size + self.sequence = sequence + self.sequence_ids = tokenizer.encode(sequence, + add_special_tokens=False) + self.sequence_id_len = len(self.sequence_ids) + self.tokenizer = tokenizer + + def __call__(self, input_ids, scores, **kwargs) -> bool: + # compare the last len(stop) tokens + lookback_ids_batch = input_ids[:, -self.sequence_id_len:] + lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch) + for i, done in enumerate(self.done_tracker): + if done: + continue + self.done_tracker[i] = self.sequence in lookback_tokens_batch[i] + return False not in self.done_tracker + + +@MODELS.register_module() +class MindFormerModel(BaseModel): + + def __init__(self, + path: str, + checkpoint: Optional[str] = None, + yaml_cfg_file: Optional[str] = None, + hf_cache_dir: Optional[str] = None, + max_seq_len: int = 2048, + tokenizer_path: Optional[str] = None, + tokenizer_kwargs: dict = dict(), + peft_path: Optional[str] = None, + tokenizer_only: bool = False, + model_kwargs: dict = dict(device_map='auto'), + generation_kwargs: dict = dict(), + meta_template: Optional[Dict] = None, + extract_pred_after_decode: bool = False, + batch_padding: bool = False, + pad_token_id: Optional[int] = None, + mode: str = 'none', + use_fastchat_template: bool = False, + end_str: Optional[str] = None): + super().__init__(path=path, + max_seq_len=max_seq_len, + tokenizer_only=tokenizer_only, + meta_template=meta_template) + if hf_cache_dir is None: + hf_cache_dir = os.getenv('HF_MODEL_HUB', None) + self.logger = get_logger() + self.pad_token_id = pad_token_id + self.pretrained_model_path = path + assert mode in ['none', 'mid'] + self.mode = mode + self.config = MindFormerConfig(yaml_cfg_file) + self.checkpoint = checkpoint + self._load_tokenizer(path=path, + tokenizer_path=tokenizer_path, + tokenizer_kwargs=tokenizer_kwargs) + self.batch_padding = batch_padding + self.extract_pred_after_decode = extract_pred_after_decode + if not tokenizer_only: + self._load_model(path=path, + model_kwargs=model_kwargs, + peft_path=peft_path) + self.generation_kwargs = generation_kwargs + self.use_fastchat_template = use_fastchat_template + self.end_str = end_str + + def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], + tokenizer_kwargs: dict): + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_path if tokenizer_path else path, **tokenizer_kwargs) + + # A patch for some models without pad_token_id + if self.pad_token_id is not None: + if self.pad_token_id < 0: + self.pad_token_id += self.tokenizer.vocab_size + if self.tokenizer.pad_token_id is None: + self.logger.debug(f'Using {self.pad_token_id} as pad_token_id') + elif self.tokenizer.pad_token_id != self.pad_token_id: + self.logger.warning( + 'pad_token_id is not consistent with the tokenizer. Using ' + f'{self.pad_token_id} as pad_token_id') + self.tokenizer.pad_token_id = self.pad_token_id + elif self.tokenizer.pad_token_id is None: + self.logger.warning('pad_token_id is not set for the tokenizer.') + if self.tokenizer.eos_token is not None: + self.logger.warning( + f'Using eos_token_id {self.tokenizer.eos_token} ' + 'as pad_token_id.') + self.tokenizer.pad_token = self.tokenizer.eos_token + else: + from transformers.generation import GenerationConfig + gcfg = GenerationConfig.from_pretrained(path) + + if gcfg.pad_token_id is not None: + self.logger.warning( + f'Using pad_token_id {gcfg.pad_token_id} ' + 'as pad_token_id.') + self.tokenizer.pad_token_id = gcfg.pad_token_id + else: + raise ValueError( + 'pad_token_id is not set for this tokenizer. Try to ' + 'set pad_token_id via passing ' + '`pad_token_id={PAD_TOKEN_ID}` in model_cfg.') + + # A patch for llama when batch_padding = True + if 'decapoda-research/llama' in path or \ + (tokenizer_path and + 'decapoda-research/llama' in tokenizer_path): + self.logger.warning('We set new pad_token_id for LLaMA model') + self.tokenizer.bos_token = '' + self.tokenizer.eos_token = '' + self.tokenizer.pad_token_id = 0 + + def _set_config_from_yaml(self): + self.config.load_checkpoint = self.checkpoint + self.config.model.pretrained_model_dir = self.pretrained_model_path + self.config.model.model_config.seq_length = self.max_seq_len + build_context(self.config) + + def _load_model(self, + path: str, + model_kwargs: dict, + peft_path: Optional[str] = None): + + try: + self.model = build_network(self.config.model) + self.model.set_train(False) + print(self.config.load_checkpoint) + self.config.load_checkpoint = get_load_path_after_hf_convert(self.config, self.model) + print("after:", self.config.load_checkpoint) + load_checkpoint_with_safetensors(self.config, None, self.model, None) + except ValueError as err: + raise ValueError('Failed to build MindFormers model, please check configuration') from err + + # A patch for llama when batch_padding = True + if 'decapoda-research/llama' in path: + self.model.config.bos_token_id = 1 + self.model.config.eos_token_id = 2 + self.model.config.pad_token_id = self.tokenizer.pad_token_id + + def generate(self, + inputs: List[str], + max_out_len: int, + min_out_len: Optional[int] = None, + stopping_criteria: List[str] = [], + **kwargs) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + min_out_len (Optional[int]): The minimum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + generation_kwargs = kwargs.copy() + generation_kwargs.update(self.generation_kwargs) + if self.batch_padding and len(inputs) > 1: + return self._batch_generate(inputs=inputs, + max_out_len=max_out_len, + min_out_len=min_out_len, + stopping_criteria=stopping_criteria, + **generation_kwargs) + else: + return sum( + (self._single_generate(inputs=[input_], + max_out_len=max_out_len, + min_out_len=min_out_len, + stopping_criteria=stopping_criteria, + **generation_kwargs) + for input_ in inputs), []) + + def _batch_generate(self, + inputs: List[str], + max_out_len: int, + min_out_len: Optional[int] = None, + stopping_criteria: List[str] = [], + **kwargs) -> List[str]: + """Support for batch prompts inference. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + origin_stopping_criteria = list(stopping_criteria) + prompt_char_lens = None + if self.extract_pred_after_decode: + prompt_char_lens = [len(text) for text in inputs] + + batch_inputs = list(inputs) + if self.use_fastchat_template: + try: + from fastchat.model import get_conversation_template + except ModuleNotFoundError: + raise ModuleNotFoundError( + 'Fastchat is not implemented. You can use ' + "'pip install \"fschat[model_worker,webui]\"' " + 'to implement fastchat.') + for idx, text in enumerate(batch_inputs): + conv = get_conversation_template('vicuna') + conv.append_message(conv.roles[0], text) + conv.append_message(conv.roles[1], None) + batch_inputs[idx] = conv.get_prompt() + + encoded = self.tokenizer(batch_inputs, + padding=True, + truncation=True, + max_length=self.max_seq_len, + return_tensors='np') + input_ids = encoded['input_ids'] + attention_mask = encoded.get('attention_mask') + prompt_token_lens = ( + attention_mask.sum(axis=1).astype(int).tolist() + if attention_mask is not None else + [input_ids.shape[1]] * len(batch_inputs) + ) + input_ids_tensor = Tensor(input_ids) + + generation_kwargs = { + k: v for k, v in kwargs.items() + if k not in ['is_synthetic', 'batch_size'] + } + if min_out_len is not None: + generation_kwargs['min_new_tokens'] = min_out_len + generation_kwargs['max_new_tokens'] = max_out_len + generation_kwargs.setdefault('top_k', 1) + generation_kwargs.setdefault('return_dict_in_generate', False) + + outputs = self.model.generate(input_ids=input_ids_tensor, + **generation_kwargs) + + if hasattr(outputs, 'asnumpy'): + outputs = outputs.asnumpy() + + def _to_token_list(seq): + if hasattr(seq, 'asnumpy'): + seq = seq.asnumpy() + if isinstance(seq, np.ndarray): + return seq.tolist() + return list(seq) + + if isinstance(outputs, np.ndarray): + sequences = [row.tolist() for row in outputs] + elif isinstance(outputs, list): + sequences = [_to_token_list(seq) for seq in outputs] + else: + sequences = [_to_token_list(seq) for seq in list(outputs)] + + if not self.extract_pred_after_decode: + sequences = [ + seq[prompt_len:] + for seq, prompt_len in zip(sequences, prompt_token_lens) + ] + + decoded = [ + self.tokenizer.decode(seq, skip_special_tokens=True) + for seq in sequences + ] + + if self.extract_pred_after_decode and prompt_char_lens is not None: + decoded = [ + text[length:] + for text, length in zip(decoded, prompt_char_lens) + ] + + if self.end_str: + decoded = [text.split(self.end_str)[0] for text in decoded] + if origin_stopping_criteria: + for token in origin_stopping_criteria: + decoded = [text.split(token)[0] for text in decoded] + return decoded + + def _single_generate(self, + inputs: List[str], + max_out_len: int, + min_out_len: Optional[int] = None, + stopping_criteria: List[str] = [], + **kwargs) -> List[str]: + """Support for single prompt inference. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + origin_stopping_criteria = list(stopping_criteria) + prompt_char_lens = None + if self.extract_pred_after_decode: + prompt_char_lens = [len(text) for text in inputs] + + single_inputs = list(inputs) + if self.use_fastchat_template and single_inputs: + try: + from fastchat.model import get_conversation_template + except ModuleNotFoundError: + raise ModuleNotFoundError( + 'Fastchat is not implemented. You can use ' + "'pip install \"fschat[model_worker,webui]\"' " + 'to implement fastchat.') + for idx, text in enumerate(single_inputs): + conv = get_conversation_template('vicuna') + conv.append_message(conv.roles[0], text) + conv.append_message(conv.roles[1], None) + single_inputs[idx] = conv.get_prompt() + + if self.mode == 'mid' and single_inputs: + full_tokens = self.tokenizer(single_inputs, truncation=False)['input_ids'] + processed_inputs = [] + for token_ids in full_tokens: + if len(token_ids) > self.max_seq_len - max_out_len: + half = int((self.max_seq_len - max_out_len) / 2) + trimmed = ( + self.tokenizer.decode(token_ids[:half], skip_special_tokens=True) + + self.tokenizer.decode(token_ids[-half:], skip_special_tokens=True) + ) + processed_inputs.append(trimmed) + else: + processed_inputs.append( + self.tokenizer.decode(token_ids, skip_special_tokens=True)) + single_inputs = processed_inputs + + encoded = self.tokenizer(single_inputs, + truncation=True, + max_length=max(1, self.max_seq_len - max_out_len), + return_tensors='np') + input_ids = encoded['input_ids'] + attention_mask = encoded.get('attention_mask') + prompt_token_lens = ( + attention_mask.sum(axis=1).astype(int).tolist() + if attention_mask is not None else + [input_ids.shape[1]] * len(single_inputs) + ) + input_ids_tensor = Tensor(input_ids) + + stopping_tokens = None + if origin_stopping_criteria: + stopping_tokens = list(origin_stopping_criteria) + if self.tokenizer.eos_token is not None: + stopping_tokens = stopping_tokens + [ + self.tokenizer.eos_token + ] + stopping_list = transformers.StoppingCriteriaList([ + *[ + MultiTokenEOSCriteria(sequence, self.tokenizer, + input_ids_tensor.shape[0]) + for sequence in stopping_tokens + ], + ]) + else: + stopping_list = None + + generation_kwargs = { + k: v for k, v in kwargs.items() + if k not in ['is_synthetic', 'batch_size'] + } + if stopping_list is not None: + generation_kwargs['stopping_criteria'] = stopping_list + if min_out_len is not None: + generation_kwargs['min_new_tokens'] = min_out_len + generation_kwargs['max_new_tokens'] = max_out_len + + outputs = self.model.generate(input_ids=input_ids_tensor, + **generation_kwargs) + + if hasattr(outputs, 'asnumpy'): + outputs = outputs.asnumpy() + + def _to_token_list(seq): + if hasattr(seq, 'asnumpy'): + seq = seq.asnumpy() + if isinstance(seq, np.ndarray): + return seq.tolist() + return list(seq) + + if isinstance(outputs, np.ndarray): + sequences = [row.tolist() for row in outputs] + elif isinstance(outputs, list): + sequences = [_to_token_list(seq) for seq in outputs] + else: + sequences = [_to_token_list(seq) for seq in list(outputs)] + + if not self.extract_pred_after_decode: + sequences = [ + seq[prompt_len:] + for seq, prompt_len in zip(sequences, prompt_token_lens) + ] + + decoded = [ + self.tokenizer.decode(seq, skip_special_tokens=True) + for seq in sequences + ] + + if self.extract_pred_after_decode and prompt_char_lens is not None: + decoded = [ + text[length:] + for text, length in zip(decoded, prompt_char_lens) + ] + + if self.end_str: + decoded = [text.split(self.end_str)[0] for text in decoded] + if origin_stopping_criteria: + for token in origin_stopping_criteria: + decoded = [text.split(token)[0] for text in decoded] + return decoded + + def get_logits(self, inputs: List[str]): + + if self.batch_padding and len(inputs) > 1: + # batch inference + tokens = self.tokenizer(inputs, + padding=True, + truncation=True, + max_length=self.max_seq_len) + + tokens = { + k: Tensor(np.array(tokens[k])) + for k in tokens if k in ['input_ids', 'attention_mask'] + } + outputs = self.model(**tokens) + + else: + input_ids = self.tokenizer( + inputs, + padding=False, + truncation=True, + max_length=self.max_seq_len)['input_ids'] + input_ids = Tensor(input_ids) + tokens = {'input_ids': input_ids} + + outputs = self.model(input_ids) + return outputs[0], {'tokens': tokens} + + def get_ppl(self, + inputs: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get perplexity scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + List[float]: A list of perplexity scores. + """ + + if self.batch_padding and len(inputs) > 1: + assert self.tokenizer.pad_token + return self._get_ppl(inputs, mask_length=mask_length) + else: + return np.concatenate([ + self._get_ppl(inputs=[text], mask_length=mask_length) + for text in inputs + ]) + + def _get_ppl(self, + inputs: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get perplexity scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + List[float]: A list of perplexity scores. + """ + + outputs, inputs = self.get_logits(inputs) + shift_logits = outputs[..., :-1, :].contiguous().float() + + shift_labels = inputs['tokens']['input_ids'][..., 1:].contiguous() + + loss_fct = torch.nn.CrossEntropyLoss( + reduction='none', ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)).view(shift_labels.size()) + + if mask_length is not None: + mask = torch.zeros_like(shift_labels) # [batch,seqlen] + for i in range(len(mask)): + for j in range(mask_length[i] - 1, len(mask[i])): + mask[i][j] = 1 + loss = loss * mask + + lens = (inputs['tokens']['input_ids'] != + self.tokenizer.pad_token_id).sum(-1).cpu().numpy() + if mask_length is not None: + lens -= np.array(mask_length) + ce_loss = loss.float().sum(-1).cpu().detach().numpy() / lens + return ce_loss + + def get_loglikelihood( + self, + inputs: List[str], + conts: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get loglikelihood scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + conts (List[str]): A list of strings: slices after the space. + NOT SUPPORT mask_length YET! + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + List[float]: A list of loglikelihood scores. + """ + assert mask_length is None, 'Not support mask_length yet.' + if self.batch_padding and len(inputs) > 1: + assert self.tokenizer.pad_token + return self._get_loglikelihood(inputs, conts) + else: + return np.concatenate([ + self._get_loglikelihood(inputs=[inputs[idx]], + conts=[conts[idx]]) + for idx in range(len(inputs)) + ]) + + def _get_loglikelihood(self, inputs: str, conts: str) -> float: + """Get loglikelihood scores given input string and continuation string. + + Args: + inputs (str): string. + conts (str): strings: slices after the space. + Returns: + float: loglikelihood scores. + """ + input_tokenizer_out = self.tokenizer(inputs, + padding=True, + truncation=False, + return_length=True, + return_tensors='pt').to( + self.model.device) + + input_ids = input_tokenizer_out['input_ids'][:, :self.max_seq_len] + input_length = input_tokenizer_out['length'] + context_ids = [ + self.tokenizer(inputs[i].replace(conts[i], ''), + padding=False, + truncation=True, + max_length=self.max_seq_len)['input_ids'] + for i in range(len(inputs)) + ] + # forward + outputs = self.model(input_ids)['logits'] + outputs = torch.nn.functional.log_softmax(outputs, dim=-1) + # calculate loglikelihood + answer = np.zeros(len(inputs)) + for i in range(len(inputs)): + if self.tokenizer.padding_side == 'right': + cont_ids = input_ids[i, len(context_ids[i]):input_length[i]] + logits = outputs[i, + len(context_ids[i]) - 1:input_length[i] - + 1, :] # noqa + else: + cont_ids = input_ids[i, len(context_ids[i]) - input_length[i]:] + logits = outputs[i, + len(context_ids[i]) - input_length[i] - 1:-1] + # Reducing the dimension will lead to a wrong outcome + logits_gather = torch.gather( + logits.unsqueeze(0), 2, + cont_ids.unsqueeze(0).unsqueeze(-1)) # [1, seq] + # Answer: sum the likelihood of each token in continuation + answer[i] = float(logits_gather.detach().cpu().sum()) + return answer + + def get_mink_percent(self, inputs: List[str], k: int = 20) -> List[float]: + """https://swj0419.github.io/detect-pretrain.github.io/""" + + if self.batch_padding and len(inputs) > 1: + assert self.tokenizer.pad_token + return self._get_mink_percent(inputs, k=k) + else: + return np.concatenate([ + self._get_mink_percent(inputs=[text], k=k) for text in inputs + ]) + + def _get_mink_percent(self, inputs: List[str], k: int = 20) -> List[float]: + outputs, inputs = self.get_logits(inputs) + shift_logits = outputs[:, :-1, :].contiguous().float() + shift_labels = inputs['tokens']['input_ids'][:, 1:].contiguous() + + loss_fct = torch.nn.CrossEntropyLoss( + reduction='none', ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)).view(shift_labels.size()) + lens = (inputs['tokens']['input_ids'] != + self.tokenizer.pad_token_id).sum(-1).cpu().numpy() + mink_percent = [] + for nloss, nlen in zip(loss, lens): + nlen = int(nlen) + minklen = max(nlen * k // 100, 1) + nloss = torch.topk(loss[-nlen:], minklen, dim=-1)[0] + nloss = -nloss.float().mean().cpu().detach().numpy() + mink_percent.append(nloss) + return np.array(mink_percent) + + def get_token_len(self, prompt: str) -> int: + """Get lengths of the tokenized strings. + + Args: + prompt (str): Input string. + + Returns: + int: Length of the input tokens + """ + return len(self.tokenizer.encode(prompt)) + diff --git a/ais_bench/benchmark/models/mindformers_model_v1.py b/ais_bench/benchmark/models/mindformers_model_v1.py new file mode 100644 index 0000000000..6d0abd1480 --- /dev/null +++ b/ais_bench/benchmark/models/mindformers_model_v1.py @@ -0,0 +1,544 @@ +import os, sys +from typing import Dict, List, Optional, Union + +import numpy as np +import torch +import transformers + +from ais_bench.benchmark.models.base import BaseModel +from ais_bench.benchmark.models.base_api import APITemplateParser +from ais_bench.benchmark.registry import MODELS +from ais_bench.benchmark.utils.logging import get_logger +from ais_bench.benchmark.utils.prompt import PromptList + +from mindformers import MindFormerConfig, build_context +from mindformers.models import build_network +from mindformers.utils.load_checkpoint_utils import load_checkpoint_with_safetensors, get_load_path_after_hf_convert +from mindspore import Tensor +PromptType = Union[PromptList, str, dict] + + +class MultiTokenEOSCriteria(transformers.StoppingCriteria): + """Criteria to stop on the specified multi-token sequence.""" + + def __init__( + self, + sequence: str, + tokenizer: transformers.PreTrainedTokenizer, + batch_size: int, + ): + self.done_tracker = [False] * batch_size + self.sequence = sequence + self.sequence_ids = tokenizer.encode(sequence, + add_special_tokens=False) + self.sequence_id_len = len(self.sequence_ids) + self.tokenizer = tokenizer + + def __call__(self, input_ids, scores, **kwargs) -> bool: + # compare the last len(stop) tokens + lookback_ids_batch = input_ids[:, -self.sequence_id_len:] + lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch) + for i, done in enumerate(self.done_tracker): + if done: + continue + self.done_tracker[i] = self.sequence in lookback_tokens_batch[i] + return False not in self.done_tracker + + +def drop_error_generation_kwargs(generation_kwargs: dict) -> dict: + for key in ['is_synthetic', 'batch_size', 'do_performance']: + if key in generation_kwargs: + generation_kwargs.pop(key) + return generation_kwargs + + +@MODELS.register_module() +class MindFormerModel_V1(BaseModel): + + def __init__(self, + path: str, + checkpoint: Optional[str] = None, + yaml_cfg_file: Optional[str] = None, + hf_cache_dir: Optional[str] = None, + max_seq_len: int = 2048, + tokenizer_path: Optional[str] = None, + tokenizer_kwargs: dict = dict(), + peft_path: Optional[str] = None, + tokenizer_only: bool = False, + model_kwargs: dict = dict(device_map='auto'), + generation_kwargs: dict = dict(), + meta_template: Optional[Dict] = None, + extract_pred_after_decode: bool = False, + batch_padding: bool = False, + pad_token_id: Optional[int] = None, + mode: str = 'none', + use_fastchat_template: bool = False, + end_str: Optional[str] = None): + super().__init__(path=path, + max_seq_len=max_seq_len, + tokenizer_only=tokenizer_only, + meta_template=meta_template) + if hf_cache_dir is None: + hf_cache_dir = os.getenv('HF_MODEL_HUB', None) + self.logger = get_logger() + self.pad_token_id = pad_token_id + self.pretrained_model_path = path + assert mode in ['none', 'mid'] + self.mode = mode + self.config = MindFormerConfig(yaml_cfg_file) + self.checkpoint = checkpoint + self._load_tokenizer(path=path, + tokenizer_path=tokenizer_path, + tokenizer_kwargs=tokenizer_kwargs) + self.batch_padding = batch_padding + self.extract_pred_after_decode = extract_pred_after_decode + if not tokenizer_only: + self._load_model() + self.generation_kwargs = generation_kwargs + self.use_fastchat_template = use_fastchat_template + self.end_str = end_str + + def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], + tokenizer_kwargs: dict): + from transformers import AutoTokenizer, GenerationConfig + + DEFAULT_TOKENIZER_KWARGS = dict(padding_side='left', truncation_side='left', trust_remote_code=True) + kwargs = DEFAULT_TOKENIZER_KWARGS + kwargs.update(tokenizer_kwargs) + + load_path = tokenizer_path if tokenizer_path else path + self.tokenizer = AutoTokenizer.from_pretrained(load_path, **kwargs) + + pad_token_id = self.pad_token_id + + # A patch for some models without pad_token_id + if pad_token_id is not None: + if self.tokenizer.pad_token_id is None: + self.logger.debug(f'Using {pad_token_id} as pad_token_id') + elif self.tokenizer.pad_token_id != pad_token_id: + self.logger.warning(f'pad_token_id is not consistent. Using {pad_token_id} as pad_token_id') + self.tokenizer.pad_token_id = pad_token_id + return + if self.tokenizer.pad_token_id is not None: + return + self.logger.warning('pad_token_id is not set for the tokenizer.') + + try: + generation_config = GenerationConfig.from_pretrained(path) + except: + generation_config = None + + if generation_config and generation_config.pad_token_id is not None: + self.logger.warning(f'Using {generation_config.pad_token_id} as pad_token_id.') + self.tokenizer.pad_token_id = generation_config.pad_token_id + return + if self.tokenizer.eos_token_id is not None: + self.logger.warning(f'Using eos_token_id {self.tokenizer.eos_token_id} as pad_token_id.') + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + return + raise ValueError('pad_token_id is not set for this tokenizer. Please set `pad_token_id={PAD_TOKEN_ID}` in model_cfg.') + + def _set_config_from_yaml(self): + self.config.load_checkpoint = self.checkpoint + self.config.model.pretrained_model_dir = self.pretrained_model_path + self.config.model.model_config.seq_length = self.max_seq_len + build_context(self.config) + + def _load_model(self): + + self._set_config_from_yaml() + try: + self.model = build_network(self.config.model) + self.logger.info("..........Network Built Successfully..........") + self.model.set_train(False) + print(self.config.load_checkpoint) + self.config.load_checkpoint = get_load_path_after_hf_convert(self.config, self.model) + print("after:", self.config.load_checkpoint) + load_checkpoint_with_safetensors(self.config, None, self.model, None) + self.logger.info("..........Checkpoint Load Successfully..........") + except ValueError : + raise ValueError('Failed to load MindFormers model, please check configuration') + + + def generate(self, + inputs: List[str], + max_out_len: int, + min_out_len: Optional[int] = None, + stopping_criteria: List[str] = [], + **kwargs) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + min_out_len (Optional[int]): The minimum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + generation_kwargs = kwargs.copy() + generation_kwargs.update(self.generation_kwargs) + + messages = list(inputs) + batch_size = len(messages) + prompt_char_lens = None + + if self.extract_pred_after_decode: + prompt_char_lens = [len(text) for text in messages] + + if self.use_fastchat_template: + try: + from fastchat.model import get_conversation_template + except ModuleNotFoundError: + raise ModuleNotFoundError( + 'Fastchat is not implemented. You can use ' + "'pip install \"fschat[model_worker,webui]\"' " + 'to implement fastchat.') + for idx, text in enumerate(messages): + conv = get_conversation_template('vicuna') + conv.append_message(conv.roles[0], text) + conv.append_message(conv.roles[1], None) + messages[idx] = conv.get_prompt() + + run_mode = kwargs.get('run_mode') + padding = True + if run_mode == 'predict': + padding = False + + if self.mode == 'mid': + assert len(messages) == 1 + tokens = self.tokenizer(messages, padding=False, truncation=False, return_tensors='np') + input_ids = tokens['input_ids'] + if input_ids.shape[-1] > self.max_seq_len: + input_ids = np.concatenate([input_ids[:, : self.max_seq_len // 2], input_ids[:, - self.max_seq_len // 2:]], axis=-1) + tokens = {'input_ids': input_ids} + else: + tokenize_kwargs = dict( + padding=padding, + truncation=True, + max_length=self.max_seq_len, + return_tensors='np' if padding else None + ) + tokens = self.tokenizer(messages, **tokenize_kwargs) + + input_ids = tokens['input_ids'] + if padding: + attention_mask = tokens.get('attention_mask') + prompt_token_lens = ( + attention_mask.sum(axis=1).astype(int).tolist() + if attention_mask is not None else + [input_ids.shape[1]] * batch_size + ) + else: + prompt_token_lens = [len(ids) for ids in input_ids] + + origin_stopping_criteria = list(stopping_criteria) + + # Check for ragged input + is_ragged = False + if not padding and len(input_ids) > 0: + first_len = len(input_ids[0]) + if any(len(ids) != first_len for ids in input_ids): + is_ragged = True + + if is_ragged: + sequences = [] + for seq_ids in input_ids: + input_ids_tensor = Tensor(np.array([seq_ids])) + single_gen_kwargs = generation_kwargs.copy() + if min_out_len is not None: + single_gen_kwargs['min_new_tokens'] = min_out_len + single_gen_kwargs['max_new_tokens'] = max_out_len + single_gen_kwargs.setdefault('top_k', 1) + single_gen_kwargs.setdefault('return_dict_in_generate', False) + + if stopping_criteria: + current_stopping_criteria = list(stopping_criteria) + if self.tokenizer.eos_token is not None: + current_stopping_criteria.append(self.tokenizer.eos_token) + + stopping_list = transformers.StoppingCriteriaList([ + *[ + MultiTokenEOSCriteria(sequence, self.tokenizer, 1) + for sequence in current_stopping_criteria + ], + ]) + single_gen_kwargs['stopping_criteria'] = stopping_list + + single_gen_kwargs = drop_error_generation_kwargs(single_gen_kwargs) + output = self.model.generate(input_ids=input_ids_tensor, **single_gen_kwargs) + if isinstance(output, dict): + output = output.get('sequences', output) + sequences.append(output[0].tolist()) + else: + input_ids_tensor = Tensor(input_ids) + + if min_out_len is not None: + generation_kwargs['min_new_tokens'] = min_out_len + generation_kwargs['max_new_tokens'] = max_out_len + generation_kwargs.setdefault('top_k', 1) + generation_kwargs.setdefault('return_dict_in_generate', False) + + if stopping_criteria: + current_stopping_criteria = list(stopping_criteria) + if self.tokenizer.eos_token is not None: + current_stopping_criteria.append(self.tokenizer.eos_token) + + stopping_list = transformers.StoppingCriteriaList([ + *[ + MultiTokenEOSCriteria(sequence, self.tokenizer, + input_ids_tensor.shape[0]) + for sequence in current_stopping_criteria + ], + ]) + generation_kwargs['stopping_criteria'] = stopping_list + + generation_kwargs = drop_error_generation_kwargs(generation_kwargs) + + outputs = self.model.generate(input_ids=input_ids_tensor, + **generation_kwargs) + + if isinstance(outputs, dict): + outputs = outputs.get('sequences', outputs) + + sequences = [seq.tolist() for seq in outputs] + + if not self.extract_pred_after_decode: + sequences = [ + seq[prompt_len:] + for seq, prompt_len in zip(sequences, prompt_token_lens) + ] + + decodeds = [ + self.tokenizer.decode(seq, skip_special_tokens=True) + for seq in sequences + ] + + if self.extract_pred_after_decode and prompt_char_lens is not None: + decodeds = [ + text[length:] + for text, length in zip(decodeds, prompt_char_lens) + ] + + if self.end_str: + decodeds = [text.split(self.end_str)[0] for text in decodeds] + if origin_stopping_criteria: + for token in origin_stopping_criteria: + decodeds = [text.split(token)[0] for text in decodeds] + return decodeds + + def get_logits(self, inputs: List[str]): + + if self.batch_padding and len(inputs) > 1: + # batch inference + tokens = self.tokenizer(inputs, + padding=True, + truncation=True, + max_length=self.max_seq_len) + + tokens = { + k: Tensor(np.array(tokens[k])) + for k in tokens if k in ['input_ids', 'attention_mask'] + } + outputs = self.model(**tokens) + + else: + input_ids = self.tokenizer( + inputs, + padding=False, + truncation=True, + max_length=self.max_seq_len)['input_ids'] + input_ids = Tensor(input_ids) + tokens = {'input_ids': input_ids} + + outputs = self.model(input_ids) + return outputs[0], {'tokens': tokens} + + def get_ppl(self, + inputs: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get perplexity scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + List[float]: A list of perplexity scores. + """ + + if self.batch_padding and len(inputs) > 1: + assert self.tokenizer.pad_token + return self._get_ppl(inputs, mask_length=mask_length) + else: + return np.concatenate([ + self._get_ppl(inputs=[text], mask_length=mask_length) + for text in inputs + ]) + + def _get_ppl(self, + inputs: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get perplexity scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + List[float]: A list of perplexity scores. + """ + + outputs, inputs = self.get_logits(inputs) + shift_logits = outputs[..., :-1, :].contiguous().float() + + shift_labels = inputs['tokens']['input_ids'][..., 1:].contiguous() + + loss_fct = torch.nn.CrossEntropyLoss( + reduction='none', ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)).view(shift_labels.size()) + + if mask_length is not None: + mask = torch.zeros_like(shift_labels) # [batch,seqlen] + for i in range(len(mask)): + for j in range(mask_length[i] - 1, len(mask[i])): + mask[i][j] = 1 + loss = loss * mask + + lens = (inputs['tokens']['input_ids'] != + self.tokenizer.pad_token_id).sum(-1).cpu().numpy() + if mask_length is not None: + lens -= np.array(mask_length) + ce_loss = loss.float().sum(-1).cpu().detach().numpy() / lens + return ce_loss + + def get_loglikelihood( + self, + inputs: List[str], + conts: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get loglikelihood scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + conts (List[str]): A list of strings: slices after the space. + NOT SUPPORT mask_length YET! + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + List[float]: A list of loglikelihood scores. + """ + assert mask_length is None, 'Not support mask_length yet.' + if self.batch_padding and len(inputs) > 1: + assert self.tokenizer.pad_token + return self._get_loglikelihood(inputs, conts) + else: + return np.concatenate([ + self._get_loglikelihood(inputs=[inputs[idx]], + conts=[conts[idx]]) + for idx in range(len(inputs)) + ]) + + def _get_loglikelihood(self, inputs: str, conts: str) -> float: + """Get loglikelihood scores given input string and continuation string. + + Args: + inputs (str): string. + conts (str): strings: slices after the space. + Returns: + float: loglikelihood scores. + """ + input_tokenizer_out = self.tokenizer(inputs, + padding=True, + truncation=False, + return_length=True, + return_tensors='pt').to( + self.model.device) + + input_ids = input_tokenizer_out['input_ids'][:, :self.max_seq_len] + input_length = input_tokenizer_out['length'] + context_ids = [ + self.tokenizer(inputs[i].replace(conts[i], ''), + padding=False, + truncation=True, + max_length=self.max_seq_len)['input_ids'] + for i in range(len(inputs)) + ] + # forward + outputs = self.model(input_ids)['logits'] + outputs = torch.nn.functional.log_softmax(outputs, dim=-1) + # calculate loglikelihood + answer = np.zeros(len(inputs)) + for i in range(len(inputs)): + if self.tokenizer.padding_side == 'right': + cont_ids = input_ids[i, len(context_ids[i]):input_length[i]] + logits = outputs[i, + len(context_ids[i]) - 1:input_length[i] - + 1, :] # noqa + else: + cont_ids = input_ids[i, len(context_ids[i]) - input_length[i]:] + logits = outputs[i, + len(context_ids[i]) - input_length[i] - 1:-1] + # Reducing the dimension will lead to a wrong outcome + logits_gather = torch.gather( + logits.unsqueeze(0), 2, + cont_ids.unsqueeze(0).unsqueeze(-1)) # [1, seq] + # Answer: sum the likelihood of each token in continuation + answer[i] = float(logits_gather.detach().cpu().sum()) + return answer + + def get_mink_percent(self, inputs: List[str], k: int = 20) -> List[float]: + """https://swj0419.github.io/detect-pretrain.github.io/""" + + if self.batch_padding and len(inputs) > 1: + assert self.tokenizer.pad_token + return self._get_mink_percent(inputs, k=k) + else: + return np.concatenate([ + self._get_mink_percent(inputs=[text], k=k) for text in inputs + ]) + + def _get_mink_percent(self, inputs: List[str], k: int = 20) -> List[float]: + outputs, inputs = self.get_logits(inputs) + shift_logits = outputs[:, :-1, :].contiguous().float() + shift_labels = inputs['tokens']['input_ids'][:, 1:].contiguous() + + loss_fct = torch.nn.CrossEntropyLoss( + reduction='none', ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)).view(shift_labels.size()) + lens = (inputs['tokens']['input_ids'] != + self.tokenizer.pad_token_id).sum(-1).cpu().numpy() + mink_percent = [] + for nloss, nlen in zip(loss, lens): + nlen = int(nlen) + minklen = max(nlen * k // 100, 1) + nloss = torch.topk(loss[-nlen:], minklen, dim=-1)[0] + nloss = -nloss.float().mean().cpu().detach().numpy() + mink_percent.append(nloss) + return np.array(mink_percent) + + def get_token_len(self, prompt: str) -> int: + """Get lengths of the tokenized strings. + + Args: + prompt (str): Input string. + + Returns: + int: Length of the input tokens + """ + return len(self.tokenizer.encode(prompt)) + -- Gitee From a323c292473fd9788c8a75368f1bd8d757032e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AE=9C=E6=9D=B0?= Date: Tue, 13 Jan 2026 21:42:25 +0800 Subject: [PATCH 2/2] mf_model_v1 --- .../benchmark/models/mindformers_model_v1.py | 146 ++++++++---------- 1 file changed, 64 insertions(+), 82 deletions(-) diff --git a/ais_bench/benchmark/models/mindformers_model_v1.py b/ais_bench/benchmark/models/mindformers_model_v1.py index 6d0abd1480..2c963935aa 100644 --- a/ais_bench/benchmark/models/mindformers_model_v1.py +++ b/ais_bench/benchmark/models/mindformers_model_v1.py @@ -13,8 +13,12 @@ from ais_bench.benchmark.utils.prompt import PromptList from mindformers import MindFormerConfig, build_context from mindformers.models import build_network +from mindformers.core.parallel_config import build_parallel_config from mindformers.utils.load_checkpoint_utils import load_checkpoint_with_safetensors, get_load_path_after_hf_convert -from mindspore import Tensor +from mindformers.trainer.utils import transform_and_load_checkpoint +from mindformers.checkpoint.checkpoint import load_checkpoint +from mindformers.checkpoint.utils import compile_model +from mindspore import Tensor, Model PromptType = Union[PromptList, str, dict] @@ -143,18 +147,40 @@ class MindFormerModel_V1(BaseModel): self.config.model.pretrained_model_dir = self.pretrained_model_path self.config.model.model_config.seq_length = self.max_seq_len build_context(self.config) + build_parallel_config(self.config) def _load_model(self): self._set_config_from_yaml() try: - self.model = build_network(self.config.model) + self.model = build_network( + self.config.model, + default_args={ + "parallel_config": self.config.parallel_config, + "moe_config": self.config.moe_config + }) self.logger.info("..........Network Built Successfully..........") self.model.set_train(False) - print(self.config.load_checkpoint) self.config.load_checkpoint = get_load_path_after_hf_convert(self.config, self.model) - print("after:", self.config.load_checkpoint) - load_checkpoint_with_safetensors(self.config, None, self.model, None) + print("after convert checkpoint path : ",self.config.load_checkpoint) + run_mode = self.config.get("run_mode", None) + if run_mode == "predict": + self.model.load_weights(self.config.load_checkpoint) + else: + model = Model(self.model) + # input_ids = np.ones((self.max_seq_len,), dtype=np.int32) + # infer_data = self.model.prepare_inputs_for_predict_layout(input_ids) + input_ids = Tensor(np.ones((4, self.max_seq_len), dtype=np.int32)) + infer_data = self.model.prepare_inputs_for_predict_layout(input_ids) + # transform_and_load_checkpoint(self.config, model, self.model, infer_data, do_eval=True) + + compile_model(model,infer_data,mode=self.config.context.mode, sink_mode=self.config.runner_config.sink_mode, + epoch=self.config.runner_config.epoch, sink_size=self.config.runner_config.sink_sizel) + load_checkpoint(checkpoint=self.config.load_checkpoint, network=self.model, balance_load= self.config.balance_load) + # if not self.config.get('use_legacy_format', True): + # from mindspore.parallel.strategy import enable_save_stragtegy_online + # enable_save_stragtegy_online() + self.logger.info("..........Checkpoint Load Successfully..........") except ValueError : raise ValueError('Failed to load MindFormers model, please check configuration') @@ -199,12 +225,6 @@ class MindFormerModel_V1(BaseModel): conv.append_message(conv.roles[0], text) conv.append_message(conv.roles[1], None) messages[idx] = conv.get_prompt() - - run_mode = kwargs.get('run_mode') - padding = True - if run_mode == 'predict': - padding = False - if self.mode == 'mid': assert len(messages) == 1 tokens = self.tokenizer(messages, padding=False, truncation=False, return_tensors='np') @@ -214,15 +234,15 @@ class MindFormerModel_V1(BaseModel): tokens = {'input_ids': input_ids} else: tokenize_kwargs = dict( - padding=padding, + padding=True, truncation=True, max_length=self.max_seq_len, - return_tensors='np' if padding else None + return_tensors='np' ) tokens = self.tokenizer(messages, **tokenize_kwargs) input_ids = tokens['input_ids'] - if padding: + if len(messages) > 1: attention_mask = tokens.get('attention_mask') prompt_token_lens = ( attention_mask.sum(axis=1).astype(int).tolist() @@ -232,76 +252,38 @@ class MindFormerModel_V1(BaseModel): else: prompt_token_lens = [len(ids) for ids in input_ids] + input_ids_tensor = Tensor(input_ids) + + if min_out_len is not None: + generation_kwargs['min_new_tokens'] = min_out_len + generation_kwargs['max_new_tokens'] = max_out_len + generation_kwargs.setdefault('top_k', 1) + generation_kwargs.setdefault('return_dict_in_generate', False) + origin_stopping_criteria = list(stopping_criteria) + if stopping_criteria: + if self.tokenizer.eos_token is not None: + stopping_criteria = stopping_criteria + [ + self.tokenizer.eos_token + ] + stopping_list = transformers.StoppingCriteriaList([ + *[ + MultiTokenEOSCriteria(sequence, self.tokenizer, + input_ids_tensor.shape[0]) + for sequence in stopping_criteria + ], + ]) + generation_kwargs['stopping_criteria'] = stopping_list + + generation_kwargs = drop_error_generation_kwargs(generation_kwargs) - # Check for ragged input - is_ragged = False - if not padding and len(input_ids) > 0: - first_len = len(input_ids[0]) - if any(len(ids) != first_len for ids in input_ids): - is_ragged = True - - if is_ragged: - sequences = [] - for seq_ids in input_ids: - input_ids_tensor = Tensor(np.array([seq_ids])) - single_gen_kwargs = generation_kwargs.copy() - if min_out_len is not None: - single_gen_kwargs['min_new_tokens'] = min_out_len - single_gen_kwargs['max_new_tokens'] = max_out_len - single_gen_kwargs.setdefault('top_k', 1) - single_gen_kwargs.setdefault('return_dict_in_generate', False) - - if stopping_criteria: - current_stopping_criteria = list(stopping_criteria) - if self.tokenizer.eos_token is not None: - current_stopping_criteria.append(self.tokenizer.eos_token) - - stopping_list = transformers.StoppingCriteriaList([ - *[ - MultiTokenEOSCriteria(sequence, self.tokenizer, 1) - for sequence in current_stopping_criteria - ], - ]) - single_gen_kwargs['stopping_criteria'] = stopping_list - - single_gen_kwargs = drop_error_generation_kwargs(single_gen_kwargs) - output = self.model.generate(input_ids=input_ids_tensor, **single_gen_kwargs) - if isinstance(output, dict): - output = output.get('sequences', output) - sequences.append(output[0].tolist()) - else: - input_ids_tensor = Tensor(input_ids) - - if min_out_len is not None: - generation_kwargs['min_new_tokens'] = min_out_len - generation_kwargs['max_new_tokens'] = max_out_len - generation_kwargs.setdefault('top_k', 1) - generation_kwargs.setdefault('return_dict_in_generate', False) - - if stopping_criteria: - current_stopping_criteria = list(stopping_criteria) - if self.tokenizer.eos_token is not None: - current_stopping_criteria.append(self.tokenizer.eos_token) - - stopping_list = transformers.StoppingCriteriaList([ - *[ - MultiTokenEOSCriteria(sequence, self.tokenizer, - input_ids_tensor.shape[0]) - for sequence in current_stopping_criteria - ], - ]) - generation_kwargs['stopping_criteria'] = stopping_list - - generation_kwargs = drop_error_generation_kwargs(generation_kwargs) - - outputs = self.model.generate(input_ids=input_ids_tensor, - **generation_kwargs) - - if isinstance(outputs, dict): - outputs = outputs.get('sequences', outputs) - - sequences = [seq.tolist() for seq in outputs] + outputs = self.model.generate(input_ids=input_ids_tensor, + **generation_kwargs) + + if isinstance(outputs, dict): + outputs = outputs.get('sequences', outputs) + + sequences = [seq.tolist() for seq in outputs] if not self.extract_pred_after_decode: sequences = [ -- Gitee