diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py index 012962baceba78d436364a01e514ad6d9f0edf1d..23bfd719651ca43722488490704ecce2f31cab69 100644 --- a/vllm_mindspore/model_executor/layers/sampler.py +++ b/vllm_mindspore/model_executor/layers/sampler.py @@ -423,7 +423,7 @@ def _apply_top_k_top_p( logits_sort.masked_fill_(top_k_mask, -float("inf")) # Apply top-p. - probs_sort = logits_sort.softmax(dim=-1) + probs_sort = logits_sort.softmax(-1) probs_sum = probs_sort.cumsum(axis=-1) top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) # at least one @@ -508,6 +508,7 @@ def _random_sample( # Find the maximum n value of the prompt phase requests. sample_idx = 0 results: SampleResultType = [] + random_samples = random_samples.asnumpy() for seq_group in selected_seq_groups: if not seq_group.do_sample: results.append(([], [])) @@ -600,13 +601,6 @@ def _beam_search_sample( return results -def exponential(x, lambd=1.0, *, generator=None): - if generator is not None: - raise ValueError("`generator` can not be supported.") - output = np.random.exponential(scale=lambd, size=x.shape) - return ms.Tensor(output).astype(x.dtype) - - # torch.multinomial forces a GPU<->CPU sync. # Therefore, we use an optimized implementation instead. # Note that we always sample with replacement. @@ -621,18 +615,17 @@ def _multinomial( probs = probs.repeat_interleave(num_samples, dim=0) q = torch.empty_like(probs) if seq_groups is None: - q = exponential(q) + q.exponential_() else: sample_idx = 0 for seq_group in seq_groups: seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None - q[sample_idx : sample_idx + stride] = exponential( - q[sample_idx : sample_idx + stride] - ) + q[sample_idx : sample_idx + + stride].exponential_(generator=seq_group.generator) sample_idx += stride - return probs.div(q).argmax(axis=1).view(-1, num_samples) + return probs.div_(q).argmax(dim=1).view(-1, num_samples) def _top_k_top_p_multinomial_with_flashinfer( diff --git a/vllm_mindspore/model_executor/layers/utils.py b/vllm_mindspore/model_executor/layers/utils.py index bbef8d9cb8a6f324981e3c28ef5220d6bbca9305..7db3489e1b01a289546606a28274ff6b55e8507a 100644 --- a/vllm_mindspore/model_executor/layers/utils.py +++ b/vllm_mindspore/model_executor/layers/utils.py @@ -58,13 +58,10 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, vocab_size, num_seqs) output_bin_counts, output_mask = get_token_bin_counts_and_mask( output_tokens_tensor, vocab_size, num_seqs) - # repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat( - # 1, vocab_size) repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(1, vocab_size) - logits[logits > 0] /= torch.where(prompt_mask | output_mask, - repetition_penalties, 1.0)[logits > 0] - logits[logits <= 0] *= torch.where(prompt_mask | output_mask, - repetition_penalties, 1.0)[logits <= 0] + mask = prompt_mask | output_mask + logits = torch.where(mask & (logits > 0), logits / repetition_penalties, logits) + logits = torch.where(mask & (logits <= 0), logits * repetition_penalties, logits) # We follow the definition in OpenAI API. # Refer to https://platform.openai.com/docs/api-reference/parameter-details logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts