diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py index 354fb021464af6510c765eb8bdf0397021a84e67..e282742024a05c4596b1d318362c4df03152439b 100644 --- a/vllm_mindspore/model_executor/layers/sampler.py +++ b/vllm_mindspore/model_executor/layers/sampler.py @@ -596,6 +596,11 @@ def _beam_search_sample( assert sample_idx == logprobs.size(0) return results +def exponential(x, lambd=1.0, *, generator=None): + if generator is not None: + raise ValueError("`generator` can not be supported.") + output = np.random.exponential(scale=lambd, size=x.shape) + return ms.Tensor(output).astype(x.dtype) # torch.multinomial forces a GPU<->CPU sync. # Therefore, we use an optimized implementation instead. @@ -611,15 +616,16 @@ def _multinomial( probs = probs.repeat_interleave(num_samples, dim=0) q = torch.empty_like(probs) if seq_groups is None: - q.exponential_() + q = exponential(q) else: sample_idx = 0 for seq_group in seq_groups: seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None - q[sample_idx : sample_idx + - stride].exponential_(generator=seq_group.generator) + q[sample_idx : sample_idx + stride] = exponential( + q[sample_idx : sample_idx + stride] + ) sample_idx += stride return probs.div_(q).argmax(dim=1).view(-1, num_samples)