Deprecate best_of Sampling Parameter in anticipation for vLLM V1 (#13997)

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Vincent
2025-03-05 15:22:43 -05:00
committed by GitHub
parent a32c8669ca
commit a4f1ee35d6
12 changed files with 16 additions and 88 deletions
-4
View File
@@ -617,7 +617,6 @@ def test_schedule_decode_blocks_to_copy_update():
num_gpu_blocks=16)
_, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
curr_loras = None
scheduler._allocate_and_set_running(seq_group)
@@ -686,7 +685,6 @@ def test_schedule_swapped_cannot_swap_in():
for i in range(2):
_, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
@@ -717,7 +715,6 @@ def test_infeasible_swap():
for i in range(2):
_, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
@@ -747,7 +744,6 @@ def test_schedule_swapped_blocks_to_copy():
curr_loras = None
_, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
+13 -14
View File
@@ -18,7 +18,6 @@ def create_dummy_prompt(
prompt_length: int = -1,
block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
best_of: int = 1,
prompt_tokens: Optional[list[int]] = None,
min_tokens: int = 0,
max_tokens: int = 16,
@@ -32,17 +31,19 @@ def create_dummy_prompt(
prompt_tokens = list(range(prompt_length))
prompt_str = " ".join([str(t) for t in prompt_tokens])
prompt = Sequence(int(request_id),
inputs=token_inputs(prompt_tokens, prompt=prompt_str),
block_size=block_size)
seq_group = SequenceGroup(request_id=request_id,
seqs=[prompt],
arrival_time=time.time(),
sampling_params=SamplingParams(
best_of=best_of,
max_tokens=max_tokens,
min_tokens=min_tokens),
lora_request=lora_request)
prompt = Sequence(
int(request_id),
inputs=token_inputs(prompt_tokens, prompt=prompt_str),
block_size=block_size,
)
seq_group = SequenceGroup(
request_id=request_id,
seqs=[prompt],
arrival_time=time.time(),
sampling_params=SamplingParams(max_tokens=max_tokens,
min_tokens=min_tokens),
lora_request=lora_request,
)
return prompt, seq_group
@@ -72,7 +73,6 @@ def create_dummy_prompt_encoder_decoder(
encoder_prompt_length: int,
block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
best_of: int = 1,
) -> tuple[Sequence, Sequence, SequenceGroup]:
if not block_size:
block_size = decoder_prompt_length
@@ -102,7 +102,6 @@ def create_dummy_prompt_encoder_decoder(
seq_group = SequenceGroup(request_id=request_id,
seqs=[decoder_prompt],
sampling_params=SamplingParams(best_of=best_of),
arrival_time=time.time(),
lora_request=lora_request,
encoder_seq=encoder_prompt)
@@ -25,14 +25,6 @@ def test_n_gt_1(model):
assert len(outputs[0].outputs) == 3
def test_best_of(model):
"""Raise a ValueError since best_of is deprecated."""
params = SamplingParams(n=2, best_of=3)
with pytest.raises(ValueError):
_ = model.generate(PROMPT, params)
def test_penalties(model):
"""Check that we do not get errors if applied."""