mirror of
https://github.com/wassname/SimPO.git
synced 2026-06-27 18:57:43 +08:00
Merge branch 'main' of github.com:princeton-nlp/SimPO
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
## Evaluation
|
||||
We provide details on the evaluation of the models in this directory. Specifically, we evaluate on AlpacaEval 2 and ArenaHard. AlpacaEval 2 consists of 805 questions from 5 datasets, and MT-Bench covers 8 categories with
|
||||
80 questions. The most recently released Arena-Hard is an enhanced version of an MT-Bench,
|
||||
incorporating 500 well-defined technical problem-solving queries. We report scores following each
|
||||
benchmark’s evaluation protocol. For AlpacaEval 2, we report both the raw win rate (WR) and the
|
||||
length-controlled win rate (LC). The LC metric is specifically designed to be robust against model verbosity. For Arena-Hard, we report the win rate (WR) against the baseline model.
|
||||
|
||||
### AlpacaEval 2
|
||||
We provides generation configurations for the released models in the `alpacaeval2/configs` directory, and the corresponding generation templates can be found in `alpacaeval2/templates`. To evaluate the models on AlpacaEval 2, please use the [`alpaca-eval`](https://github.com/tatsu-lab/alpaca_eval) package.
|
||||
|
||||
### Arena-Hard
|
||||
We provide generation configurations for the released models in the `arenahard/configs` directory, and the corresponding generation templates can be found in `arenahard/templates`. To evaluate the models on Arena-Hard, please use the [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto) package.
|
||||
@@ -0,0 +1,16 @@
|
||||
Llama-3-Base-8B-SFT-SimPO:
|
||||
completions_kwargs:
|
||||
batch_size: 900
|
||||
do_sample: true
|
||||
max_new_tokens: 4096
|
||||
model_kwargs:
|
||||
torch_dtype: bfloat16
|
||||
model_name: princeton-nlp/Llama-3-Base-8B-SFT-SimPO
|
||||
stop_token_ids:
|
||||
- 128001
|
||||
- 128009
|
||||
temperature: 0.9
|
||||
top_p: 1.0
|
||||
fn_completions: vllm_local_completions
|
||||
pretty_name: Llama-3-Base-8B-SFT-SimPO
|
||||
prompt_template: templates/llama3.txt
|
||||
@@ -0,0 +1,16 @@
|
||||
Llama-3-Instruct-8B-SimPO:
|
||||
completions_kwargs:
|
||||
batch_size: 900
|
||||
do_sample: true
|
||||
max_new_tokens: 4096
|
||||
model_kwargs:
|
||||
torch_dtype: bfloat16
|
||||
model_name: princeton-nlp/Llama-3-Instruct-8B-SimPO
|
||||
stop_token_ids:
|
||||
- 128001
|
||||
- 128009
|
||||
temperature: 0.9
|
||||
top_p: 1.0
|
||||
fn_completions: vllm_local_completions
|
||||
pretty_name: Llama-3-Instruct-8B-SimPO
|
||||
prompt_template: templates/llama3.txt
|
||||
@@ -0,0 +1,13 @@
|
||||
Mistral-7B-Base-SFT-SimPO:
|
||||
prompt_template: templates/mistral_base.txt
|
||||
fn_completions: vllm_local_completions
|
||||
completions_kwargs:
|
||||
model_name: princeton-nlp/Mistral-7B-Base-SFT-SimPO
|
||||
model_kwargs:
|
||||
torch_dtype: 'bfloat16'
|
||||
max_new_tokens: 2048
|
||||
temperature: 0.7
|
||||
top_p: 1.0
|
||||
do_sample: True
|
||||
batch_size: 900
|
||||
pretty_name: Mistral-7B-Base-SFT-SimPO
|
||||
@@ -0,0 +1,11 @@
|
||||
Mistral-7B-Instruct-SimPO:
|
||||
completions_kwargs:
|
||||
batch_size: 900
|
||||
max_new_tokens: 2048
|
||||
model_kwargs:
|
||||
torch_dtype: bfloat16
|
||||
model_name: princeton-nlp/Mistral-7B-Instruct-SimPO
|
||||
temperature: 0.5
|
||||
fn_completions: vllm_local_completions
|
||||
pretty_name: snorkel-beta-3-7b-dpo-full-lr-2e-7-bs-32-len-norm
|
||||
prompt_template: templates/mistral_instruct.txt
|
||||
@@ -0,0 +1,4 @@
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
<s> <|system|>
|
||||
</s>
|
||||
<|user|>
|
||||
{instruction}</s>
|
||||
<|assistant|>
|
||||
@@ -0,0 +1 @@
|
||||
[INST] {instruction} [/INST]
|
||||
@@ -0,0 +1,7 @@
|
||||
Llama-3-Base-8B-SFT-SimPO:
|
||||
model_name: princeton-nlp/Llama-3-Base-8B-SFT-SimPO
|
||||
endpoints:
|
||||
- api_base: http://0.0.0.0:8742/v1
|
||||
api_key: token-abc123
|
||||
api_type: openai
|
||||
parallel: 8
|
||||
@@ -0,0 +1,9 @@
|
||||
bench_name: arena-hard-v0.1
|
||||
temperature: 0.0
|
||||
max_tokens: 4096
|
||||
num_choices: 1
|
||||
stop_token_ids:
|
||||
- 128001
|
||||
- 128009
|
||||
model_list:
|
||||
- Llama-3-Base-8B-SFT-SimPO
|
||||
@@ -0,0 +1,75 @@
|
||||
name: judgment config file for Arena Hard
|
||||
bench_name: arena-hard-v0.1
|
||||
judge_model: gpt-4-1106-preview
|
||||
reference: false
|
||||
ref_model: null
|
||||
baseline: true
|
||||
baseline_model: gpt-4-0314
|
||||
pairwise: true
|
||||
temperature: 0
|
||||
max_tokens: 4096
|
||||
regex_pattern: \[\[([AB<>=]+)\]\]
|
||||
system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses
|
||||
provided by two AI assistants to the user prompt displayed below. You will be given
|
||||
assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s
|
||||
answer is better.
|
||||
|
||||
|
||||
Begin your evaluation by generating your own answer to the prompt. You must provide
|
||||
your answers before judging any answers.
|
||||
|
||||
|
||||
When evaluating the assistants'' answers, compare both assistants'' answers with
|
||||
your answer. You must identify and correct any mistakes or inaccurate information.
|
||||
|
||||
|
||||
Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful
|
||||
means the answer correctly responds to the prompt or follows the instructions. Note
|
||||
when user prompt has any ambiguity or more than one interpretation, it is more helpful
|
||||
and appropriate to ask for clarifications or more information from the user than
|
||||
providing an answer based on assumptions. Relevant means all parts of the response
|
||||
closely connect or are appropriate to what is being asked. Concise means the response
|
||||
is clear and not verbose or excessive.
|
||||
|
||||
|
||||
Then consider the creativity and novelty of the assistant''s answers when needed.
|
||||
Finally, identify any missing important information in the assistants'' answers
|
||||
that would be beneficial to include when responding to the user prompt.
|
||||
|
||||
|
||||
After providing your explanation, you must output only one of the following choices
|
||||
as your final verdict with a label:
|
||||
|
||||
|
||||
1. Assistant A is significantly better: [[A>>B]]
|
||||
|
||||
2. Assistant A is slightly better: [[A>B]]
|
||||
|
||||
3. Tie, relatively the same: [[A=B]]
|
||||
|
||||
4. Assistant B is slightly better: [[B>A]]
|
||||
|
||||
5. Assistant B is significantly better: [[B>>A]]
|
||||
|
||||
|
||||
Example output: "My final verdict is tie: [[A=B]]".'
|
||||
prompt_template:
|
||||
- '<|User Prompt|>
|
||||
|
||||
{question_1}
|
||||
|
||||
|
||||
<|The Start of Assistant A''s Answer|>
|
||||
|
||||
{answer_1}
|
||||
|
||||
<|The End of Assistant A''s Answer|>
|
||||
|
||||
|
||||
<|The Start of Assistant B''s Answer|>
|
||||
|
||||
{answer_2}
|
||||
|
||||
<|The End of Assistant B''s Answer|>'
|
||||
model_list:
|
||||
- Llama-3-Base-8B-SFT-SimPO
|
||||
@@ -0,0 +1,7 @@
|
||||
Llama-3-Instruct-8B-SimPO:
|
||||
model_name: princeton-nlp/Llama-3-Instruct-8B-SimPO
|
||||
endpoints:
|
||||
- api_base: http://0.0.0.0:28103/v1
|
||||
api_key: token-abc123
|
||||
api_type: openai
|
||||
parallel: 8
|
||||
@@ -0,0 +1,9 @@
|
||||
bench_name: arena-hard-v0.1
|
||||
temperature: 0.0
|
||||
max_tokens: 4096
|
||||
num_choices: 1
|
||||
stop_token_ids:
|
||||
- 128001
|
||||
- 128009
|
||||
model_list:
|
||||
- Llama-3-Instruct-8B-SimPO
|
||||
@@ -0,0 +1,75 @@
|
||||
name: judgment config file for Arena Hard
|
||||
bench_name: arena-hard-v0.1
|
||||
judge_model: gpt-4-1106-preview
|
||||
reference: false
|
||||
ref_model: null
|
||||
baseline: true
|
||||
baseline_model: gpt-4-0314
|
||||
pairwise: true
|
||||
temperature: 0
|
||||
max_tokens: 4096
|
||||
regex_pattern: \[\[([AB<>=]+)\]\]
|
||||
system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses
|
||||
provided by two AI assistants to the user prompt displayed below. You will be given
|
||||
assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s
|
||||
answer is better.
|
||||
|
||||
|
||||
Begin your evaluation by generating your own answer to the prompt. You must provide
|
||||
your answers before judging any answers.
|
||||
|
||||
|
||||
When evaluating the assistants'' answers, compare both assistants'' answers with
|
||||
your answer. You must identify and correct any mistakes or inaccurate information.
|
||||
|
||||
|
||||
Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful
|
||||
means the answer correctly responds to the prompt or follows the instructions. Note
|
||||
when user prompt has any ambiguity or more than one interpretation, it is more helpful
|
||||
and appropriate to ask for clarifications or more information from the user than
|
||||
providing an answer based on assumptions. Relevant means all parts of the response
|
||||
closely connect or are appropriate to what is being asked. Concise means the response
|
||||
is clear and not verbose or excessive.
|
||||
|
||||
|
||||
Then consider the creativity and novelty of the assistant''s answers when needed.
|
||||
Finally, identify any missing important information in the assistants'' answers
|
||||
that would be beneficial to include when responding to the user prompt.
|
||||
|
||||
|
||||
After providing your explanation, you must output only one of the following choices
|
||||
as your final verdict with a label:
|
||||
|
||||
|
||||
1. Assistant A is significantly better: [[A>>B]]
|
||||
|
||||
2. Assistant A is slightly better: [[A>B]]
|
||||
|
||||
3. Tie, relatively the same: [[A=B]]
|
||||
|
||||
4. Assistant B is slightly better: [[B>A]]
|
||||
|
||||
5. Assistant B is significantly better: [[B>>A]]
|
||||
|
||||
|
||||
Example output: "My final verdict is tie: [[A=B]]".'
|
||||
prompt_template:
|
||||
- '<|User Prompt|>
|
||||
|
||||
{question_1}
|
||||
|
||||
|
||||
<|The Start of Assistant A''s Answer|>
|
||||
|
||||
{answer_1}
|
||||
|
||||
<|The End of Assistant A''s Answer|>
|
||||
|
||||
|
||||
<|The Start of Assistant B''s Answer|>
|
||||
|
||||
{answer_2}
|
||||
|
||||
<|The End of Assistant B''s Answer|>'
|
||||
model_list:
|
||||
- Llama-3-Instruct-8B-SimPO
|
||||
@@ -0,0 +1,7 @@
|
||||
Mistral-7B-Base-SFT-SimPO:
|
||||
model_name: princeton-nlp/Mistral-7B-Base-SFT-SimPO
|
||||
endpoints:
|
||||
- api_base: http://0.0.0.0:14940/v1
|
||||
api_key: token-abc123
|
||||
api_type: openai
|
||||
parallel: 8
|
||||
@@ -0,0 +1,8 @@
|
||||
bench_name: arena-hard-v0.1
|
||||
temperature: 0.0
|
||||
max_tokens: 4096
|
||||
num_choices: 1
|
||||
stop_token_ids:
|
||||
- 2
|
||||
model_list:
|
||||
- Mistral-7B-Base-SFT-SimPO
|
||||
@@ -0,0 +1,75 @@
|
||||
name: judgment config file for Arena Hard
|
||||
bench_name: arena-hard-v0.1
|
||||
judge_model: gpt-4-1106-preview
|
||||
reference: false
|
||||
ref_model: null
|
||||
baseline: true
|
||||
baseline_model: gpt-4-0314
|
||||
pairwise: true
|
||||
temperature: 0
|
||||
max_tokens: 4096
|
||||
regex_pattern: \[\[([AB<>=]+)\]\]
|
||||
system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses
|
||||
provided by two AI assistants to the user prompt displayed below. You will be given
|
||||
assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s
|
||||
answer is better.
|
||||
|
||||
|
||||
Begin your evaluation by generating your own answer to the prompt. You must provide
|
||||
your answers before judging any answers.
|
||||
|
||||
|
||||
When evaluating the assistants'' answers, compare both assistants'' answers with
|
||||
your answer. You must identify and correct any mistakes or inaccurate information.
|
||||
|
||||
|
||||
Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful
|
||||
means the answer correctly responds to the prompt or follows the instructions. Note
|
||||
when user prompt has any ambiguity or more than one interpretation, it is more helpful
|
||||
and appropriate to ask for clarifications or more information from the user than
|
||||
providing an answer based on assumptions. Relevant means all parts of the response
|
||||
closely connect or are appropriate to what is being asked. Concise means the response
|
||||
is clear and not verbose or excessive.
|
||||
|
||||
|
||||
Then consider the creativity and novelty of the assistant''s answers when needed.
|
||||
Finally, identify any missing important information in the assistants'' answers
|
||||
that would be beneficial to include when responding to the user prompt.
|
||||
|
||||
|
||||
After providing your explanation, you must output only one of the following choices
|
||||
as your final verdict with a label:
|
||||
|
||||
|
||||
1. Assistant A is significantly better: [[A>>B]]
|
||||
|
||||
2. Assistant A is slightly better: [[A>B]]
|
||||
|
||||
3. Tie, relatively the same: [[A=B]]
|
||||
|
||||
4. Assistant B is slightly better: [[B>A]]
|
||||
|
||||
5. Assistant B is significantly better: [[B>>A]]
|
||||
|
||||
|
||||
Example output: "My final verdict is tie: [[A=B]]".'
|
||||
prompt_template:
|
||||
- '<|User Prompt|>
|
||||
|
||||
{question_1}
|
||||
|
||||
|
||||
<|The Start of Assistant A''s Answer|>
|
||||
|
||||
{answer_1}
|
||||
|
||||
<|The End of Assistant A''s Answer|>
|
||||
|
||||
|
||||
<|The Start of Assistant B''s Answer|>
|
||||
|
||||
{answer_2}
|
||||
|
||||
<|The End of Assistant B''s Answer|>'
|
||||
model_list:
|
||||
- Mistral-7B-Base-SFT-SimPO
|
||||
@@ -0,0 +1,7 @@
|
||||
Mistral-7B-Instruct-SimPO:
|
||||
model_name: princeton-nlp/Mistral-7B-Instruct-SimPO
|
||||
endpoints:
|
||||
- api_base: http://0.0.0.0:26627/v1
|
||||
api_key: token-abc123
|
||||
api_type: openai
|
||||
parallel: 8
|
||||
@@ -0,0 +1,8 @@
|
||||
bench_name: arena-hard-v0.1
|
||||
temperature: 0.0
|
||||
max_tokens: 4096
|
||||
num_choices: 1
|
||||
stop_token_ids:
|
||||
- 2
|
||||
model_list:
|
||||
- Mistral-7B-Instruct-SimPO
|
||||
@@ -0,0 +1,75 @@
|
||||
name: judgment config file for Arena Hard
|
||||
bench_name: arena-hard-v0.1
|
||||
judge_model: gpt-4-1106-preview
|
||||
reference: false
|
||||
ref_model: null
|
||||
baseline: true
|
||||
baseline_model: gpt-4-0314
|
||||
pairwise: true
|
||||
temperature: 0
|
||||
max_tokens: 4096
|
||||
regex_pattern: \[\[([AB<>=]+)\]\]
|
||||
system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses
|
||||
provided by two AI assistants to the user prompt displayed below. You will be given
|
||||
assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s
|
||||
answer is better.
|
||||
|
||||
|
||||
Begin your evaluation by generating your own answer to the prompt. You must provide
|
||||
your answers before judging any answers.
|
||||
|
||||
|
||||
When evaluating the assistants'' answers, compare both assistants'' answers with
|
||||
your answer. You must identify and correct any mistakes or inaccurate information.
|
||||
|
||||
|
||||
Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful
|
||||
means the answer correctly responds to the prompt or follows the instructions. Note
|
||||
when user prompt has any ambiguity or more than one interpretation, it is more helpful
|
||||
and appropriate to ask for clarifications or more information from the user than
|
||||
providing an answer based on assumptions. Relevant means all parts of the response
|
||||
closely connect or are appropriate to what is being asked. Concise means the response
|
||||
is clear and not verbose or excessive.
|
||||
|
||||
|
||||
Then consider the creativity and novelty of the assistant''s answers when needed.
|
||||
Finally, identify any missing important information in the assistants'' answers
|
||||
that would be beneficial to include when responding to the user prompt.
|
||||
|
||||
|
||||
After providing your explanation, you must output only one of the following choices
|
||||
as your final verdict with a label:
|
||||
|
||||
|
||||
1. Assistant A is significantly better: [[A>>B]]
|
||||
|
||||
2. Assistant A is slightly better: [[A>B]]
|
||||
|
||||
3. Tie, relatively the same: [[A=B]]
|
||||
|
||||
4. Assistant B is slightly better: [[B>A]]
|
||||
|
||||
5. Assistant B is significantly better: [[B>>A]]
|
||||
|
||||
|
||||
Example output: "My final verdict is tie: [[A=B]]".'
|
||||
prompt_template:
|
||||
- '<|User Prompt|>
|
||||
|
||||
{question_1}
|
||||
|
||||
|
||||
<|The Start of Assistant A''s Answer|>
|
||||
|
||||
{answer_1}
|
||||
|
||||
<|The End of Assistant A''s Answer|>
|
||||
|
||||
|
||||
<|The Start of Assistant B''s Answer|>
|
||||
|
||||
{answer_2}
|
||||
|
||||
<|The End of Assistant B''s Answer|>'
|
||||
model_list:
|
||||
- Mistral-7B-Instruct-SimPO
|
||||
@@ -0,0 +1 @@
|
||||
{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
|
||||
@@ -0,0 +1 @@
|
||||
{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}
|
||||
@@ -0,0 +1 @@
|
||||
{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
|
||||
Reference in New Issue
Block a user