Merge branch 'main' of github.com:princeton-nlp/SimPO

2026-06-27 18:57:43 +08:00 · 2024-06-02 17:55:37 -04:00
parent 6cc70cec47 26c060cbfc
commit b982a84748
23 changed files with 443 additions and 0 deletions
@@ -0,0 +1,12 @@
+## Evaluation
+We provide details on the evaluation of the models in this directory. Specifically, we evaluate on AlpacaEval 2 and ArenaHard. AlpacaEval 2 consists of 805 questions from 5 datasets, and MT-Bench covers 8 categories with
+80 questions. The most recently released Arena-Hard is an enhanced version of an MT-Bench,
+incorporating 500 well-defined technical problem-solving queries. We report scores following each
+benchmark’s evaluation protocol. For AlpacaEval 2, we report both the raw win rate (WR) and the
+length-controlled win rate (LC). The LC metric is specifically designed to be robust against model verbosity. For Arena-Hard, we report the win rate (WR) against the baseline model. 
+
+### AlpacaEval 2
+We provides generation configurations for the released models in the `alpacaeval2/configs` directory, and the corresponding generation templates can be found in `alpacaeval2/templates`. To evaluate the models on AlpacaEval 2, please use the [`alpaca-eval`](https://github.com/tatsu-lab/alpaca_eval) package.
+
+### Arena-Hard
+We provide generation configurations for the released models in the `arenahard/configs` directory, and the corresponding generation templates can be found in `arenahard/templates`. To evaluate the models on Arena-Hard, please use the [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto) package.
@@ -0,0 +1,16 @@
+Llama-3-Base-8B-SFT-SimPO:
+  completions_kwargs:
+    batch_size: 900
+    do_sample: true
+    max_new_tokens: 4096
+    model_kwargs:
+      torch_dtype: bfloat16
+    model_name: princeton-nlp/Llama-3-Base-8B-SFT-SimPO
+    stop_token_ids:
+    - 128001
+    - 128009
+    temperature: 0.9
+    top_p: 1.0
+  fn_completions: vllm_local_completions
+  pretty_name: Llama-3-Base-8B-SFT-SimPO
+  prompt_template: templates/llama3.txt
@@ -0,0 +1,16 @@
+Llama-3-Instruct-8B-SimPO:
+  completions_kwargs:
+    batch_size: 900
+    do_sample: true
+    max_new_tokens: 4096
+    model_kwargs:
+      torch_dtype: bfloat16
+    model_name: princeton-nlp/Llama-3-Instruct-8B-SimPO
+    stop_token_ids:
+    - 128001
+    - 128009
+    temperature: 0.9
+    top_p: 1.0
+  fn_completions: vllm_local_completions
+  pretty_name: Llama-3-Instruct-8B-SimPO
+  prompt_template: templates/llama3.txt
@@ -0,0 +1,13 @@
+Mistral-7B-Base-SFT-SimPO:
+  prompt_template: templates/mistral_base.txt
+  fn_completions: vllm_local_completions
+  completions_kwargs:
+    model_name: princeton-nlp/Mistral-7B-Base-SFT-SimPO
+    model_kwargs:
+      torch_dtype: 'bfloat16'
+    max_new_tokens: 2048
+    temperature: 0.7
+    top_p: 1.0
+    do_sample: True
+    batch_size: 900
+  pretty_name: Mistral-7B-Base-SFT-SimPO
@@ -0,0 +1,11 @@
+Mistral-7B-Instruct-SimPO:
+  completions_kwargs:
+    batch_size: 900
+    max_new_tokens: 2048
+    model_kwargs:
+      torch_dtype: bfloat16
+    model_name: princeton-nlp/Mistral-7B-Instruct-SimPO
+    temperature: 0.5
+  fn_completions: vllm_local_completions
+  pretty_name: snorkel-beta-3-7b-dpo-full-lr-2e-7-bs-32-len-norm
+  prompt_template: templates/mistral_instruct.txt
@@ -0,0 +1,4 @@
+<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+
+{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
@@ -0,0 +1,5 @@
+<s> <|system|>
+</s>
+<|user|>
+{instruction}</s>
+<|assistant|>
@@ -0,0 +1 @@
+[INST] {instruction} [/INST]
@@ -0,0 +1,7 @@
+Llama-3-Base-8B-SFT-SimPO:
+  model_name: princeton-nlp/Llama-3-Base-8B-SFT-SimPO 
+  endpoints:
+  - api_base: http://0.0.0.0:8742/v1
+    api_key: token-abc123
+  api_type: openai
+  parallel: 8
@@ -0,0 +1,9 @@
+bench_name: arena-hard-v0.1
+temperature: 0.0
+max_tokens: 4096
+num_choices: 1
+stop_token_ids:
+- 128001
+- 128009
+model_list:
+- Llama-3-Base-8B-SFT-SimPO
@@ -0,0 +1,75 @@
+name: judgment config file for Arena Hard
+bench_name: arena-hard-v0.1
+judge_model: gpt-4-1106-preview
+reference: false
+ref_model: null
+baseline: true
+baseline_model: gpt-4-0314
+pairwise: true
+temperature: 0
+max_tokens: 4096
+regex_pattern: \[\[([AB<>=]+)\]\]
+system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses
+  provided by two AI assistants to the user prompt displayed below. You will be given
+  assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s
+  answer is better.
+
+
+  Begin your evaluation by generating your own answer to the prompt. You must provide
+  your answers before judging any answers.
+
+
+  When evaluating the assistants'' answers, compare both assistants'' answers with
+  your answer. You must identify and correct any mistakes or inaccurate information.
+
+
+  Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful
+  means the answer correctly responds to the prompt or follows the instructions. Note
+  when user prompt has any ambiguity or more than one interpretation, it is more helpful
+  and appropriate to ask for clarifications or more information from the user than
+  providing an answer based on assumptions. Relevant means all parts of the response
+  closely connect or are appropriate to what is being asked. Concise means the response
+  is clear and not verbose or excessive.
+
+
+  Then consider the creativity and novelty of the assistant''s answers when needed.
+  Finally, identify any missing important information in the assistants'' answers
+  that would be beneficial to include when responding to the user prompt.
+
+
+  After providing your explanation, you must output only one of the following choices
+  as your final verdict with a label:
+
+
+  1. Assistant A is significantly better: [[A>>B]]
+
+  2. Assistant A is slightly better: [[A>B]]
+
+  3. Tie, relatively the same: [[A=B]]
+
+  4. Assistant B is slightly better: [[B>A]]
+
+  5. Assistant B is significantly better: [[B>>A]]
+
+
+  Example output: "My final verdict is tie: [[A=B]]".'
+prompt_template:
+- '<|User Prompt|>
+
+  {question_1}
+
+
+  <|The Start of Assistant A''s Answer|>
+
+  {answer_1}
+
+  <|The End of Assistant A''s Answer|>
+
+
+  <|The Start of Assistant B''s Answer|>
+
+  {answer_2}
+
+  <|The End of Assistant B''s Answer|>'
+model_list:
+- Llama-3-Base-8B-SFT-SimPO
@@ -0,0 +1,7 @@
+Llama-3-Instruct-8B-SimPO:
+  model_name: princeton-nlp/Llama-3-Instruct-8B-SimPO 
+  endpoints:
+  - api_base: http://0.0.0.0:28103/v1
+    api_key: token-abc123
+  api_type: openai
+  parallel: 8
@@ -0,0 +1,9 @@
+bench_name: arena-hard-v0.1
+temperature: 0.0
+max_tokens: 4096
+num_choices: 1
+stop_token_ids:
+- 128001
+- 128009
+model_list:
+- Llama-3-Instruct-8B-SimPO
@@ -0,0 +1,75 @@
+name: judgment config file for Arena Hard
+bench_name: arena-hard-v0.1
+judge_model: gpt-4-1106-preview
+reference: false
+ref_model: null
+baseline: true
+baseline_model: gpt-4-0314
+pairwise: true
+temperature: 0
+max_tokens: 4096
+regex_pattern: \[\[([AB<>=]+)\]\]
+system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses
+  provided by two AI assistants to the user prompt displayed below. You will be given
+  assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s
+  answer is better.
+
+
+  Begin your evaluation by generating your own answer to the prompt. You must provide
+  your answers before judging any answers.
+
+
+  When evaluating the assistants'' answers, compare both assistants'' answers with
+  your answer. You must identify and correct any mistakes or inaccurate information.
+
+
+  Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful
+  means the answer correctly responds to the prompt or follows the instructions. Note
+  when user prompt has any ambiguity or more than one interpretation, it is more helpful
+  and appropriate to ask for clarifications or more information from the user than
+  providing an answer based on assumptions. Relevant means all parts of the response
+  closely connect or are appropriate to what is being asked. Concise means the response
+  is clear and not verbose or excessive.
+
+
+  Then consider the creativity and novelty of the assistant''s answers when needed.
+  Finally, identify any missing important information in the assistants'' answers
+  that would be beneficial to include when responding to the user prompt.
+
+
+  After providing your explanation, you must output only one of the following choices
+  as your final verdict with a label:
+
+
+  1. Assistant A is significantly better: [[A>>B]]
+
+  2. Assistant A is slightly better: [[A>B]]
+
+  3. Tie, relatively the same: [[A=B]]
+
+  4. Assistant B is slightly better: [[B>A]]
+
+  5. Assistant B is significantly better: [[B>>A]]
+
+
+  Example output: "My final verdict is tie: [[A=B]]".'
+prompt_template:
+- '<|User Prompt|>
+
+  {question_1}
+
+
+  <|The Start of Assistant A''s Answer|>
+
+  {answer_1}
+
+  <|The End of Assistant A''s Answer|>
+
+
+  <|The Start of Assistant B''s Answer|>
+
+  {answer_2}
+
+  <|The End of Assistant B''s Answer|>'
+model_list:
+- Llama-3-Instruct-8B-SimPO
@@ -0,0 +1,7 @@
+Mistral-7B-Base-SFT-SimPO:
+  model_name: princeton-nlp/Mistral-7B-Base-SFT-SimPO 
+  endpoints:
+  - api_base: http://0.0.0.0:14940/v1
+    api_key: token-abc123
+  api_type: openai
+  parallel: 8
@@ -0,0 +1,8 @@
+bench_name: arena-hard-v0.1
+temperature: 0.0
+max_tokens: 4096
+num_choices: 1
+stop_token_ids:
+- 2
+model_list:
+- Mistral-7B-Base-SFT-SimPO
@@ -0,0 +1,75 @@
+name: judgment config file for Arena Hard
+bench_name: arena-hard-v0.1
+judge_model: gpt-4-1106-preview
+reference: false
+ref_model: null
+baseline: true
+baseline_model: gpt-4-0314
+pairwise: true
+temperature: 0
+max_tokens: 4096
+regex_pattern: \[\[([AB<>=]+)\]\]
+system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses
+  provided by two AI assistants to the user prompt displayed below. You will be given
+  assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s
+  answer is better.
+
+
+  Begin your evaluation by generating your own answer to the prompt. You must provide
+  your answers before judging any answers.
+
+
+  When evaluating the assistants'' answers, compare both assistants'' answers with
+  your answer. You must identify and correct any mistakes or inaccurate information.
+
+
+  Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful
+  means the answer correctly responds to the prompt or follows the instructions. Note
+  when user prompt has any ambiguity or more than one interpretation, it is more helpful
+  and appropriate to ask for clarifications or more information from the user than
+  providing an answer based on assumptions. Relevant means all parts of the response
+  closely connect or are appropriate to what is being asked. Concise means the response
+  is clear and not verbose or excessive.
+
+
+  Then consider the creativity and novelty of the assistant''s answers when needed.
+  Finally, identify any missing important information in the assistants'' answers
+  that would be beneficial to include when responding to the user prompt.
+
+
+  After providing your explanation, you must output only one of the following choices
+  as your final verdict with a label:
+
+
+  1. Assistant A is significantly better: [[A>>B]]
+
+  2. Assistant A is slightly better: [[A>B]]
+
+  3. Tie, relatively the same: [[A=B]]
+
+  4. Assistant B is slightly better: [[B>A]]
+
+  5. Assistant B is significantly better: [[B>>A]]
+
+
+  Example output: "My final verdict is tie: [[A=B]]".'
+prompt_template:
+- '<|User Prompt|>
+
+  {question_1}
+
+
+  <|The Start of Assistant A''s Answer|>
+
+  {answer_1}
+
+  <|The End of Assistant A''s Answer|>
+
+
+  <|The Start of Assistant B''s Answer|>
+
+  {answer_2}
+
+  <|The End of Assistant B''s Answer|>'
+model_list:
+- Mistral-7B-Base-SFT-SimPO
@@ -0,0 +1,7 @@
+Mistral-7B-Instruct-SimPO:
+  model_name: princeton-nlp/Mistral-7B-Instruct-SimPO 
+  endpoints:
+  - api_base: http://0.0.0.0:26627/v1
+    api_key: token-abc123
+  api_type: openai
+  parallel: 8
@@ -0,0 +1,8 @@
+bench_name: arena-hard-v0.1
+temperature: 0.0
+max_tokens: 4096
+num_choices: 1
+stop_token_ids:
+- 2
+model_list:
+- Mistral-7B-Instruct-SimPO
@@ -0,0 +1,75 @@
+name: judgment config file for Arena Hard
+bench_name: arena-hard-v0.1
+judge_model: gpt-4-1106-preview
+reference: false
+ref_model: null
+baseline: true
+baseline_model: gpt-4-0314
+pairwise: true
+temperature: 0
+max_tokens: 4096
+regex_pattern: \[\[([AB<>=]+)\]\]
+system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses
+  provided by two AI assistants to the user prompt displayed below. You will be given
+  assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s
+  answer is better.
+
+
+  Begin your evaluation by generating your own answer to the prompt. You must provide
+  your answers before judging any answers.
+
+
+  When evaluating the assistants'' answers, compare both assistants'' answers with
+  your answer. You must identify and correct any mistakes or inaccurate information.
+
+
+  Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful
+  means the answer correctly responds to the prompt or follows the instructions. Note
+  when user prompt has any ambiguity or more than one interpretation, it is more helpful
+  and appropriate to ask for clarifications or more information from the user than
+  providing an answer based on assumptions. Relevant means all parts of the response
+  closely connect or are appropriate to what is being asked. Concise means the response
+  is clear and not verbose or excessive.
+
+
+  Then consider the creativity and novelty of the assistant''s answers when needed.
+  Finally, identify any missing important information in the assistants'' answers
+  that would be beneficial to include when responding to the user prompt.
+
+
+  After providing your explanation, you must output only one of the following choices
+  as your final verdict with a label:
+
+
+  1. Assistant A is significantly better: [[A>>B]]
+
+  2. Assistant A is slightly better: [[A>B]]
+
+  3. Tie, relatively the same: [[A=B]]
+
+  4. Assistant B is slightly better: [[B>A]]
+
+  5. Assistant B is significantly better: [[B>>A]]
+
+
+  Example output: "My final verdict is tie: [[A=B]]".'
+prompt_template:
+- '<|User Prompt|>
+
+  {question_1}
+
+
+  <|The Start of Assistant A''s Answer|>
+
+  {answer_1}
+
+  <|The End of Assistant A''s Answer|>
+
+
+  <|The Start of Assistant B''s Answer|>
+
+  {answer_2}
+
+  <|The End of Assistant B''s Answer|>'
+model_list:
+- Mistral-7B-Instruct-SimPO
@@ -0,0 +1 @@
+{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
@@ -0,0 +1 @@
+{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}
@@ -0,0 +1 @@
+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
				`@@ -0,0 +1 @@`
				`{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<\|start_header_id\|>' + message['role'] + '<\|end_header_id\|>\n\n'+ message['content'] \| trim + '<\|eot_id\|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<\|start_header_id\|>assistant<\|end_header_id\|>\n\n' }}{% endif %}`
				`@@ -0,0 +1 @@`
				`{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<\|user\|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<\|system\|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<\|assistant\|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<\|assistant\|>' }}\n{% endif %}\n{% endfor %}`
				`@@ -0,0 +1 @@`
				{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}