diff --git a/eval/README.md b/eval/README.md new file mode 100644 index 0000000..f69cb25 --- /dev/null +++ b/eval/README.md @@ -0,0 +1,12 @@ +## Evaluation +We provide details on the evaluation of the models in this directory. Specifically, we evaluate on AlpacaEval 2 and ArenaHard. AlpacaEval 2 consists of 805 questions from 5 datasets, and MT-Bench covers 8 categories with +80 questions. The most recently released Arena-Hard is an enhanced version of an MT-Bench, +incorporating 500 well-defined technical problem-solving queries. We report scores following each +benchmark’s evaluation protocol. For AlpacaEval 2, we report both the raw win rate (WR) and the +length-controlled win rate (LC). The LC metric is specifically designed to be robust against model verbosity. For Arena-Hard, we report the win rate (WR) against the baseline model. + +### AlpacaEval 2 +We provides generation configurations for the released models in the `alpacaeval2/configs` directory, and the corresponding generation templates can be found in `alpacaeval2/templates`. To evaluate the models on AlpacaEval 2, please use the [`alpaca-eval`](https://github.com/tatsu-lab/alpaca_eval) package. + +### Arena-Hard +We provide generation configurations for the released models in the `arenahard/configs` directory, and the corresponding generation templates can be found in `arenahard/templates`. To evaluate the models on Arena-Hard, please use the [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto) package. diff --git a/eval/alpacaeval2/configs/Llama-3-Base-8B-SFT-SimPO.yaml b/eval/alpacaeval2/configs/Llama-3-Base-8B-SFT-SimPO.yaml new file mode 100644 index 0000000..137c226 --- /dev/null +++ b/eval/alpacaeval2/configs/Llama-3-Base-8B-SFT-SimPO.yaml @@ -0,0 +1,16 @@ +Llama-3-Base-8B-SFT-SimPO: + completions_kwargs: + batch_size: 900 + do_sample: true + max_new_tokens: 4096 + model_kwargs: + torch_dtype: bfloat16 + model_name: princeton-nlp/Llama-3-Base-8B-SFT-SimPO + stop_token_ids: + - 128001 + - 128009 + temperature: 0.9 + top_p: 1.0 + fn_completions: vllm_local_completions + pretty_name: Llama-3-Base-8B-SFT-SimPO + prompt_template: templates/llama3.txt diff --git a/eval/alpacaeval2/configs/Llama-3-Instruct-8B-SimPO.yaml b/eval/alpacaeval2/configs/Llama-3-Instruct-8B-SimPO.yaml new file mode 100644 index 0000000..866b522 --- /dev/null +++ b/eval/alpacaeval2/configs/Llama-3-Instruct-8B-SimPO.yaml @@ -0,0 +1,16 @@ +Llama-3-Instruct-8B-SimPO: + completions_kwargs: + batch_size: 900 + do_sample: true + max_new_tokens: 4096 + model_kwargs: + torch_dtype: bfloat16 + model_name: princeton-nlp/Llama-3-Instruct-8B-SimPO + stop_token_ids: + - 128001 + - 128009 + temperature: 0.9 + top_p: 1.0 + fn_completions: vllm_local_completions + pretty_name: Llama-3-Instruct-8B-SimPO + prompt_template: templates/llama3.txt diff --git a/eval/alpacaeval2/configs/Mistral-7B-Base-SFT-SimPO.yaml b/eval/alpacaeval2/configs/Mistral-7B-Base-SFT-SimPO.yaml new file mode 100644 index 0000000..0fe84d8 --- /dev/null +++ b/eval/alpacaeval2/configs/Mistral-7B-Base-SFT-SimPO.yaml @@ -0,0 +1,13 @@ +Mistral-7B-Base-SFT-SimPO: + prompt_template: templates/mistral_base.txt + fn_completions: vllm_local_completions + completions_kwargs: + model_name: princeton-nlp/Mistral-7B-Base-SFT-SimPO + model_kwargs: + torch_dtype: 'bfloat16' + max_new_tokens: 2048 + temperature: 0.7 + top_p: 1.0 + do_sample: True + batch_size: 900 + pretty_name: Mistral-7B-Base-SFT-SimPO \ No newline at end of file diff --git a/eval/alpacaeval2/configs/Mistral-7B-Instruct-SimPO.yaml b/eval/alpacaeval2/configs/Mistral-7B-Instruct-SimPO.yaml new file mode 100644 index 0000000..1b4554d --- /dev/null +++ b/eval/alpacaeval2/configs/Mistral-7B-Instruct-SimPO.yaml @@ -0,0 +1,11 @@ +Mistral-7B-Instruct-SimPO: + completions_kwargs: + batch_size: 900 + max_new_tokens: 2048 + model_kwargs: + torch_dtype: bfloat16 + model_name: princeton-nlp/Mistral-7B-Instruct-SimPO + temperature: 0.5 + fn_completions: vllm_local_completions + pretty_name: snorkel-beta-3-7b-dpo-full-lr-2e-7-bs-32-len-norm + prompt_template: templates/mistral_instruct.txt \ No newline at end of file diff --git a/eval/alpacaeval2/templates/llama3.txt b/eval/alpacaeval2/templates/llama3.txt new file mode 100644 index 0000000..0007d44 --- /dev/null +++ b/eval/alpacaeval2/templates/llama3.txt @@ -0,0 +1,4 @@ +<|begin_of_text|><|start_header_id|>user<|end_header_id|> + +{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + diff --git a/eval/alpacaeval2/templates/mistral_base.txt b/eval/alpacaeval2/templates/mistral_base.txt new file mode 100644 index 0000000..9385e1d --- /dev/null +++ b/eval/alpacaeval2/templates/mistral_base.txt @@ -0,0 +1,5 @@ + <|system|> + +<|user|> +{instruction} +<|assistant|> diff --git a/eval/alpacaeval2/templates/mistral_instruct.txt b/eval/alpacaeval2/templates/mistral_instruct.txt new file mode 100644 index 0000000..3577e92 --- /dev/null +++ b/eval/alpacaeval2/templates/mistral_instruct.txt @@ -0,0 +1 @@ +[INST] {instruction} [/INST] \ No newline at end of file diff --git a/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/api_config.yaml b/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/api_config.yaml new file mode 100644 index 0000000..00487e8 --- /dev/null +++ b/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/api_config.yaml @@ -0,0 +1,7 @@ +Llama-3-Base-8B-SFT-SimPO: + model_name: princeton-nlp/Llama-3-Base-8B-SFT-SimPO + endpoints: + - api_base: http://0.0.0.0:8742/v1 + api_key: token-abc123 + api_type: openai + parallel: 8 diff --git a/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/gen_answer_config.yaml b/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/gen_answer_config.yaml new file mode 100644 index 0000000..18a8a8d --- /dev/null +++ b/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/gen_answer_config.yaml @@ -0,0 +1,9 @@ +bench_name: arena-hard-v0.1 +temperature: 0.0 +max_tokens: 4096 +num_choices: 1 +stop_token_ids: +- 128001 +- 128009 +model_list: +- Llama-3-Base-8B-SFT-SimPO diff --git a/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/judge_config.yaml b/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/judge_config.yaml new file mode 100644 index 0000000..a11cd3e --- /dev/null +++ b/eval/arenahard/configs/Llama-3-Base-8B-SFT-SimPO/judge_config.yaml @@ -0,0 +1,75 @@ +name: judgment config file for Arena Hard +bench_name: arena-hard-v0.1 +judge_model: gpt-4-1106-preview +reference: false +ref_model: null +baseline: true +baseline_model: gpt-4-0314 +pairwise: true +temperature: 0 +max_tokens: 4096 +regex_pattern: \[\[([AB<>=]+)\]\] +system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses + provided by two AI assistants to the user prompt displayed below. You will be given + assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s + answer is better. + + + Begin your evaluation by generating your own answer to the prompt. You must provide + your answers before judging any answers. + + + When evaluating the assistants'' answers, compare both assistants'' answers with + your answer. You must identify and correct any mistakes or inaccurate information. + + + Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful + means the answer correctly responds to the prompt or follows the instructions. Note + when user prompt has any ambiguity or more than one interpretation, it is more helpful + and appropriate to ask for clarifications or more information from the user than + providing an answer based on assumptions. Relevant means all parts of the response + closely connect or are appropriate to what is being asked. Concise means the response + is clear and not verbose or excessive. + + + Then consider the creativity and novelty of the assistant''s answers when needed. + Finally, identify any missing important information in the assistants'' answers + that would be beneficial to include when responding to the user prompt. + + + After providing your explanation, you must output only one of the following choices + as your final verdict with a label: + + + 1. Assistant A is significantly better: [[A>>B]] + + 2. Assistant A is slightly better: [[A>B]] + + 3. Tie, relatively the same: [[A=B]] + + 4. Assistant B is slightly better: [[B>A]] + + 5. Assistant B is significantly better: [[B>>A]] + + + Example output: "My final verdict is tie: [[A=B]]".' +prompt_template: +- '<|User Prompt|> + + {question_1} + + + <|The Start of Assistant A''s Answer|> + + {answer_1} + + <|The End of Assistant A''s Answer|> + + + <|The Start of Assistant B''s Answer|> + + {answer_2} + + <|The End of Assistant B''s Answer|>' +model_list: +- Llama-3-Base-8B-SFT-SimPO diff --git a/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/api_config.yaml b/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/api_config.yaml new file mode 100644 index 0000000..8dab9d5 --- /dev/null +++ b/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/api_config.yaml @@ -0,0 +1,7 @@ +Llama-3-Instruct-8B-SimPO: + model_name: princeton-nlp/Llama-3-Instruct-8B-SimPO + endpoints: + - api_base: http://0.0.0.0:28103/v1 + api_key: token-abc123 + api_type: openai + parallel: 8 diff --git a/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/gen_answer_config.yaml b/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/gen_answer_config.yaml new file mode 100644 index 0000000..a9bb007 --- /dev/null +++ b/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/gen_answer_config.yaml @@ -0,0 +1,9 @@ +bench_name: arena-hard-v0.1 +temperature: 0.0 +max_tokens: 4096 +num_choices: 1 +stop_token_ids: +- 128001 +- 128009 +model_list: +- Llama-3-Instruct-8B-SimPO diff --git a/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/judge_config.yaml b/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/judge_config.yaml new file mode 100644 index 0000000..58f5dfb --- /dev/null +++ b/eval/arenahard/configs/Llama-3-Instruct-8B-SimPO/judge_config.yaml @@ -0,0 +1,75 @@ +name: judgment config file for Arena Hard +bench_name: arena-hard-v0.1 +judge_model: gpt-4-1106-preview +reference: false +ref_model: null +baseline: true +baseline_model: gpt-4-0314 +pairwise: true +temperature: 0 +max_tokens: 4096 +regex_pattern: \[\[([AB<>=]+)\]\] +system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses + provided by two AI assistants to the user prompt displayed below. You will be given + assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s + answer is better. + + + Begin your evaluation by generating your own answer to the prompt. You must provide + your answers before judging any answers. + + + When evaluating the assistants'' answers, compare both assistants'' answers with + your answer. You must identify and correct any mistakes or inaccurate information. + + + Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful + means the answer correctly responds to the prompt or follows the instructions. Note + when user prompt has any ambiguity or more than one interpretation, it is more helpful + and appropriate to ask for clarifications or more information from the user than + providing an answer based on assumptions. Relevant means all parts of the response + closely connect or are appropriate to what is being asked. Concise means the response + is clear and not verbose or excessive. + + + Then consider the creativity and novelty of the assistant''s answers when needed. + Finally, identify any missing important information in the assistants'' answers + that would be beneficial to include when responding to the user prompt. + + + After providing your explanation, you must output only one of the following choices + as your final verdict with a label: + + + 1. Assistant A is significantly better: [[A>>B]] + + 2. Assistant A is slightly better: [[A>B]] + + 3. Tie, relatively the same: [[A=B]] + + 4. Assistant B is slightly better: [[B>A]] + + 5. Assistant B is significantly better: [[B>>A]] + + + Example output: "My final verdict is tie: [[A=B]]".' +prompt_template: +- '<|User Prompt|> + + {question_1} + + + <|The Start of Assistant A''s Answer|> + + {answer_1} + + <|The End of Assistant A''s Answer|> + + + <|The Start of Assistant B''s Answer|> + + {answer_2} + + <|The End of Assistant B''s Answer|>' +model_list: +- Llama-3-Instruct-8B-SimPO diff --git a/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/api_config.yaml b/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/api_config.yaml new file mode 100644 index 0000000..a42df6d --- /dev/null +++ b/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/api_config.yaml @@ -0,0 +1,7 @@ +Mistral-7B-Base-SFT-SimPO: + model_name: princeton-nlp/Mistral-7B-Base-SFT-SimPO + endpoints: + - api_base: http://0.0.0.0:14940/v1 + api_key: token-abc123 + api_type: openai + parallel: 8 diff --git a/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/gen_answer_config.yaml b/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/gen_answer_config.yaml new file mode 100644 index 0000000..a81a535 --- /dev/null +++ b/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/gen_answer_config.yaml @@ -0,0 +1,8 @@ +bench_name: arena-hard-v0.1 +temperature: 0.0 +max_tokens: 4096 +num_choices: 1 +stop_token_ids: +- 2 +model_list: +- Mistral-7B-Base-SFT-SimPO diff --git a/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/judge_config.yaml b/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/judge_config.yaml new file mode 100644 index 0000000..dcdff38 --- /dev/null +++ b/eval/arenahard/configs/Mistral-7B-Base-SFT-SimPO/judge_config.yaml @@ -0,0 +1,75 @@ +name: judgment config file for Arena Hard +bench_name: arena-hard-v0.1 +judge_model: gpt-4-1106-preview +reference: false +ref_model: null +baseline: true +baseline_model: gpt-4-0314 +pairwise: true +temperature: 0 +max_tokens: 4096 +regex_pattern: \[\[([AB<>=]+)\]\] +system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses + provided by two AI assistants to the user prompt displayed below. You will be given + assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s + answer is better. + + + Begin your evaluation by generating your own answer to the prompt. You must provide + your answers before judging any answers. + + + When evaluating the assistants'' answers, compare both assistants'' answers with + your answer. You must identify and correct any mistakes or inaccurate information. + + + Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful + means the answer correctly responds to the prompt or follows the instructions. Note + when user prompt has any ambiguity or more than one interpretation, it is more helpful + and appropriate to ask for clarifications or more information from the user than + providing an answer based on assumptions. Relevant means all parts of the response + closely connect or are appropriate to what is being asked. Concise means the response + is clear and not verbose or excessive. + + + Then consider the creativity and novelty of the assistant''s answers when needed. + Finally, identify any missing important information in the assistants'' answers + that would be beneficial to include when responding to the user prompt. + + + After providing your explanation, you must output only one of the following choices + as your final verdict with a label: + + + 1. Assistant A is significantly better: [[A>>B]] + + 2. Assistant A is slightly better: [[A>B]] + + 3. Tie, relatively the same: [[A=B]] + + 4. Assistant B is slightly better: [[B>A]] + + 5. Assistant B is significantly better: [[B>>A]] + + + Example output: "My final verdict is tie: [[A=B]]".' +prompt_template: +- '<|User Prompt|> + + {question_1} + + + <|The Start of Assistant A''s Answer|> + + {answer_1} + + <|The End of Assistant A''s Answer|> + + + <|The Start of Assistant B''s Answer|> + + {answer_2} + + <|The End of Assistant B''s Answer|>' +model_list: +- Mistral-7B-Base-SFT-SimPO diff --git a/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/api_config.yaml b/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/api_config.yaml new file mode 100644 index 0000000..4782165 --- /dev/null +++ b/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/api_config.yaml @@ -0,0 +1,7 @@ +Mistral-7B-Instruct-SimPO: + model_name: princeton-nlp/Mistral-7B-Instruct-SimPO + endpoints: + - api_base: http://0.0.0.0:26627/v1 + api_key: token-abc123 + api_type: openai + parallel: 8 diff --git a/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/gen_answer_config.yaml b/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/gen_answer_config.yaml new file mode 100644 index 0000000..4a57705 --- /dev/null +++ b/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/gen_answer_config.yaml @@ -0,0 +1,8 @@ +bench_name: arena-hard-v0.1 +temperature: 0.0 +max_tokens: 4096 +num_choices: 1 +stop_token_ids: +- 2 +model_list: +- Mistral-7B-Instruct-SimPO diff --git a/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/judge_config.yaml b/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/judge_config.yaml new file mode 100644 index 0000000..1059c1b --- /dev/null +++ b/eval/arenahard/configs/Mistral-7B-Instruct-SimPO/judge_config.yaml @@ -0,0 +1,75 @@ +name: judgment config file for Arena Hard +bench_name: arena-hard-v0.1 +judge_model: gpt-4-1106-preview +reference: false +ref_model: null +baseline: true +baseline_model: gpt-4-0314 +pairwise: true +temperature: 0 +max_tokens: 4096 +regex_pattern: \[\[([AB<>=]+)\]\] +system_prompt: 'Please act as an impartial judge and evaluate the quality of the responses + provided by two AI assistants to the user prompt displayed below. You will be given + assistant A''s answer and assistant B''s answer. Your job is to evaluate which assistant''s + answer is better. + + + Begin your evaluation by generating your own answer to the prompt. You must provide + your answers before judging any answers. + + + When evaluating the assistants'' answers, compare both assistants'' answers with + your answer. You must identify and correct any mistakes or inaccurate information. + + + Then consider if the assistant''s answers are helpful, relevant, and concise. Helpful + means the answer correctly responds to the prompt or follows the instructions. Note + when user prompt has any ambiguity or more than one interpretation, it is more helpful + and appropriate to ask for clarifications or more information from the user than + providing an answer based on assumptions. Relevant means all parts of the response + closely connect or are appropriate to what is being asked. Concise means the response + is clear and not verbose or excessive. + + + Then consider the creativity and novelty of the assistant''s answers when needed. + Finally, identify any missing important information in the assistants'' answers + that would be beneficial to include when responding to the user prompt. + + + After providing your explanation, you must output only one of the following choices + as your final verdict with a label: + + + 1. Assistant A is significantly better: [[A>>B]] + + 2. Assistant A is slightly better: [[A>B]] + + 3. Tie, relatively the same: [[A=B]] + + 4. Assistant B is slightly better: [[B>A]] + + 5. Assistant B is significantly better: [[B>>A]] + + + Example output: "My final verdict is tie: [[A=B]]".' +prompt_template: +- '<|User Prompt|> + + {question_1} + + + <|The Start of Assistant A''s Answer|> + + {answer_1} + + <|The End of Assistant A''s Answer|> + + + <|The Start of Assistant B''s Answer|> + + {answer_2} + + <|The End of Assistant B''s Answer|>' +model_list: +- Mistral-7B-Instruct-SimPO diff --git a/eval/arenahard/templates/llama3.jinja b/eval/arenahard/templates/llama3.jinja new file mode 100644 index 0000000..5117b70 --- /dev/null +++ b/eval/arenahard/templates/llama3.jinja @@ -0,0 +1 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %} \ No newline at end of file diff --git a/eval/arenahard/templates/mistral_base.jinja b/eval/arenahard/templates/mistral_base.jinja new file mode 100644 index 0000000..7568abd --- /dev/null +++ b/eval/arenahard/templates/mistral_base.jinja @@ -0,0 +1 @@ +{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %} \ No newline at end of file diff --git a/eval/arenahard/templates/mistral_instruct.jinja b/eval/arenahard/templates/mistral_instruct.jinja new file mode 100644 index 0000000..aff4b0e --- /dev/null +++ b/eval/arenahard/templates/mistral_instruct.jinja @@ -0,0 +1 @@ +{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% set content = system_message + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %} \ No newline at end of file