mirror of
https://github.com/wassname/SimPO.git
synced 2026-06-27 17:46:46 +08:00
add data generation scripts
This commit is contained in:
@@ -0,0 +1,35 @@
|
||||
# On-Policy Preference Data Generation
|
||||
|
||||
We provide the code to generate on-policy preference data (e.g., [princeton-nlp/llama3-ultrafeedback-armorm](https://huggingface.co/datasets/princeton-nlp/llama3-ultrafeedback-armorm) and [princeton-nlp/gemma2-ultrafeedback-armorm](https://huggingface.co/datasets/princeton-nlp/gemma2-ultrafeedback-armorm)) used in our experiments.
|
||||
|
||||
## Requirements
|
||||
|
||||
You will need to install the [`vllm`](https://github.com/vllm-project/vllm) package for decoding. Moreover, if you are running decoding with `gemma-2` models, you will need to also install `flashinfer`.
|
||||
|
||||
## On-Policy Preference Data Generation Process
|
||||
|
||||
1. Generate multiple responses using the language model:
|
||||
|
||||
```
|
||||
python decode.py --data_dir $DATASET_DIR --seed $SEED
|
||||
```
|
||||
This will generate one response per prompt under the specified seed. You need to provide a dataset containing prompts (by default, we use `HuggingFaceH4/ultrafeedback_binarized`). You can also set decoding hyperparameters by passing in corresponding arguments (by default, we use a temperature of `0.8` for sampling).
|
||||
|
||||
Note that you will need to run the above command under **multiple different** seeds (by default, we use `13, 21, 42, 79, 100`) to obtain different responses for each prompt.
|
||||
|
||||
2. Post-process the generations
|
||||
|
||||
```
|
||||
python post_process.py
|
||||
```
|
||||
|
||||
This will combine the generated responses under each seed and filter out samples with identical responses across all seeds.
|
||||
|
||||
3. Annotate the preference labels with a reward model
|
||||
|
||||
```
|
||||
python reward_model_annotate.py --reward_model $MODEL
|
||||
```
|
||||
|
||||
This will score the generations using a reward model (by default, we use `RLHFlow/ArmoRM-Llama3-8B-v0.1`) and binarize the dataset by taking the highest-scoring response as the winning response and the lowest-scoring one as the losing.
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
from datasets import load_dataset, load_from_disk
|
||||
import os
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" # this is recommended for gemma-2 models; otherwise it is not needed
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description='Decode with vllm')
|
||||
parser.add_argument('--data_dir', type=str, default="HuggingFaceH4/ultrafeedback_binarized",
|
||||
help='Directory containing the data')
|
||||
parser.add_argument('--model', type=str, default="google/gemma-2-9b-it",
|
||||
help='Path to the LLM model')
|
||||
parser.add_argument('--temperature', type=float, default=0.8,
|
||||
help='Temperature for sampling')
|
||||
parser.add_argument('--top_p', type=float, default=0.95,
|
||||
help='Top-p probability for sampling')
|
||||
parser.add_argument('--max_tokens', type=int, default=4096,
|
||||
help='Maximum number of tokens to generate')
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help='Random seed')
|
||||
parser.add_argument('--output_dir', type=str, default="datasets/gemma2_ultrafeedback",
|
||||
help='output_dir')
|
||||
args = parser.parse_args()
|
||||
|
||||
print(args)
|
||||
|
||||
data_dir = args.data_dir
|
||||
llm = LLM(model=args.model)
|
||||
tokenizer = llm.get_tokenizer()
|
||||
|
||||
train_dataset= load_dataset(data_dir, split='train_prefs')
|
||||
|
||||
prompts = list(set(train_dataset['prompt']))
|
||||
|
||||
conversations = [tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) for prompt in prompts]
|
||||
|
||||
sampling_params = SamplingParams(temperature=args.temperature,
|
||||
top_p=args.top_p,
|
||||
max_tokens=args.max_tokens,
|
||||
seed=args.seed,)
|
||||
outputs = llm.generate(conversations, sampling_params)
|
||||
|
||||
# Save the outputs as a JSON file.
|
||||
output_data = []
|
||||
for i, output in enumerate(outputs):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
output_data.append({
|
||||
'prompt': prompts[i],
|
||||
"format_prompt": prompt,
|
||||
'generated_text': generated_text,
|
||||
})
|
||||
|
||||
output_file = f'output_{args.seed}.json'
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
with open(os.path.join(args.output_dir, output_file), 'w') as f:
|
||||
json.dump(output_data, f, indent=4)
|
||||
|
||||
print(f"Outputs saved to {os.path.join(args.output_dir, output_file)}")
|
||||
@@ -0,0 +1,45 @@
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--generation_file_dir", type=str, help="Diretory containing the generation files", default="datasets/gemma2_ultrafeedback")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(args)
|
||||
|
||||
all_data = []
|
||||
for file_name in os.listdir(args.generation_file_dir):
|
||||
if file_name.startswith("output") and file_name.endswith(".json"):
|
||||
generation_file = os.path.join(args.generation_file_dir, file_name)
|
||||
with open(generation_file, 'r') as f:
|
||||
output_data = json.load(f)
|
||||
all_data.append(output_data)
|
||||
|
||||
num_samples = len(all_data[0])
|
||||
all_res = []
|
||||
num_identical = 0
|
||||
for i in range(num_samples):
|
||||
prompt = all_data[0][i]["prompt"]
|
||||
gen_text = []
|
||||
for data in all_data:
|
||||
gen_text.append(data[i]["generated_text"])
|
||||
|
||||
if len(set(gen_text)) == 1:
|
||||
# filter out samples where all generated responses are identical
|
||||
num_identical += 1
|
||||
continue
|
||||
|
||||
all_res.append(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"all_generated_responses": gen_text,
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Filtered out {num_identical} samples with identical generated responses")
|
||||
|
||||
with open(os.path.join(args.generation_file_dir, 'all_outputs.json'), 'w') as f:
|
||||
json.dump(all_res, f, indent=4)
|
||||
|
||||
print(f"Processed outputs saved to {os.path.join(args.generation_file_dir, 'all_outputs.json')}")
|
||||
@@ -0,0 +1,86 @@
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import tqdm
|
||||
import numpy as np
|
||||
import datasets
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--generation_file", type=str, default="datasets/gemma2_ultrafeedback/all_outputs.json", help="Path to the output generation file")
|
||||
parser.add_argument("--reward_model", type=str, default="RLHFlow/ArmoRM-Llama3-8B-v0.1", help="Path to reward model")
|
||||
parser.add_argument("--output_dir", type=str, default="datasets/gemma2_ultrafeedback/", help="Path to output directory")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(args)
|
||||
|
||||
generation_file = args.generation_file
|
||||
with open(generation_file, 'r') as f:
|
||||
output_data = json.load(f)
|
||||
|
||||
inputs = [data["prompt"] for data in output_data]
|
||||
candidates_texts = [data["all_generated_responses"] for data in output_data]
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(args.reward_model,
|
||||
device_map="cuda",
|
||||
trust_remote_code=True, torch_dtype=torch.bfloat16)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.reward_model, use_fast=True)
|
||||
|
||||
for data in tqdm.tqdm(output_data):
|
||||
prompt = data["prompt"]
|
||||
candidates = data["all_generated_responses"]
|
||||
scores = []
|
||||
for candidate in candidates:
|
||||
messages = [{"role": "user", "content": prompt},
|
||||
{"role": "assistant", "content": candidate}]
|
||||
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
|
||||
with torch.no_grad():
|
||||
output = model(input_ids)
|
||||
score = output.score.float().item()
|
||||
scores.append(score)
|
||||
data["all_rm_scores"] = scores
|
||||
|
||||
file_name = os.path.basename(args.generation_file).split('.json')[0] + "_rm.json"
|
||||
with open(os.path.join(args.output_dir, file_name), 'w') as f:
|
||||
json.dump(output_data, f, indent=4)
|
||||
|
||||
print(f"Annotated outputs saved to {os.path.join(args.output_dir, file_name)}")
|
||||
|
||||
# Binarize data: win = highest scoring reponse; lose = lowest scoring response
|
||||
for data in output_data:
|
||||
chosen_idx = np.argmax(data["all_rm_scores"])
|
||||
rejected_idx = np.argmin(data["all_rm_scores"])
|
||||
chosen = []
|
||||
chosen.append({
|
||||
"role": "user",
|
||||
"content": data["prompt"]
|
||||
})
|
||||
chosen.append({
|
||||
"role": "assistant",
|
||||
"content": data["all_generated_responses"][chosen_idx]
|
||||
})
|
||||
rejected = []
|
||||
rejected.append({
|
||||
"role": "user",
|
||||
"content": data["prompt"]
|
||||
})
|
||||
rejected.append({
|
||||
"role": "assistant",
|
||||
"content": data["all_generated_responses"][rejected_idx]
|
||||
})
|
||||
data.update({
|
||||
"chosen": chosen,
|
||||
"rejected": rejected,
|
||||
})
|
||||
|
||||
output_file = os.path.basename(args.generation_file).split('.json')[0] + "_bin.json"
|
||||
with open(os.path.join(args.output_dir, file_name), 'w') as f:
|
||||
json.dump(output_data, f, indent=4)
|
||||
print(f"Binarized outputs saved to {output_file}")
|
||||
|
||||
# Convert the data to Hugging Face datasets format
|
||||
dataset = datasets.Dataset.from_list(output_data)
|
||||
dataset.save_to_disk(os.path.join(args.output_dir))
|
||||
print(f"Binarized dataset saved to {os.path.join(args.output_dir)}")
|
||||
Reference in New Issue
Block a user