add data generation scripts

This commit is contained in:
Yu Meng
2024-07-20 00:32:36 -04:00
parent e6b139711c
commit ba76f2540b
4 changed files with 227 additions and 0 deletions
+35
View File
@@ -0,0 +1,35 @@
# On-Policy Preference Data Generation
We provide the code to generate on-policy preference data (e.g., [princeton-nlp/llama3-ultrafeedback-armorm](https://huggingface.co/datasets/princeton-nlp/llama3-ultrafeedback-armorm) and [princeton-nlp/gemma2-ultrafeedback-armorm](https://huggingface.co/datasets/princeton-nlp/gemma2-ultrafeedback-armorm)) used in our experiments.
## Requirements
You will need to install the [`vllm`](https://github.com/vllm-project/vllm) package for decoding. Moreover, if you are running decoding with `gemma-2` models, you will need to also install `flashinfer`.
## On-Policy Preference Data Generation Process
1. Generate multiple responses using the language model:
```
python decode.py --data_dir $DATASET_DIR --seed $SEED
```
This will generate one response per prompt under the specified seed. You need to provide a dataset containing prompts (by default, we use `HuggingFaceH4/ultrafeedback_binarized`). You can also set decoding hyperparameters by passing in corresponding arguments (by default, we use a temperature of `0.8` for sampling).
Note that you will need to run the above command under **multiple different** seeds (by default, we use `13, 21, 42, 79, 100`) to obtain different responses for each prompt.
2. Post-process the generations
```
python post_process.py
```
This will combine the generated responses under each seed and filter out samples with identical responses across all seeds.
3. Annotate the preference labels with a reward model
```
python reward_model_annotate.py --reward_model $MODEL
```
This will score the generations using a reward model (by default, we use `RLHFlow/ArmoRM-Llama3-8B-v0.1`) and binarize the dataset by taking the highest-scoring response as the winning response and the lowest-scoring one as the losing.
+61
View File
@@ -0,0 +1,61 @@
from vllm import LLM, SamplingParams
from datasets import load_dataset, load_from_disk
import os
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" # this is recommended for gemma-2 models; otherwise it is not needed
import argparse
import json
parser = argparse.ArgumentParser(description='Decode with vllm')
parser.add_argument('--data_dir', type=str, default="HuggingFaceH4/ultrafeedback_binarized",
help='Directory containing the data')
parser.add_argument('--model', type=str, default="google/gemma-2-9b-it",
help='Path to the LLM model')
parser.add_argument('--temperature', type=float, default=0.8,
help='Temperature for sampling')
parser.add_argument('--top_p', type=float, default=0.95,
help='Top-p probability for sampling')
parser.add_argument('--max_tokens', type=int, default=4096,
help='Maximum number of tokens to generate')
parser.add_argument('--seed', type=int, default=42,
help='Random seed')
parser.add_argument('--output_dir', type=str, default="datasets/gemma2_ultrafeedback",
help='output_dir')
args = parser.parse_args()
print(args)
data_dir = args.data_dir
llm = LLM(model=args.model)
tokenizer = llm.get_tokenizer()
train_dataset= load_dataset(data_dir, split='train_prefs')
prompts = list(set(train_dataset['prompt']))
conversations = [tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) for prompt in prompts]
sampling_params = SamplingParams(temperature=args.temperature,
top_p=args.top_p,
max_tokens=args.max_tokens,
seed=args.seed,)
outputs = llm.generate(conversations, sampling_params)
# Save the outputs as a JSON file.
output_data = []
for i, output in enumerate(outputs):
prompt = output.prompt
generated_text = output.outputs[0].text
output_data.append({
'prompt': prompts[i],
"format_prompt": prompt,
'generated_text': generated_text,
})
output_file = f'output_{args.seed}.json'
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
with open(os.path.join(args.output_dir, output_file), 'w') as f:
json.dump(output_data, f, indent=4)
print(f"Outputs saved to {os.path.join(args.output_dir, output_file)}")
+45
View File
@@ -0,0 +1,45 @@
import json
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument("--generation_file_dir", type=str, help="Diretory containing the generation files", default="datasets/gemma2_ultrafeedback")
args = parser.parse_args()
print(args)
all_data = []
for file_name in os.listdir(args.generation_file_dir):
if file_name.startswith("output") and file_name.endswith(".json"):
generation_file = os.path.join(args.generation_file_dir, file_name)
with open(generation_file, 'r') as f:
output_data = json.load(f)
all_data.append(output_data)
num_samples = len(all_data[0])
all_res = []
num_identical = 0
for i in range(num_samples):
prompt = all_data[0][i]["prompt"]
gen_text = []
for data in all_data:
gen_text.append(data[i]["generated_text"])
if len(set(gen_text)) == 1:
# filter out samples where all generated responses are identical
num_identical += 1
continue
all_res.append(
{
"prompt": prompt,
"all_generated_responses": gen_text,
}
)
print(f"Filtered out {num_identical} samples with identical generated responses")
with open(os.path.join(args.generation_file_dir, 'all_outputs.json'), 'w') as f:
json.dump(all_res, f, indent=4)
print(f"Processed outputs saved to {os.path.join(args.generation_file_dir, 'all_outputs.json')}")
@@ -0,0 +1,86 @@
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json
import os
import argparse
import tqdm
import numpy as np
import datasets
parser = argparse.ArgumentParser()
parser.add_argument("--generation_file", type=str, default="datasets/gemma2_ultrafeedback/all_outputs.json", help="Path to the output generation file")
parser.add_argument("--reward_model", type=str, default="RLHFlow/ArmoRM-Llama3-8B-v0.1", help="Path to reward model")
parser.add_argument("--output_dir", type=str, default="datasets/gemma2_ultrafeedback/", help="Path to output directory")
args = parser.parse_args()
print(args)
generation_file = args.generation_file
with open(generation_file, 'r') as f:
output_data = json.load(f)
inputs = [data["prompt"] for data in output_data]
candidates_texts = [data["all_generated_responses"] for data in output_data]
model = AutoModelForSequenceClassification.from_pretrained(args.reward_model,
device_map="cuda",
trust_remote_code=True, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(args.reward_model, use_fast=True)
for data in tqdm.tqdm(output_data):
prompt = data["prompt"]
candidates = data["all_generated_responses"]
scores = []
for candidate in candidates:
messages = [{"role": "user", "content": prompt},
{"role": "assistant", "content": candidate}]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
with torch.no_grad():
output = model(input_ids)
score = output.score.float().item()
scores.append(score)
data["all_rm_scores"] = scores
file_name = os.path.basename(args.generation_file).split('.json')[0] + "_rm.json"
with open(os.path.join(args.output_dir, file_name), 'w') as f:
json.dump(output_data, f, indent=4)
print(f"Annotated outputs saved to {os.path.join(args.output_dir, file_name)}")
# Binarize data: win = highest scoring reponse; lose = lowest scoring response
for data in output_data:
chosen_idx = np.argmax(data["all_rm_scores"])
rejected_idx = np.argmin(data["all_rm_scores"])
chosen = []
chosen.append({
"role": "user",
"content": data["prompt"]
})
chosen.append({
"role": "assistant",
"content": data["all_generated_responses"][chosen_idx]
})
rejected = []
rejected.append({
"role": "user",
"content": data["prompt"]
})
rejected.append({
"role": "assistant",
"content": data["all_generated_responses"][rejected_idx]
})
data.update({
"chosen": chosen,
"rejected": rejected,
})
output_file = os.path.basename(args.generation_file).split('.json')[0] + "_bin.json"
with open(os.path.join(args.output_dir, file_name), 'w') as f:
json.dump(output_data, f, indent=4)
print(f"Binarized outputs saved to {output_file}")
# Convert the data to Hugging Face datasets format
dataset = datasets.Dataset.from_list(output_data)
dataset.save_to_disk(os.path.join(args.output_dir))
print(f"Binarized dataset saved to {os.path.join(args.output_dir)}")