From ba76f2540b0ce9e2e47a9ec26247be0ca0b16fc7 Mon Sep 17 00:00:00 2001 From: Yu Meng Date: Sat, 20 Jul 2024 00:32:36 -0400 Subject: [PATCH] add data generation scripts --- on_policy_data_gen/README.md | 35 +++++++++ on_policy_data_gen/decode.py | 61 +++++++++++++++ on_policy_data_gen/post_process.py | 45 +++++++++++ on_policy_data_gen/reward_model_annotate.py | 86 +++++++++++++++++++++ 4 files changed, 227 insertions(+) create mode 100644 on_policy_data_gen/README.md create mode 100644 on_policy_data_gen/decode.py create mode 100644 on_policy_data_gen/post_process.py create mode 100644 on_policy_data_gen/reward_model_annotate.py diff --git a/on_policy_data_gen/README.md b/on_policy_data_gen/README.md new file mode 100644 index 0000000..804fa48 --- /dev/null +++ b/on_policy_data_gen/README.md @@ -0,0 +1,35 @@ +# On-Policy Preference Data Generation + +We provide the code to generate on-policy preference data (e.g., [princeton-nlp/llama3-ultrafeedback-armorm](https://huggingface.co/datasets/princeton-nlp/llama3-ultrafeedback-armorm) and [princeton-nlp/gemma2-ultrafeedback-armorm](https://huggingface.co/datasets/princeton-nlp/gemma2-ultrafeedback-armorm)) used in our experiments. + +## Requirements + +You will need to install the [`vllm`](https://github.com/vllm-project/vllm) package for decoding. Moreover, if you are running decoding with `gemma-2` models, you will need to also install `flashinfer`. + +## On-Policy Preference Data Generation Process + +1. Generate multiple responses using the language model: + +``` +python decode.py --data_dir $DATASET_DIR --seed $SEED +``` +This will generate one response per prompt under the specified seed. You need to provide a dataset containing prompts (by default, we use `HuggingFaceH4/ultrafeedback_binarized`). You can also set decoding hyperparameters by passing in corresponding arguments (by default, we use a temperature of `0.8` for sampling). + +Note that you will need to run the above command under **multiple different** seeds (by default, we use `13, 21, 42, 79, 100`) to obtain different responses for each prompt. + +2. Post-process the generations + +``` +python post_process.py +``` + +This will combine the generated responses under each seed and filter out samples with identical responses across all seeds. + +3. Annotate the preference labels with a reward model + +``` +python reward_model_annotate.py --reward_model $MODEL +``` + +This will score the generations using a reward model (by default, we use `RLHFlow/ArmoRM-Llama3-8B-v0.1`) and binarize the dataset by taking the highest-scoring response as the winning response and the lowest-scoring one as the losing. + diff --git a/on_policy_data_gen/decode.py b/on_policy_data_gen/decode.py new file mode 100644 index 0000000..eb01f42 --- /dev/null +++ b/on_policy_data_gen/decode.py @@ -0,0 +1,61 @@ +from vllm import LLM, SamplingParams +from datasets import load_dataset, load_from_disk +import os +os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" # this is recommended for gemma-2 models; otherwise it is not needed +import argparse +import json + +parser = argparse.ArgumentParser(description='Decode with vllm') +parser.add_argument('--data_dir', type=str, default="HuggingFaceH4/ultrafeedback_binarized", + help='Directory containing the data') +parser.add_argument('--model', type=str, default="google/gemma-2-9b-it", + help='Path to the LLM model') +parser.add_argument('--temperature', type=float, default=0.8, + help='Temperature for sampling') +parser.add_argument('--top_p', type=float, default=0.95, + help='Top-p probability for sampling') +parser.add_argument('--max_tokens', type=int, default=4096, + help='Maximum number of tokens to generate') +parser.add_argument('--seed', type=int, default=42, + help='Random seed') +parser.add_argument('--output_dir', type=str, default="datasets/gemma2_ultrafeedback", + help='output_dir') +args = parser.parse_args() + +print(args) + +data_dir = args.data_dir +llm = LLM(model=args.model) +tokenizer = llm.get_tokenizer() + +train_dataset= load_dataset(data_dir, split='train_prefs') + +prompts = list(set(train_dataset['prompt'])) + +conversations = [tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) for prompt in prompts] + +sampling_params = SamplingParams(temperature=args.temperature, + top_p=args.top_p, + max_tokens=args.max_tokens, + seed=args.seed,) +outputs = llm.generate(conversations, sampling_params) + +# Save the outputs as a JSON file. +output_data = [] +for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + output_data.append({ + 'prompt': prompts[i], + "format_prompt": prompt, + 'generated_text': generated_text, + }) + +output_file = f'output_{args.seed}.json' +if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + +with open(os.path.join(args.output_dir, output_file), 'w') as f: + json.dump(output_data, f, indent=4) + +print(f"Outputs saved to {os.path.join(args.output_dir, output_file)}") diff --git a/on_policy_data_gen/post_process.py b/on_policy_data_gen/post_process.py new file mode 100644 index 0000000..4d1018e --- /dev/null +++ b/on_policy_data_gen/post_process.py @@ -0,0 +1,45 @@ +import json +import argparse +import os + +parser = argparse.ArgumentParser() +parser.add_argument("--generation_file_dir", type=str, help="Diretory containing the generation files", default="datasets/gemma2_ultrafeedback") +args = parser.parse_args() + +print(args) + +all_data = [] +for file_name in os.listdir(args.generation_file_dir): + if file_name.startswith("output") and file_name.endswith(".json"): + generation_file = os.path.join(args.generation_file_dir, file_name) + with open(generation_file, 'r') as f: + output_data = json.load(f) + all_data.append(output_data) + +num_samples = len(all_data[0]) +all_res = [] +num_identical = 0 +for i in range(num_samples): + prompt = all_data[0][i]["prompt"] + gen_text = [] + for data in all_data: + gen_text.append(data[i]["generated_text"]) + + if len(set(gen_text)) == 1: + # filter out samples where all generated responses are identical + num_identical += 1 + continue + + all_res.append( + { + "prompt": prompt, + "all_generated_responses": gen_text, + } + ) + +print(f"Filtered out {num_identical} samples with identical generated responses") + +with open(os.path.join(args.generation_file_dir, 'all_outputs.json'), 'w') as f: + json.dump(all_res, f, indent=4) + +print(f"Processed outputs saved to {os.path.join(args.generation_file_dir, 'all_outputs.json')}") diff --git a/on_policy_data_gen/reward_model_annotate.py b/on_policy_data_gen/reward_model_annotate.py new file mode 100644 index 0000000..9c28c3d --- /dev/null +++ b/on_policy_data_gen/reward_model_annotate.py @@ -0,0 +1,86 @@ + +import torch +from transformers import AutoModelForSequenceClassification, AutoTokenizer +import json +import os +import argparse +import tqdm +import numpy as np +import datasets + +parser = argparse.ArgumentParser() +parser.add_argument("--generation_file", type=str, default="datasets/gemma2_ultrafeedback/all_outputs.json", help="Path to the output generation file") +parser.add_argument("--reward_model", type=str, default="RLHFlow/ArmoRM-Llama3-8B-v0.1", help="Path to reward model") +parser.add_argument("--output_dir", type=str, default="datasets/gemma2_ultrafeedback/", help="Path to output directory") +args = parser.parse_args() + +print(args) + +generation_file = args.generation_file +with open(generation_file, 'r') as f: + output_data = json.load(f) + +inputs = [data["prompt"] for data in output_data] +candidates_texts = [data["all_generated_responses"] for data in output_data] + +model = AutoModelForSequenceClassification.from_pretrained(args.reward_model, + device_map="cuda", + trust_remote_code=True, torch_dtype=torch.bfloat16) +tokenizer = AutoTokenizer.from_pretrained(args.reward_model, use_fast=True) + +for data in tqdm.tqdm(output_data): + prompt = data["prompt"] + candidates = data["all_generated_responses"] + scores = [] + for candidate in candidates: + messages = [{"role": "user", "content": prompt}, + {"role": "assistant", "content": candidate}] + input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda") + with torch.no_grad(): + output = model(input_ids) + score = output.score.float().item() + scores.append(score) + data["all_rm_scores"] = scores + +file_name = os.path.basename(args.generation_file).split('.json')[0] + "_rm.json" +with open(os.path.join(args.output_dir, file_name), 'w') as f: + json.dump(output_data, f, indent=4) + +print(f"Annotated outputs saved to {os.path.join(args.output_dir, file_name)}") + +# Binarize data: win = highest scoring reponse; lose = lowest scoring response +for data in output_data: + chosen_idx = np.argmax(data["all_rm_scores"]) + rejected_idx = np.argmin(data["all_rm_scores"]) + chosen = [] + chosen.append({ + "role": "user", + "content": data["prompt"] + }) + chosen.append({ + "role": "assistant", + "content": data["all_generated_responses"][chosen_idx] + }) + rejected = [] + rejected.append({ + "role": "user", + "content": data["prompt"] + }) + rejected.append({ + "role": "assistant", + "content": data["all_generated_responses"][rejected_idx] + }) + data.update({ + "chosen": chosen, + "rejected": rejected, + }) + +output_file = os.path.basename(args.generation_file).split('.json')[0] + "_bin.json" +with open(os.path.join(args.output_dir, file_name), 'w') as f: + json.dump(output_data, f, indent=4) +print(f"Binarized outputs saved to {output_file}") + +# Convert the data to Hugging Face datasets format +dataset = datasets.Dataset.from_list(output_data) +dataset.save_to_disk(os.path.join(args.output_dir)) +print(f"Binarized dataset saved to {os.path.join(args.output_dir)}")