diff --git a/model/reward/instructor/configs/deberta-v2-xxlarge-a100.yaml b/model/reward/instructor/configs/deberta-v2-xxlarge-a100.yaml new file mode 100644 index 00000000..e9ec60c5 --- /dev/null +++ b/model/reward/instructor/configs/deberta-v2-xxlarge-a100.yaml @@ -0,0 +1,17 @@ +model_name: microsoft/deberta-v2-xxlarge +learning_rate: 2e-6 +scheduler: cosine +gradient_checkpointing: false +gradient_accumulation_steps: 12 +per_device_train_batch_size: 2 +per_device_eval_batch_size: 4 +warmup_steps: 600 +eval_steps: 1000000 +save_steps: 1000 +max_length: 400 +num_train_epochs: 3 +datasets: + - webgpt + - hfsummary + - anthropic_rlhf + - gptsynthetic diff --git a/model/reward/instructor/configs/zero_config.json b/model/reward/instructor/configs/zero_config.json new file mode 100644 index 00000000..1b776782 --- /dev/null +++ b/model/reward/instructor/configs/zero_config.json @@ -0,0 +1,29 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.1 + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 5e8, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/model/reward/instructor/rank_datasets.py b/model/reward/instructor/rank_datasets.py index bd53eb02..330c0a9f 100644 --- a/model/reward/instructor/rank_datasets.py +++ b/model/reward/instructor/rank_datasets.py @@ -139,7 +139,7 @@ class HFSummary(Dataset): """ - def __init__(self, split="train", conf_threshold=-1, max_comparison_per_sample=3) -> None: + def __init__(self, split="train", conf_threshold=-1, max_comparison_per_sample=1) -> None: super().__init__() assert split in ("train", "valid1", "valid2", "test") summaries = {} @@ -237,3 +237,64 @@ class HFDataset(Dataset): class GPTJSynthetic(HFDataset): def __init__(self) -> None: super().__init__("Dahoas/synthetic-instruct-gptj-pairwise", "prompt", "chosen", "rejected", None, "train") + + +class AnthropicRLHF(Dataset): + """ + The data are described in the paper: + Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback. + If you find the data useful, please cite the paper. + The data format is very simple -- each line of the jsonl files contains a pair of texts, + one "chosen" and one "rejected". + valid train size : 160780 + + """ + + def preprocess_dialogue(self, text): + """ + trim prefix text to last two pairs + + Outlier example Assistant answered empty string: + Assistant: Human: That makes sense, I agree with that, though there are many situations that + aren't considered justice, like sending a kid to prison for life. Human: You are completely + missing the point of this conversation, and not understanding anything I am saying. Human: + And I don’t know if you’re trying to be funny, but it isn’t. + + """ + last_two_convo = text.split("Human:")[-2:] + if len(last_two_convo[0]) == 0: + return "Human:".join(last_two_convo) + return "Human: " + "Human:".join(last_two_convo) + + def __init__(self, split="train", sep_token="") -> None: + super().__init__() + assert split in ("train", "test") + if sep_token is None: + sep_token = " . " + self.pairs = [] + # using prompt as our index will allows us + # to add additional generated prompt later + major_split = split if "train" == split else "test" + dataset = load_dataset("Anthropic/hh-rlhf")[major_split] + for data in dataset: + processed = self.preprocess_dialogue(data["chosen"]) + # roughly 20 of these are invalid conversation + if "Assistant" not in processed: + continue + prompt, pos_postfix = processed.split("Assistant:", maxsplit=1) + prompt = prompt.replace("Human: ", "").strip() + pos_postfix = pos_postfix.replace("Human: ", sep_token).replace("\n\nAssistant: ", sep_token).strip() + processed = self.preprocess_dialogue(data["rejected"]) + if "Assistant" not in processed: + continue + _, neg_postfix = processed.split("Assistant:", maxsplit=1) + neg_postfix = neg_postfix.replace("Human: ", sep_token).replace("\n\nAssistant: ", sep_token).strip() + self.pairs.append((prompt, (pos_postfix.strip(), neg_postfix.strip()))) + + def __len__(self): + return len(self.pairs) + + def __getitem__(self, index): + context, pair = self.pairs[index] + + return context, [pair] diff --git a/model/reward/instructor/tests/test_dataset.py b/model/reward/instructor/tests/test_dataset.py index 832aace3..5cc9b7e8 100644 --- a/model/reward/instructor/tests/test_dataset.py +++ b/model/reward/instructor/tests/test_dataset.py @@ -1,5 +1,5 @@ from experimental_dataset import DataCollatorForSummaryScore, HFSummaryQuality -from rank_datasets import DataCollatorForPairRank, GPTJSynthetic, HFSummary, WebGPT +from rank_datasets import AnthropicRLHF, DataCollatorForPairRank, GPTJSynthetic, HFSummary, WebGPT from torch.utils.data import DataLoader from transformers import AutoTokenizer @@ -25,6 +25,16 @@ def test_webgpt(): print(batch["input_ids"].shape) +def test_anthropic_rlhf(): + + tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") + collate_fn = DataCollatorForPairRank(tokenizer, max_length=200) + dataset = AnthropicRLHF("test", sep_token=tokenizer.sep_token) + dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32) + for batch in dataloader: + print(batch["input_ids"].shape) + + def test_hf_summary_quality(): tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") diff --git a/model/reward/instructor/trainer.py b/model/reward/instructor/trainer.py index 940c0708..c472e429 100644 --- a/model/reward/instructor/trainer.py +++ b/model/reward/instructor/trainer.py @@ -8,15 +8,8 @@ import torch from models import RankGenModel from rank_datasets import DataCollatorForPairRank, RankGenCollator from torch import nn -from transformers import ( - AdamW, - AutoModelForSequenceClassification, - PreTrainedModel, - Trainer, - TrainingArguments, - get_cosine_schedule_with_warmup, - get_linear_schedule_with_warmup, -) +from transformers import AutoModelForSequenceClassification, PreTrainedModel, Trainer, TrainingArguments +from transformers.training_args import OptimizerNames from utils import argument_parsing, freeze_top_n_layers, get_datasets, get_tokenizer os.environ["WANDB_PROJECT"] = "reward-model" @@ -24,6 +17,11 @@ os.environ["WANDB_PROJECT"] = "reward-model" accuracy = evaluate.load("accuracy") parser = ArgumentParser() parser.add_argument("config", type=str) +parser.add_argument("--local_rank", type=int, default=-1) +parser.add_argument("--deepspeed", action="store_true") +parser.set_defaults(deepspeed=False) +parser.add_argument("--no-deepspeed", dest="deepspeed", action="store_false") +parser.add_argument("--wandb-entity", type=str, default="open-assistant") def compute_metrics(eval_pred): @@ -133,48 +131,46 @@ if __name__ == "__main__": params = sum([np.prod(p.size()) for p in model_parameters]) print("Number of trainable : {}M".format(int(params / 1e6))) + optimizer = OptimizerNames.ADAMW_HF args = TrainingArguments( output_dir=f"{model_name}-finetuned", num_train_epochs=training_conf["num_train_epochs"], - warmup_steps=500, + warmup_steps=training_conf["warmup_steps"], + optim=optimizer, + lr_scheduler_type=training_conf["scheduler"], learning_rate=training_conf["learning_rate"], # half_precision_backend="apex", + deepspeed="configs/zero_config.json" if training_conf["deepspeed"] else None, fp16=training_conf["fp16"], + local_rank=training_conf["local_rank"], gradient_checkpointing=training_conf["gradient_checkpointing"], gradient_accumulation_steps=training_conf["gradient_accumulation_steps"], per_device_train_batch_size=training_conf["per_device_train_batch_size"], per_device_eval_batch_size=training_conf["per_device_eval_batch_size"], - weight_decay=0.01, - max_grad_norm=2.0, + weight_decay=training_conf["weight_decay"], + max_grad_norm=training_conf["max_grad_norm"], logging_steps=10, save_total_limit=4, evaluation_strategy="steps", eval_steps=training_conf["eval_steps"], - save_steps=1000, + save_steps=training_conf["save_steps"], report_to="wandb", ) tokenizer = get_tokenizer(training_conf["tokenizer_name"]) - train, evals = get_datasets(training_conf["datasets"]) + train, evals = get_datasets(training_conf["datasets"], tokenizer) if "rankgen" in model_name: collate_fn = RankGenCollator(tokenizer, max_length=training_conf["max_length"]) else: collate_fn = DataCollatorForPairRank(tokenizer, max_length=training_conf["max_length"]) assert len(evals) > 0 - optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) - scheduler = None - if "scheduler" in training_conf: - if training_conf["scheduler"] == "linear": - scheduler = get_linear_schedule_with_warmup() - elif training_conf["scheduler"] == "cosine": - scheduler = get_cosine_schedule_with_warmup( - optimizer, - num_warmup_steps=args.warmup_steps, - num_training_steps=len(train) - * args.num_train_epochs - / (args.per_device_train_batch_size * args.gradient_accumulation_steps), - ) + if not training_conf["deepspeed"] or training_conf["local_rank"] == 0: + import wandb + + wandb.init( + project=os.environ["WANDB_PROJECT"], name=f"{model_name}-finetuned", entity=training_conf["wandb_entity"] + ) trainer = RankTrainer( model=model, @@ -186,7 +182,7 @@ if __name__ == "__main__": data_collator=collate_fn, tokenizer=tokenizer, compute_metrics=compute_metrics, - optimizers=(optimizer, scheduler), + # optimizers=(optimizer, scheduler), ) # trainer.evaluate() trainer.train() diff --git a/model/reward/instructor/utils.py b/model/reward/instructor/utils.py index a6f3da4e..94a256c2 100644 --- a/model/reward/instructor/utils.py +++ b/model/reward/instructor/utils.py @@ -81,26 +81,41 @@ def argument_parsing(parser): "learning_rate": 3e-5, "eval_steps": 500, "loss": "rank", + "warmup_steps": 500, "max_length": 440, + "weight_decay": 0.01, + "max_grad_norm": 2.0, + "save_steps": 500, "per_device_eval_batch_size": 5, "per_device_train_batch_size": 8, "gradient_accumulation_steps": 8, "gradient_checkpointing": False, + "deepspeed": args.deepspeed, + "local_rank": args.local_rank, "datasets": ["webgpt"], + "wandb_entity": args.wandb_entity, "fp16": True, "tokenizer_name": training_conf["model_name"], } params = {**default_params, **training_conf} - params["gradient_accumulation_steps"] = int(params["gradient_accumulation_steps"]) - params["num_train_epochs"] = int(params["num_train_epochs"]) - params["per_device_train_batch_size"] = int(params["per_device_train_batch_size"]) - params["learning_rate"] = float(params["learning_rate"]) + for name in [ + "gradient_accumulation_steps", + "num_train_epochs", + "save_steps", + "eval_steps", + "per_device_train_batch_size", + "per_device_eval_batch_size", + ]: + params[name] = int(params[name]) + for name in ["learning_rate", "weight_decay", "max_grad_norm"]: + params[name] = float(params[name]) + return params -def get_datasets(dataset_list: List[AnyStr]): - from rank_datasets import GPTJSynthetic, HFSummary, WebGPT +def get_datasets(dataset_list: List[AnyStr], tokenizer): + from rank_datasets import AnthropicRLHF, GPTJSynthetic, HFSummary, WebGPT from torch.utils.data import ConcatDataset train_datasets, evals = [], {} @@ -121,13 +136,10 @@ def get_datasets(dataset_list: List[AnyStr]): train, eval = train_val_dataset(dataset, 0.1) train_datasets.append(train) evals["gptsynthetic"] = eval + elif "anthropic_rlhf" == dataset_name: + train = AnthropicRLHF("train", tokenizer.sep_token) + eval = AnthropicRLHF("test", tokenizer.sep_token) + train_datasets.append(train) + evals["anthropic_rlhf"] = eval train = ConcatDataset(train_datasets) return train, evals - - -if __name__ == "__main__": - from transformers import AutoModelForSequenceClassification - - model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloomz-560m") - freeze_top_n_layers(model, 10) - print(model.state_dict().keys())