From 967eab4cfb71ac41e51f77b30123d6858c880328 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 8 Nov 2023 13:21:57 +0000
Subject: [PATCH 01/30] Add skeleton

---
 Makefile                             |  10 +-
 recipes/dpo/.gitkeep                 |   0
 recipes/ppo/.gitkeep                 |   0
 recipes/reward_modeling/.gitkeep     |   0
 recipes/sft/.gitkeep                 |   0
 scripts/run_dpo.py                   | 235 +++++++++++++++++++++++
 scripts/run_sft.py                   | 198 +++++++++++++++++++
 setup.py                             |  14 +-
 src/alignment/__init__.py            |   4 +
 src/alignment/configs.py             | 272 +++++++++++++++++++++++++++
 src/alignment/data.py                | 171 +++++++++++++++++
 src/alignment/model_utils.py         |  79 ++++++++
 src/alignment/{utils => }/release.py |   0
 13 files changed, 971 insertions(+), 12 deletions(-)
 delete mode 100644 recipes/dpo/.gitkeep
 delete mode 100644 recipes/ppo/.gitkeep
 delete mode 100644 recipes/reward_modeling/.gitkeep
 delete mode 100644 recipes/sft/.gitkeep
 create mode 100644 scripts/run_dpo.py
 create mode 100644 scripts/run_sft.py
 create mode 100644 src/alignment/configs.py
 create mode 100644 src/alignment/data.py
 create mode 100644 src/alignment/model_utils.py
 rename src/alignment/{utils => }/release.py (100%)

diff --git a/Makefile b/Makefile
index 553edb1..2d82400 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
 
-check_dirs := src tests
+check_dirs := src tests scripts
 
 style:
 	python -m black --line-length 119 --target-version py310 $(check_dirs) setup.py
@@ -18,16 +18,16 @@ quality:
 # Release stuff
 
 pre-release:
-	python src/alignment/utils/release.py
+	python src/alignment/release.py
 
 pre-patch:
-	python src/alignment/utils/release.py --patch
+	python src/alignment/release.py --patch
 
 post-release:
-	python src/alignment/utils/release.py --post_release
+	python src/alignment/release.py --post_release
 
 post-patch:
-	python src/alignment/utils/release.py --post_release --patch
+	python src/alignment/release.py --post_release --patch
 
 wheels:
 	python setup.py bdist_wheel && python setup.py sdist
diff --git a/recipes/dpo/.gitkeep b/recipes/dpo/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/recipes/ppo/.gitkeep b/recipes/ppo/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/recipes/reward_modeling/.gitkeep b/recipes/reward_modeling/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/recipes/sft/.gitkeep b/recipes/sft/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py
new file mode 100644
index 0000000..b6f1cba
--- /dev/null
+++ b/scripts/run_dpo.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import subprocess
+import sys
+from datetime import timedelta
+
+import torch
+import transformers
+from transformers import set_seed
+
+import wandb
+from accelerate import Accelerator, InitProcessGroupKwargs
+from h4.data import get_datasets
+from h4.training import DataArguments, DPOTrainingArguments, ModelArguments, init_wandb_training
+from h4.utils import (
+    H4ArgumentParser,
+    apply_chat_template,
+    convert_to_safetensors,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+    hf_login,
+    is_slurm_available,
+    push_to_hub_revision,
+    run_mt_bench_job,
+)
+from trl import DPOTrainer
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, DPOTrainingArguments))
+    model_args, data_args, training_args = parser.parse()
+
+    #######
+    # Setup
+    #######
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Setup WandB
+    if training_args.wandb_enabled:
+        init_wandb_training(training_args)
+
+    # Login to HuggingFace Hub if needed
+    hf_login()
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    # Increase distributed timeout to 3h to enable push to Hub to complete
+    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=6 * 1800))])
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = get_datasets(data_args, splits=data_args.dataset_splits)
+    logger.info(
+        f"Training on the following splits: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+
+    #####################################
+    # Load tokenizer and process datasets
+    #####################################
+    data_args.truncation_side = "left"  # Truncate from left to ensure we don't lose labels in final turn
+    tokenizer = get_tokenizer(model_args, data_args)
+
+    #####################
+    # Apply chat template
+    #####################
+    raw_datasets = raw_datasets.map(
+        apply_chat_template,
+        fn_kwargs={"tokenizer": tokenizer, "task": "dpo"},
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        desc="Formatting comparisons with prompt template",
+    )
+
+    # Replace column names with what TRL needs, text_chosen -> chosen and text_rejected -> rejected
+    for split in ["train", "test"]:
+        raw_datasets[split] = raw_datasets[split].rename_columns(
+            {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
+        )
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(raw_datasets["train"])), 3):
+        logger.info(f"Prompt sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['prompt']}")
+        logger.info(f"Chosen sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['chosen']}")
+        logger.info(f"Rejected sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['rejected']}")
+
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        use_flash_attention_2=model_args.use_flash_attention_2,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map(),
+        quantization_config=get_quantization_config(model_args),
+    )
+
+    ref_model = model_args.model_name_or_path
+    ref_model_kwargs = model_kwargs
+
+    if model_args.use_peft:
+        ref_model = None
+        ref_model_kwargs = None
+
+    #########################
+    # Instantiate DPO trainer
+    #########################
+    dpo_trainer = DPOTrainer(
+        model_args.model_name_or_path,
+        ref_model,
+        model_init_kwargs=model_kwargs,
+        ref_model_init_kwargs=ref_model_kwargs,
+        args=training_args,
+        beta=training_args.beta,
+        train_dataset=raw_datasets["train"],
+        eval_dataset=raw_datasets["test"],
+        tokenizer=tokenizer,
+        max_length=training_args.max_seq_length,
+        max_prompt_length=training_args.max_prompt_length,
+        peft_config=get_peft_config(model_args),
+    )
+
+    ###############
+    # Training loop
+    ###############
+    train_result = dpo_trainer.train()
+    metrics = train_result.metrics
+    max_train_samples = (
+        data_args.max_train_samples if data_args.max_train_samples is not None else len(raw_datasets["train"])
+    )
+    metrics["train_samples"] = min(max_train_samples, len(raw_datasets["train"]))
+    dpo_trainer.log_metrics("train", metrics)
+    dpo_trainer.save_metrics("train", metrics)
+    dpo_trainer.save_state()
+
+    logger.info("*** Training complete ***")
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = dpo_trainer.evaluate(eval_dataset=raw_datasets["test"])
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(raw_datasets["test"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(raw_datasets["test"]))
+        dpo_trainer.log_metrics("eval", metrics)
+        dpo_trainer.save_metrics("eval", metrics)
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    dpo_trainer.save_model(training_args.output_dir)
+
+    # Save everything else on main process
+    if accelerator.is_main_process:
+        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
+        kwargs["dataset"] = list(data_args.dataset_mixer.keys())
+        dpo_trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        dpo_trainer.model.config.use_cache = True
+        # Fix custom code paths
+        if model_args.trust_remote_code is True:
+            auto_map = dpo_trainer.model.config.auto_map
+            dpo_trainer.model.config.auto_map = {k: v.split("--")[-1] for k, v in auto_map.items()}
+        dpo_trainer.model.config.save_pretrained(training_args.output_dir)
+        # FSDP/DeepSpeed save the model as a single `pytorch_model.bin` file, so we need to shard it.
+        # We run this in a subprocess to avoid interference from the accelerators.
+        subprocess.run(
+            [
+                "python",
+                "scripts/training/shard_checkpoint.py",
+                f"--output_dir={training_args.output_dir}",
+                f"--trust_remote_code={model_args.trust_remote_code}",
+            ],
+            check=True,
+        )
+        # Convert torch weights to safetensors for deployment with TGI
+        convert_to_safetensors(training_args.output_dir)
+        if training_args.push_to_hub_revision:
+            is_model_on_hub = push_to_hub_revision(training_args, model_args)
+            # Run automatic evaluation once the model is pushed to the Hub
+            if is_slurm_available() and is_model_on_hub is True and training_args.do_eval is True:
+                logger.info("*** Launching MT Bench ***")
+                run_mt_bench_job(training_args, model_args)
+
+    # Ensure we don't timeout on model save / push to Hub
+    logger.info("*** Waiting for all processes to finish ***")
+    accelerator.wait_for_everyone()
+    wandb.finish()
+
+    logger.info("*** Run complete! ***")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_sft.py b/scripts/run_sft.py
new file mode 100644
index 0000000..580916b
--- /dev/null
+++ b/scripts/run_sft.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Supervised fine-tuning script for decoder language models.
+"""
+
+import logging
+import math
+import random
+import sys
+
+import datasets
+import torch
+import transformers
+from transformers import set_seed
+
+from accelerate import Accelerator
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    SFTConfig,
+    apply_chat_template,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import SFTTrainer
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
+    model_args, data_args, training_args = parser.parse()
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+    accelerator = Accelerator()
+
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.bf16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = get_datasets(data_args, splits=data_args.dataset_splits)
+
+    logger.info(
+        f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    with training_args.main_process_first(desc="Log a few random samples from the raw training set"):
+        for index in random.sample(range(len(raw_datasets["train"])), 3):
+            logger.info(f"Sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['messages']}")
+
+    #####################################
+    # Load tokenizer and process datasets
+    #####################################
+    tokenizer = get_tokenizer(model_args, data_args)
+
+    #####################
+    # Apply chat template
+    #####################
+    raw_datasets = raw_datasets.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer, "task": "sft"})
+    train_dataset = raw_datasets["train"]
+    eval_dataset = raw_datasets["test"]
+
+    with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
+        for index in random.sample(range(len(raw_datasets["train"])), 3):
+            logger.info(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")
+
+    #######################
+    # Load pretrained model
+    #######################
+    logger.info("*** Load pretrained model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        use_flash_attention_2=model_args.use_flash_attention_2,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map(),
+        quantization_config=get_quantization_config(model_args),
+    )
+    logger.info("*** Model loaded! ***")
+
+    ########################
+    # Initialize the Trainer
+    ########################
+    trainer = SFTTrainer(
+        model=model_args.model_name_or_path,
+        model_init_kwargs=model_kwargs,
+        args=training_args,
+        train_dataset=raw_datasets["train"] if training_args.do_train else None,
+        eval_dataset=raw_datasets["test"] if training_args.do_eval else None,
+        dataset_text_field="text",
+        max_seq_length=training_args.max_seq_length,
+        tokenizer=tokenizer,
+        packing=True,
+        peft_config=get_peft_config(model_args),
+    )
+
+    ###############
+    # Training loop
+    ###############
+    if training_args.do_train:
+        logger.info("*** Train ***")
+        train_result = trainer.train()
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+        try:
+            perplexity = math.exp(metrics["eval_loss"])
+        except OverflowError:
+            perplexity = float("inf")
+        metrics["perplexity"] = perplexity
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    if accelerator.is_main_process:
+        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
+        kwargs["dataset"] = list(data_args.dataset_mixer.keys())
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+        if training_args.push_to_hub:
+            trainer.push_to_hub()
+
+    accelerator.wait_for_everyone()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
index 3ef9c89..dad5141 100644
--- a/setup.py
+++ b/setup.py
@@ -45,25 +45,25 @@ _deps = [
     "bitsandbytes==0.41.1",
     "black==23.1.0",
     "datasets==2.12.0",
-    "deepspeed==0.9.5",
-    "einops==0.6.1",
+    "deepspeed==0.12.2",
+    "einops>=0.6.1",
     "evaluate==0.4.0",
     "flake8>=6.0.0",
     "hf-doc-builder>=0.4.0",
     "huggingface-hub>=0.14.1,<1.0",
     "isort>=5.12.0",
-    "ninja==1.11.1",
+    "ninja>=1.11.1",
     "numpy>=1.24.2",
     "packaging>=23.0",
     "parameterized>=0.9.0",
-    "peft==0.5.0",
+    "peft==0.6.0",
     "protobuf<=3.20.2",  # Needed to avoid conflicts with `transformers`
     "pytest",
-    "safetensors==0.3.3",
+    "safetensors>=0.3.3",
     "tensorboard",
     "torch==2.0.1",
-    "transformers @ git+https://github.com/huggingface/transformers.git@b3961f7291307ee877ef1a4d057949597d805220",
-    "trl @ git+https://github.com/huggingface/trl.git@1e56ff0f166888973d69cd9d56be60a9f8edfedb",  # TODO bump to next release, added for NEFTune
+    "transformers==4.35.0",
+    "trl==0.7.4",  # TODO bump to next release, added for NEFTune
     "tqdm>=4.64.1",
 ]
 
diff --git a/src/alignment/__init__.py b/src/alignment/__init__.py
index b9d465b..3080b6a 100644
--- a/src/alignment/__init__.py
+++ b/src/alignment/__init__.py
@@ -1 +1,5 @@
 __version__ = "0.2.0.dev0"
+
+from .configs import DataArguments, DPOConfig, H4ArgumentParser, ModelArguments, SFTConfig
+from .data import apply_chat_template, get_datasets
+from .model_utils import get_kbit_device_map, get_peft_config, get_quantization_config, get_tokenizer
diff --git a/src/alignment/configs.py b/src/alignment/configs.py
new file mode 100644
index 0000000..890790e
--- /dev/null
+++ b/src/alignment/configs.py
@@ -0,0 +1,272 @@
+# coding=utf-8
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, NewType, Optional, Tuple, Union
+
+import transformers
+from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, HfArgumentParser
+
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+DataClassType = NewType("DataClassType", Any)
+
+
+class H4ArgumentParser(HfArgumentParser):
+    def parse_yaml_and_args(self, yaml_arg: str, other_args: Optional[List[str]] = None) -> List[dataclass]:
+        """
+        Parse a YAML file and overwrite the default/loaded values with the values provided to the command line.
+
+        Args:
+            yaml_arg (`str`):
+                The path to the config file used
+            other_args (`List[str]`, *optional`):
+                A list of strings to parse as command line arguments, e.g. ['--arg=val', '--arg2=val2'].
+
+        Returns:
+            [`List[dataclass]`]: a list of dataclasses with the values from the YAML file and the command line
+        """
+        arg_list = self.parse_yaml_file(os.path.abspath(yaml_arg))
+
+        outputs = []
+        # strip other args list into dict of key-value pairs
+        other_args = {arg.split("=")[0].strip("-"): arg.split("=")[1] for arg in other_args}
+        used_args = {}
+
+        # overwrite the default/loaded value with the value provided to the command line
+        # adapted from https://github.com/huggingface/transformers/blob/d0b5002378daabf62769159add3e7d66d3f83c3b/src/transformers/hf_argparser.py#L327
+        for data_yaml, data_class in zip(arg_list, self.dataclass_types):
+            keys = {f.name for f in dataclasses.fields(data_yaml) if f.init}
+            inputs = {k: v for k, v in vars(data_yaml).items() if k in keys}
+            for arg, val in other_args.items():
+                # add only if in keys
+                if arg in keys:
+                    base_type = data_yaml.__dataclass_fields__[arg].type
+                    inputs[arg] = val
+
+                    # cast type for ints, floats (default to strings)
+                    if base_type in [int, float]:
+                        inputs[arg] = base_type(val)
+
+                    if base_type == List[str]:
+                        inputs[arg] = [str(v) for v in val.split(",")]
+
+                    # bool of a non-empty string is True, so we manually check for bools
+                    if base_type == bool:
+                        if val in ["true", "True"]:
+                            inputs[arg] = True
+                        else:
+                            inputs[arg] = False
+
+                    # add to used-args so we can check if double add
+                    if arg not in used_args:
+                        used_args[arg] = val
+                    else:
+                        raise ValueError(f"Duplicate argument provided: {arg}, may cause unexpected behavior")
+
+            obj = data_class(**inputs)
+            outputs.append(obj)
+
+        return outputs
+
+    def parse(self) -> Union[DataClassType, Tuple[DataClassType]]:
+        if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
+            # If we pass only one argument to the script and it's the path to a YAML file,
+            # let's parse it to get our arguments.
+            output = self.parse_yaml_file(os.path.abspath(sys.argv[1]))
+        # parse command line args and yaml file
+        elif len(sys.argv) > 2 and sys.argv[1].endswith(".yaml"):
+            output = self.parse_yaml_and_args(os.path.abspath(sys.argv[1]), sys.argv[2:])
+        # parse command line args only
+        else:
+            output = self.parse_args_into_dataclasses()
+
+        if len(output) == 1:
+            output = output[0]
+        return output
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
+    """
+
+    base_model_revision: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The base model checkpoint for weights initialization with PEFT adatpers.")},
+    )
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
+            )
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    model_code_revision: str = field(default=None, metadata={"help": "The branch of the IFT model"})
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
+                "dtype will be automatically derived from the model's weights."
+            ),
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
+    use_flash_attention_2: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to use flash attention 2. You must install this manually by running `pip install flash-attn --no-build-isolation`"
+            )
+        },
+    )
+    use_peft: bool = field(
+        default=False,
+        metadata={"help": ("Whether to use PEFT or not for training.")},
+    )
+    lora_r: Optional[int] = field(
+        default=16,
+        metadata={"help": ("LoRA R value.")},
+    )
+    lora_alpha: Optional[int] = field(
+        default=32,
+        metadata={"help": ("LoRA alpha.")},
+    )
+    lora_dropout: Optional[float] = field(
+        default=0.05,
+        metadata={"help": ("LoRA dropout.")},
+    )
+    lora_target_modules: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("LoRA target modules.")},
+    )
+    lora_modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("Model layers to unfreeze & train")},
+    )
+    load_in_8bit: bool = field(default=False, metadata={"help": "use 8 bit precision"})
+    load_in_4bit: bool = field(default=False, metadata={"help": "use 4 bit precision"})
+
+    bnb_4bit_quant_type: Optional[str] = field(
+        default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
+    )
+    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
+
+    def __post_init__(self):
+        if self.load_in_8bit and self.load_in_4bit:
+            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
+
+
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
+    dataset_mixer: Optional[Dict[str, float]] = field(
+        default=None,
+        metadata={"help": ("Datasets and their proportions to be used for training ift/rl.")},
+    )
+    dataset_splits: Optional[List[str]] = field(
+        default_factory=lambda: ["train", "test"],
+        metadata={"help": ("List of train test splits to use in the dataset")},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    truncation_side: Optional[str] = field(
+        default=None, metadata={"help": "Truncation side to use for the tokenizer."}
+    )
+
+
+@dataclass
+class SFTConfig(transformers.TrainingArguments):
+    """
+    Arguments related to the training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
+    """
+
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
+    )
+    logging_first_step: bool = field(
+        default=True,
+        metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
+    )
+    optim: Optional[str] = field(default="adamw_torch")
+
+
+@dataclass
+class DPOConfig(transformers.TrainingArguments):
+    """
+    Arguments related to the DPO training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
+    """
+
+    beta: Optional[float] = field(
+        default=0.1,
+        metadata={"help": "The beta factor in DPO loss. Higher beta means less divergence from the initial policy."},
+    )
+    hub_model_revision: Optional[str] = field(
+        default="main",
+        metadata={"help": ("The Hub model branch to push the model to.")},
+    )
+    logging_first_step: bool = field(
+        default=True,
+        metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
+    )
+    max_prompt_length: Optional[int] = field(
+        default=None,
+        metadata={"help": ("For DPO, the maximum length of the prompt to use for conditioning the model.")},
+    )
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
+    )
+    optim: Optional[str] = field(default="rmsprop")
+    remove_unused_columns: bool = field(default=False)
diff --git a/src/alignment/data.py b/src/alignment/data.py
new file mode 100644
index 0000000..de25e61
--- /dev/null
+++ b/src/alignment/data.py
@@ -0,0 +1,171 @@
+import re
+from typing import List, Literal, Optional, Union
+
+from datasets import DatasetDict, concatenate_datasets, load_dataset
+
+from .configs import DataArguments
+
+
+DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+
+
+def apply_chat_template(
+    example, tokenizer, task: Literal["sft", "generation", "rm", "dpo"] = "sft", assistant_prefix="<|assistant|>\n"
+):
+    def _strip_prefix(s, pattern):
+        # Use re.escape to escape any special characters in the pattern
+        return re.sub(f"^{re.escape(pattern)}", "", s)
+
+    if task in ["sft", "generation"]:
+        messages = example["messages"]
+        # We add an empty system message if there is none
+        if messages[0]["role"] != "system":
+            messages.insert(0, {"role": "system", "content": ""})
+        example["text"] = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True if task == "generation" else False
+        )
+    elif task == "rm":
+        if all(k in example.keys() for k in ("chosen", "rejected")):
+            chosen_messages = example["chosen"]
+            rejected_messages = example["rejected"]
+            # We add an empty system message if there is none
+            if chosen_messages[0]["role"] != "system":
+                chosen_messages.insert(0, {"role": "system", "content": ""})
+            if rejected_messages[0]["role"] != "system":
+                rejected_messages.insert(0, {"role": "system", "content": ""})
+            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
+            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
+        else:
+            raise ValueError(
+                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
+            )
+    elif task == "dpo":
+        if all(k in example.keys() for k in ("chosen", "rejected")):
+            # Compared to reward modeling, we filter out the prompt, so the text is everything after the last assistant token
+            prompt_messages = [[msg for msg in example["chosen"] if msg["role"] == "user"][0]]
+            # Insert system message
+            if example["chosen"][0]["role"] != "system":
+                prompt_messages.insert(0, {"role": "system", "content": ""})
+            else:
+                prompt_messages.insert(0, example["chosen"][0])
+            # TODO: handle case where chosen/rejected also have system messages
+            chosen_messages = example["chosen"][1:]
+            rejected_messages = example["rejected"][1:]
+            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
+            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
+            example["text_prompt"] = tokenizer.apply_chat_template(
+                prompt_messages, tokenize=False, add_generation_prompt=True
+            )
+
+        example["text_chosen"] = _strip_prefix(example["text_chosen"], assistant_prefix)
+        example["text_rejected"] = _strip_prefix(example["text_rejected"], assistant_prefix)
+    else:
+        raise ValueError(
+            f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
+        )
+    return example
+
+
+def get_datasets(
+    data_config: Union[DataArguments, dict],
+    splits: List[str] = ["train", "test"],
+    shuffle: bool = True,
+) -> DatasetDict:
+    """
+    Loads one or more datasets with varying training set proportions.
+
+    Args:
+        data_config (`DataArguments` or `dict`):
+            Dataset configuration and split proportions.
+        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
+            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training data.
+
+    Returns
+        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
+    """
+
+    if type(data_config) is DataArguments:
+        # Structure of the config to read the datasets and their mix
+        # datasets_mixer:
+        #     - 'dataset1': 0.5
+        #     - 'dataset2': 0.3
+        #     - 'dataset3': 0.2
+        dataset_mixer = data_config.dataset_mixer
+    elif type(data_config) is dict:
+        # Structure of the input is:
+        #     dataset_mixer = {
+        #             "dataset1": 0.5,
+        #             "dataset1": 0.3,
+        #             "dataset1": 0.2,
+        #         }
+        dataset_mixer = data_config
+    else:
+        raise ValueError(f"Data config {data_config} not recognized.")
+
+    raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
+    return raw_datasets
+
+
+def mix_datasets(dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True) -> DatasetDict:
+    """
+    Loads and mixes datasets according to proportions specified in `dataset_mixer`.
+
+    Args:
+        dataset_mixer (`dict`):
+            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
+        splits (Optional[List[str]], *optional*, defaults to `None`):
+            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training data.
+    """
+    raw_datasets = DatasetDict()
+    raw_train_datasets = []
+    raw_val_datasets = []
+    fracs = []
+    for ds, frac in dataset_mixer.items():
+        fracs.append(frac)
+        for split in splits:
+            if "train" in split:
+                raw_train_datasets.append(
+                    load_dataset(
+                        ds,
+                        split=split,
+                    )
+                )
+            elif "test" in split:
+                raw_val_datasets.append(
+                    load_dataset(
+                        ds,
+                        split=split,
+                    )
+                )
+            else:
+                raise ValueError(f"Split type {split} not recognized as one of test or train.")
+
+    if any(frac < 0 for frac in fracs):
+        raise ValueError("Dataset fractions cannot be negative.")
+
+    if len(raw_train_datasets) > 0:
+        train_subsets = []
+        for dataset, frac in zip(raw_train_datasets, fracs):
+            train_subset = dataset.select(range(int(frac * len(dataset))))
+            train_subsets.append(train_subset)
+        if shuffle:
+            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
+        else:
+            raw_datasets["train"] = concatenate_datasets(train_subsets)
+    # No subsampling for test datasets to enable fair comparison across models
+    if len(raw_val_datasets) > 0:
+        if shuffle:
+            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
+        else:
+            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)
+
+    if len(raw_datasets) == 0:
+        raise ValueError(
+            f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
+        )
+
+    return raw_datasets
diff --git a/src/alignment/model_utils.py b/src/alignment/model_utils.py
new file mode 100644
index 0000000..a01f1b4
--- /dev/null
+++ b/src/alignment/model_utils.py
@@ -0,0 +1,79 @@
+from typing import Dict, Union
+
+import torch
+from transformers import AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer
+
+from accelerate import Accelerator
+from peft import LoraConfig, PeftConfig
+
+from .configs import DataArguments, ModelArguments
+from .data import DEFAULT_CHAT_TEMPLATE
+
+
+def get_current_device() -> int:
+    """Get the current device. For GPU we return the local process index to enable multiple GPU training."""
+    return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
+
+
+def get_kbit_device_map() -> Dict[str, int] | None:
+    """Useful for running inference with quantized models by setting `device_map=get_peft_device_map()`"""
+    return {"": get_current_device()} if torch.cuda.is_available() else None
+
+
+def get_quantization_config(model_args) -> BitsAndBytesConfig | None:
+    if model_args.load_in_4bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,  # For consistency with model weights, we use the same value as `torch_dtype` which is float16 for PEFT models
+            bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
+            bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
+        )
+    elif model_args.load_in_8bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+        )
+    else:
+        quantization_config = None
+
+    return quantization_config
+
+
+def get_tokenizer(model_args: ModelArguments, data_args: DataArguments) -> PreTrainedTokenizer:
+    """Get the tokenizer for the model."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        revision=model_args.model_revision,
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    if data_args.truncation_side is not None:
+        tokenizer.truncation_side = data_args.truncation_side
+
+    # Set reasonable default for models without max length
+    if tokenizer.model_max_length > 100_000:
+        tokenizer.model_max_length = 2048
+
+    if data_args.chat_template is not None:
+        tokenizer.chat_template = data_args.chat_template
+    elif tokenizer.chat_template is None:
+        tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
+
+    return tokenizer
+
+
+def get_peft_config(model_args: ModelArguments) -> Union[PeftConfig, None]:
+    if model_args.use_peft is False:
+        return None
+
+    peft_config = LoraConfig(
+        r=model_args.lora_r,
+        lora_alpha=model_args.lora_alpha,
+        lora_dropout=model_args.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=model_args.lora_target_modules,
+        modules_to_save=model_args.lora_modules_to_save,
+    )
+
+    return peft_config
diff --git a/src/alignment/utils/release.py b/src/alignment/release.py
similarity index 100%
rename from src/alignment/utils/release.py
rename to src/alignment/release.py

From d2900adc83134cf86d51b74ffda0d26d120b324d Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 8 Nov 2023 16:31:57 +0000
Subject: [PATCH 02/30] Make it work!

---
 .gitignore                                    |  3 ++
 README.md                                     | 14 +++++--
 .../accelerate_configs/deepspeed_zero1.yaml   | 19 +++++++++
 .../accelerate_configs/deepspeed_zero2.yaml   | 21 ++++++++++
 .../accelerate_configs/deepspeed_zero3.yaml   | 22 ++++++++++
 recipes/accelerate_configs/multi_gpu.yaml     | 16 +++++++
 recipes/zephyr-7b/sft/config.yaml             | 41 ++++++++++++++++++
 scripts/README.md                             |  6 +++
 scripts/run_sft.py                            | 42 +++++++++----------
 setup.py                                      |  6 +--
 10 files changed, 160 insertions(+), 30 deletions(-)
 create mode 100644 recipes/accelerate_configs/deepspeed_zero1.yaml
 create mode 100644 recipes/accelerate_configs/deepspeed_zero2.yaml
 create mode 100644 recipes/accelerate_configs/deepspeed_zero3.yaml
 create mode 100644 recipes/accelerate_configs/multi_gpu.yaml
 create mode 100644 recipes/zephyr-7b/sft/config.yaml
 create mode 100644 scripts/README.md

diff --git a/.gitignore b/.gitignore
index 2dc53ca..d4de801 100644
--- a/.gitignore
+++ b/.gitignore
@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# Temp checkpoint folder
+data/
\ No newline at end of file
diff --git a/README.md b/README.md
index af1a003..6f7c61e 100644
--- a/README.md
+++ b/README.md
@@ -32,13 +32,19 @@ To run the code in this project, first create a Python virtual environment using
 conda create -n handbook python=3.10 && conda activate handbook
 ```
 
-Next, install PyTorch v2.1.0. Since this hardware-dependent, we
-direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
+Next, install PyTorch `v2.0.1` - the precise version is important for reproducibility! Since this hardware-dependent, we
+direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/previous-versions/#v201).
 
-Once PyTorch is installed, you can install the remaining package dependencies as follows:
+You can then install the remaining package dependencies as follows:
 
 ```shell
-pip install .
+python -m pip install .
+```
+
+You will also need Flash Attention 2 installed, which can be done by running:
+
+```shell
+python -m pip install flash-attn==2.3.0 --no-build-isolation
 ```
 
 Next, log into your Hugging Face account as follows:
diff --git a/recipes/accelerate_configs/deepspeed_zero1.yaml b/recipes/accelerate_configs/deepspeed_zero1.yaml
new file mode 100644
index 0000000..1dfeda0
--- /dev/null
+++ b/recipes/accelerate_configs/deepspeed_zero1.yaml
@@ -0,0 +1,19 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  zero3_init_flag: false
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/accelerate_configs/deepspeed_zero2.yaml b/recipes/accelerate_configs/deepspeed_zero2.yaml
new file mode 100644
index 0000000..0777900
--- /dev/null
+++ b/recipes/accelerate_configs/deepspeed_zero2.yaml
@@ -0,0 +1,21 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/accelerate_configs/deepspeed_zero3.yaml b/recipes/accelerate_configs/deepspeed_zero3.yaml
new file mode 100644
index 0000000..b5a1201
--- /dev/null
+++ b/recipes/accelerate_configs/deepspeed_zero3.yaml
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/accelerate_configs/multi_gpu.yaml b/recipes/accelerate_configs/multi_gpu.yaml
new file mode 100644
index 0000000..4f05571
--- /dev/null
+++ b/recipes/accelerate_configs/multi_gpu.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/zephyr-7b/sft/config.yaml b/recipes/zephyr-7b/sft/config.yaml
new file mode 100644
index 0000000..e7e786a
--- /dev/null
+++ b/recipes/zephyr-7b/sft/config.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/zephyr-7b-sft
+overwrite_output_dir: true
+per_device_eval_batch_size: 16
+per_device_train_batch_size: 32
+push_to_hub: True
+push_to_hub_model_id: zephyr-7b-sft
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "no"
+save_total_limit: null
+seed: 42
+tf32: true
\ No newline at end of file
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..de1c109
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,6 @@
+
+## Supervised Fine-Tuning (SFT)
+
+```
+
+```
\ No newline at end of file
diff --git a/scripts/run_sft.py b/scripts/run_sft.py
index 580916b..116a654 100644
--- a/scripts/run_sft.py
+++ b/scripts/run_sft.py
@@ -18,7 +18,6 @@ Supervised fine-tuning script for decoder language models.
 """
 
 import logging
-import math
 import random
 import sys
 
@@ -52,6 +51,7 @@ def main():
 
     # Set seed for reproducibility
     set_seed(training_args.seed)
+
     accelerator = Accelerator()
 
     ###############
@@ -72,7 +72,7 @@ def main():
     # Log on each process a small summary
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.bf16}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Model parameters {model_args}")
     logger.info(f"Data parameters {data_args}")
@@ -132,8 +132,8 @@ def main():
         model=model_args.model_name_or_path,
         model_init_kwargs=model_kwargs,
         args=training_args,
-        train_dataset=raw_datasets["train"] if training_args.do_train else None,
-        eval_dataset=raw_datasets["test"] if training_args.do_eval else None,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
         dataset_text_field="text",
         max_seq_length=training_args.max_seq_length,
         tokenizer=tokenizer,
@@ -144,17 +144,14 @@ def main():
     ###############
     # Training loop
     ###############
-    if training_args.do_train:
-        logger.info("*** Train ***")
-        train_result = trainer.train()
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+    logger.info("*** Train ***")
+    train_result = trainer.train()
+    metrics = train_result.metrics
+    max_train_samples = data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+    metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
 
     ##########
     # Evaluate
@@ -164,11 +161,6 @@ def main():
         metrics = trainer.evaluate()
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-        try:
-            perplexity = math.exp(metrics["eval_loss"])
-        except OverflowError:
-            perplexity = float("inf")
-        metrics["perplexity"] = perplexity
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
@@ -181,14 +173,18 @@ def main():
 
     # Save everything else on main process
     if accelerator.is_main_process:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
-        kwargs["dataset"] = list(data_args.dataset_mixer.keys())
+        kwargs = {
+            "finetuned_from": model_args.model_name_or_path,
+            "dataset": list(data_args.dataset_mixer.keys()),
+            "tags": ["alignment-handbook"],
+        }
         trainer.create_model_card(**kwargs)
         # Restore k,v cache for fast inference
         trainer.model.config.use_cache = True
         trainer.model.config.save_pretrained(training_args.output_dir)
 
-        if training_args.push_to_hub:
+        if training_args.push_to_hub is True:
+            logger.info("Pushing to hub...")
             trainer.push_to_hub()
 
     accelerator.wait_for_everyone()
diff --git a/setup.py b/setup.py
index dad5141..c6d4f21 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@ _deps = [
     "accelerate==0.23.0",
     "bitsandbytes==0.41.1",
     "black==23.1.0",
-    "datasets==2.12.0",
+    "datasets==2.14.6",
     "deepspeed==0.12.2",
     "einops>=0.6.1",
     "evaluate==0.4.0",
@@ -60,8 +60,8 @@ _deps = [
     "protobuf<=3.20.2",  # Needed to avoid conflicts with `transformers`
     "pytest",
     "safetensors>=0.3.3",
+    "scipy",
     "tensorboard",
-    "torch==2.0.1",
     "transformers==4.35.0",
     "trl==0.7.4",  # TODO bump to next release, added for NEFTune
     "tqdm>=4.64.1",
@@ -82,7 +82,6 @@ def deps_list(*pkgs):
 
 extras = {}
 extras["tests"] = deps_list("pytest", "parameterized")
-extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("black", "isort", "flake8")
 extras["docs"] = deps_list("hf-doc-builder")
 extras["dev"] = extras["docs"] + extras["quality"] + extras["tests"]
@@ -102,6 +101,7 @@ install_requires = [
     deps["peft"],
     deps["protobuf"],
     deps["safetensors"],
+    deps["scipy"],
     deps["tensorboard"],
     deps["tqdm"],  # progress bars in model download and training scripts
     deps["transformers"],

From e54e09597806ae093bdb591b9fba2755d2b48278 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 8 Nov 2023 22:20:17 +0000
Subject: [PATCH 03/30] Make it work for realz

---
 recipes/launch.slurm                          | 92 +++++++++++++++++++
 recipes/zephyr-7b/README.md                   |  7 ++
 .../sft/{config.yaml => config_full.yaml}     |  0
 scripts/run_sft.py                            |  6 +-
 4 files changed, 102 insertions(+), 3 deletions(-)
 create mode 100644 recipes/launch.slurm
 create mode 100644 recipes/zephyr-7b/README.md
 rename recipes/zephyr-7b/sft/{config.yaml => config_full.yaml} (100%)

diff --git a/recipes/launch.slurm b/recipes/launch.slurm
new file mode 100644
index 0000000..28d4bee
--- /dev/null
+++ b/recipes/launch.slurm
@@ -0,0 +1,92 @@
+#!/bin/bash
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --exclusive
+#SBATCH --gres=gpu:8
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/h4/logs/%x-%j.out
+#SBATCH --err=/fsx/h4/logs/%x-%j.err
+
+set -x -e
+
+source ~/.bashrc
+conda activate handbook
+echo "START TIME: $(date)"
+
+MODEL=$1
+TASK=$2
+VERSION=$3
+ACCELERATOR=$4
+OPTIONAL_ARGS=$5
+
+# Training setup
+NUM_NODES=$SLURM_NNODES
+GPUS_PER_NODE=8
+WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
+# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
+CONFIG_FILE=recipes/$MODEL/$TASK/config_$VERSION.yaml
+GRAD_ACC_STEPS=$(yq -r .gradient_accumulation_steps $CONFIG_FILE)
+
+# Split the string into individual arguments
+IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
+
+# Loop through the arguments and find the one with "--gradient_accumulation_steps"
+for arg in "${ARGS[@]}"; do
+    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
+        # Extract the value after the equals sign
+        GRAD_ACC_STEPS="${arg#*=}"
+        break  # Exit the loop once we find the desired argument
+    fi
+done
+
+echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+
+export CMD=" \
+    scripts/run_$TASK.py $CONFIG_FILE
+    "
+
+export LAUNCHER="ACCELERATE_LOG_LEVEL=info accelerate launch \
+    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
+    --gradient_accumulation_steps $GRAD_ACC_STEPS \
+    --num_machines $NUM_NODES \
+    --num_processes $WORLD_SIZE \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank \$SLURM_PROCID \
+    --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
+    --max_restarts 1 \
+    --role \$(hostname -s): \
+    --tee 3 \
+    "
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
+
+echo "END TIME: $(date)"
\ No newline at end of file
diff --git a/recipes/zephyr-7b/README.md b/recipes/zephyr-7b/README.md
new file mode 100644
index 0000000..cb3847a
--- /dev/null
+++ b/recipes/zephyr-7b/README.md
@@ -0,0 +1,7 @@
+
+
+## SFT
+
+```shell
+sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full
+```
\ No newline at end of file
diff --git a/recipes/zephyr-7b/sft/config.yaml b/recipes/zephyr-7b/sft/config_full.yaml
similarity index 100%
rename from recipes/zephyr-7b/sft/config.yaml
rename to recipes/zephyr-7b/sft/config_full.yaml
diff --git a/scripts/run_sft.py b/scripts/run_sft.py
index 116a654..614a14a 100644
--- a/scripts/run_sft.py
+++ b/scripts/run_sft.py
@@ -90,9 +90,9 @@ def main():
         for index in random.sample(range(len(raw_datasets["train"])), 3):
             logger.info(f"Sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['messages']}")
 
-    #####################################
-    # Load tokenizer and process datasets
-    #####################################
+    ################
+    # Load tokenizer
+    ################
     tokenizer = get_tokenizer(model_args, data_args)
 
     #####################

From ee10c4efd96fc03b715aabcf13daf0be19b41674 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 8 Nov 2023 22:58:34 +0000
Subject: [PATCH 04/30] Make DPO work!

---
 recipes/launch.slurm                   |  8 +--
 recipes/zephyr-7b/dpo/config_full.yaml | 37 +++++++++++++
 recipes/zephyr-7b/sft/config_full.yaml |  2 +-
 scripts/run_dpo.py                     | 76 ++++++--------------------
 4 files changed, 60 insertions(+), 63 deletions(-)
 create mode 100644 recipes/zephyr-7b/dpo/config_full.yaml

diff --git a/recipes/launch.slurm b/recipes/launch.slurm
index 28d4bee..39ee457 100644
--- a/recipes/launch.slurm
+++ b/recipes/launch.slurm
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
 #SBATCH --partition=production-cluster
@@ -14,7 +14,7 @@ echo "START TIME: $(date)"
 
 MODEL=$1
 TASK=$2
-VERSION=$3
+PRECISION=$3
 ACCELERATOR=$4
 OPTIONAL_ARGS=$5
 
@@ -23,7 +23,7 @@ NUM_NODES=$SLURM_NNODES
 GPUS_PER_NODE=8
 WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
 # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
-CONFIG_FILE=recipes/$MODEL/$TASK/config_$VERSION.yaml
+CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
 GRAD_ACC_STEPS=$(yq -r .gradient_accumulation_steps $CONFIG_FILE)
 
 # Split the string into individual arguments
@@ -69,7 +69,7 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 # export NCCL_NSOCKS_PERTHREAD=1
 # export CUDA_LAUNCH_BLOCKING=1
 
-# AWS specific
+# Specific configuration for the Hugging Face Compute Cluster - be warned this may not work on other clusters!
 export NCCL_PROTO=simple
 export RDMAV_FORK_SAFE=1
 export FI_EFA_FORK_SAFE=1
diff --git a/recipes/zephyr-7b/dpo/config_full.yaml b/recipes/zephyr-7b/dpo/config_full.yaml
new file mode 100644
index 0000000..82258b8
--- /dev/null
+++ b/recipes/zephyr-7b/dpo/config_full.yaml
@@ -0,0 +1,37 @@
+# Model arguments
+model_name_or_path: lewtun/zephyr-7b-sft
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.1
+do_eval: true
+evaluation_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+hub_model_id: zephyr-7b-dpo
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: linear
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 3
+optim: rmsprop
+output_dir: data/zephyr-7b-dpo
+per_device_train_batch_size: 4
+per_device_eval_batch_size: 4
+push_to_hub: true
+save_strategy: "no"
+save_total_limit: null
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/zephyr-7b/sft/config_full.yaml b/recipes/zephyr-7b/sft/config_full.yaml
index e7e786a..8ceb856 100644
--- a/recipes/zephyr-7b/sft/config_full.yaml
+++ b/recipes/zephyr-7b/sft/config_full.yaml
@@ -17,6 +17,7 @@ bf16: true
 evaluation_strategy: epoch
 gradient_accumulation_steps: 2
 gradient_checkpointing: true
+hub_model_id: zephyr-7b-sft
 hub_strategy: every_save
 learning_rate: 2.0e-05
 log_level: info
@@ -31,7 +32,6 @@ overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 32
 push_to_hub: True
-push_to_hub_model_id: zephyr-7b-sft
 remove_unused_columns: true
 report_to:
 - tensorboard
diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py
index b6f1cba..542de20 100644
--- a/scripts/run_dpo.py
+++ b/scripts/run_dpo.py
@@ -14,31 +14,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import random
-import subprocess
 import sys
-from datetime import timedelta
 
 import torch
 import transformers
 from transformers import set_seed
 
-import wandb
-from accelerate import Accelerator, InitProcessGroupKwargs
-from h4.data import get_datasets
-from h4.training import DataArguments, DPOTrainingArguments, ModelArguments, init_wandb_training
-from h4.utils import (
+from accelerate import Accelerator
+from alignment import (
+    DataArguments,
+    DPOConfig,
     H4ArgumentParser,
+    ModelArguments,
     apply_chat_template,
-    convert_to_safetensors,
+    get_datasets,
     get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
     get_tokenizer,
-    hf_login,
-    is_slurm_available,
-    push_to_hub_revision,
-    run_mt_bench_job,
 )
 from trl import DPOTrainer
 
@@ -47,7 +40,7 @@ logger = logging.getLogger(__name__)
 
 
 def main():
-    parser = H4ArgumentParser((ModelArguments, DataArguments, DPOTrainingArguments))
+    parser = H4ArgumentParser((ModelArguments, DataArguments, DPOConfig))
     model_args, data_args, training_args = parser.parse()
 
     #######
@@ -69,18 +62,11 @@ def main():
     logger.info(f"Data parameters {data_args}")
     logger.info(f"Training/evaluation parameters {training_args}")
 
-    # Setup WandB
-    if training_args.wandb_enabled:
-        init_wandb_training(training_args)
-
-    # Login to HuggingFace Hub if needed
-    hf_login()
-
     # Set seed for reproducibility
     set_seed(training_args.seed)
 
     # Increase distributed timeout to 3h to enable push to Hub to complete
-    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=6 * 1800))])
+    accelerator = Accelerator()
 
     ###############
     # Load datasets
@@ -114,12 +100,6 @@ def main():
             {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
         )
 
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(raw_datasets["train"])), 3):
-        logger.info(f"Prompt sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['prompt']}")
-        logger.info(f"Chosen sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['chosen']}")
-        logger.info(f"Rejected sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['rejected']}")
-
     torch_dtype = (
         model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
     )
@@ -136,7 +116,7 @@ def main():
     ref_model = model_args.model_name_or_path
     ref_model_kwargs = model_kwargs
 
-    if model_args.use_peft:
+    if model_args.use_peft is True:
         ref_model = None
         ref_model_kwargs = None
 
@@ -153,7 +133,7 @@ def main():
         train_dataset=raw_datasets["train"],
         eval_dataset=raw_datasets["test"],
         tokenizer=tokenizer,
-        max_length=training_args.max_seq_length,
+        max_length=training_args.max_length,
         max_prompt_length=training_args.max_prompt_length,
         peft_config=get_peft_config(model_args),
     )
@@ -178,7 +158,7 @@ def main():
     ##########
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
-        metrics = dpo_trainer.evaluate(eval_dataset=raw_datasets["test"])
+        metrics = dpo_trainer.evaluate()
         max_eval_samples = (
             data_args.max_eval_samples if data_args.max_eval_samples is not None else len(raw_datasets["test"])
         )
@@ -190,43 +170,23 @@ def main():
     # Save model and create model card
     ##################################
     dpo_trainer.save_model(training_args.output_dir)
-
     # Save everything else on main process
     if accelerator.is_main_process:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
-        kwargs["dataset"] = list(data_args.dataset_mixer.keys())
+        kwargs = {
+            "finetuned_from": model_args.model_name_or_path,
+            "dataset": list(data_args.dataset_mixer.keys()),
+            "tags": ["alignment-handbook"],
+        }
         dpo_trainer.create_model_card(**kwargs)
         # Restore k,v cache for fast inference
         dpo_trainer.model.config.use_cache = True
-        # Fix custom code paths
-        if model_args.trust_remote_code is True:
-            auto_map = dpo_trainer.model.config.auto_map
-            dpo_trainer.model.config.auto_map = {k: v.split("--")[-1] for k, v in auto_map.items()}
         dpo_trainer.model.config.save_pretrained(training_args.output_dir)
-        # FSDP/DeepSpeed save the model as a single `pytorch_model.bin` file, so we need to shard it.
-        # We run this in a subprocess to avoid interference from the accelerators.
-        subprocess.run(
-            [
-                "python",
-                "scripts/training/shard_checkpoint.py",
-                f"--output_dir={training_args.output_dir}",
-                f"--trust_remote_code={model_args.trust_remote_code}",
-            ],
-            check=True,
-        )
-        # Convert torch weights to safetensors for deployment with TGI
-        convert_to_safetensors(training_args.output_dir)
-        if training_args.push_to_hub_revision:
-            is_model_on_hub = push_to_hub_revision(training_args, model_args)
-            # Run automatic evaluation once the model is pushed to the Hub
-            if is_slurm_available() and is_model_on_hub is True and training_args.do_eval is True:
-                logger.info("*** Launching MT Bench ***")
-                run_mt_bench_job(training_args, model_args)
+        if training_args.push_to_hub is True:
+            dpo_trainer.push_to_hub()
 
     # Ensure we don't timeout on model save / push to Hub
     logger.info("*** Waiting for all processes to finish ***")
     accelerator.wait_for_everyone()
-    wandb.finish()
 
     logger.info("*** Run complete! ***")
 

From e2c19a02521b6dff706cc9b336496938c44e9f3d Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 8 Nov 2023 23:09:16 +0000
Subject: [PATCH 05/30] Tweak

---
 recipes/launch.slurm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/recipes/launch.slurm b/recipes/launch.slurm
index 39ee457..15ef4b6 100644
--- a/recipes/launch.slurm
+++ b/recipes/launch.slurm
@@ -69,7 +69,8 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 # export NCCL_NSOCKS_PERTHREAD=1
 # export CUDA_LAUNCH_BLOCKING=1
 
-# Specific configuration for the Hugging Face Compute Cluster - be warned this may not work on other clusters!
+# Specific configuration for the Hugging Face Compute Cluster
+# Be ye warned this may not work on other clusters!
 export NCCL_PROTO=simple
 export RDMAV_FORK_SAFE=1
 export FI_EFA_FORK_SAFE=1

From 2de17f5ba1a6f77975b6c90caee6df812c424fb7 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 9 Nov 2023 07:32:24 +0000
Subject: [PATCH 06/30] Add doc

---
 .../accelerate_configs/deepspeed_zero1.yaml   | 19 -----------
 .../accelerate_configs/deepspeed_zero2.yaml   | 21 ------------
 scripts/README.md                             | 32 +++++++++++++++++++
 3 files changed, 32 insertions(+), 40 deletions(-)
 delete mode 100644 recipes/accelerate_configs/deepspeed_zero1.yaml
 delete mode 100644 recipes/accelerate_configs/deepspeed_zero2.yaml

diff --git a/recipes/accelerate_configs/deepspeed_zero1.yaml b/recipes/accelerate_configs/deepspeed_zero1.yaml
deleted file mode 100644
index 1dfeda0..0000000
--- a/recipes/accelerate_configs/deepspeed_zero1.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-deepspeed_config:
-  deepspeed_multinode_launcher: standard
-  zero3_init_flag: false
-  zero_stage: 1
-distributed_type: DEEPSPEED
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 8
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
diff --git a/recipes/accelerate_configs/deepspeed_zero2.yaml b/recipes/accelerate_configs/deepspeed_zero2.yaml
deleted file mode 100644
index 0777900..0000000
--- a/recipes/accelerate_configs/deepspeed_zero2.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-deepspeed_config:
-  deepspeed_multinode_launcher: standard
-  offload_optimizer_device: none
-  offload_param_device: none
-  zero3_init_flag: false
-  zero_stage: 2
-distributed_type: DEEPSPEED
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 8
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
diff --git a/scripts/README.md b/scripts/README.md
index de1c109..502f566 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -1,6 +1,38 @@
 
 ## Supervised Fine-Tuning (SFT)
 
+We provide 3 main ways to train SFT models:
+
+* Distributed fine-tuning of all model weights with ZeRO-3
+* Fine-tuning with LoRA adapters and ZeRO-3
+* Fine-tuning with QLoRA adapters and DDP
+
+```shell
+# Full training with ZeRO-3
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_full.yaml
+
+# LoRA training with ZeRO-3
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_16bit.yaml
+
+# QLoRA training with DDP
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml scripts/run_sft.py recipes/{model_name}/sft/config_8bit.yaml
 ```
 
+You can override the parameters in each YAML config by appending them to the command as follows:
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_full.yaml --per_device_train_batch_size=2 --num_train_epochs=3
+```
+
+## Direct Preference Optimisation (DPO)
+
+```shell
+# Full training with ZeRO-3
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_full.yaml
+
+# LoRA training with ZeRO-3
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_16bit.yaml
+
+# QLoRA training with DDP
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_8bit.yaml
 ```
\ No newline at end of file

From 49da3ef739d4282572458dcdf377950abf28c2f6 Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Thu, 9 Nov 2023 10:56:25 +0100
Subject: [PATCH 07/30] adds configs and instructions for lora training

---
 recipes/zephyr-7b/README.md            | 41 ++++++++++++++++++-
 recipes/zephyr-7b/dpo/config_full.yaml |  4 +-
 recipes/zephyr-7b/dpo/config_lora.yaml | 53 +++++++++++++++++++++++++
 recipes/zephyr-7b/sft/config_full.yaml |  4 +-
 recipes/zephyr-7b/sft/config_lora.yaml | 54 ++++++++++++++++++++++++++
 5 files changed, 150 insertions(+), 6 deletions(-)
 create mode 100644 recipes/zephyr-7b/dpo/config_lora.yaml
 create mode 100644 recipes/zephyr-7b/sft/config_lora.yaml

diff --git a/recipes/zephyr-7b/README.md b/recipes/zephyr-7b/README.md
index cb3847a..f1df22c 100644
--- a/recipes/zephyr-7b/README.md
+++ b/recipes/zephyr-7b/README.md
@@ -1,7 +1,44 @@
 
+# Instructions
+In the handbook, for each training step we provide two sets of recipes:
+- Full training on a multi-GPU machine (tested on a 8xA100 node), using slurm to queue jobs.
+- LORA taining on a single consumer 24GB GPU (tested on a RTX 4090)
+
+The full training jobs will scale to a multi-node setting, by adjusting `--nodes=1`, we advise adjusting the gradient accumulation steps and/or batch size if you want to replicate our results.
+
 
-## SFT
+## Full training examples 
+
+### SFT
 
 ```shell
-sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full
+sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
+```
+
+## DPO
+```shell
+sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
+```
+
+## LORA training examples
+### SFT
+```shell
+# locally on 1 gpu
+accelerate launch scripts/run_sft.py recipes/zephyr-7b/sft/config_lora.yaml
+```
+
+```shell
+# on a cluster
+sbatch --job-name=handbook_sft_lora --nodes=1 recipes/launch.slurm zephyr-7b sft lora multi_gpu "--gradient_accumulation_steps=16"
+```
+
+### SFT
+```shell
+# locally on 1 gpu
+accelerate launch scripts/run_dpo.py recipes/zephyr-7b/dpo/config_lora.yaml
+```
+
+```shell
+# on a cluster
+sbatch --job-name=handbook_dpo_lora --nodes=1 recipes/launch.slurm zephyr-7b dpo lora multi_gpu "--gradient_accumulation_steps=8"
 ```
\ No newline at end of file
diff --git a/recipes/zephyr-7b/dpo/config_full.yaml b/recipes/zephyr-7b/dpo/config_full.yaml
index 82258b8..8845330 100644
--- a/recipes/zephyr-7b/dpo/config_full.yaml
+++ b/recipes/zephyr-7b/dpo/config_full.yaml
@@ -18,7 +18,7 @@ evaluation_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
-hub_model_id: zephyr-7b-dpo
+hub_model_id: zephyr-7b-dpo-full
 learning_rate: 5.0e-7
 log_level: info
 logging_steps: 10
@@ -27,7 +27,7 @@ max_length: 1024
 max_prompt_length: 512
 num_train_epochs: 3
 optim: rmsprop
-output_dir: data/zephyr-7b-dpo
+output_dir: data/zephyr-7b-dpo-full
 per_device_train_batch_size: 4
 per_device_eval_batch_size: 4
 push_to_hub: true
diff --git a/recipes/zephyr-7b/dpo/config_lora.yaml b/recipes/zephyr-7b/dpo/config_lora.yaml
new file mode 100644
index 0000000..d3f00b9
--- /dev/null
+++ b/recipes/zephyr-7b/dpo/config_lora.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: HuggingFaceH4/mistral-7b-ift
+model_revision: v14.0
+torch_dtype: auto
+
+# LORA
+use_peft: true
+lora_r: 64
+lora_alpha: 16
+lora_dropout: 0.1
+lora_target_modules:
+- q_proj
+- k_proj
+- v_proj
+- o_proj
+
+# Data training arguments
+
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.1
+do_eval: true
+ddp_find_unused_parameters: true
+evaluation_strategy: epoch
+eval_steps: 100
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-dpo-lora
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: linear
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 3
+optim: rmsprop
+output_dir: data/zephyr-7b-dpo-lora # It is handy to append `hub_model_revision` to keep track of your local experiments
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 4
+push_to_hub: true
+save_strategy: "no"
+save_total_limit: null
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/zephyr-7b/sft/config_full.yaml b/recipes/zephyr-7b/sft/config_full.yaml
index 8ceb856..020df7f 100644
--- a/recipes/zephyr-7b/sft/config_full.yaml
+++ b/recipes/zephyr-7b/sft/config_full.yaml
@@ -17,7 +17,7 @@ bf16: true
 evaluation_strategy: epoch
 gradient_accumulation_steps: 2
 gradient_checkpointing: true
-hub_model_id: zephyr-7b-sft
+hub_model_id: zephyr-7b-sft-full
 hub_strategy: every_save
 learning_rate: 2.0e-05
 log_level: info
@@ -27,7 +27,7 @@ lr_scheduler_type: cosine
 max_seq_length: 2048
 max_steps: -1
 num_train_epochs: 1
-output_dir: data/zephyr-7b-sft
+output_dir: data/zephyr-7b-sft-full
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 32
diff --git a/recipes/zephyr-7b/sft/config_lora.yaml b/recipes/zephyr-7b/sft/config_lora.yaml
new file mode 100644
index 0000000..6bf806b
--- /dev/null
+++ b/recipes/zephyr-7b/sft/config_lora.yaml
@@ -0,0 +1,54 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: auto
+use_flash_attention_2: true
+
+# LORA
+use_peft: true
+lora_r: 64
+lora_alpha: 16
+lora_dropout: 0.1
+lora_target_modules:
+- q_proj
+- k_proj
+- v_proj
+- o_proj
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 128
+ddp_find_unused_parameters: true
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-sft-lora
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/zephyr-7b-sft-lora
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 4
+push_to_hub: True
+report_to:
+- tensorboard
+save_strategy: "no"
+save_total_limit: null
+seed: 42
+tf32: true
\ No newline at end of file

From 3a5430222edd1c0111358c82665f602a387ca638 Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Thu, 9 Nov 2023 13:04:34 +0100
Subject: [PATCH 08/30] removes need for yq dep

---
 recipes/launch.slurm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/launch.slurm b/recipes/launch.slurm
index 15ef4b6..17f1afc 100644
--- a/recipes/launch.slurm
+++ b/recipes/launch.slurm
@@ -24,7 +24,7 @@ GPUS_PER_NODE=8
 WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
 # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
-GRAD_ACC_STEPS=$(yq -r .gradient_accumulation_steps $CONFIG_FILE)
+GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
 
 # Split the string into individual arguments
 IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"

From 33a0ce3afd26d300de6aedee0995c3658501d4b6 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 9 Nov 2023 13:39:03 +0000
Subject: [PATCH 09/30] Add more doc

---
 .gitignore                             |  5 ++-
 README.md                              |  6 +--
 recipes/launch.slurm                   |  4 +-
 recipes/zephyr-7b/README.md            |  3 ++
 recipes/zephyr-7b/sft/config_lora.yaml |  1 -
 scripts/README.md                      | 60 +++++++++++++++++---------
 scripts/run_dpo.py                     |  1 +
 scripts/run_sft.py                     |  5 +--
 setup.py                               |  2 +-
 9 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/.gitignore b/.gitignore
index d4de801..1445d93 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,5 +159,6 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 
-# Temp checkpoint folder
-data/
\ No newline at end of file
+# Temp folders
+data/
+wandb/
\ No newline at end of file
diff --git a/README.md b/README.md
index 6f7c61e..afad90d 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,8 @@ To run the code in this project, first create a Python virtual environment using
 conda create -n handbook python=3.10 && conda activate handbook
 ```
 
-Next, install PyTorch `v2.0.1` - the precise version is important for reproducibility! Since this hardware-dependent, we
-direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/previous-versions/#v201).
+Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this hardware-dependent, we
+direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
 
 You can then install the remaining package dependencies as follows:
 
@@ -44,7 +44,7 @@ python -m pip install .
 You will also need Flash Attention 2 installed, which can be done by running:
 
 ```shell
-python -m pip install flash-attn==2.3.0 --no-build-isolation
+python -m pip install flash-attn --no-build-isolation
 ```
 
 Next, log into your Hugging Face account as follows:
diff --git a/recipes/launch.slurm b/recipes/launch.slurm
index 17f1afc..da0b176 100644
--- a/recipes/launch.slurm
+++ b/recipes/launch.slurm
@@ -3,8 +3,8 @@
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
 #SBATCH --partition=production-cluster
-#SBATCH --output=/fsx/h4/logs/%x-%j.out
-#SBATCH --err=/fsx/h4/logs/%x-%j.err
+#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this to your cluster
+#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this to your cluster
 
 set -x -e
 
diff --git a/recipes/zephyr-7b/README.md b/recipes/zephyr-7b/README.md
index f1df22c..fcafde9 100644
--- a/recipes/zephyr-7b/README.md
+++ b/recipes/zephyr-7b/README.md
@@ -1,5 +1,6 @@
 
 # Instructions
+
 In the handbook, for each training step we provide two sets of recipes:
 - Full training on a multi-GPU machine (tested on a 8xA100 node), using slurm to queue jobs.
 - LORA taining on a single consumer 24GB GPU (tested on a RTX 4090)
@@ -21,6 +22,7 @@ sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full
 ```
 
 ## LORA training examples
+
 ### SFT
 ```shell
 # locally on 1 gpu
@@ -33,6 +35,7 @@ sbatch --job-name=handbook_sft_lora --nodes=1 recipes/launch.slurm zephyr-7b sft
 ```
 
 ### SFT
+
 ```shell
 # locally on 1 gpu
 accelerate launch scripts/run_dpo.py recipes/zephyr-7b/dpo/config_lora.yaml
diff --git a/recipes/zephyr-7b/sft/config_lora.yaml b/recipes/zephyr-7b/sft/config_lora.yaml
index 6bf806b..2d488d8 100644
--- a/recipes/zephyr-7b/sft/config_lora.yaml
+++ b/recipes/zephyr-7b/sft/config_lora.yaml
@@ -1,6 +1,5 @@
 # Model arguments
 model_name_or_path: mistralai/Mistral-7B-v0.1
-model_revision: main
 torch_dtype: auto
 use_flash_attention_2: true
 
diff --git a/scripts/README.md b/scripts/README.md
index 502f566..a388ff9 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -1,38 +1,58 @@
 
-## Supervised Fine-Tuning (SFT)
+## Scripts to Train and Evaluate Chat Models
 
-We provide 3 main ways to train SFT models:
+### Fine-tuning
 
-* Distributed fine-tuning of all model weights with ZeRO-3
-* Fine-tuning with LoRA adapters and ZeRO-3
-* Fine-tuning with QLoRA adapters and DDP
+In the handbook, we provide two main ways to align LLMs for chat:
+
+- Full fine-tuning on a multi-GPU machine (tested on an 8 x A100 (80GB) node).
+- LoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
+
+In practice, we find comparable performance for both full and LoRA fine-tuning, with the latter having the advantage of producing small adapter weights that are fast to upload and download from the Hugging Face Hub. Here's the two general commands to fine-tune your models:
 
 ```shell
-# Full training with ZeRO-3
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_full.yaml
+# Full training with ZeRO-3 on 8 GPUs
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml
 
-# LoRA training with ZeRO-3
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_16bit.yaml
+# LoRA training on single GPU
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml
+```
 
-# QLoRA training with DDP
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml scripts/run_sft.py recipes/{model_name}/sft/config_8bit.yaml
+Here `{task}` refers to type of training you wish to run (SFT, DPO, etc), while `{model_name}` refers to the choice of recipe in the `recipes/` directory. For example, to replicate Zephyr 7B you can run:
+
+```shell
+# Step 1 - train SFT policy
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b/sft/config_full.yaml
+
+# Step 2 - align with DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b/dpo/config_full.yaml
 ```
 
 You can override the parameters in each YAML config by appending them to the command as follows:
 
 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_full.yaml --per_device_train_batch_size=2 --num_train_epochs=3
+# Change batch size, number of epochs etc
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --per_device_train_batch_size=42 --num_train_epochs=5
 ```
 
-## Direct Preference Optimisation (DPO)
+By default all training metrics are logged with TensorBoard. If you have a [Weights and Biases](https://wandb.ai/site) account and are logged in, you can view the training metrics by appending `--report_to=wandb`, e.g.
 
 ```shell
-# Full training with ZeRO-3
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_full.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --report_to=wandb
+```
 
-# LoRA training with ZeRO-3
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_16bit.yaml
+#### Launching jobs on a Slurm cluster
 
-# QLoRA training with DDP
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_8bit.yaml
-```
\ No newline at end of file
+If you have access to a Slurm cluster, we provide a `recipes/launch.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
+
+```shell
+sbatch --job-name=handbook_{task} --nodes=1 recipes/launch.slurm {model_name} {task} {precision} {accelerator}
+```
+
+Here `{model_name}` and `{task}` are defined as above, while `{precision}` refers to the type of training (full vs LoRA) and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. Here's a concrete example to run SFT on 1 node of 8 GPUs:
+
+```shell
+sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
+```
+
+**Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
\ No newline at end of file
diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py
index 542de20..704ce1f 100644
--- a/scripts/run_dpo.py
+++ b/scripts/run_dpo.py
@@ -175,6 +175,7 @@ def main():
         kwargs = {
             "finetuned_from": model_args.model_name_or_path,
             "dataset": list(data_args.dataset_mixer.keys()),
+            "dataset_tags": list(data_args.dataset_mixer.keys()),
             "tags": ["alignment-handbook"],
         }
         dpo_trainer.create_model_card(**kwargs)
diff --git a/scripts/run_sft.py b/scripts/run_sft.py
index 614a14a..1ed8e33 100644
--- a/scripts/run_sft.py
+++ b/scripts/run_sft.py
@@ -82,13 +82,9 @@ def main():
     # Load datasets
     ###############
     raw_datasets = get_datasets(data_args, splits=data_args.dataset_splits)
-
     logger.info(
         f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
     )
-    with training_args.main_process_first(desc="Log a few random samples from the raw training set"):
-        for index in random.sample(range(len(raw_datasets["train"])), 3):
-            logger.info(f"Sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['messages']}")
 
     ################
     # Load tokenizer
@@ -176,6 +172,7 @@ def main():
         kwargs = {
             "finetuned_from": model_args.model_name_or_path,
             "dataset": list(data_args.dataset_mixer.keys()),
+            "dataset_tags": list(data_args.dataset_mixer.keys()),
             "tags": ["alignment-handbook"],
         }
         trainer.create_model_card(**kwargs)
diff --git a/setup.py b/setup.py
index c6d4f21..d71b591 100644
--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@ _deps = [
     "scipy",
     "tensorboard",
     "transformers==4.35.0",
-    "trl==0.7.4",  # TODO bump to next release, added for NEFTune
+    "trl==0.7.4",
     "tqdm>=4.64.1",
 ]
 

From 756bb76d226c7873edbbe841c045f49d0d40255d Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 9 Nov 2023 14:09:52 +0000
Subject: [PATCH 10/30] Fix Slurm opts

---
 recipes/launch.slurm        | 10 ++++----
 recipes/zephyr-7b/README.md | 48 ++++++++++++-------------------------
 scripts/README.md           | 16 +++++++++----
 3 files changed, 31 insertions(+), 43 deletions(-)

diff --git a/recipes/launch.slurm b/recipes/launch.slurm
index da0b176..a5f4359 100644
--- a/recipes/launch.slurm
+++ b/recipes/launch.slurm
@@ -2,9 +2,9 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
-#SBATCH --partition=production-cluster
-#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this to your cluster
-#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this to your cluster
+#SBATCH --partition=production-cluster  # Adjust this for your cluster
+#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
+#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this for your cluster
 
 set -x -e
 
@@ -44,7 +44,7 @@ MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
 
 export CMD=" \
-    scripts/run_$TASK.py $CONFIG_FILE
+    scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
     "
 
 export LAUNCHER="ACCELERATE_LOG_LEVEL=info accelerate launch \
@@ -69,7 +69,7 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 # export NCCL_NSOCKS_PERTHREAD=1
 # export CUDA_LAUNCH_BLOCKING=1
 
-# Specific configuration for the Hugging Face Compute Cluster
+# Specific configuration optimized for the Hugging Face Compute Cluster
 # Be ye warned this may not work on other clusters!
 export NCCL_PROTO=simple
 export RDMAV_FORK_SAFE=1
diff --git a/recipes/zephyr-7b/README.md b/recipes/zephyr-7b/README.md
index fcafde9..02746a1 100644
--- a/recipes/zephyr-7b/README.md
+++ b/recipes/zephyr-7b/README.md
@@ -1,47 +1,29 @@
 
-# Instructions
+# Instructions to Replicate Zephyr 7B
 
-In the handbook, for each training step we provide two sets of recipes:
-- Full training on a multi-GPU machine (tested on a 8xA100 node), using slurm to queue jobs.
-- LORA taining on a single consumer 24GB GPU (tested on a RTX 4090)
+As described in the Zephyr [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
 
-The full training jobs will scale to a multi-node setting, by adjusting `--nodes=1`, we advise adjusting the gradient accumulation steps and/or batch size if you want to replicate our results.
+1. Apply SFT to fine-tune Mistral 7B on the UltraChat dataset.
+2. Align the SFT model to AI feedback via DPO on the UltraFeedback dataset.
 
+See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.
 
 ## Full training examples 
 
-### SFT
-
 ```shell
-sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b/sft/config_full.yaml
+
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b/dpo/config_full.yaml
 ```
 
-## DPO
-```shell
-sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
-```
-
-## LORA training examples
-
-### SFT
-```shell
-# locally on 1 gpu
-accelerate launch scripts/run_sft.py recipes/zephyr-7b/sft/config_lora.yaml
-```
+## LoRA training examples
 
 ```shell
-# on a cluster
-sbatch --job-name=handbook_sft_lora --nodes=1 recipes/launch.slurm zephyr-7b sft lora multi_gpu "--gradient_accumulation_steps=16"
-```
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b/sft/config_lora.yaml
 
-### SFT
-
-```shell
-# locally on 1 gpu
-accelerate launch scripts/run_dpo.py recipes/zephyr-7b/dpo/config_lora.yaml
-```
-
-```shell
-# on a cluster
-sbatch --job-name=handbook_dpo_lora --nodes=1 recipes/launch.slurm zephyr-7b dpo lora multi_gpu "--gradient_accumulation_steps=8"
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b/dpo/config_lora.yaml
 ```
\ No newline at end of file
diff --git a/scripts/README.md b/scripts/README.md
index a388ff9..dcaa063 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -3,10 +3,11 @@
 
 ### Fine-tuning
 
-In the handbook, we provide two main ways to align LLMs for chat:
+In the handbook, we provide three main ways to align LLMs for chat:
 
-- Full fine-tuning on a multi-GPU machine (tested on an 8 x A100 (80GB) node).
+- Full fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on an 8 x A100 (80GB) node).
 - LoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
+- LoRA fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on a 2 x A100s (80GB)).
 
 In practice, we find comparable performance for both full and LoRA fine-tuning, with the latter having the advantage of producing small adapter weights that are fast to upload and download from the Hugging Face Hub. Here's the two general commands to fine-tune your models:
 
@@ -14,8 +15,11 @@ In practice, we find comparable performance for both full and LoRA fine-tuning,
 # Full training with ZeRO-3 on 8 GPUs
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml
 
-# LoRA training on single GPU
+# LoRA training on a single GPU
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml
+
+# LoRA training with ZeRO-3 on two or more GPUs
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml --num_processes={num_gpus} scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml
 ```
 
 Here `{task}` refers to type of training you wish to run (SFT, DPO, etc), while `{model_name}` refers to the choice of recipe in the `recipes/` directory. For example, to replicate Zephyr 7B you can run:
@@ -28,7 +32,7 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b/dpo/config_full.yaml
 ```
 
-You can override the parameters in each YAML config by appending them to the command as follows:
+By default, these scripts will push each model to your Hugging Face Hub username, i.e. `{username}/{model_name}-{task}`. You can override the parameters in each YAML config by appending them to the command as follows:
 
 ```shell
 # Change batch size, number of epochs etc
@@ -41,7 +45,7 @@ By default all training metrics are logged with TensorBoard. If you have a [Weig
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --report_to=wandb
 ```
 
-#### Launching jobs on a Slurm cluster
+### Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `recipes/launch.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
 
@@ -55,4 +59,6 @@ Here `{model_name}` and `{task}` are defined as above, while `{precision}` refer
 sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
 ```
 
+You can scale the number of nodes by increasing the `--nodes` flag; in these cases we recommend also scaling up the per-device batch size or number of gradient accumulation steps to keep the global batch size constant (and thus replicate our results).
+
 **Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
\ No newline at end of file

From 44b324487dd2f76e770277cab098872296e3e291 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 9 Nov 2023 14:20:43 +0000
Subject: [PATCH 11/30] Bump bs

---
 recipes/zephyr-7b/dpo/config_full.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/zephyr-7b/dpo/config_full.yaml b/recipes/zephyr-7b/dpo/config_full.yaml
index 8845330..433c36d 100644
--- a/recipes/zephyr-7b/dpo/config_full.yaml
+++ b/recipes/zephyr-7b/dpo/config_full.yaml
@@ -28,7 +28,7 @@ max_prompt_length: 512
 num_train_epochs: 3
 optim: rmsprop
 output_dir: data/zephyr-7b-dpo-full
-per_device_train_batch_size: 4
+per_device_train_batch_size: 8
 per_device_eval_batch_size: 4
 push_to_hub: true
 save_strategy: "no"

From 89f58a043c62a256402678f61871bea601fe1ab2 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 9 Nov 2023 14:40:23 +0000
Subject: [PATCH 12/30] Add project structure

---
 README.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/README.md b/README.md
index afad90d..31ae641 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,10 @@ However, we know from the [InstructGPT](https://huggingface.co/papers/2203.02155
 
 The Alignment Handbook aims to fill that gap by providing the community with a series of robust training recipes that span the whole pipeline.
 
+## News 🗞️
+
+* November 10, 2023: We release all the training code to replicate Zephyr 7B 🪁!
+
 ## Links 🔗
 
 * [Zephyr 7B models, datasets, and demos](https://huggingface.co/collections/HuggingFaceH4/zephyr-7b-6538c6d6d5ddd1cbb1744a66)
@@ -59,6 +63,21 @@ Finally, install Git LFS so that you can push models to the Hugging Face Hub:
 sudo apt-get install git-lfs
 ```
 
+## Project structure
+
+```
+├── LICENSE
+├── Makefile                    <- Makefile with commands like `make style`
+├── README.md                   <- The top-level README for developers using this project
+├── chapters                    <- Educational content to render on hf.co/learn
+├── recipes                     <- Recipe configs, accelerate configs, slurm scripts
+├── scripts                     <- Scripts to train and evaluate chat models
+├── setup.cfg                   <- Installation config (mostly used for configuring code quality & tests)
+├── setup.py                    <- Makes project pip installable (pip install -e .) so `alignment` can be imported
+├── src                         <- Source code for use in this project
+└── tests                       <- Unit tests
+```
+
 ## Citation
 
 If you find the content of this repo useful in your work, please cite it as follows:

From 4b0769d13721b7288a0bb0412e0107196a9cd862 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 9 Nov 2023 14:42:57 +0000
Subject: [PATCH 13/30] Fix links

---
 recipes/zephyr-7b/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes/zephyr-7b/README.md b/recipes/zephyr-7b/README.md
index 02746a1..4e783ad 100644
--- a/recipes/zephyr-7b/README.md
+++ b/recipes/zephyr-7b/README.md
@@ -3,8 +3,8 @@
 
 As described in the Zephyr [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
 
-1. Apply SFT to fine-tune Mistral 7B on the UltraChat dataset.
-2. Align the SFT model to AI feedback via DPO on the UltraFeedback dataset.
+1. Apply SFT to fine-tune Mistral 7B on a filtered version of the UltraChat dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)).
+2. Align the SFT model to AI feedback via DPO on a preprocessed version of the UltraFeedback dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)).
 
 See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.
 

From 13141a4b0ba03e3d41a8769f5bb57d6bb12ae81f Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Fri, 10 Nov 2023 09:26:39 +0100
Subject: [PATCH 14/30] adds updated model paths, adds eval to sft scripts

---
 recipes/zephyr-7b/dpo/config_full.yaml | 2 +-
 recipes/zephyr-7b/dpo/config_lora.yaml | 3 +--
 recipes/zephyr-7b/sft/config_full.yaml | 1 +
 recipes/zephyr-7b/sft/config_lora.yaml | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/recipes/zephyr-7b/dpo/config_full.yaml b/recipes/zephyr-7b/dpo/config_full.yaml
index 433c36d..5110f59 100644
--- a/recipes/zephyr-7b/dpo/config_full.yaml
+++ b/recipes/zephyr-7b/dpo/config_full.yaml
@@ -1,5 +1,5 @@
 # Model arguments
-model_name_or_path: lewtun/zephyr-7b-sft
+model_name_or_path: alignment-handbook/zephyr-7b-sft-full
 
 # Data training arguments
 # For definitions, see: src/h4/training/config.py
diff --git a/recipes/zephyr-7b/dpo/config_lora.yaml b/recipes/zephyr-7b/dpo/config_lora.yaml
index d3f00b9..9100d1a 100644
--- a/recipes/zephyr-7b/dpo/config_lora.yaml
+++ b/recipes/zephyr-7b/dpo/config_lora.yaml
@@ -1,6 +1,5 @@
 # Model arguments
-model_name_or_path: HuggingFaceH4/mistral-7b-ift
-model_revision: v14.0
+model_name_or_path: alignment-handbook/zephyr-7b-sft-lora
 torch_dtype: auto
 
 # LORA
diff --git a/recipes/zephyr-7b/sft/config_full.yaml b/recipes/zephyr-7b/sft/config_full.yaml
index 020df7f..d73c443 100644
--- a/recipes/zephyr-7b/sft/config_full.yaml
+++ b/recipes/zephyr-7b/sft/config_full.yaml
@@ -14,6 +14,7 @@ preprocessing_num_workers: 12
 
 # SFT trainer config
 bf16: true
+do_eval: true
 evaluation_strategy: epoch
 gradient_accumulation_steps: 2
 gradient_checkpointing: true
diff --git a/recipes/zephyr-7b/sft/config_lora.yaml b/recipes/zephyr-7b/sft/config_lora.yaml
index 2d488d8..c4288bb 100644
--- a/recipes/zephyr-7b/sft/config_lora.yaml
+++ b/recipes/zephyr-7b/sft/config_lora.yaml
@@ -24,6 +24,7 @@ preprocessing_num_workers: 12
 
 # SFT trainer config
 bf16: true
+do_eval: true
 evaluation_strategy: epoch
 gradient_accumulation_steps: 128
 ddp_find_unused_parameters: true

From 0f0b61c0967c471892900c4d88d29365be66414b Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Fri, 10 Nov 2023 09:30:54 +0100
Subject: [PATCH 15/30] ups lora bs x grad_acc to 64

---
 recipes/zephyr-7b/dpo/config_lora.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/zephyr-7b/dpo/config_lora.yaml b/recipes/zephyr-7b/dpo/config_lora.yaml
index 9100d1a..2e9bb28 100644
--- a/recipes/zephyr-7b/dpo/config_lora.yaml
+++ b/recipes/zephyr-7b/dpo/config_lora.yaml
@@ -29,7 +29,7 @@ do_eval: true
 ddp_find_unused_parameters: true
 evaluation_strategy: epoch
 eval_steps: 100
-gradient_accumulation_steps: 16
+gradient_accumulation_steps: 32
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: False

From 610a1a2de42d50cfa8fb276070aa7132f5a64900 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 08:37:53 +0000
Subject: [PATCH 16/30] Add unit tests for data mixer

---
 .github/workflows/tests.yml | 31 +++++++++++++++
 Makefile                    | 10 ++---
 README.md                   |  4 +-
 setup.py                    |  2 +
 tests/test_data.py          | 79 +++++++++++++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/tests.yml
 create mode 100644 tests/test_data.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..990795f
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,31 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+
+jobs:
+
+  unit-tests:
+    name: Run unit tests
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.10
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install ".[dev, torch]"
+      - name: Run unit tests
+        run: HF_TOKEN=$HF_TOKEN pytest -sv tests/
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 2d82400..e2e4d2c 100644
--- a/Makefile
+++ b/Makefile
@@ -6,13 +6,13 @@ export PYTHONPATH = src
 check_dirs := src tests scripts
 
 style:
-	python -m black --line-length 119 --target-version py310 $(check_dirs) setup.py
-	python -m isort $(check_dirs) setup.py
+	black --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort $(check_dirs) setup.py
 
 quality:
-	python -m black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
-	python -m isort --check-only $(check_dirs) setup.py
-	python -m flake8 --max-line-length 119 $(check_dirs) setup.py
+	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort --check-only $(check_dirs) setup.py
+	flake8 --max-line-length 119 $(check_dirs) setup.py
 
 
 # Release stuff
diff --git a/README.md b/README.md
index 31ae641..952b7b3 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ To run the code in this project, first create a Python virtual environment using
 conda create -n handbook python=3.10 && conda activate handbook
 ```
 
-Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this hardware-dependent, we
+Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this is hardware-dependent, we
 direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
 
 You can then install the remaining package dependencies as follows:
@@ -63,6 +63,8 @@ Finally, install Git LFS so that you can push models to the Hugging Face Hub:
 sudo apt-get install git-lfs
 ```
 
+You can now checkout the `scripts` and `recipes` directories for instructions on how to train some models 🪁!
+
 ## Project structure
 
 ```
diff --git a/setup.py b/setup.py
index d71b591..1d2af6a 100644
--- a/setup.py
+++ b/setup.py
@@ -62,6 +62,7 @@ _deps = [
     "safetensors>=0.3.3",
     "scipy",
     "tensorboard",
+    "torch==2.1.0",
     "transformers==4.35.0",
     "trl==0.7.4",
     "tqdm>=4.64.1",
@@ -82,6 +83,7 @@ def deps_list(*pkgs):
 
 extras = {}
 extras["tests"] = deps_list("pytest", "parameterized")
+extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("black", "isort", "flake8")
 extras["docs"] = deps_list("hf-doc-builder")
 extras["dev"] = extras["docs"] + extras["quality"] + extras["tests"]
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000..63e390a
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import pytest
+
+from alignment import DataArguments, get_datasets
+
+
+class GetDatasetsTest(unittest.TestCase):
+    """Each of these test datasets has 100 examples"""
+
+    def test_loading_data_args(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.5,
+            "HuggingFaceH4/testing_self_instruct_small": 0.3,
+            "HuggingFaceH4/testing_codealpaca_small": 0.2,
+        }
+        data_args = DataArguments(dataset_mixer=dataset_mixer)
+        datasets = get_datasets(data_args)
+        self.assertEqual(len(datasets["train"]), 100)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_data_dict(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.5,
+            "HuggingFaceH4/testing_self_instruct_small": 0.3,
+            "HuggingFaceH4/testing_codealpaca_small": 0.2,
+        }
+        datasets = get_datasets(dataset_mixer)
+        self.assertEqual(len(datasets["train"]), 100)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_with_unit_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 1.0,
+            "HuggingFaceH4/testing_self_instruct_small": 1.0,
+            "HuggingFaceH4/testing_codealpaca_small": 1.0,
+        }
+        datasets = get_datasets(dataset_mixer)
+        self.assertEqual(len(datasets["train"]), 300)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_with_fractions_greater_than_unity(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.7,
+            "HuggingFaceH4/testing_self_instruct_small": 0.4,
+        }
+        datasets = get_datasets(dataset_mixer)
+        self.assertEqual(len(datasets["train"]), 70 + 40)
+        self.assertEqual(len(datasets["test"]), 200)
+
+    def test_loading_fails_with_negative_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.7,
+            "HuggingFaceH4/testing_self_instruct_small": -0.3,
+        }
+        with pytest.raises(ValueError, match=r"Dataset fractions cannot be negative."):
+            get_datasets(dataset_mixer)
+
+    def test_loading_single_split_with_unit_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 1.0,
+        }
+        datasets = get_datasets(dataset_mixer, splits=["test"])
+        self.assertEqual(len(datasets["test"]), 100)
+        self.assertRaises(KeyError, lambda: datasets["train"])

From 0af801199336e59a338c6f2d54c8f4a7bffb9eb1 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 08:41:17 +0000
Subject: [PATCH 17/30] Bump deps

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 1d2af6a..cdb4e1b 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@ if stale_egg_info.exists():
 #   * If a dependency is fast-moving (e.g. transformers), pin to the exact version
 _deps = [
     "accelerate==0.23.0",
-    "bitsandbytes==0.41.1",
+    "bitsandbytes==0.41.2.post2",
     "black==23.1.0",
     "datasets==2.14.6",
     "deepspeed==0.12.2",
@@ -56,7 +56,7 @@ _deps = [
     "numpy>=1.24.2",
     "packaging>=23.0",
     "parameterized>=0.9.0",
-    "peft==0.6.0",
+    "peft==0.6.1",
     "protobuf<=3.20.2",  # Needed to avoid conflicts with `transformers`
     "pytest",
     "safetensors>=0.3.3",

From 2ed5a45d25ecfce9e7f7399db13e9809f4a80251 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 09:42:15 +0000
Subject: [PATCH 18/30] Add model utils tests

---
 src/alignment/configs.py     |  4 +-
 src/alignment/data.py        |  4 +-
 src/alignment/model_utils.py |  4 +-
 tests/test_data.py           | 71 ++++++++++++++++++++++++++++++++-
 tests/test_model_utils.py    | 76 ++++++++++++++++++++++++++++++++++++
 5 files changed, 152 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_model_utils.py

diff --git a/src/alignment/configs.py b/src/alignment/configs.py
index 890790e..d785e16 100644
--- a/src/alignment/configs.py
+++ b/src/alignment/configs.py
@@ -17,7 +17,7 @@ import dataclasses
 import os
 import sys
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, NewType, Optional, Tuple, Union
+from typing import Any, Dict, List, NewType, Optional, Tuple
 
 import transformers
 from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, HfArgumentParser
@@ -87,7 +87,7 @@ class H4ArgumentParser(HfArgumentParser):
 
         return outputs
 
-    def parse(self) -> Union[DataClassType, Tuple[DataClassType]]:
+    def parse(self) -> DataClassType | Tuple[DataClassType]:
         if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
             # If we pass only one argument to the script and it's the path to a YAML file,
             # let's parse it to get our arguments.
diff --git a/src/alignment/data.py b/src/alignment/data.py
index de25e61..2150095 100644
--- a/src/alignment/data.py
+++ b/src/alignment/data.py
@@ -1,5 +1,5 @@
 import re
-from typing import List, Literal, Optional, Union
+from typing import List, Literal, Optional
 
 from datasets import DatasetDict, concatenate_datasets, load_dataset
 
@@ -67,7 +67,7 @@ def apply_chat_template(
 
 
 def get_datasets(
-    data_config: Union[DataArguments, dict],
+    data_config: DataArguments | dict,
     splits: List[str] = ["train", "test"],
     shuffle: bool = True,
 ) -> DatasetDict:
diff --git a/src/alignment/model_utils.py b/src/alignment/model_utils.py
index a01f1b4..f237745 100644
--- a/src/alignment/model_utils.py
+++ b/src/alignment/model_utils.py
@@ -1,4 +1,4 @@
-from typing import Dict, Union
+from typing import Dict
 
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer
@@ -62,7 +62,7 @@ def get_tokenizer(model_args: ModelArguments, data_args: DataArguments) -> PreTr
     return tokenizer
 
 
-def get_peft_config(model_args: ModelArguments) -> Union[PeftConfig, None]:
+def get_peft_config(model_args: ModelArguments) -> PeftConfig | None:
     if model_args.use_peft is False:
         return None
 
diff --git a/tests/test_data.py b/tests/test_data.py
index 63e390a..6a6b63c 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -15,8 +15,9 @@
 import unittest
 
 import pytest
+from datasets import Dataset
 
-from alignment import DataArguments, get_datasets
+from alignment import DataArguments, ModelArguments, apply_chat_template, get_datasets, get_tokenizer
 
 
 class GetDatasetsTest(unittest.TestCase):
@@ -77,3 +78,71 @@ class GetDatasetsTest(unittest.TestCase):
         datasets = get_datasets(dataset_mixer, splits=["test"])
         self.assertEqual(len(datasets["test"]), 100)
         self.assertRaises(KeyError, lambda: datasets["train"])
+
+
+class ApplyChatTemplateTest(unittest.TestCase):
+    def setUp(self):
+        model_args = ModelArguments(model_name_or_path="HuggingFaceH4/zephyr-7b-alpha")
+        data_args = DataArguments()
+        self.tokenizer = get_tokenizer(model_args, data_args)
+        self.dataset = Dataset.from_dict(
+            {
+                "prompt": ["Hello!"],
+                "messages": [[{"role": "user", "content": "Hello!"}, {"role": "assistant", "content": "Bonjour!"}]],
+                "chosen": [[{"role": "user", "content": "Hello!"}, {"role": "assistant", "content": "Bonjour!"}]],
+                "rejected": [[{"role": "user", "content": "Hello!"}, {"role": "assistant", "content": "Hola!"}]],
+            }
+        )
+
+    def test_sft(self):
+        dataset = self.dataset.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": self.tokenizer, "task": "sft"},
+            remove_columns=self.dataset.column_names,
+        )
+        self.assertDictEqual(
+            dataset[0],
+            {"text": "<|system|>\n</s>\n<|user|>\nHello!</s>\n<|assistant|>\nBonjour!</s>\n"},
+        )
+
+    def test_generation(self):
+        # Remove last turn from messages
+        dataset = self.dataset.map(lambda x: {"messages": x["messages"][:-1]})
+        dataset = dataset.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": self.tokenizer, "task": "generation"},
+            remove_columns=self.dataset.column_names,
+        )
+        self.assertDictEqual(
+            dataset[0],
+            {"text": "<|system|>\n</s>\n<|user|>\nHello!</s>\n<|assistant|>\n"},
+        )
+
+    def test_rm(self):
+        dataset = self.dataset.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": self.tokenizer, "task": "rm"},
+            remove_columns=self.dataset.column_names,
+        )
+        self.assertDictEqual(
+            dataset[0],
+            {
+                "text_chosen": "<|system|>\n</s>\n<|user|>\nHello!</s>\n<|assistant|>\nBonjour!</s>\n",
+                "text_rejected": "<|system|>\n</s>\n<|user|>\nHello!</s>\n<|assistant|>\nHola!</s>\n",
+            },
+        )
+
+    def test_dpo(self):
+        dataset = self.dataset.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": self.tokenizer, "task": "dpo"},
+            remove_columns=self.dataset.column_names,
+        )
+        self.assertDictEqual(
+            dataset[0],
+            {
+                "text_prompt": "<|system|>\n</s>\n<|user|>\nHello!</s>\n<|assistant|>\n",
+                "text_chosen": "Bonjour!</s>\n",
+                "text_rejected": "Hola!</s>\n",
+            },
+        )
diff --git a/tests/test_model_utils.py b/tests/test_model_utils.py
new file mode 100644
index 0000000..d20afec
--- /dev/null
+++ b/tests/test_model_utils.py
@@ -0,0 +1,76 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import torch
+
+from alignment import DataArguments, ModelArguments, get_peft_config, get_quantization_config, get_tokenizer
+from alignment.data import DEFAULT_CHAT_TEMPLATE
+
+
+class GetQuantizationConfigTest(unittest.TestCase):
+    def test_4bit(self):
+        model_args = ModelArguments(load_in_4bit=True)
+        quantization_config = get_quantization_config(model_args)
+        self.assertTrue(quantization_config.load_in_4bit)
+        self.assertEqual(quantization_config.bnb_4bit_compute_dtype, torch.float16)
+        self.assertEqual(quantization_config.bnb_4bit_quant_type, "nf4")
+        self.assertFalse(quantization_config.bnb_4bit_use_double_quant)
+
+    def test_8bit(self):
+        model_args = ModelArguments(load_in_8bit=True)
+        quantization_config = get_quantization_config(model_args)
+        self.assertTrue(quantization_config.load_in_8bit)
+
+    def test_no_quantization(self):
+        model_args = ModelArguments()
+        quantization_config = get_quantization_config(model_args)
+        self.assertIsNone(quantization_config)
+
+
+class GetTokenizerTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.model_args = ModelArguments(model_name_or_path="HuggingFaceH4/zephyr-7b-alpha")
+
+    def test_right_truncation_side(self):
+        tokenizer = get_tokenizer(self.model_args, DataArguments(truncation_side="right"))
+        self.assertEqual(tokenizer.truncation_side, "right")
+
+    def test_left_truncation_side(self):
+        tokenizer = get_tokenizer(self.model_args, DataArguments(truncation_side="left"))
+        self.assertEqual(tokenizer.truncation_side, "left")
+
+    def test_default_chat_template(self):
+        tokenizer = get_tokenizer(self.model_args, DataArguments())
+        self.assertEqual(tokenizer.chat_template, DEFAULT_CHAT_TEMPLATE)
+
+    def test_chatml_chat_template(self):
+        chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+        tokenizer = get_tokenizer(self.model_args, DataArguments(chat_template=chat_template))
+        self.assertEqual(tokenizer.chat_template, chat_template)
+
+
+class GetPeftConfigTest(unittest.TestCase):
+    def test_peft_config(self):
+        model_args = ModelArguments(use_peft=True, lora_r=42, lora_alpha=0.66, lora_dropout=0.99)
+        peft_config = get_peft_config(model_args)
+        self.assertEqual(peft_config.r, 42)
+        self.assertEqual(peft_config.lora_alpha, 0.66)
+        self.assertEqual(peft_config.lora_dropout, 0.99)
+
+    def test_no_peft_config(self):
+        model_args = ModelArguments(use_peft=False)
+        peft_config = get_peft_config(model_args)
+        self.assertIsNone(peft_config)

From b1b0c1c8c0c4d247bd5ab9562947e53e7100507f Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 10:44:06 +0100
Subject: [PATCH 19/30] Update setup.py

Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index cdb4e1b..8f78891 100644
--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,7 @@ _deps = [
     "torch==2.1.0",
     "transformers==4.35.0",
     "trl==0.7.4",
+    "jinja2=>3.0.0",
     "tqdm>=4.64.1",
 ]
 

From 8699f47bf307eba402244de31da8b5ad5b64ab6c Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 09:45:22 +0000
Subject: [PATCH 20/30] Add jinja2 to req deps

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 8f78891..3c1c9e7 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,7 @@ install_requires = [
     deps["datasets"],
     deps["deepspeed"],
     deps["huggingface-hub"],
+    deps["jinja2"],
     deps["ninja"],
     deps["numpy"],
     deps["packaging"],  # utilities from PyPA to e.g., compare versions

From 64f1834e017f33fc0b56770a9fe60f265ff4b6b1 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 10:00:05 +0000
Subject: [PATCH 21/30] Add config tests

---
 tests/fixtures/config_dpo_full.yaml | 37 +++++++++++++++++++++++++
 tests/fixtures/config_sft_full.yaml | 41 +++++++++++++++++++++++++++
 tests/test_configs.py               | 43 +++++++++++++++++++++++++++++
 3 files changed, 121 insertions(+)
 create mode 100644 tests/fixtures/config_dpo_full.yaml
 create mode 100644 tests/fixtures/config_sft_full.yaml
 create mode 100644 tests/test_configs.py

diff --git a/tests/fixtures/config_dpo_full.yaml b/tests/fixtures/config_dpo_full.yaml
new file mode 100644
index 0000000..5110f59
--- /dev/null
+++ b/tests/fixtures/config_dpo_full.yaml
@@ -0,0 +1,37 @@
+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-full
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.1
+do_eval: true
+evaluation_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+hub_model_id: zephyr-7b-dpo-full
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: linear
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 3
+optim: rmsprop
+output_dir: data/zephyr-7b-dpo-full
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 4
+push_to_hub: true
+save_strategy: "no"
+save_total_limit: null
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/tests/fixtures/config_sft_full.yaml b/tests/fixtures/config_sft_full.yaml
new file mode 100644
index 0000000..81720e9
--- /dev/null
+++ b/tests/fixtures/config_sft_full.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+hub_model_id: zephyr-7b-sft-full
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/zephyr-7b-sft-full
+overwrite_output_dir: true
+per_device_eval_batch_size: 16
+per_device_train_batch_size: 32
+push_to_hub: True
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "no"
+save_total_limit: null
+seed: 42
\ No newline at end of file
diff --git a/tests/test_configs.py b/tests/test_configs.py
new file mode 100644
index 0000000..2a4a7a6
--- /dev/null
+++ b/tests/test_configs.py
@@ -0,0 +1,43 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from alignment import DataArguments, H4ArgumentParser, ModelArguments, SFTConfig
+
+
+class H4ArgumentParserTest(unittest.TestCase):
+    def setUp(self):
+        self.parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
+        self.yaml_file_path = "tests/fixtures/config_sft_full.yaml"
+
+    def test_load_yaml(self):
+        model_args, data_args, training_args = self.parser.parse_yaml_file(os.path.abspath(self.yaml_file_path))
+        self.assertEqual(model_args.model_name_or_path, "mistralai/Mistral-7B-v0.1")
+
+    def test_load_yaml_and_args(self):
+        command_line_args = [
+            "--model_name_or_path=test",
+            "--use_peft=true",
+            "--lora_r=16",
+            "--lora_dropout=0.5",
+        ]
+        model_args, data_args, training_args = self.parser.parse_yaml_and_args(
+            os.path.abspath(self.yaml_file_path), command_line_args
+        )
+        self.assertEqual(model_args.model_name_or_path, "test")
+        self.assertEqual(model_args.use_peft, True)
+        self.assertEqual(model_args.lora_r, 16)
+        self.assertEqual(model_args.lora_dropout, 0.5)

From a0b8d4942497db40020d59aa9edc52f78c94021a Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 10:49:13 +0000
Subject: [PATCH 22/30] Rename recipe

---
 recipes/{zephyr-7b => zephyr-7b-beta}/README.md  |  8 ++++----
 .../dpo/config_full.yaml                         |  0
 .../dpo/config_lora.yaml                         |  1 -
 .../sft/config_full.yaml                         |  0
 .../sft/config_lora.yaml                         |  6 ++----
 scripts/README.md                                | 16 +++++++++-------
 6 files changed, 15 insertions(+), 16 deletions(-)
 rename recipes/{zephyr-7b => zephyr-7b-beta}/README.md (85%)
 rename recipes/{zephyr-7b => zephyr-7b-beta}/dpo/config_full.yaml (100%)
 rename recipes/{zephyr-7b => zephyr-7b-beta}/dpo/config_lora.yaml (96%)
 rename recipes/{zephyr-7b => zephyr-7b-beta}/sft/config_full.yaml (100%)
 rename recipes/{zephyr-7b => zephyr-7b-beta}/sft/config_lora.yaml (94%)

diff --git a/recipes/zephyr-7b/README.md b/recipes/zephyr-7b-beta/README.md
similarity index 85%
rename from recipes/zephyr-7b/README.md
rename to recipes/zephyr-7b-beta/README.md
index 4e783ad..3960ba5 100644
--- a/recipes/zephyr-7b/README.md
+++ b/recipes/zephyr-7b-beta/README.md
@@ -12,18 +12,18 @@ See below for commands to train these models using either DeepSpeed ZeRO-3 or Lo
 
 ```shell
 # Step 1 - SFT
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b/sft/config_full.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_full.yaml
 
 # Step 2 - DPO
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b/dpo/config_full.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-beta/beta/beta/beta/dpo/config_full.yaml
 ```
 
 ## LoRA training examples
 
 ```shell
 # Step 1 - SFT
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b/sft/config_lora.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/beta/sft/config_lora.yaml
 
 # Step 2 - DPO
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b/dpo/config_lora.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_lora.yaml
 ```
\ No newline at end of file
diff --git a/recipes/zephyr-7b/dpo/config_full.yaml b/recipes/zephyr-7b-beta/dpo/config_full.yaml
similarity index 100%
rename from recipes/zephyr-7b/dpo/config_full.yaml
rename to recipes/zephyr-7b-beta/dpo/config_full.yaml
diff --git a/recipes/zephyr-7b/dpo/config_lora.yaml b/recipes/zephyr-7b-beta/dpo/config_lora.yaml
similarity index 96%
rename from recipes/zephyr-7b/dpo/config_lora.yaml
rename to recipes/zephyr-7b-beta/dpo/config_lora.yaml
index 2e9bb28..38ac36a 100644
--- a/recipes/zephyr-7b/dpo/config_lora.yaml
+++ b/recipes/zephyr-7b-beta/dpo/config_lora.yaml
@@ -26,7 +26,6 @@ preprocessing_num_workers: 12
 bf16: true
 beta: 0.1
 do_eval: true
-ddp_find_unused_parameters: true
 evaluation_strategy: epoch
 eval_steps: 100
 gradient_accumulation_steps: 32
diff --git a/recipes/zephyr-7b/sft/config_full.yaml b/recipes/zephyr-7b-beta/sft/config_full.yaml
similarity index 100%
rename from recipes/zephyr-7b/sft/config_full.yaml
rename to recipes/zephyr-7b-beta/sft/config_full.yaml
diff --git a/recipes/zephyr-7b/sft/config_lora.yaml b/recipes/zephyr-7b-beta/sft/config_lora.yaml
similarity index 94%
rename from recipes/zephyr-7b/sft/config_lora.yaml
rename to recipes/zephyr-7b-beta/sft/config_lora.yaml
index c4288bb..3106a01 100644
--- a/recipes/zephyr-7b/sft/config_lora.yaml
+++ b/recipes/zephyr-7b-beta/sft/config_lora.yaml
@@ -3,7 +3,7 @@ model_name_or_path: mistralai/Mistral-7B-v0.1
 torch_dtype: auto
 use_flash_attention_2: true
 
-# LORA
+# LoRA arguments
 use_peft: true
 lora_r: 64
 lora_alpha: 16
@@ -27,7 +27,6 @@ bf16: true
 do_eval: true
 evaluation_strategy: epoch
 gradient_accumulation_steps: 128
-ddp_find_unused_parameters: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: False
@@ -50,5 +49,4 @@ report_to:
 - tensorboard
 save_strategy: "no"
 save_total_limit: null
-seed: 42
-tf32: true
\ No newline at end of file
+seed: 42
\ No newline at end of file
diff --git a/scripts/README.md b/scripts/README.md
index dcaa063..6af993c 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -6,7 +6,7 @@
 In the handbook, we provide three main ways to align LLMs for chat:
 
 - Full fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on an 8 x A100 (80GB) node).
-- LoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
+- LoRA or QLoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
 - LoRA fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on a 2 x A100s (80GB)).
 
 In practice, we find comparable performance for both full and LoRA fine-tuning, with the latter having the advantage of producing small adapter weights that are fast to upload and download from the Hugging Face Hub. Here's the two general commands to fine-tune your models:
@@ -22,16 +22,18 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml --num_processes={num_gpus} scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml
 ```
 
-Here `{task}` refers to type of training you wish to run (SFT, DPO, etc), while `{model_name}` refers to the choice of recipe in the `recipes/` directory. For example, to replicate Zephyr 7B you can run:
+Here `{task}` refers to type of training you wish to run (SFT, DPO, etc), while `{model_name}` refers to the choice of recipe in the `recipes` directory. For example, to replicate Zephyr-7B-β you can run:
 
 ```shell
 # Step 1 - train SFT policy
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b/sft/config_full.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_full.yaml
 
 # Step 2 - align with DPO
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b/dpo/config_full.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_full.yaml
 ```
 
+** 💡 Tip:** If you scale the up/down the number of GPUs, we recommend also scaling up the per-device batch size or number of gradient accumulation steps to keep the global batch size constant (and thus replicate our results).
+
 By default, these scripts will push each model to your Hugging Face Hub username, i.e. `{username}/{model_name}-{task}`. You can override the parameters in each YAML config by appending them to the command as follows:
 
 ```shell
@@ -56,9 +58,9 @@ sbatch --job-name=handbook_{task} --nodes=1 recipes/launch.slurm {model_name} {t
 Here `{model_name}` and `{task}` are defined as above, while `{precision}` refers to the type of training (full vs LoRA) and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. Here's a concrete example to run SFT on 1 node of 8 GPUs:
 
 ```shell
-sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
+sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b-beta sft full deepspeed_zero3
 ```
 
-You can scale the number of nodes by increasing the `--nodes` flag; in these cases we recommend also scaling up the per-device batch size or number of gradient accumulation steps to keep the global batch size constant (and thus replicate our results).
+You can scale the number of nodes by increasing the `--nodes` flag.
 
-**Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
\ No newline at end of file
+**⚠️ Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
\ No newline at end of file

From 551f901f95937c61a6cc2d1fd4bfa71ff5145973 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 11:02:44 +0000
Subject: [PATCH 23/30] Fix dep

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3c1c9e7..1dc950f 100644
--- a/setup.py
+++ b/setup.py
@@ -65,7 +65,7 @@ _deps = [
     "torch==2.1.0",
     "transformers==4.35.0",
     "trl==0.7.4",
-    "jinja2=>3.0.0",
+    "jinja2>=3.0.0",
     "tqdm>=4.64.1",
 ]
 

From edf67d1d937e524b7383057ae176bd88126d895b Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 11:15:45 +0000
Subject: [PATCH 24/30] Tweaks

---
 README.md                                   | 2 +-
 recipes/zephyr-7b-beta/dpo/config_lora.yaml | 2 +-
 recipes/zephyr-7b-beta/sft/config_full.yaml | 2 +-
 recipes/zephyr-7b-beta/sft/config_lora.yaml | 2 +-
 scripts/README.md                           | 2 +-
 tests/fixtures/config_sft_full.yaml         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 952b7b3..02e82fa 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The Alignment Handbook aims to fill that gap by providing the community with a s
 
 ## News 🗞️
 
-* November 10, 2023: We release all the training code to replicate Zephyr 7B 🪁!
+* November 10, 2023: We release all the training code to replicate Zephyr-7b-β 🪁!
 
 ## Links 🔗
 
diff --git a/recipes/zephyr-7b-beta/dpo/config_lora.yaml b/recipes/zephyr-7b-beta/dpo/config_lora.yaml
index 38ac36a..6d04714 100644
--- a/recipes/zephyr-7b-beta/dpo/config_lora.yaml
+++ b/recipes/zephyr-7b-beta/dpo/config_lora.yaml
@@ -2,7 +2,7 @@
 model_name_or_path: alignment-handbook/zephyr-7b-sft-lora
 torch_dtype: auto
 
-# LORA
+# LoRA arguments
 use_peft: true
 lora_r: 64
 lora_alpha: 16
diff --git a/recipes/zephyr-7b-beta/sft/config_full.yaml b/recipes/zephyr-7b-beta/sft/config_full.yaml
index d73c443..4d8d2d1 100644
--- a/recipes/zephyr-7b-beta/sft/config_full.yaml
+++ b/recipes/zephyr-7b-beta/sft/config_full.yaml
@@ -32,7 +32,7 @@ output_dir: data/zephyr-7b-sft-full
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 32
-push_to_hub: True
+push_to_hub: true
 remove_unused_columns: true
 report_to:
 - tensorboard
diff --git a/recipes/zephyr-7b-beta/sft/config_lora.yaml b/recipes/zephyr-7b-beta/sft/config_lora.yaml
index 3106a01..3eb2d0e 100644
--- a/recipes/zephyr-7b-beta/sft/config_lora.yaml
+++ b/recipes/zephyr-7b-beta/sft/config_lora.yaml
@@ -44,7 +44,7 @@ output_dir: data/zephyr-7b-sft-lora
 overwrite_output_dir: true
 per_device_eval_batch_size: 8
 per_device_train_batch_size: 4
-push_to_hub: True
+push_to_hub: true
 report_to:
 - tensorboard
 save_strategy: "no"
diff --git a/scripts/README.md b/scripts/README.md
index 6af993c..d3065e2 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -32,7 +32,7 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_full.yaml
 ```
 
-** 💡 Tip:** If you scale the up/down the number of GPUs, we recommend also scaling up the per-device batch size or number of gradient accumulation steps to keep the global batch size constant (and thus replicate our results).
+** 💡 Tip:** If you scale up/down the number of GPUs, we recommend also scaling up the per-device batch size or number of gradient accumulation steps to keep the global batch size constant (and thus replicate our results).
 
 By default, these scripts will push each model to your Hugging Face Hub username, i.e. `{username}/{model_name}-{task}`. You can override the parameters in each YAML config by appending them to the command as follows:
 
diff --git a/tests/fixtures/config_sft_full.yaml b/tests/fixtures/config_sft_full.yaml
index 81720e9..adf13da 100644
--- a/tests/fixtures/config_sft_full.yaml
+++ b/tests/fixtures/config_sft_full.yaml
@@ -32,7 +32,7 @@ output_dir: data/zephyr-7b-sft-full
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 32
-push_to_hub: True
+push_to_hub: true
 remove_unused_columns: true
 report_to:
 - tensorboard

From 54185783e0c7b0117cca75278be4060c6d5ca38c Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 11:20:39 +0000
Subject: [PATCH 25/30] Remove QLoRa for now

---
 scripts/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/README.md b/scripts/README.md
index d3065e2..62c2be9 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -6,7 +6,7 @@
 In the handbook, we provide three main ways to align LLMs for chat:
 
 - Full fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on an 8 x A100 (80GB) node).
-- LoRA or QLoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
+- LoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
 - LoRA fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on a 2 x A100s (80GB)).
 
 In practice, we find comparable performance for both full and LoRA fine-tuning, with the latter having the advantage of producing small adapter weights that are fast to upload and download from the Hugging Face Hub. Here's the two general commands to fine-tune your models:

From 7f1a14e0d4aec284a6afdfffe5d7fd23989748e7 Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Fri, 10 Nov 2023 14:15:44 +0100
Subject: [PATCH 26/30] adds auto adapter merge to dpo script

---
 scripts/run_dpo.py           | 31 ++++++++++++++++++++++++++++---
 src/alignment/model_utils.py |  6 +++++-
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py
index 704ce1f..2a428ac 100644
--- a/scripts/run_dpo.py
+++ b/scripts/run_dpo.py
@@ -34,7 +34,9 @@ from alignment import (
     get_tokenizer,
 )
 from trl import DPOTrainer
-
+from transformers import AutoModelForCausalLM
+from alignment.model_utils import is_adapter_model
+from peft import PeftConfig, PeftModel
 
 logger = logging.getLogger(__name__)
 
@@ -112,8 +114,31 @@ def main():
         device_map=get_kbit_device_map(),
         quantization_config=get_quantization_config(model_args),
     )
+    
+    model = model_args.model_name_or_path
+    if is_adapter_model(model, model_args.model_revision):
+        # load the model, merge the adapter weights and unload the adapter
+        # Note: to run QLora, you will need to merge the based model separately as the merged model in 16bit
+        logger.info(f"Merging peft adapters for {model_args.model_name_or_path=}")
+        
+        peft_config = PeftConfig.from_pretrained(model_args.model_name_or_path, revision=model_args.model_revision)
+        
+        model_kwargs = dict(
+            revision=model_args.base_model_revision,
+            trust_remote_code=model_args.trust_remote_code,
+            use_flash_attention_2=model_args.use_flash_attention_2,
+            torch_dtype=torch_dtype,
+            use_cache=False if training_args.gradient_checkpointing else True,
+        )
+        base_model = AutoModelForCausalLM.from_pretrained(
+            peft_config.base_model_name_or_path, **model_kwargs,
+        )
+        model = PeftModel.from_pretrained(base_model, model_args.model_name_or_path, revision=model_args.model_revision)
+        model.eval()
+        model = model.merge_and_unload()
+        model_kwargs = None
 
-    ref_model = model_args.model_name_or_path
+    ref_model = model
     ref_model_kwargs = model_kwargs
 
     if model_args.use_peft is True:
@@ -124,7 +149,7 @@ def main():
     # Instantiate DPO trainer
     #########################
     dpo_trainer = DPOTrainer(
-        model_args.model_name_or_path,
+        model,
         ref_model,
         model_init_kwargs=model_kwargs,
         ref_model_init_kwargs=ref_model_kwargs,
diff --git a/src/alignment/model_utils.py b/src/alignment/model_utils.py
index f237745..d35e037 100644
--- a/src/alignment/model_utils.py
+++ b/src/alignment/model_utils.py
@@ -5,7 +5,7 @@ from transformers import AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer
 
 from accelerate import Accelerator
 from peft import LoraConfig, PeftConfig
-
+from huggingface_hub import list_repo_files
 from .configs import DataArguments, ModelArguments
 from .data import DEFAULT_CHAT_TEMPLATE
 
@@ -77,3 +77,7 @@ def get_peft_config(model_args: ModelArguments) -> PeftConfig | None:
     )
 
     return peft_config
+
+def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
+    repo_files = list_repo_files(model_name_or_path, revision=revision)
+    return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files
\ No newline at end of file

From e2e8ab945db9ca680e833e169d7d8ba00923cd33 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 13:38:45 +0000
Subject: [PATCH 27/30] Refactor imports

---
 recipes/zephyr-7b-beta/dpo/config_lora.yaml |  2 +-
 recipes/zephyr-7b-beta/sft/config_lora.yaml |  2 +-
 scripts/README.md                           | 43 +++++++++++++++++++--
 scripts/run_dpo.py                          | 21 +++++-----
 src/alignment/__init__.py                   |  2 +-
 src/alignment/model_utils.py                |  6 ++-
 6 files changed, 58 insertions(+), 18 deletions(-)

diff --git a/recipes/zephyr-7b-beta/dpo/config_lora.yaml b/recipes/zephyr-7b-beta/dpo/config_lora.yaml
index 6d04714..afeb8b4 100644
--- a/recipes/zephyr-7b-beta/dpo/config_lora.yaml
+++ b/recipes/zephyr-7b-beta/dpo/config_lora.yaml
@@ -31,7 +31,7 @@ eval_steps: 100
 gradient_accumulation_steps: 32
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
-  use_reentrant: False
+  use_reentrant: false
 hub_model_id: zephyr-7b-dpo-lora
 learning_rate: 5.0e-7
 log_level: info
diff --git a/recipes/zephyr-7b-beta/sft/config_lora.yaml b/recipes/zephyr-7b-beta/sft/config_lora.yaml
index 3eb2d0e..286166a 100644
--- a/recipes/zephyr-7b-beta/sft/config_lora.yaml
+++ b/recipes/zephyr-7b-beta/sft/config_lora.yaml
@@ -29,7 +29,7 @@ evaluation_strategy: epoch
 gradient_accumulation_steps: 128
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
-  use_reentrant: False
+  use_reentrant: false
 hub_model_id: zephyr-7b-sft-lora
 hub_strategy: every_save
 learning_rate: 2.0e-05
diff --git a/scripts/README.md b/scripts/README.md
index 62c2be9..10dc3fb 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -1,7 +1,7 @@
 
-## Scripts to Train and Evaluate Chat Models
+# Scripts to Train and Evaluate Chat Models
 
-### Fine-tuning
+## Fine-tuning
 
 In the handbook, we provide three main ways to align LLMs for chat:
 
@@ -47,7 +47,7 @@ By default all training metrics are logged with TensorBoard. If you have a [Weig
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --report_to=wandb
 ```
 
-### Launching jobs on a Slurm cluster
+## Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `recipes/launch.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
 
@@ -63,4 +63,39 @@ sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b-beta sft
 
 You can scale the number of nodes by increasing the `--nodes` flag.
 
-**⚠️ Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
\ No newline at end of file
+**⚠️ Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
+
+## Fine-tuning on custom datasets
+
+Under the hood, each training script uses the `get_datasets()` function which allows one to easily combing multiple datasets with varying proportions. For instance, this is how one can specify multiple datasets and which splits to combine in one of the YAML configs:
+
+```yaml
+datasets_mixer:
+    dataset_1: 0.5  # Use 50% of the training examples
+    dataset_2: 0.66 # Use 66% of the training examples
+    dataset_3: 0.10 # Use 10% of the training examples
+dataset_splits:
+- train_x           # Samples from each train split
+- test_x            # Test splits aren't sampled
+```
+
+If you want to fine-tune on your own datasets, the main thing to keep in mind is how the chat templates are applied to the dataset blend. Since each task (SFT, DPO, etc), requires a different format, we assume the datasets have the following columns:
+
+**SFT**
+
+* `messages`: A list of `dicts` in the form `{"role": "{role}", "content": {content}}`. 
+* See [ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) for an example.
+
+**DPO**
+
+* `chosen`: A list of `dicts` in the form `{"role": "{role}", "content": {content}}` corresponding to the preferred dialogue.
+* `rejected`: A list of `dicts` in the form `{"role": "{role}", "content": {content}}` corresponding to the dispreferred dialogue.
+* See [ultrafeedback_binarized](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized) for an example.
+
+We also find it useful to include dedicated splits per task in our datasets, so e.g. we have:
+
+* `{train,test}_sft`: Splits for SFT training.
+* `{train,test}_gen`: Splits for generation ranking like rejection sampling or PPO.
+* `{train,test}_prefs`: Splits for preference modelling, like reward modelling or DPO.
+
+If you format your dataset in the same way, our training scripts should work out of the box!
\ No newline at end of file
diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py
index 2a428ac..11f9a2f 100644
--- a/scripts/run_dpo.py
+++ b/scripts/run_dpo.py
@@ -18,7 +18,7 @@ import sys
 
 import torch
 import transformers
-from transformers import set_seed
+from transformers import AutoModelForCausalLM, set_seed
 
 from accelerate import Accelerator
 from alignment import (
@@ -32,11 +32,11 @@ from alignment import (
     get_peft_config,
     get_quantization_config,
     get_tokenizer,
+    is_adapter_model,
 )
-from trl import DPOTrainer
-from transformers import AutoModelForCausalLM
-from alignment.model_utils import is_adapter_model
 from peft import PeftConfig, PeftModel
+from trl import DPOTrainer
+
 
 logger = logging.getLogger(__name__)
 
@@ -114,15 +114,15 @@ def main():
         device_map=get_kbit_device_map(),
         quantization_config=get_quantization_config(model_args),
     )
-    
+
     model = model_args.model_name_or_path
     if is_adapter_model(model, model_args.model_revision):
         # load the model, merge the adapter weights and unload the adapter
         # Note: to run QLora, you will need to merge the based model separately as the merged model in 16bit
         logger.info(f"Merging peft adapters for {model_args.model_name_or_path=}")
-        
+
         peft_config = PeftConfig.from_pretrained(model_args.model_name_or_path, revision=model_args.model_revision)
-        
+
         model_kwargs = dict(
             revision=model_args.base_model_revision,
             trust_remote_code=model_args.trust_remote_code,
@@ -131,9 +131,12 @@ def main():
             use_cache=False if training_args.gradient_checkpointing else True,
         )
         base_model = AutoModelForCausalLM.from_pretrained(
-            peft_config.base_model_name_or_path, **model_kwargs,
+            peft_config.base_model_name_or_path,
+            **model_kwargs,
+        )
+        model = PeftModel.from_pretrained(
+            base_model, model_args.model_name_or_path, revision=model_args.model_revision
         )
-        model = PeftModel.from_pretrained(base_model, model_args.model_name_or_path, revision=model_args.model_revision)
         model.eval()
         model = model.merge_and_unload()
         model_kwargs = None
diff --git a/src/alignment/__init__.py b/src/alignment/__init__.py
index 3080b6a..17f4767 100644
--- a/src/alignment/__init__.py
+++ b/src/alignment/__init__.py
@@ -2,4 +2,4 @@ __version__ = "0.2.0.dev0"
 
 from .configs import DataArguments, DPOConfig, H4ArgumentParser, ModelArguments, SFTConfig
 from .data import apply_chat_template, get_datasets
-from .model_utils import get_kbit_device_map, get_peft_config, get_quantization_config, get_tokenizer
+from .model_utils import get_kbit_device_map, get_peft_config, get_quantization_config, get_tokenizer, is_adapter_model
diff --git a/src/alignment/model_utils.py b/src/alignment/model_utils.py
index d35e037..cbaad69 100644
--- a/src/alignment/model_utils.py
+++ b/src/alignment/model_utils.py
@@ -4,8 +4,9 @@ import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer
 
 from accelerate import Accelerator
-from peft import LoraConfig, PeftConfig
 from huggingface_hub import list_repo_files
+from peft import LoraConfig, PeftConfig
+
 from .configs import DataArguments, ModelArguments
 from .data import DEFAULT_CHAT_TEMPLATE
 
@@ -78,6 +79,7 @@ def get_peft_config(model_args: ModelArguments) -> PeftConfig | None:
 
     return peft_config
 
+
 def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
     repo_files = list_repo_files(model_name_or_path, revision=revision)
-    return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files
\ No newline at end of file
+    return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files

From 5a630a198909cf5ac55e88f75de0986521a6f511 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 13:57:52 +0000
Subject: [PATCH 28/30] Add QLoRA command

---
 scripts/README.md | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/README.md b/scripts/README.md
index 10dc3fb..5730bc9 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -6,7 +6,7 @@
 In the handbook, we provide three main ways to align LLMs for chat:
 
 - Full fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on an 8 x A100 (80GB) node).
-- LoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
+- LoRA or QLoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
 - LoRA fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on a 2 x A100s (80GB)).
 
 In practice, we find comparable performance for both full and LoRA fine-tuning, with the latter having the advantage of producing small adapter weights that are fast to upload and download from the Hugging Face Hub. Here's the two general commands to fine-tune your models:
@@ -18,6 +18,9 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 # LoRA training on a single GPU
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml
 
+# QLoRA 4-bit training on a single GPU
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml --load_in_4bit=true
+
 # LoRA training with ZeRO-3 on two or more GPUs
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml --num_processes={num_gpus} scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml
 ```
@@ -55,17 +58,18 @@ If you have access to a Slurm cluster, we provide a `recipes/launch.slurm` scrip
 sbatch --job-name=handbook_{task} --nodes=1 recipes/launch.slurm {model_name} {task} {precision} {accelerator}
 ```
 
-Here `{model_name}` and `{task}` are defined as above, while `{precision}` refers to the type of training (full vs LoRA) and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. Here's a concrete example to run SFT on 1 node of 8 GPUs:
+Here `{model_name}` and `{task}` are defined as above, while `{precision}` refers to the type of training (full vs LoRA) and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. If you wish to override the default config parameters, you can provide them by appending a space-separated string like `'--arg1=value1 --arg2=value2'. Here's a concrete example to run SFT on 1 node of 8 GPUs:
 
 ```shell
-sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b-beta sft full deepspeed_zero3
+# Launch on Slurm and override default hyperparameters
+sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b-beta sft full deepspeed_zero3 '--per_device_train_batch_size=42 --num_train_epochs=5'
 ```
 
 You can scale the number of nodes by increasing the `--nodes` flag.
 
 **⚠️ Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
 
-## Fine-tuning on custom datasets
+## Fine-tuning on your datasets
 
 Under the hood, each training script uses the `get_datasets()` function which allows one to easily combing multiple datasets with varying proportions. For instance, this is how one can specify multiple datasets and which splits to combine in one of the YAML configs:
 
@@ -75,8 +79,8 @@ datasets_mixer:
     dataset_2: 0.66 # Use 66% of the training examples
     dataset_3: 0.10 # Use 10% of the training examples
 dataset_splits:
-- train_x           # Samples from each train split
-- test_x            # Test splits aren't sampled
+- train_xxx         # The training splits to mix
+- test_xxx          # The test splits to mix
 ```
 
 If you want to fine-tune on your own datasets, the main thing to keep in mind is how the chat templates are applied to the dataset blend. Since each task (SFT, DPO, etc), requires a different format, we assume the datasets have the following columns:

From f5e70fbf9e84cb9fe66f7a1d7193307d20ce051c Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 14:47:54 +0000
Subject: [PATCH 29/30] Add licenses

---
 src/alignment/data.py        | 16 ++++++++++++++++
 src/alignment/model_utils.py | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/src/alignment/data.py b/src/alignment/data.py
index 2150095..838169a 100644
--- a/src/alignment/data.py
+++ b/src/alignment/data.py
@@ -1,3 +1,19 @@
+# coding=utf-8
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import re
 from typing import List, Literal, Optional
 
diff --git a/src/alignment/model_utils.py b/src/alignment/model_utils.py
index cbaad69..9463f2e 100644
--- a/src/alignment/model_utils.py
+++ b/src/alignment/model_utils.py
@@ -1,3 +1,19 @@
+# coding=utf-8
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict
 
 import torch

From 363e29ff9596c300c455c613b6087e5a40d31b18 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 15:49:00 +0100
Subject: [PATCH 30/30] Apply suggestions from code review

Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 02e82fa..10977db 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,7 @@ python -m pip install .
 ```
 
 You will also need Flash Attention 2 installed, which can be done by running:
+_Note: If your machine has less than 96GB of RAM and many CPU cores, reduce the MAX_JOBS., e.g. `MAX_JOBS=4 pip install flash-attn --no-build-isolation` _
 
 ```shell
 python -m pip install flash-attn --no-build-isolation