diff --git a/.gitignore b/.gitignore
index 2dc53ca..d4de801 100644
--- a/.gitignore
+++ b/.gitignore
@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# Temp checkpoint folder
+data/
\ No newline at end of file
diff --git a/README.md b/README.md
index af1a003..6f7c61e 100644
--- a/README.md
+++ b/README.md
@@ -32,13 +32,19 @@ To run the code in this project, first create a Python virtual environment using
 conda create -n handbook python=3.10 && conda activate handbook
 ```
 
-Next, install PyTorch v2.1.0. Since this hardware-dependent, we
-direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
+Next, install PyTorch `v2.0.1` - the precise version is important for reproducibility! Since this hardware-dependent, we
+direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/previous-versions/#v201).
 
-Once PyTorch is installed, you can install the remaining package dependencies as follows:
+You can then install the remaining package dependencies as follows:
 
 ```shell
-pip install .
+python -m pip install .
+```
+
+You will also need Flash Attention 2 installed, which can be done by running:
+
+```shell
+python -m pip install flash-attn==2.3.0 --no-build-isolation
 ```
 
 Next, log into your Hugging Face account as follows:
diff --git a/recipes/accelerate_configs/deepspeed_zero1.yaml b/recipes/accelerate_configs/deepspeed_zero1.yaml
new file mode 100644
index 0000000..1dfeda0
--- /dev/null
+++ b/recipes/accelerate_configs/deepspeed_zero1.yaml
@@ -0,0 +1,19 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  zero3_init_flag: false
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/accelerate_configs/deepspeed_zero2.yaml b/recipes/accelerate_configs/deepspeed_zero2.yaml
new file mode 100644
index 0000000..0777900
--- /dev/null
+++ b/recipes/accelerate_configs/deepspeed_zero2.yaml
@@ -0,0 +1,21 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/accelerate_configs/deepspeed_zero3.yaml b/recipes/accelerate_configs/deepspeed_zero3.yaml
new file mode 100644
index 0000000..b5a1201
--- /dev/null
+++ b/recipes/accelerate_configs/deepspeed_zero3.yaml
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/accelerate_configs/multi_gpu.yaml b/recipes/accelerate_configs/multi_gpu.yaml
new file mode 100644
index 0000000..4f05571
--- /dev/null
+++ b/recipes/accelerate_configs/multi_gpu.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/zephyr-7b/sft/config.yaml b/recipes/zephyr-7b/sft/config.yaml
new file mode 100644
index 0000000..e7e786a
--- /dev/null
+++ b/recipes/zephyr-7b/sft/config.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/zephyr-7b-sft
+overwrite_output_dir: true
+per_device_eval_batch_size: 16
+per_device_train_batch_size: 32
+push_to_hub: True
+push_to_hub_model_id: zephyr-7b-sft
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "no"
+save_total_limit: null
+seed: 42
+tf32: true
\ No newline at end of file
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..de1c109
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,6 @@
+
+## Supervised Fine-Tuning (SFT)
+
+```
+
+```
\ No newline at end of file
diff --git a/scripts/run_sft.py b/scripts/run_sft.py
index 580916b..116a654 100644
--- a/scripts/run_sft.py
+++ b/scripts/run_sft.py
@@ -18,7 +18,6 @@ Supervised fine-tuning script for decoder language models.
 """
 
 import logging
-import math
 import random
 import sys
 
@@ -52,6 +51,7 @@ def main():
 
     # Set seed for reproducibility
     set_seed(training_args.seed)
+
     accelerator = Accelerator()
 
     ###############
@@ -72,7 +72,7 @@ def main():
     # Log on each process a small summary
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.bf16}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Model parameters {model_args}")
     logger.info(f"Data parameters {data_args}")
@@ -132,8 +132,8 @@ def main():
         model=model_args.model_name_or_path,
         model_init_kwargs=model_kwargs,
         args=training_args,
-        train_dataset=raw_datasets["train"] if training_args.do_train else None,
-        eval_dataset=raw_datasets["test"] if training_args.do_eval else None,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
         dataset_text_field="text",
         max_seq_length=training_args.max_seq_length,
         tokenizer=tokenizer,
@@ -144,17 +144,14 @@ def main():
     ###############
     # Training loop
     ###############
-    if training_args.do_train:
-        logger.info("*** Train ***")
-        train_result = trainer.train()
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+    logger.info("*** Train ***")
+    train_result = trainer.train()
+    metrics = train_result.metrics
+    max_train_samples = data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+    metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
 
     ##########
     # Evaluate
@@ -164,11 +161,6 @@ def main():
         metrics = trainer.evaluate()
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-        try:
-            perplexity = math.exp(metrics["eval_loss"])
-        except OverflowError:
-            perplexity = float("inf")
-        metrics["perplexity"] = perplexity
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
@@ -181,14 +173,18 @@ def main():
 
     # Save everything else on main process
     if accelerator.is_main_process:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
-        kwargs["dataset"] = list(data_args.dataset_mixer.keys())
+        kwargs = {
+            "finetuned_from": model_args.model_name_or_path,
+            "dataset": list(data_args.dataset_mixer.keys()),
+            "tags": ["alignment-handbook"],
+        }
         trainer.create_model_card(**kwargs)
         # Restore k,v cache for fast inference
         trainer.model.config.use_cache = True
         trainer.model.config.save_pretrained(training_args.output_dir)
 
-        if training_args.push_to_hub:
+        if training_args.push_to_hub is True:
+            logger.info("Pushing to hub...")
             trainer.push_to_hub()
 
     accelerator.wait_for_everyone()
diff --git a/setup.py b/setup.py
index dad5141..c6d4f21 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@ _deps = [
     "accelerate==0.23.0",
     "bitsandbytes==0.41.1",
     "black==23.1.0",
-    "datasets==2.12.0",
+    "datasets==2.14.6",
     "deepspeed==0.12.2",
     "einops>=0.6.1",
     "evaluate==0.4.0",
@@ -60,8 +60,8 @@ _deps = [
     "protobuf<=3.20.2",  # Needed to avoid conflicts with `transformers`
     "pytest",
     "safetensors>=0.3.3",
+    "scipy",
     "tensorboard",
-    "torch==2.0.1",
     "transformers==4.35.0",
     "trl==0.7.4",  # TODO bump to next release, added for NEFTune
     "tqdm>=4.64.1",
@@ -82,7 +82,6 @@ def deps_list(*pkgs):
 
 extras = {}
 extras["tests"] = deps_list("pytest", "parameterized")
-extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("black", "isort", "flake8")
 extras["docs"] = deps_list("hf-doc-builder")
 extras["dev"] = extras["docs"] + extras["quality"] + extras["tests"]
@@ -102,6 +101,7 @@ install_requires = [
     deps["peft"],
     deps["protobuf"],
     deps["safetensors"],
+    deps["scipy"],
     deps["tensorboard"],
     deps["tqdm"],  # progress bars in model download and training scripts
     deps["transformers"],