From 756bb76d226c7873edbbe841c045f49d0d40255d Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 9 Nov 2023 14:09:52 +0000
Subject: [PATCH] Fix Slurm opts

---
 recipes/launch.slurm        | 10 ++++----
 recipes/zephyr-7b/README.md | 48 ++++++++++++-------------------------
 scripts/README.md           | 16 +++++++++----
 3 files changed, 31 insertions(+), 43 deletions(-)

diff --git a/recipes/launch.slurm b/recipes/launch.slurm
index da0b176..a5f4359 100644
--- a/recipes/launch.slurm
+++ b/recipes/launch.slurm
@@ -2,9 +2,9 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
-#SBATCH --partition=production-cluster
-#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this to your cluster
-#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this to your cluster
+#SBATCH --partition=production-cluster  # Adjust this for your cluster
+#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
+#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this for your cluster
 
 set -x -e
 
@@ -44,7 +44,7 @@ MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
 
 export CMD=" \
-    scripts/run_$TASK.py $CONFIG_FILE
+    scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
     "
 
 export LAUNCHER="ACCELERATE_LOG_LEVEL=info accelerate launch \
@@ -69,7 +69,7 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 # export NCCL_NSOCKS_PERTHREAD=1
 # export CUDA_LAUNCH_BLOCKING=1
 
-# Specific configuration for the Hugging Face Compute Cluster
+# Specific configuration optimized for the Hugging Face Compute Cluster
 # Be ye warned this may not work on other clusters!
 export NCCL_PROTO=simple
 export RDMAV_FORK_SAFE=1
diff --git a/recipes/zephyr-7b/README.md b/recipes/zephyr-7b/README.md
index fcafde9..02746a1 100644
--- a/recipes/zephyr-7b/README.md
+++ b/recipes/zephyr-7b/README.md
@@ -1,47 +1,29 @@
 
-# Instructions
+# Instructions to Replicate Zephyr 7B
 
-In the handbook, for each training step we provide two sets of recipes:
-- Full training on a multi-GPU machine (tested on a 8xA100 node), using slurm to queue jobs.
-- LORA taining on a single consumer 24GB GPU (tested on a RTX 4090)
+As described in the Zephyr [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
 
-The full training jobs will scale to a multi-node setting, by adjusting `--nodes=1`, we advise adjusting the gradient accumulation steps and/or batch size if you want to replicate our results.
+1. Apply SFT to fine-tune Mistral 7B on the UltraChat dataset.
+2. Align the SFT model to AI feedback via DPO on the UltraFeedback dataset.
 
+See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.
 
 ## Full training examples 
 
-### SFT
-
 ```shell
-sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b/sft/config_full.yaml
+
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b/dpo/config_full.yaml
 ```
 
-## DPO
-```shell
-sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
-```
-
-## LORA training examples
-
-### SFT
-```shell
-# locally on 1 gpu
-accelerate launch scripts/run_sft.py recipes/zephyr-7b/sft/config_lora.yaml
-```
+## LoRA training examples
 
 ```shell
-# on a cluster
-sbatch --job-name=handbook_sft_lora --nodes=1 recipes/launch.slurm zephyr-7b sft lora multi_gpu "--gradient_accumulation_steps=16"
-```
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b/sft/config_lora.yaml
 
-### SFT
-
-```shell
-# locally on 1 gpu
-accelerate launch scripts/run_dpo.py recipes/zephyr-7b/dpo/config_lora.yaml
-```
-
-```shell
-# on a cluster
-sbatch --job-name=handbook_dpo_lora --nodes=1 recipes/launch.slurm zephyr-7b dpo lora multi_gpu "--gradient_accumulation_steps=8"
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b/dpo/config_lora.yaml
 ```
\ No newline at end of file
diff --git a/scripts/README.md b/scripts/README.md
index a388ff9..dcaa063 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -3,10 +3,11 @@
 
 ### Fine-tuning
 
-In the handbook, we provide two main ways to align LLMs for chat:
+In the handbook, we provide three main ways to align LLMs for chat:
 
-- Full fine-tuning on a multi-GPU machine (tested on an 8 x A100 (80GB) node).
+- Full fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on an 8 x A100 (80GB) node).
 - LoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090).
+- LoRA fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on a 2 x A100s (80GB)).
 
 In practice, we find comparable performance for both full and LoRA fine-tuning, with the latter having the advantage of producing small adapter weights that are fast to upload and download from the Hugging Face Hub. Here's the two general commands to fine-tune your models:
 
@@ -14,8 +15,11 @@ In practice, we find comparable performance for both full and LoRA fine-tuning,
 # Full training with ZeRO-3 on 8 GPUs
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml
 
-# LoRA training on single GPU
+# LoRA training on a single GPU
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml
+
+# LoRA training with ZeRO-3 on two or more GPUs
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml --num_processes={num_gpus} scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml
 ```
 
 Here `{task}` refers to type of training you wish to run (SFT, DPO, etc), while `{model_name}` refers to the choice of recipe in the `recipes/` directory. For example, to replicate Zephyr 7B you can run:
@@ -28,7 +32,7 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b/dpo/config_full.yaml
 ```
 
-You can override the parameters in each YAML config by appending them to the command as follows:
+By default, these scripts will push each model to your Hugging Face Hub username, i.e. `{username}/{model_name}-{task}`. You can override the parameters in each YAML config by appending them to the command as follows:
 
 ```shell
 # Change batch size, number of epochs etc
@@ -41,7 +45,7 @@ By default all training metrics are logged with TensorBoard. If you have a [Weig
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --report_to=wandb
 ```
 
-#### Launching jobs on a Slurm cluster
+### Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `recipes/launch.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
 
@@ -55,4 +59,6 @@ Here `{model_name}` and `{task}` are defined as above, while `{precision}` refer
 sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3
 ```
 
+You can scale the number of nodes by increasing the `--nodes` flag; in these cases we recommend also scaling up the per-device batch size or number of gradient accumulation steps to keep the global batch size constant (and thus replicate our results).
+
 **Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
\ No newline at end of file