Update Zephyr configs to account for UltraFeedback & TRL fixes (#88)

* Add files * Add checkpointing * Add checkpointing to SFT * Add loss type * Fix setup| * Clean SFT * Add lora config * Rename config * Remove max eval samples * Add kwargs tp push to hub * Add DPO configs * Fix dpo configs * Extend chat template test to multi-turn * Add warmup * Refactor * Fix LoRA -> QLoRA * Fix configs * Specify chat template * Add sample logging * Fix push to hub hanging * Add reentrant * Fix quality * Add transformer logging * Tweak grad acc * Add null type * Add doc
2026-06-27 17:47:01 +08:00 · 2024-01-10 17:42:24 +11:00
parent c69ae4b8a5
commit f0ffa0d7a6
17 changed files with 266 additions and 187 deletions
@@ -2,7 +2,7 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
-#SBATCH --partition=production-cluster  # Adjust this for your cluster
+#SBATCH --partition=hopper-prod  # Adjust this for your cluster
 #SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
 #SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this for your cluster

@@ -47,7 +47,7 @@ export CMD=" \
    scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
    "

-export LAUNCHER="ACCELERATE_LOG_LEVEL=info accelerate launch \
+export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
    --gradient_accumulation_steps $GRAD_ACC_STEPS \
    --num_machines $NUM_NODES \
@@ -71,14 +71,7 @@ export NCCL_ASYNC_ERROR_HANDLING=1

 # Specific configuration optimized for the Hugging Face Compute Cluster
 # Be ye warned this may not work on other clusters!
-export NCCL_PROTO=simple
-export RDMAV_FORK_SAFE=1
-export FI_EFA_FORK_SAFE=1
-export FI_EFA_USE_DEVICE_RDMA=1
-export FI_PROVIDER=efa
-export FI_LOG_LEVEL=1
-export NCCL_IB_DISABLE=1
-export NCCL_SOCKET_IFNAME=ens
+module load cuda/12.1

 # srun error handling:
 # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
@@ -3,12 +3,15 @@

 As described in the Zephyr [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:

-1. Apply SFT to fine-tune Mistral 7B on a filtered version of the UltraChat dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)). The result is an SFT model like [`zephyr-7b-sft-full`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) or [`zephyr-7b-sft-lora`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-lora).
-2. Align the SFT model to AI feedback via DPO on a preprocessed version of the UltraFeedback dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)). The result is an DPO model like [`zephyr-7b-dpo-full`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-full) or [`zephyr-7b-dpo-lora`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-lora).
+1. Apply SFT to fine-tune Mistral 7B on a filtered version of the UltraChat dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)). The result is an SFT model like [`zephyr-7b-sft-full`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) or [`zephyr-7b-sft-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-qlora).
+2. Align the SFT model to AI feedback via DPO on a preprocessed version of the UltraFeedback dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)). The result is an DPO model like [`zephyr-7b-dpo-full`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-full) or [`zephyr-7b-dpo-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-qlora).
+
+**Note:** after the release of Zephyr, the team at [Argilla](https://argilla.io) found that the source UltraFeedback dataset had a few thousand incorrect preference labels from GPT-4. Additionally, TRL's `SFTTrainer` had a bug in the learning rate scheduler which terminated training early. Accounting for these changes led us to find a better set of hyperparameters from those described in the technical report. In particular, for DPO training we found that training for 1 epoch with `beta=0.01` was suffucient to achieve comparable performance to `zephyr-7b-beta` (vs. 3 epochs with `beta=0.1`).

 See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.

 ## Full training examples
+
 You will require 8 GPUs (80GB of VRAM) to train the full model.
 ```shell
 # Step 1 - SFT
@@ -18,12 +21,12 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_full.yaml
 ```

-## LoRA training examples
+## QLoRA training examples

 ```shell
 # Step 1 - SFT
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_lora.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true

 # Step 2 - DPO
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_lora.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml
 ```
@@ -1,5 +1,6 @@
 # Model arguments
 model_name_or_path: alignment-handbook/zephyr-7b-sft-full
+torch_dtype: null

 # Data training arguments
 # For definitions, see: src/h4/training/config.py
@@ -12,26 +13,29 @@ preprocessing_num_workers: 12

 # DPOTrainer arguments
 bf16: true
-beta: 0.1
+beta: 0.01
 do_eval: true
 evaluation_strategy: steps
 eval_steps: 100
-gradient_accumulation_steps: 1
+gradient_accumulation_steps: 2
 gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
 hub_model_id: zephyr-7b-dpo-full
 learning_rate: 5.0e-7
 log_level: info
 logging_steps: 10
-lr_scheduler_type: linear
+lr_scheduler_type: cosine
 max_length: 1024
 max_prompt_length: 512
-num_train_epochs: 3
-optim: rmsprop
+num_train_epochs: 1
+optim: adamw_torch
 output_dir: data/zephyr-7b-dpo-full
 per_device_train_batch_size: 8
-per_device_eval_batch_size: 4
+per_device_eval_batch_size: 8
 push_to_hub: true
-save_strategy: "no"
-save_total_limit: null
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
 seed: 42
 warmup_ratio: 0.1
@@ -1,51 +0,0 @@
-# Model arguments
-model_name_or_path: alignment-handbook/zephyr-7b-sft-lora
-torch_dtype: auto
-
-# LoRA arguments
-use_peft: true
-lora_r: 64
-lora_alpha: 16
-lora_dropout: 0.1
-lora_target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
-
-# Data training arguments
-
-dataset_mixer:
-  HuggingFaceH4/ultrafeedback_binarized: 1.0
-dataset_splits:
- train_prefs
- test_prefs
-preprocessing_num_workers: 12
-
-# DPOTrainer arguments
-bf16: true
-beta: 0.1
-do_eval: true
-evaluation_strategy: epoch
-eval_steps: 100
-gradient_accumulation_steps: 32
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: zephyr-7b-dpo-lora
-learning_rate: 5.0e-7
-log_level: info
-logging_steps: 10
-lr_scheduler_type: linear
-max_length: 1024
-max_prompt_length: 512
-num_train_epochs: 3
-optim: rmsprop
-output_dir: data/zephyr-7b-dpo-lora # It is handy to append `hub_model_revision` to keep track of your local experiments
-per_device_train_batch_size: 2
-per_device_eval_batch_size: 4
-push_to_hub: true
-save_strategy: "no"
-save_total_limit: null
-seed: 42
-warmup_ratio: 0.1
@@ -0,0 +1,56 @@
+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-qlora
+torch_dtype: float16
+
+# LoRA arguments
+use_peft: true
+load_in_4bit: true
+lora_r: 16
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+- q_proj
+- k_proj
+- v_proj
+- o_proj
+- gate_proj
+- up_proj
+- down_proj
+
+# Data training arguments
+
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.01
+do_eval: true
+evaluation_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: zephyr-7b-dpo-qlora
+learning_rate: 5.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: paged_adamw_32bit
+output_dir: data/zephyr-7b-dpo-qlora # It is handy to append `hub_model_revision` to keep track of your local experiments
+per_device_train_batch_size: 4
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
@@ -5,6 +5,7 @@ torch_dtype: bfloat16
 use_flash_attention_2: true

 # Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
 dataset_mixer:
  HuggingFaceH4/ultrachat_200k: 1.0
 dataset_splits:
@@ -16,8 +17,10 @@ preprocessing_num_workers: 12
 bf16: true
 do_eval: true
 evaluation_strategy: epoch
-gradient_accumulation_steps: 2
+gradient_accumulation_steps: 1
 gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
 hub_model_id: zephyr-7b-sft-full
 hub_strategy: every_save
 learning_rate: 2.0e-05
@@ -30,13 +33,14 @@ max_steps: -1
 num_train_epochs: 1
 output_dir: data/zephyr-7b-sft-full
 overwrite_output_dir: true
-per_device_eval_batch_size: 16
-per_device_train_batch_size: 32
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 16
 push_to_hub: true
 remove_unused_columns: true
 report_to:
 - tensorboard
-save_strategy: "no"
-save_total_limit: null
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
 seed: 42
-tf32: true
+warmup_ratio: 0.1
@@ -1,20 +1,25 @@
 # Model arguments
 model_name_or_path: mistralai/Mistral-7B-v0.1
-torch_dtype: auto
-use_flash_attention_2: true
+model_revision: main
+torch_dtype: float16

 # LoRA arguments
+load_in_4bit: true
 use_peft: true
-lora_r: 64
+lora_r: 16
 lora_alpha: 16
-lora_dropout: 0.1
+lora_dropout: 0.05
 lora_target_modules:
 - q_proj
 - k_proj
 - v_proj
 - o_proj
+- gate_proj
+- up_proj
+- down_proj

 # Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
 dataset_mixer:
  HuggingFaceH4/ultrachat_200k: 1.0
 dataset_splits:
@@ -26,13 +31,13 @@ preprocessing_num_workers: 12
 bf16: true
 do_eval: true
 evaluation_strategy: epoch
-gradient_accumulation_steps: 128
+gradient_accumulation_steps: 2
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
-hub_model_id: zephyr-7b-sft-lora
+hub_model_id: zephyr-7b-sft-qlora
 hub_strategy: every_save
-learning_rate: 2.0e-05
+learning_rate: 2.0e-04
 log_level: info
 logging_steps: 5  
 logging_strategy: steps
@@ -40,14 +45,15 @@ lr_scheduler_type: cosine
 max_seq_length: 2048
 max_steps: -1
 num_train_epochs: 1
-output_dir: data/zephyr-7b-sft-lora
+output_dir: data/zephyr-7b-sft-qlora
 overwrite_output_dir: true
 per_device_eval_batch_size: 8
 per_device_train_batch_size: 4
 push_to_hub: true
 report_to:
 - tensorboard
-save_strategy: "no"
-save_total_limit: null
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
 seed: 42
-warmup_ratio: 0.1
+warmup_ratio: 0.1