diff --git a/.gitignore b/.gitignore index 2dc53ca..d4de801 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ + +# Temp checkpoint folder +data/ \ No newline at end of file diff --git a/README.md b/README.md index af1a003..6f7c61e 100644 --- a/README.md +++ b/README.md @@ -32,13 +32,19 @@ To run the code in this project, first create a Python virtual environment using conda create -n handbook python=3.10 && conda activate handbook ``` -Next, install PyTorch v2.1.0. Since this hardware-dependent, we -direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/). +Next, install PyTorch `v2.0.1` - the precise version is important for reproducibility! Since this hardware-dependent, we +direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/previous-versions/#v201). -Once PyTorch is installed, you can install the remaining package dependencies as follows: +You can then install the remaining package dependencies as follows: ```shell -pip install . +python -m pip install . +``` + +You will also need Flash Attention 2 installed, which can be done by running: + +```shell +python -m pip install flash-attn==2.3.0 --no-build-isolation ``` Next, log into your Hugging Face account as follows: diff --git a/recipes/accelerate_configs/deepspeed_zero1.yaml b/recipes/accelerate_configs/deepspeed_zero1.yaml new file mode 100644 index 0000000..1dfeda0 --- /dev/null +++ b/recipes/accelerate_configs/deepspeed_zero1.yaml @@ -0,0 +1,19 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + zero3_init_flag: false + zero_stage: 1 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/recipes/accelerate_configs/deepspeed_zero2.yaml b/recipes/accelerate_configs/deepspeed_zero2.yaml new file mode 100644 index 0000000..0777900 --- /dev/null +++ b/recipes/accelerate_configs/deepspeed_zero2.yaml @@ -0,0 +1,21 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: false + zero_stage: 2 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/recipes/accelerate_configs/deepspeed_zero3.yaml b/recipes/accelerate_configs/deepspeed_zero3.yaml new file mode 100644 index 0000000..b5a1201 --- /dev/null +++ b/recipes/accelerate_configs/deepspeed_zero3.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/recipes/accelerate_configs/multi_gpu.yaml b/recipes/accelerate_configs/multi_gpu.yaml new file mode 100644 index 0000000..4f05571 --- /dev/null +++ b/recipes/accelerate_configs/multi_gpu.yaml @@ -0,0 +1,16 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/recipes/zephyr-7b/sft/config.yaml b/recipes/zephyr-7b/sft/config.yaml new file mode 100644 index 0000000..e7e786a --- /dev/null +++ b/recipes/zephyr-7b/sft/config.yaml @@ -0,0 +1,41 @@ +# Model arguments +model_name_or_path: mistralai/Mistral-7B-v0.1 +model_revision: main +torch_dtype: bfloat16 +use_flash_attention_2: true + +# Data training arguments +dataset_mixer: + HuggingFaceH4/ultrachat_200k: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 12 + +# SFT trainer config +bf16: true +evaluation_strategy: epoch +gradient_accumulation_steps: 2 +gradient_checkpointing: true +hub_strategy: every_save +learning_rate: 2.0e-05 +log_level: info +logging_steps: 5 +logging_strategy: steps +lr_scheduler_type: cosine +max_seq_length: 2048 +max_steps: -1 +num_train_epochs: 1 +output_dir: data/zephyr-7b-sft +overwrite_output_dir: true +per_device_eval_batch_size: 16 +per_device_train_batch_size: 32 +push_to_hub: True +push_to_hub_model_id: zephyr-7b-sft +remove_unused_columns: true +report_to: +- tensorboard +save_strategy: "no" +save_total_limit: null +seed: 42 +tf32: true \ No newline at end of file diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..de1c109 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,6 @@ + +## Supervised Fine-Tuning (SFT) + +``` + +``` \ No newline at end of file diff --git a/scripts/run_sft.py b/scripts/run_sft.py index 580916b..116a654 100644 --- a/scripts/run_sft.py +++ b/scripts/run_sft.py @@ -18,7 +18,6 @@ Supervised fine-tuning script for decoder language models. """ import logging -import math import random import sys @@ -52,6 +51,7 @@ def main(): # Set seed for reproducibility set_seed(training_args.seed) + accelerator = Accelerator() ############### @@ -72,7 +72,7 @@ def main(): # Log on each process a small summary logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.bf16}" + + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Model parameters {model_args}") logger.info(f"Data parameters {data_args}") @@ -132,8 +132,8 @@ def main(): model=model_args.model_name_or_path, model_init_kwargs=model_kwargs, args=training_args, - train_dataset=raw_datasets["train"] if training_args.do_train else None, - eval_dataset=raw_datasets["test"] if training_args.do_eval else None, + train_dataset=train_dataset, + eval_dataset=eval_dataset, dataset_text_field="text", max_seq_length=training_args.max_seq_length, tokenizer=tokenizer, @@ -144,17 +144,14 @@ def main(): ############### # Training loop ############### - if training_args.do_train: - logger.info("*** Train ***") - train_result = trainer.train() - metrics = train_result.metrics - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() + logger.info("*** Train ***") + train_result = trainer.train() + metrics = train_result.metrics + max_train_samples = data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() ########## # Evaluate @@ -164,11 +161,6 @@ def main(): metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) @@ -181,14 +173,18 @@ def main(): # Save everything else on main process if accelerator.is_main_process: - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} - kwargs["dataset"] = list(data_args.dataset_mixer.keys()) + kwargs = { + "finetuned_from": model_args.model_name_or_path, + "dataset": list(data_args.dataset_mixer.keys()), + "tags": ["alignment-handbook"], + } trainer.create_model_card(**kwargs) # Restore k,v cache for fast inference trainer.model.config.use_cache = True trainer.model.config.save_pretrained(training_args.output_dir) - if training_args.push_to_hub: + if training_args.push_to_hub is True: + logger.info("Pushing to hub...") trainer.push_to_hub() accelerator.wait_for_everyone() diff --git a/setup.py b/setup.py index dad5141..c6d4f21 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ _deps = [ "accelerate==0.23.0", "bitsandbytes==0.41.1", "black==23.1.0", - "datasets==2.12.0", + "datasets==2.14.6", "deepspeed==0.12.2", "einops>=0.6.1", "evaluate==0.4.0", @@ -60,8 +60,8 @@ _deps = [ "protobuf<=3.20.2", # Needed to avoid conflicts with `transformers` "pytest", "safetensors>=0.3.3", + "scipy", "tensorboard", - "torch==2.0.1", "transformers==4.35.0", "trl==0.7.4", # TODO bump to next release, added for NEFTune "tqdm>=4.64.1", @@ -82,7 +82,6 @@ def deps_list(*pkgs): extras = {} extras["tests"] = deps_list("pytest", "parameterized") -extras["torch"] = deps_list("torch") extras["quality"] = deps_list("black", "isort", "flake8") extras["docs"] = deps_list("hf-doc-builder") extras["dev"] = extras["docs"] + extras["quality"] + extras["tests"] @@ -102,6 +101,7 @@ install_requires = [ deps["peft"], deps["protobuf"], deps["safetensors"], + deps["scipy"], deps["tensorboard"], deps["tqdm"], # progress bars in model download and training scripts deps["transformers"],