diff --git a/training_configs/llama-3-2-1b-base-sft.yaml b/training_configs/llama-3-2-1b-base-sft.yaml index 2f31c5b..acee6fc 100644 --- a/training_configs/llama-3-2-1b-base-sft.yaml +++ b/training_configs/llama-3-2-1b-base-sft.yaml @@ -22,7 +22,7 @@ gradient_accumulation_steps: 32 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: False -hub_model_id: llama-3-2-1b-sft +hub_model_id: llama-3.2-1b-sft hub_strategy: every_save learning_rate: 2.0e-04 log_level: info diff --git a/training_configs/llama-3-2-3b-base-sft.yaml b/training_configs/llama-3-2-3b-base-sft.yaml index 6c8eec2..380d4ab 100644 --- a/training_configs/llama-3-2-3b-base-sft.yaml +++ b/training_configs/llama-3-2-3b-base-sft.yaml @@ -1,5 +1,5 @@ # Model arguments -model_name_or_path: tanliboy/llama-3.2-3b +model_name_or_path: tanliboy/Llama-3.2-3B model_revision: main torch_dtype: bfloat16 attn_implementation: flash_attention_2 @@ -18,25 +18,25 @@ bf16: true do_eval: true evaluation_strategy: steps eval_steps: 200 -gradient_accumulation_steps: 32 +gradient_accumulation_steps: 8 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: False -hub_model_id: llama-3-2-3b-sft +hub_model_id: llama-3.2-3b-sft hub_strategy: every_save -learning_rate: 2.0e-04 +learning_rate: 2.0e-05 log_level: info logging_steps: 5 logging_strategy: steps lr_scheduler_type: cosine max_seq_length: 2048 max_steps: -1 -num_train_epochs: 3 +num_train_epochs: 1 output_dir: /workspace/checkpoints_new/llama-3-2-3b-sft run_name: llama-3-2-3b-sft overwrite_output_dir: true -per_device_eval_batch_size: 4 -per_device_train_batch_size: 4 +per_device_eval_batch_size: 3 +per_device_train_batch_size: 2 push_to_hub: false remove_unused_columns: true report_to: