diff --git a/training_configs/llama-3-2-1b-base-sft.yaml b/training_configs/llama-3-2-1b-base-sft.yaml
index 2f31c5b..acee6fc 100644
--- a/training_configs/llama-3-2-1b-base-sft.yaml
+++ b/training_configs/llama-3-2-1b-base-sft.yaml
@@ -22,7 +22,7 @@ gradient_accumulation_steps: 32
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: False
-hub_model_id: llama-3-2-1b-sft
+hub_model_id: llama-3.2-1b-sft
 hub_strategy: every_save
 learning_rate: 2.0e-04
 log_level: info
diff --git a/training_configs/llama-3-2-3b-base-sft.yaml b/training_configs/llama-3-2-3b-base-sft.yaml
index 6c8eec2..380d4ab 100644
--- a/training_configs/llama-3-2-3b-base-sft.yaml
+++ b/training_configs/llama-3-2-3b-base-sft.yaml
@@ -1,5 +1,5 @@
 # Model arguments
-model_name_or_path: tanliboy/llama-3.2-3b
+model_name_or_path: tanliboy/Llama-3.2-3B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2
@@ -18,25 +18,25 @@ bf16: true
 do_eval: true
 evaluation_strategy: steps
 eval_steps: 200
-gradient_accumulation_steps: 32
+gradient_accumulation_steps: 8
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: False
-hub_model_id: llama-3-2-3b-sft
+hub_model_id: llama-3.2-3b-sft
 hub_strategy: every_save
-learning_rate: 2.0e-04
+learning_rate: 2.0e-05
 log_level: info
 logging_steps: 5  
 logging_strategy: steps
 lr_scheduler_type: cosine
 max_seq_length: 2048
 max_steps: -1
-num_train_epochs: 3
+num_train_epochs: 1
 output_dir: /workspace/checkpoints_new/llama-3-2-3b-sft
 run_name: llama-3-2-3b-sft
 overwrite_output_dir: true
-per_device_eval_batch_size: 4
-per_device_train_batch_size: 4
+per_device_eval_batch_size: 3
+per_device_train_batch_size: 2
 push_to_hub: false
 remove_unused_columns: true
 report_to: