fmt

2026-06-27 16:14:07 +08:00 · 2025-06-02 07:13:52 +00:00
parent fc7d4ed451
commit 2819dd46d0
12 changed files with 33 additions and 23 deletions
@@ -3,7 +3,11 @@ I'm using this to train some simple base -> SFT models for my  work

 ```sh
 uv sync --no-build-isolation-package flash-attn
+# took me ~30mins
 MAX_JOBS=10 pip install flash-attn --no-build-isolation
+. ./.venv/bin/activate
+
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml
 ```


@@ -52,5 +52,5 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"

 [tool.hatch.build.targets.wheel]
-packages = ["src"]
+packages = ["src/alignment"]

@@ -1,11 +1,12 @@
 # Model arguments
-model_name_or_path: Qwen/Qwen3-0.6B
+model_name_or_path: Qwen/Qwen3-0.6B-Base
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
-chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+tokenizer_name_or_path: Qwen/Qwen3-0.6B # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
 dataset_mixer:
  wassname/ultrachat_200k_filtered: 1.0
 dataset_splits:
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 200
 gradient_accumulation_steps: 32
 gradient_checkpointing: true
@@ -1,11 +1,14 @@
 # Model arguments
-model_name_or_path: Qwen/Qwen3-0.6B
+model_name_or_path: Qwen/Qwen3-0.6B-Base
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2
+# use_flash_attention_2: true

 # Data training arguments
-chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+tokenizer_name_or_path: Qwen/Qwen3-0.6B # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+
 dataset_mixer:
  wassname/v2ray_4chan_formatted: 0.6
  wassname/ultrachat_200k_filtered: 0.4
@@ -17,7 +20,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 200
 gradient_accumulation_steps: 32
 gradient_checkpointing: true
@@ -33,8 +36,8 @@ lr_scheduler_type: cosine
 max_seq_length: 2048
 max_steps: -1
 num_train_epochs: 3
-output_dir: /workspace/checkpoints_new/llama-3-2-1b-sft
-run_name: llama-3-2-1b-sft
+output_dir: /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan
+run_name: Qwen3-0.6B-sft-4chan
 overwrite_output_dir: true
 per_device_eval_batch_size: 8
 per_device_train_batch_size: 8
@@ -5,7 +5,8 @@ torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
-chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+tokenizer_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
 dataset_mixer:
  wassname/ultrachat_200k_filtered: 1.0
 dataset_splits:
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 200
 gradient_accumulation_steps: 32
 gradient_checkpointing: true
@@ -5,7 +5,8 @@ torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
-chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+tokenizer_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
 dataset_mixer:
  wassname/ultrachat_200k_filtered: 1.0
 dataset_splits:
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 200
 gradient_accumulation_steps: 32
 gradient_checkpointing: true
@@ -10,13 +10,13 @@ dataset_mixer:
  wassname/ultrachat_200k_filtered: 1.0
 dataset_splits:
 - train_sft
- test_sft
+# - test_sft
 preprocessing_num_workers: 12

 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 200
 gradient_accumulation_steps: 32
 gradient_checkpointing: true
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 200
 gradient_accumulation_steps: 8
 gradient_checkpointing: true
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: steps
+eval_strategy: steps
 eval_steps: 200
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
@@ -24,7 +24,7 @@ dataset_kwargs:
  add_special_tokens: false  # We already wrap <bos> and <eos> in the chat template
  append_concat_token: false # No need to add <eos> across samples
 do_eval: true
-evaluation_strategy: epoch
+eval_strategy: epoch
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
@@ -50,4 +50,4 @@ report_to:
 - wandb
 save_strategy: "no"
 seed: 42
-warmup_ratio: 0.1
+warmup_ratio: 0.1
@@ -20,7 +20,7 @@ preprocessing_num_workers: 36
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: epoch
+eval_strategy: epoch
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
@@ -46,4 +46,4 @@ report_to:
 - wandb
 save_strategy: "no"
 seed: 42
-warmup_ratio: 0.1
+warmup_ratio: 0.1
@@ -17,7 +17,7 @@ preprocessing_num_workers: 36
 # SFT trainer config
 bf16: true
 do_eval: true
-evaluation_strategy: epoch
+eval_strategy: epoch
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
@@ -43,4 +43,4 @@ report_to:
 - wandb
 save_strategy: "no"
 seed: 42
-warmup_ratio: 0.1
+warmup_ratio: 0.1