This commit is contained in:
wassname
2025-06-02 07:13:52 +00:00
parent fc7d4ed451
commit 2819dd46d0
12 changed files with 33 additions and 23 deletions
+4
View File
@@ -3,7 +3,11 @@ I'm using this to train some simple base -> SFT models for my work
```sh
uv sync --no-build-isolation-package flash-attn
# took me ~30mins
MAX_JOBS=10 pip install flash-attn --no-build-isolation
. ./.venv/bin/activate
ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml
```
+1 -1
View File
@@ -52,5 +52,5 @@ requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src"]
packages = ["src/alignment"]
+4 -3
View File
@@ -1,11 +1,12 @@
# Model arguments
model_name_or_path: Qwen/Qwen3-0.6B
model_name_or_path: Qwen/Qwen3-0.6B-Base
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
tokenizer_name_or_path: Qwen/Qwen3-0.6B # Custom tokenizer with <|im_start|> and <|im_end|> tokens
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
dataset_mixer:
wassname/ultrachat_200k_filtered: 1.0
dataset_splits:
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: steps
eval_strategy: steps
eval_steps: 200
gradient_accumulation_steps: 32
gradient_checkpointing: true
+8 -5
View File
@@ -1,11 +1,14 @@
# Model arguments
model_name_or_path: Qwen/Qwen3-0.6B
model_name_or_path: Qwen/Qwen3-0.6B-Base
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# use_flash_attention_2: true
# Data training arguments
chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
tokenizer_name_or_path: Qwen/Qwen3-0.6B # Custom tokenizer with <|im_start|> and <|im_end|> tokens
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
dataset_mixer:
wassname/v2ray_4chan_formatted: 0.6
wassname/ultrachat_200k_filtered: 0.4
@@ -17,7 +20,7 @@ preprocessing_num_workers: 12
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: steps
eval_strategy: steps
eval_steps: 200
gradient_accumulation_steps: 32
gradient_checkpointing: true
@@ -33,8 +36,8 @@ lr_scheduler_type: cosine
max_seq_length: 2048
max_steps: -1
num_train_epochs: 3
output_dir: /workspace/checkpoints_new/llama-3-2-1b-sft
run_name: llama-3-2-1b-sft
output_dir: /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan
run_name: Qwen3-0.6B-sft-4chan
overwrite_output_dir: true
per_device_eval_batch_size: 8
per_device_train_batch_size: 8
+3 -2
View File
@@ -5,7 +5,8 @@ torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
tokenizer_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
dataset_mixer:
wassname/ultrachat_200k_filtered: 1.0
dataset_splits:
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: steps
eval_strategy: steps
eval_steps: 200
gradient_accumulation_steps: 32
gradient_checkpointing: true
+3 -2
View File
@@ -5,7 +5,8 @@ torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
tokenizer_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
dataset_mixer:
wassname/ultrachat_200k_filtered: 1.0
dataset_splits:
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: steps
eval_strategy: steps
eval_steps: 200
gradient_accumulation_steps: 32
gradient_checkpointing: true
+2 -2
View File
@@ -10,13 +10,13 @@ dataset_mixer:
wassname/ultrachat_200k_filtered: 1.0
dataset_splits:
- train_sft
- test_sft
# - test_sft
preprocessing_num_workers: 12
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: steps
eval_strategy: steps
eval_steps: 200
gradient_accumulation_steps: 32
gradient_checkpointing: true
+1 -1
View File
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: steps
eval_strategy: steps
eval_steps: 200
gradient_accumulation_steps: 8
gradient_checkpointing: true
+1 -1
View File
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: steps
eval_strategy: steps
eval_steps: 200
gradient_accumulation_steps: 4
gradient_checkpointing: true
+2 -2
View File
@@ -24,7 +24,7 @@ dataset_kwargs:
add_special_tokens: false # We already wrap <bos> and <eos> in the chat template
append_concat_token: false # No need to add <eos> across samples
do_eval: true
evaluation_strategy: epoch
eval_strategy: epoch
gradient_accumulation_steps: 4
gradient_checkpointing: true
gradient_checkpointing_kwargs:
@@ -50,4 +50,4 @@ report_to:
- wandb
save_strategy: "no"
seed: 42
warmup_ratio: 0.1
warmup_ratio: 0.1
+2 -2
View File
@@ -20,7 +20,7 @@ preprocessing_num_workers: 36
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: epoch
eval_strategy: epoch
gradient_accumulation_steps: 4
gradient_checkpointing: true
gradient_checkpointing_kwargs:
@@ -46,4 +46,4 @@ report_to:
- wandb
save_strategy: "no"
seed: 42
warmup_ratio: 0.1
warmup_ratio: 0.1
+2 -2
View File
@@ -17,7 +17,7 @@ preprocessing_num_workers: 36
# SFT trainer config
bf16: true
do_eval: true
evaluation_strategy: epoch
eval_strategy: epoch
gradient_accumulation_steps: 4
gradient_checkpointing: true
gradient_checkpointing_kwargs:
@@ -43,4 +43,4 @@ report_to:
- wandb
save_strategy: "no"
seed: 42
warmup_ratio: 0.1
warmup_ratio: 0.1