mirror of
https://github.com/wassname/alignment-handbook.git
synced 2026-06-27 16:14:07 +08:00
fmt
This commit is contained in:
@@ -3,7 +3,11 @@ I'm using this to train some simple base -> SFT models for my work
|
||||
|
||||
```sh
|
||||
uv sync --no-build-isolation-package flash-attn
|
||||
# took me ~30mins
|
||||
MAX_JOBS=10 pip install flash-attn --no-build-isolation
|
||||
. ./.venv/bin/activate
|
||||
|
||||
ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml
|
||||
```
|
||||
|
||||
|
||||
|
||||
+1
-1
@@ -52,5 +52,5 @@ requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src"]
|
||||
packages = ["src/alignment"]
|
||||
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen3-0.6B
|
||||
model_name_or_path: Qwen/Qwen3-0.6B-Base
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
|
||||
tokenizer_name_or_path: Qwen/Qwen3-0.6B # Custom tokenizer with <|im_start|> and <|im_end|> tokens
|
||||
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
|
||||
dataset_mixer:
|
||||
wassname/ultrachat_200k_filtered: 1.0
|
||||
dataset_splits:
|
||||
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: steps
|
||||
eval_strategy: steps
|
||||
eval_steps: 200
|
||||
gradient_accumulation_steps: 32
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
# Model arguments
|
||||
model_name_or_path: Qwen/Qwen3-0.6B
|
||||
model_name_or_path: Qwen/Qwen3-0.6B-Base
|
||||
model_revision: main
|
||||
torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
# use_flash_attention_2: true
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
|
||||
tokenizer_name_or_path: Qwen/Qwen3-0.6B # Custom tokenizer with <|im_start|> and <|im_end|> tokens
|
||||
# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
|
||||
|
||||
dataset_mixer:
|
||||
wassname/v2ray_4chan_formatted: 0.6
|
||||
wassname/ultrachat_200k_filtered: 0.4
|
||||
@@ -17,7 +20,7 @@ preprocessing_num_workers: 12
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: steps
|
||||
eval_strategy: steps
|
||||
eval_steps: 200
|
||||
gradient_accumulation_steps: 32
|
||||
gradient_checkpointing: true
|
||||
@@ -33,8 +36,8 @@ lr_scheduler_type: cosine
|
||||
max_seq_length: 2048
|
||||
max_steps: -1
|
||||
num_train_epochs: 3
|
||||
output_dir: /workspace/checkpoints_new/llama-3-2-1b-sft
|
||||
run_name: llama-3-2-1b-sft
|
||||
output_dir: /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan
|
||||
run_name: Qwen3-0.6B-sft-4chan
|
||||
overwrite_output_dir: true
|
||||
per_device_eval_batch_size: 8
|
||||
per_device_train_batch_size: 8
|
||||
|
||||
@@ -5,7 +5,8 @@ torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
|
||||
tokenizer_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
|
||||
# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
|
||||
dataset_mixer:
|
||||
wassname/ultrachat_200k_filtered: 1.0
|
||||
dataset_splits:
|
||||
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: steps
|
||||
eval_strategy: steps
|
||||
eval_steps: 200
|
||||
gradient_accumulation_steps: 32
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -5,7 +5,8 @@ torch_dtype: bfloat16
|
||||
attn_implementation: flash_attention_2
|
||||
|
||||
# Data training arguments
|
||||
chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
|
||||
tokenizer_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
|
||||
# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
|
||||
dataset_mixer:
|
||||
wassname/ultrachat_200k_filtered: 1.0
|
||||
dataset_splits:
|
||||
@@ -16,7 +17,7 @@ preprocessing_num_workers: 12
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: steps
|
||||
eval_strategy: steps
|
||||
eval_steps: 200
|
||||
gradient_accumulation_steps: 32
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -10,13 +10,13 @@ dataset_mixer:
|
||||
wassname/ultrachat_200k_filtered: 1.0
|
||||
dataset_splits:
|
||||
- train_sft
|
||||
- test_sft
|
||||
# - test_sft
|
||||
preprocessing_num_workers: 12
|
||||
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: steps
|
||||
eval_strategy: steps
|
||||
eval_steps: 200
|
||||
gradient_accumulation_steps: 32
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: steps
|
||||
eval_strategy: steps
|
||||
eval_steps: 200
|
||||
gradient_accumulation_steps: 8
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -16,7 +16,7 @@ preprocessing_num_workers: 12
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: steps
|
||||
eval_strategy: steps
|
||||
eval_steps: 200
|
||||
gradient_accumulation_steps: 4
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -24,7 +24,7 @@ dataset_kwargs:
|
||||
add_special_tokens: false # We already wrap <bos> and <eos> in the chat template
|
||||
append_concat_token: false # No need to add <eos> across samples
|
||||
do_eval: true
|
||||
evaluation_strategy: epoch
|
||||
eval_strategy: epoch
|
||||
gradient_accumulation_steps: 4
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
@@ -50,4 +50,4 @@ report_to:
|
||||
- wandb
|
||||
save_strategy: "no"
|
||||
seed: 42
|
||||
warmup_ratio: 0.1
|
||||
warmup_ratio: 0.1
|
||||
|
||||
@@ -20,7 +20,7 @@ preprocessing_num_workers: 36
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: epoch
|
||||
eval_strategy: epoch
|
||||
gradient_accumulation_steps: 4
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
@@ -46,4 +46,4 @@ report_to:
|
||||
- wandb
|
||||
save_strategy: "no"
|
||||
seed: 42
|
||||
warmup_ratio: 0.1
|
||||
warmup_ratio: 0.1
|
||||
|
||||
@@ -17,7 +17,7 @@ preprocessing_num_workers: 36
|
||||
# SFT trainer config
|
||||
bf16: true
|
||||
do_eval: true
|
||||
evaluation_strategy: epoch
|
||||
eval_strategy: epoch
|
||||
gradient_accumulation_steps: 4
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
@@ -43,4 +43,4 @@ report_to:
|
||||
- wandb
|
||||
save_strategy: "no"
|
||||
seed: 42
|
||||
warmup_ratio: 0.1
|
||||
warmup_ratio: 0.1
|
||||
|
||||
Reference in New Issue
Block a user