From 2819dd46d079ad565676434843c26a521a78297b Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Mon, 2 Jun 2025 07:13:52 +0000 Subject: [PATCH] fmt --- README.md | 4 ++++ pyproject.toml | 2 +- recipes/fromSimPO/Qwen3-0.6B.yaml | 7 ++++--- recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml | 13 ++++++++----- recipes/fromSimPO/SmolLM2-135M.yaml | 5 +++-- recipes/fromSimPO/SmolLM2-360M.yaml | 5 +++-- recipes/fromSimPO/llama-3-2-1b-base-sft.yaml | 4 ++-- recipes/fromSimPO/llama-3-2-3b-base-sft.yaml | 2 +- recipes/fromSimPO/llama-3-8b-base-sft.yaml | 2 +- recipes/smollm/sft/config.yaml | 4 ++-- recipes/smollm2/sft/config.yaml | 4 ++-- recipes/smollm2/sft/config_smol.yaml | 4 ++-- 12 files changed, 33 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 7d7b4a5..3c8c457 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,11 @@ I'm using this to train some simple base -> SFT models for my work ```sh uv sync --no-build-isolation-package flash-attn +# took me ~30mins MAX_JOBS=10 pip install flash-attn --no-build-isolation +. ./.venv/bin/activate + +ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml ``` diff --git a/pyproject.toml b/pyproject.toml index 0b3a343..69e10a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,5 +52,5 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["src"] +packages = ["src/alignment"] diff --git a/recipes/fromSimPO/Qwen3-0.6B.yaml b/recipes/fromSimPO/Qwen3-0.6B.yaml index 898c56a..0600258 100644 --- a/recipes/fromSimPO/Qwen3-0.6B.yaml +++ b/recipes/fromSimPO/Qwen3-0.6B.yaml @@ -1,11 +1,12 @@ # Model arguments -model_name_or_path: Qwen/Qwen3-0.6B +model_name_or_path: Qwen/Qwen3-0.6B-Base model_revision: main torch_dtype: bfloat16 attn_implementation: flash_attention_2 # Data training arguments -chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +tokenizer_name_or_path: Qwen/Qwen3-0.6B # Custom tokenizer with <|im_start|> and <|im_end|> tokens +# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" dataset_mixer: wassname/ultrachat_200k_filtered: 1.0 dataset_splits: @@ -16,7 +17,7 @@ preprocessing_num_workers: 12 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: steps +eval_strategy: steps eval_steps: 200 gradient_accumulation_steps: 32 gradient_checkpointing: true diff --git a/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml b/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml index a80193d..7ae8197 100644 --- a/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml +++ b/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml @@ -1,11 +1,14 @@ # Model arguments -model_name_or_path: Qwen/Qwen3-0.6B +model_name_or_path: Qwen/Qwen3-0.6B-Base model_revision: main torch_dtype: bfloat16 attn_implementation: flash_attention_2 +# use_flash_attention_2: true # Data training arguments -chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +tokenizer_name_or_path: Qwen/Qwen3-0.6B # Custom tokenizer with <|im_start|> and <|im_end|> tokens +# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + dataset_mixer: wassname/v2ray_4chan_formatted: 0.6 wassname/ultrachat_200k_filtered: 0.4 @@ -17,7 +20,7 @@ preprocessing_num_workers: 12 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: steps +eval_strategy: steps eval_steps: 200 gradient_accumulation_steps: 32 gradient_checkpointing: true @@ -33,8 +36,8 @@ lr_scheduler_type: cosine max_seq_length: 2048 max_steps: -1 num_train_epochs: 3 -output_dir: /workspace/checkpoints_new/llama-3-2-1b-sft -run_name: llama-3-2-1b-sft +output_dir: /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan +run_name: Qwen3-0.6B-sft-4chan overwrite_output_dir: true per_device_eval_batch_size: 8 per_device_train_batch_size: 8 diff --git a/recipes/fromSimPO/SmolLM2-135M.yaml b/recipes/fromSimPO/SmolLM2-135M.yaml index c1a3648..13e0d7a 100644 --- a/recipes/fromSimPO/SmolLM2-135M.yaml +++ b/recipes/fromSimPO/SmolLM2-135M.yaml @@ -5,7 +5,8 @@ torch_dtype: bfloat16 attn_implementation: flash_attention_2 # Data training arguments -chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +tokenizer_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens +# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" dataset_mixer: wassname/ultrachat_200k_filtered: 1.0 dataset_splits: @@ -16,7 +17,7 @@ preprocessing_num_workers: 12 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: steps +eval_strategy: steps eval_steps: 200 gradient_accumulation_steps: 32 gradient_checkpointing: true diff --git a/recipes/fromSimPO/SmolLM2-360M.yaml b/recipes/fromSimPO/SmolLM2-360M.yaml index c06fd60..c1e8824 100644 --- a/recipes/fromSimPO/SmolLM2-360M.yaml +++ b/recipes/fromSimPO/SmolLM2-360M.yaml @@ -5,7 +5,8 @@ torch_dtype: bfloat16 attn_implementation: flash_attention_2 # Data training arguments -chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +tokenizer_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens +# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" dataset_mixer: wassname/ultrachat_200k_filtered: 1.0 dataset_splits: @@ -16,7 +17,7 @@ preprocessing_num_workers: 12 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: steps +eval_strategy: steps eval_steps: 200 gradient_accumulation_steps: 32 gradient_checkpointing: true diff --git a/recipes/fromSimPO/llama-3-2-1b-base-sft.yaml b/recipes/fromSimPO/llama-3-2-1b-base-sft.yaml index d28308b..7ed0f9c 100644 --- a/recipes/fromSimPO/llama-3-2-1b-base-sft.yaml +++ b/recipes/fromSimPO/llama-3-2-1b-base-sft.yaml @@ -10,13 +10,13 @@ dataset_mixer: wassname/ultrachat_200k_filtered: 1.0 dataset_splits: - train_sft -- test_sft +# - test_sft preprocessing_num_workers: 12 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: steps +eval_strategy: steps eval_steps: 200 gradient_accumulation_steps: 32 gradient_checkpointing: true diff --git a/recipes/fromSimPO/llama-3-2-3b-base-sft.yaml b/recipes/fromSimPO/llama-3-2-3b-base-sft.yaml index 5eded20..2e5dfe7 100644 --- a/recipes/fromSimPO/llama-3-2-3b-base-sft.yaml +++ b/recipes/fromSimPO/llama-3-2-3b-base-sft.yaml @@ -16,7 +16,7 @@ preprocessing_num_workers: 12 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: steps +eval_strategy: steps eval_steps: 200 gradient_accumulation_steps: 8 gradient_checkpointing: true diff --git a/recipes/fromSimPO/llama-3-8b-base-sft.yaml b/recipes/fromSimPO/llama-3-8b-base-sft.yaml index b9e37ab..2204995 100644 --- a/recipes/fromSimPO/llama-3-8b-base-sft.yaml +++ b/recipes/fromSimPO/llama-3-8b-base-sft.yaml @@ -16,7 +16,7 @@ preprocessing_num_workers: 12 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: steps +eval_strategy: steps eval_steps: 200 gradient_accumulation_steps: 4 gradient_checkpointing: true diff --git a/recipes/smollm/sft/config.yaml b/recipes/smollm/sft/config.yaml index 2462191..cb900ff 100644 --- a/recipes/smollm/sft/config.yaml +++ b/recipes/smollm/sft/config.yaml @@ -24,7 +24,7 @@ dataset_kwargs: add_special_tokens: false # We already wrap and in the chat template append_concat_token: false # No need to add across samples do_eval: true -evaluation_strategy: epoch +eval_strategy: epoch gradient_accumulation_steps: 4 gradient_checkpointing: true gradient_checkpointing_kwargs: @@ -50,4 +50,4 @@ report_to: - wandb save_strategy: "no" seed: 42 -warmup_ratio: 0.1 \ No newline at end of file +warmup_ratio: 0.1 diff --git a/recipes/smollm2/sft/config.yaml b/recipes/smollm2/sft/config.yaml index 6f6cd51..27fddde 100644 --- a/recipes/smollm2/sft/config.yaml +++ b/recipes/smollm2/sft/config.yaml @@ -20,7 +20,7 @@ preprocessing_num_workers: 36 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: epoch +eval_strategy: epoch gradient_accumulation_steps: 4 gradient_checkpointing: true gradient_checkpointing_kwargs: @@ -46,4 +46,4 @@ report_to: - wandb save_strategy: "no" seed: 42 -warmup_ratio: 0.1 \ No newline at end of file +warmup_ratio: 0.1 diff --git a/recipes/smollm2/sft/config_smol.yaml b/recipes/smollm2/sft/config_smol.yaml index 70be48c..6ed5073 100644 --- a/recipes/smollm2/sft/config_smol.yaml +++ b/recipes/smollm2/sft/config_smol.yaml @@ -17,7 +17,7 @@ preprocessing_num_workers: 36 # SFT trainer config bf16: true do_eval: true -evaluation_strategy: epoch +eval_strategy: epoch gradient_accumulation_steps: 4 gradient_checkpointing: true gradient_checkpointing_kwargs: @@ -43,4 +43,4 @@ report_to: - wandb save_strategy: "no" seed: 42 -warmup_ratio: 0.1 \ No newline at end of file +warmup_ratio: 0.1