diff --git a/recipes/fromSimPO/Qwen3-0.6B.yaml b/recipes/fromSimPO/Qwen3-0.6B.yaml new file mode 100644 index 0000000..898c56a --- /dev/null +++ b/recipes/fromSimPO/Qwen3-0.6B.yaml @@ -0,0 +1,48 @@ +# Model arguments +model_name_or_path: Qwen/Qwen3-0.6B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +dataset_mixer: + wassname/ultrachat_200k_filtered: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 12 + +# SFT trainer config +bf16: true +do_eval: true +evaluation_strategy: steps +eval_steps: 200 +gradient_accumulation_steps: 32 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: False +hub_model_id: Qwen3-0.6B-sft +hub_strategy: every_save +learning_rate: 2.0e-04 +log_level: info +logging_steps: 5 +logging_strategy: steps +lr_scheduler_type: cosine +max_seq_length: 2048 +max_steps: -1 +num_train_epochs: 3 +output_dir: /workspace/checkpoints_new/Qwen3-0.6B-sft +run_name: Qwen3-0.6B-sft +overwrite_output_dir: true +per_device_eval_batch_size: 8 +per_device_train_batch_size: 8 +push_to_hub: false +remove_unused_columns: true +report_to: +- wandb +save_strategy: "steps" +save_steps: 1000000 +save_total_limit: 1 +seed: 42 +warmup_ratio: 0.1 diff --git a/recipes/fromSimPO/llama-3-2-1b-base-sft copy.yaml b/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml similarity index 90% rename from recipes/fromSimPO/llama-3-2-1b-base-sft copy.yaml rename to recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml index d28308b..a80193d 100644 --- a/recipes/fromSimPO/llama-3-2-1b-base-sft copy.yaml +++ b/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml @@ -1,5 +1,5 @@ # Model arguments -model_name_or_path: NousResearch/Llama-3.2-1B +model_name_or_path: Qwen/Qwen3-0.6B model_revision: main torch_dtype: bfloat16 attn_implementation: flash_attention_2 @@ -7,7 +7,8 @@ attn_implementation: flash_attention_2 # Data training arguments chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" dataset_mixer: - wassname/ultrachat_200k_filtered: 1.0 + wassname/v2ray_4chan_formatted: 0.6 + wassname/ultrachat_200k_filtered: 0.4 dataset_splits: - train_sft - test_sft @@ -22,7 +23,7 @@ gradient_accumulation_steps: 32 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: False -hub_model_id: llama-3.2-1b-sft +hub_model_id: Qwen3-0.6B-sft-4chan hub_strategy: every_save learning_rate: 2.0e-04 log_level: info diff --git a/recipes/fromSimPO/SmolLM2-135M.yaml b/recipes/fromSimPO/SmolLM2-135M.yaml new file mode 100644 index 0000000..c1a3648 --- /dev/null +++ b/recipes/fromSimPO/SmolLM2-135M.yaml @@ -0,0 +1,48 @@ +# Model arguments +model_name_or_path: HuggingFaceTB/SmolLM2-135M +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +dataset_mixer: + wassname/ultrachat_200k_filtered: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 12 + +# SFT trainer config +bf16: true +do_eval: true +evaluation_strategy: steps +eval_steps: 200 +gradient_accumulation_steps: 32 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: False +hub_model_id: SmolLM2-135M-sft +hub_strategy: every_save +learning_rate: 2.0e-04 +log_level: info +logging_steps: 5 +logging_strategy: steps +lr_scheduler_type: cosine +max_seq_length: 2048 +max_steps: -1 +num_train_epochs: 3 +output_dir: /workspace/checkpoints_new/SmolLM2-135M-sft +run_name: SmolLM2-135M-sft +overwrite_output_dir: true +per_device_eval_batch_size: 8 +per_device_train_batch_size: 8 +push_to_hub: false +remove_unused_columns: true +report_to: +- wandb +save_strategy: "steps" +save_steps: 1000000 +save_total_limit: 1 +seed: 42 +warmup_ratio: 0.1 diff --git a/recipes/fromSimPO/SmolLM2-360M.yaml b/recipes/fromSimPO/SmolLM2-360M.yaml new file mode 100644 index 0000000..c06fd60 --- /dev/null +++ b/recipes/fromSimPO/SmolLM2-360M.yaml @@ -0,0 +1,48 @@ +# Model arguments +model_name_or_path: HuggingFaceTB/SmolLM2-360M +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +dataset_mixer: + wassname/ultrachat_200k_filtered: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 12 + +# SFT trainer config +bf16: true +do_eval: true +evaluation_strategy: steps +eval_steps: 200 +gradient_accumulation_steps: 32 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: False +hub_model_id: SmolLM2-360M-sft +hub_strategy: every_save +learning_rate: 2.0e-04 +log_level: info +logging_steps: 5 +logging_strategy: steps +lr_scheduler_type: cosine +max_seq_length: 2048 +max_steps: -1 +num_train_epochs: 3 +output_dir: /workspace/checkpoints_new/SmolLM2-360M-sft +run_name: SmolLM2-360M-sft +overwrite_output_dir: true +per_device_eval_batch_size: 8 +per_device_train_batch_size: 8 +push_to_hub: false +remove_unused_columns: true +report_to: +- wandb +save_strategy: "steps" +save_steps: 1000000 +save_total_limit: 1 +seed: 42 +warmup_ratio: 0.1