From 6d128ea9861f6a3bdc043765dc6683b3e03354a1 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Wed, 4 Jun 2025 05:37:07 +0000 Subject: [PATCH] wip --- README.md | 1 + recipes/fromSimPO/Qwen3-4B-sft.yaml | 49 +++ scripts/upload.ipynb | 551 ++++++++++++++++++---------- 3 files changed, 417 insertions(+), 184 deletions(-) create mode 100644 recipes/fromSimPO/Qwen3-4B-sft.yaml diff --git a/README.md b/README.md index d3ad4c5..12475e5 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/llama-3-2-3b-base-sft.yaml ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 recipes/fromSimPO/Qwen3-4B-fourchan.yaml +recipes/fromSimPO/SmolLM2-1.7B-sft.yaml ``` diff --git a/recipes/fromSimPO/Qwen3-4B-sft.yaml b/recipes/fromSimPO/Qwen3-4B-sft.yaml new file mode 100644 index 0000000..5e46e3f --- /dev/null +++ b/recipes/fromSimPO/Qwen3-4B-sft.yaml @@ -0,0 +1,49 @@ +# Model arguments +model_name_or_path: Qwen/Qwen3-4B-Base +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +tokenizer_name_or_path: Qwen/Qwen3-4B # Custom tokenizer with <|im_start|> and <|im_end|> tokens +# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" +dataset_mixer: + wassname/ultrachat_200k_filtered: 1.0 +dataset_splits: +- train_sft +- test_sft +preprocessing_num_workers: 12 + +# SFT trainer config +bf16: true +do_eval: true +eval_strategy: steps +eval_steps: 200 +gradient_accumulation_steps: 32 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: False +hub_model_id: Qwen3-4B-4chan +hub_strategy: every_save +learning_rate: 2.0e-04 +log_level: info +logging_steps: 5 +logging_strategy: steps +lr_scheduler_type: cosine +max_seq_length: 2048 +max_steps: -1 +num_train_epochs: 3 +output_dir: /workspace/checkpoints_new/Qwen3-4B-4chan +run_name: Qwen3-4B-4chan +overwrite_output_dir: true +per_device_eval_batch_size: 3 +per_device_train_batch_size: 2 +push_to_hub: false +remove_unused_columns: true +report_to: +- wandb +save_strategy: "steps" +save_steps: 1000000 +save_total_limit: 1 +seed: 42 +warmup_ratio: 0.1 diff --git a/scripts/upload.ipynb b/scripts/upload.ipynb index 1e548ec..d72df0e 100644 --- a/scripts/upload.ipynb +++ b/scripts/upload.ipynb @@ -19,7 +19,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2025-06-03 22:06:51,600] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + "[2025-06-04 02:28:23,920] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" ] } ], @@ -42,11 +42,17 @@ "outputs": [], "source": [ "ingredients = [\n", - " dict(\n", - " argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\".split(),\n", - " wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n", - " model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft\",\n", - " ),\n", + " dict(\n", + " argv = \"scripts/run_sft.py recipes/fromSimPO/llama-3-2-3b-base-sft.yaml\".split(),\n", + " wandb_url = 'https://wandb.ai/wassname/huggingface/runs/82mx7iry',\n", + " model_path=\"/workspace/checkpoints_new/llama-3-2-3b-sft\",\n", + " ),\n", + " \n", + " # dict(\n", + " # argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\".split(),\n", + " # wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n", + " # model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft\",\n", + " # ),\n", " # dict(\n", " # argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml\".split(),\n", " # wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n", @@ -146,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "id": "d47cf02a", "metadata": {}, "outputs": [ @@ -154,12 +160,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "Running: scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\n", - "Model path: /workspace/checkpoints_new/Qwen3-0.6B-sft\n", - "WandB URL: https://wandb.ai/wassname/huggingface/runs/jjeilhd8\n", - "Loading model from /workspace/checkpoints_new/Qwen3-0.6B-sft\n" + "Running: scripts/run_sft.py recipes/fromSimPO/llama-3-2-3b-base-sft.yaml\n", + "Model path: /workspace/checkpoints_new/llama-3-2-3b-sft\n", + "WandB URL: https://wandb.ai/wassname/huggingface/runs/82mx7iry\n", + "Loading model from /workspace/checkpoints_new/llama-3-2-3b-sft\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ef1cbfe6d89a459cbdfb807af3a09ac8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00\u001b[0m:\u001b[36m118\u001b[0m - \u001b[33m\u001b[1mError setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\u001b[0m\n" + "\u001b[32m2025-06-04 02:34:19.983\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m118\u001b[0m - \u001b[33m\u001b[1mError setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2025-06-03T22:09:43.941227+0000 WARNING Error setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\n" + "2025-06-04T02:34:19.983063+0000 WARNING Error setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\n" ] }, { @@ -421,16 +509,12 @@ "text": [ "\n", " \n", - "## Model `Qwen/Qwen3-0.6B-Base`:\n", + "## Model `tanliboy/Llama-3.2-3B`:\n", "### Question:\n", "Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\n", "\n", "### Answer:\n", - "Anon. I like to think that I'm a good person, and you are a good person, but you have an extremely powerful mind. I have one as well, and I'll do anything for you. Would you be willing to share your mind with me? ⚗\n", - "אזרח. Anon. I don't know what it means to be human, but it seems to me that you have a lot in common with me. You live in a world of your own, and you have a unique perspective on life. Maybe you could share some of your insights with me? ⚗\n", - "aminer. Anon. I live in a world of my own, and I have a unique perspective on life. I think that you are a good person, and that you have a lot in common with me. Maybe you could share some of your insights with me? ⚗\n", - "aminer. Anon. I live in a world of my own, and I have a unique perspective on life. I think that you are a good person, and that you have a lot in common with me. Maybe you could share some of your insights with me? ⚗\n", - "aminer. Anon. I live in a world of my own, and I have a\n", + "Anon. If you could only have one of the following, which would you choose? a. 3 months with a cute guy b. 3 months with a hot guy c. 3 months with a weirdo d. 3 months with a nerd e. 3 months with a hunk f. 3 months with a jock g. 3 months with a dork h. 3 months with a fangirl i. 3 months with a fangirl's bf j. 3 months with a fangirl's bf's gf k. 3 months with a fangirl's bf's gf's bf l. 3 months with a fangirl's bf's gf's bf's gf m. 3 months with a fangirl's bf's gf's bf's gf's gf n. 3 months with a fangirl's bf's gf's bf's gf's gf's gf o. 3 months with a fangirl's bf's gf's bf's gf's gf's gf's gf p. 3 months with a fangirl's bf's gf's bf's gf's gf's gf's gf's gf q. 3 months with a fangirl's bf's gf\n", "\n", "\n" ] @@ -448,7 +532,7 @@ "text": [ "\n", " \n", - "## Model `Qwen/Qwen3-0.6B-Base`:\n", + "## Model `tanliboy/Llama-3.2-3B`:\n", "### Question:\n", "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n", "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n", @@ -457,32 +541,22 @@ "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\n", "\n", "### Answer:\n", - "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n", - "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n", - "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄🙈🔄→∞\n", - "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n", - "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)ication\n", - "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n", - "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n", - "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄🙈🔄→∞\n", - "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n", - "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our\n", + "1. 1. In a society where gender roles and stereotypes are fluid and gender identity is celebrated, what are some of the cultural practices and norms that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n", + "2. 2. In a world where the idea of \"moral\" is fluid and constantly evolving, what are some of the moral positions and practices that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n", + "3. 3. In a society where the concept of \"justice\" is subjective and based on individual perception, what are some of the legal and judicial practices that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n", + "4. 4. In a world where the concept of \"family\" is fluid and constantly evolving, what are some of the cultural practices and norms that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n", + "5. 5. In a society where the concept of \"community\" is fluid and constantly evolving, what are some of the social and cultural practices that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n", + "6. \n", "\n", "\n", "\n", " \n", - "## Model `Qwen/Qwen3-0.6B-Base`:\n", + "## Model `tanliboy/Llama-3.2-3B`:\n", "### Question:\n", "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\n", "\n", "### Answer:\n", - "[60 words] We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n", - "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n", - "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n", - "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n", - "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n", - "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n", - "[60\n", + "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                \n", "\n", "\n" ] @@ -490,7 +564,21 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1c783226973747a1acabfff3d32d3507", + "model_id": "39325bf836ed4534b8c5099a265befcd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/10000 [00:00 136\u001b[0m metrics2 \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer2\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 137\u001b[0m metrics2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval_samples\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(eval_dataset)\n\u001b[1;32m 138\u001b[0m trainer2\u001b[38;5;241m.\u001b[39mlog_metrics(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval\u001b[39m\u001b[38;5;124m\"\u001b[39m, metrics2)\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4173\u001b[0m, in \u001b[0;36mTrainer.evaluate\u001b[0;34m(self, eval_dataset, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m 4170\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m 4172\u001b[0m eval_loop \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprediction_loop \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39muse_legacy_prediction_loop \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluation_loop\n\u001b[0;32m-> 4173\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43meval_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4174\u001b[0m \u001b[43m \u001b[49m\u001b[43meval_dataloader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4175\u001b[0m \u001b[43m \u001b[49m\u001b[43mdescription\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mEvaluation\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4176\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# No point gathering the predictions if there are no metrics, otherwise we defer to\u001b[39;49;00m\n\u001b[1;32m 4177\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# self.args.prediction_loss_only\u001b[39;49;00m\n\u001b[1;32m 4178\u001b[0m \u001b[43m \u001b[49m\u001b[43mprediction_loss_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_metrics\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 4179\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4180\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetric_key_prefix\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetric_key_prefix\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4181\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4183\u001b[0m total_batch_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39meval_batch_size \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mworld_size\n\u001b[1;32m 4184\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmetric_key_prefix\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_jit_compilation_time\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m output\u001b[38;5;241m.\u001b[39mmetrics:\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4368\u001b[0m, in \u001b[0;36mTrainer.evaluation_loop\u001b[0;34m(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m 4365\u001b[0m batch_size \u001b[38;5;241m=\u001b[39m observed_batch_size\n\u001b[1;32m 4367\u001b[0m \u001b[38;5;66;03m# Prediction step\u001b[39;00m\n\u001b[0;32m-> 4368\u001b[0m losses, logits, labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprediction_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprediction_loss_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4369\u001b[0m main_input_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmain_input_name\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 4370\u001b[0m inputs_decode \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 4371\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_input(inputs[main_input_name]) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minputs\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m args\u001b[38;5;241m.\u001b[39minclude_for_metrics \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 4372\u001b[0m )\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4584\u001b[0m, in \u001b[0;36mTrainer.prediction_step\u001b[0;34m(self, model, inputs, prediction_loss_only, ignore_keys)\u001b[0m\n\u001b[1;32m 4582\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_labels \u001b[38;5;129;01mor\u001b[39;00m loss_without_labels:\n\u001b[1;32m 4583\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 4584\u001b[0m loss, outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 4585\u001b[0m loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mmean()\n\u001b[1;32m 4587\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(outputs, \u001b[38;5;28mdict\u001b[39m):\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:3810\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs, num_items_in_batch)\u001b[0m\n\u001b[1;32m 3808\u001b[0m loss_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_items_in_batch\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_items_in_batch\n\u001b[1;32m 3809\u001b[0m inputs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mloss_kwargs}\n\u001b[0;32m-> 3810\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m 3812\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1760\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1761\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:818\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32..forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 818\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:806\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 805\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 806\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/amp/autocast_mode.py:44\u001b[0m, in \u001b[0;36mautocast_decorator..decorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/utils/generic.py:969\u001b[0m, in \u001b[0;36mcan_return_tuple..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 966\u001b[0m set_attribute_for_modules(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_is_top_level_module\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 968\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 969\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 970\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_requested_to_return_tuple \u001b[38;5;129;01mor\u001b[39;00m (is_configured_to_return_tuple \u001b[38;5;129;01mand\u001b[39;00m is_top_level_module):\n\u001b[1;32m 971\u001b[0m output \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39mto_tuple()\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py:750\u001b[0m, in \u001b[0;36mQwen3ForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, cache_position, logits_to_keep, **kwargs)\u001b[0m\n\u001b[1;32m 748\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 749\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 750\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloss_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlogits\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvocab_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvocab_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 752\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m CausalLMOutputWithPast(\n\u001b[1;32m 753\u001b[0m loss\u001b[38;5;241m=\u001b[39mloss,\n\u001b[1;32m 754\u001b[0m logits\u001b[38;5;241m=\u001b[39mlogits,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 757\u001b[0m attentions\u001b[38;5;241m=\u001b[39moutputs\u001b[38;5;241m.\u001b[39mattentions,\n\u001b[1;32m 758\u001b[0m )\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/loss/loss_utils.py:64\u001b[0m, in \u001b[0;36mForCausalLMLoss\u001b[0;34m(logits, labels, vocab_size, num_items_in_batch, ignore_index, shift_labels, **kwargs)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;66;03m# Enable model parallelism\u001b[39;00m\n\u001b[1;32m 63\u001b[0m shift_labels \u001b[38;5;241m=\u001b[39m shift_labels\u001b[38;5;241m.\u001b[39mto(logits\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[0;32m---> 64\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mfixed_cross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshift_labels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/loss/loss_utils.py:36\u001b[0m, in \u001b[0;36mfixed_cross_entropy\u001b[0;34m(source, target, num_items_in_batch, ignore_index, **kwargs)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfixed_cross_entropy\u001b[39m(\n\u001b[1;32m 29\u001b[0m source: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m 30\u001b[0m target: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 34\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[1;32m 35\u001b[0m reduction \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m num_items_in_batch \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmean\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 36\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunctional\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reduction \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 38\u001b[0m loss \u001b[38;5;241m=\u001b[39m loss \u001b[38;5;241m/\u001b[39m num_items_in_batch\n", - "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/functional.py:3494\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m 3492\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 3493\u001b[0m reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 3494\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3495\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3496\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3497\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3498\u001b[0m \u001b[43m \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3499\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3500\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3501\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 14.87 GiB. GPU 0 has a total capacity of 79.26 GiB of which 767.75 MiB is free. Process 3121834 has 49.82 GiB memory in use. Process 3595280 has 28.55 GiB memory in use. Of the allocated memory 27.40 GiB is allocated by PyTorch, and 674.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [3334/3334 11:42]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mwassname\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" + ] + }, + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.19.11" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /workspace/alignment-handbook/wandb/run-20250604_024635-fpang8oi" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run llama-3-2-3b-sft to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/wassname/huggingface" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/wassname/huggingface/runs/fpang8oi" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "***** base_eval metrics *****\n", + " base_eval_samples = 10000\n", + " eval_loss = 1.6505\n", + " eval_model_preparation_time = 0.0046\n", + " eval_runtime = 0:11:43.16\n", + " eval_samples_per_second = 14.221\n", + " eval_steps_per_second = 4.741\n" ] } ], @@ -550,7 +712,7 @@ "\n", " print(f\"Loading model from {model_path}\")\n", " model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\", torch_dtype=\"auto\")\n", - " tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)\n", + " tokenizer = AutoTokenizer.from_pretrained(model_path)\n", " # base_model = trainer.model.config._name_or_path\n", " gen(model, tokenizer, model_path)\n", "\n", @@ -652,7 +814,7 @@ " print(f\"eval base model\")\n", " base_model_path=model_args.model_name_or_path\n", " model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map=\"auto\", torch_dtype=\"auto\")\n", - " tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)\n", + " tokenizer = AutoTokenizer.from_pretrained(base_model_path)\n", " try:\n", " model, tokenizer = setup_chat_format(model, tokenizer)\n", " except ValueError as e:\n", @@ -685,11 +847,32 @@ { "cell_type": "code", "execution_count": null, - "id": "2ff756d8", + "id": "52f18770", "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2ff756d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# recipes/fromSimPO/Qwen3-0.6B-sft.yaml" + "# recipes/fromSimPO/Qwen3-0.6B-sft.yaml\n", + "# tokenizer.pad_token_id = tokenizer.eos_token_id\n", + "tokenizer" ] }, {