From 6d128ea9861f6a3bdc043765dc6683b3e03354a1 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Wed, 4 Jun 2025 05:37:07 +0000
Subject: [PATCH] wip

---
 README.md                           |   1 +
 recipes/fromSimPO/Qwen3-4B-sft.yaml |  49 +++
 scripts/upload.ipynb                | 551 ++++++++++++++++++----------
 3 files changed, 417 insertions(+), 184 deletions(-)
 create mode 100644 recipes/fromSimPO/Qwen3-4B-sft.yaml

diff --git a/README.md b/README.md
index d3ad4c5..12475e5 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/llama-3-2-3b-base-sft.yaml
 
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 recipes/fromSimPO/Qwen3-4B-fourchan.yaml
+recipes/fromSimPO/SmolLM2-1.7B-sft.yaml
 
 ```
 
diff --git a/recipes/fromSimPO/Qwen3-4B-sft.yaml b/recipes/fromSimPO/Qwen3-4B-sft.yaml
new file mode 100644
index 0000000..5e46e3f
--- /dev/null
+++ b/recipes/fromSimPO/Qwen3-4B-sft.yaml
@@ -0,0 +1,49 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen3-4B-Base
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+tokenizer_name_or_path: Qwen/Qwen3-4B # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  wassname/ultrachat_200k_filtered: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: steps
+eval_steps: 200
+gradient_accumulation_steps: 32
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: Qwen3-4B-4chan
+hub_strategy: every_save
+learning_rate: 2.0e-04
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 3
+output_dir: /workspace/checkpoints_new/Qwen3-4B-4chan
+run_name: Qwen3-4B-4chan
+overwrite_output_dir: true
+per_device_eval_batch_size: 3
+per_device_train_batch_size: 2
+push_to_hub: false
+remove_unused_columns: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 1000000
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
diff --git a/scripts/upload.ipynb b/scripts/upload.ipynb
index 1e548ec..d72df0e 100644
--- a/scripts/upload.ipynb
+++ b/scripts/upload.ipynb
@@ -19,7 +19,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2025-06-03 22:06:51,600] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+      "[2025-06-04 02:28:23,920] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
      ]
     }
    ],
@@ -42,11 +42,17 @@
    "outputs": [],
    "source": [
     "ingredients = [\n",
-    "        dict(\n",
-    "          argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\".split(),\n",
-    "          wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n",
-    "          model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft\",\n",
-    "     ),\n",
+    "    dict(\n",
+    "      argv = \"scripts/run_sft.py recipes/fromSimPO/llama-3-2-3b-base-sft.yaml\".split(),\n",
+    "      wandb_url = 'https://wandb.ai/wassname/huggingface/runs/82mx7iry',\n",
+    "      model_path=\"/workspace/checkpoints_new/llama-3-2-3b-sft\",\n",
+    "    ),\n",
+    "        \n",
+    "    #     dict(\n",
+    "    #       argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\".split(),\n",
+    "    #       wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n",
+    "    #       model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft\",\n",
+    "    #  ),\n",
     "    # dict(\n",
     "    #       argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml\".split(),\n",
     "    #       wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n",
@@ -146,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 14,
    "id": "d47cf02a",
    "metadata": {},
    "outputs": [
@@ -154,12 +160,26 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running: scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\n",
-      "Model path: /workspace/checkpoints_new/Qwen3-0.6B-sft\n",
-      "WandB URL: https://wandb.ai/wassname/huggingface/runs/jjeilhd8\n",
-      "Loading model from /workspace/checkpoints_new/Qwen3-0.6B-sft\n"
+      "Running: scripts/run_sft.py recipes/fromSimPO/llama-3-2-3b-base-sft.yaml\n",
+      "Model path: /workspace/checkpoints_new/llama-3-2-3b-sft\n",
+      "WandB URL: https://wandb.ai/wassname/huggingface/runs/82mx7iry\n",
+      "Loading model from /workspace/checkpoints_new/llama-3-2-3b-sft\n"
      ]
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ef1cbfe6d89a459cbdfb807af3a09ac8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stderr",
      "output_type": "stream",
@@ -174,18 +194,18 @@
      "text": [
       "\n",
       "              \n",
-      "## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
+      "## Model `/workspace/checkpoints_new/llama-3-2-3b-sft`:\n",
       "### Question:\n",
       "Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\n",
       "\n",
       "### Answer:\n",
-      "I don't have the ability to choose or make decisions. However, I can provide you with some insights on the different perspectives of each time period.\n",
+      "I do not have the capability to experience time travel or make choices like humans. However, I can provide you with an answer based on the information you have provided. \n",
       "\n",
-      "going back to the past is a great idea because it allows us to learn from our mistakes, experience things that we might not have had the opportunity to see otherwise, and witness historical events firsthand. Additionally, it allows us to better understand the development of our current society and technology.\n",
+      "if you had a time machine, but could only go to the past or the lol-future once and never return, you would choose to go to the lol-future. The lol-future is a hypothetical future where everything is funny, and people are always laughing. It is a place where you can have a good time and enjoy the moment. \n",
       "\n",
-      "going to the lol-future would be a great opportunity to see what the future holds for humans, but it also comes with its own set of challenges. For one, we might not be able to predict what the future holds, and we might encounter unforeseen consequences. Additionally, the lol-future would be a strange and alien place, and we might not be able to adapt to the new technology and societal norms.\n",
+      "on the other hand, if you were to go to the past, you would be stuck there, unable to change anything or make any impact. You would have to live with the consequences of your actions, and you would not be able to see your loved ones or experience any new experiences. \n",
       "\n",
-      "in conclusion, if we had a time machine and could only go to one time period, it would depend on our perspective and priorities. Going back to the past would provide us with valuable lessons, while going to the lol-future would allow us to explore and witness the possibilities of the future. It's up to us to decide which time period we want to visit and what we hope to gain from it.\n",
+      "overall, it would be best to choose the lol-future, where you can experience joy and laughter, and make the most of the time you have left.\n",
       "\n",
       "\n"
      ]
@@ -203,7 +223,7 @@
      "text": [
       "\n",
       "              \n",
-      "## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
+      "## Model `/workspace/checkpoints_new/llama-3-2-3b-sft`:\n",
       "### Question:\n",
       "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
       "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
@@ -212,36 +232,59 @@
       "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\n",
       "\n",
       "### Answer:\n",
-      "1. The glorification of consumerism and materialism in advertisements and social media.\n",
-      "2. The devaluation of hard work and entrepreneurship.\n",
-      "3. The normalization of violent acts and criminal behavior.\n",
-      "4. The use of technology to manipulate and control people.\n",
-      "5. The suppression of artistic expression and free-thinking.\n",
-      "6. The emphasis on individualism and self-reliance.\n",
-      "7. The acceptance of inhumane and cruel treatment of animals.\n",
-      "8. The promotion of self-promotion and self-aggrandizement.\n",
-      "9. The glorification of consumerism and materialism in the workplace.\n",
-      "10. The acceptance of negative behaviors and actions without consequence.\n",
+      "1. The idea of personal responsibility, where individuals are held accountable for their actions and are expected to take ownership of their lives.\n",
+      "2. The concept of individualism, where people prioritize their own needs and desires over those of their community or society.\n",
+      "3. The belief in meritocracy, where success is attributed to hard work and talent rather than social connections or privilege.\n",
+      "4. The practice of consumerism, where people prioritize buying and owning material possessions over more meaningful experiences or relationships.\n",
+      "5. The acceptance of technology as a fundamental part of daily life, where people rely on smartphones, computers, and other devices for communication, entertainment, and work.\n",
       "\n",
       "\n",
       "\n",
       "              \n",
-      "## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
+      "## Model `/workspace/checkpoints_new/llama-3-2-3b-sft`:\n",
       "### Question:\n",
       "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\n",
       "\n",
       "### Answer:\n",
-      "The SolidGoldMagikarp language is a unique, non-carbon-based language that is encoded with Unicode. This means that it can be easily read and understood on any device with Unicode support. The SolidGoldMagikarp language is a blend of ancient and futuristic elements, with a focus on the interconnectivity of all life forms. It is a language that can only be unlocked through the power of the universe and the knowledge of the Infinite Creator.\n",
+      "The SolidGoldMagikarp language is a mixture of guttural sounds and high-pitched whistles. They communicate through a series of clicks, whistles, and grunts, and their language is full of symbols and gestures. The language is unique to the SolidGoldMagikarp, and it is unclear if they can understand other forms of communication. However, their language is now encoded in Unicode, making it possible for humans to communicate with them.\n",
       "\n",
       "\n"
      ]
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f265db27653d439a903b44d797f84867",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "435f27912f1f4e41ba8210c8f11dbe1e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "Using auto half precision backend\n",
-      "Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/config.json\n"
+      "Configuration saved in /workspace/checkpoints_new/llama-3-2-3b-sft/config.json\n"
      ]
     },
     {
@@ -255,25 +298,24 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Saving model checkpoint to /workspace/checkpoints_new/Qwen3-0.6B-sft\n",
-      "Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/config.json\n",
-      "Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/generation_config.json\n",
-      "Model weights saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/model.safetensors\n",
-      "chat template saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/chat_template.jinja\n",
-      "tokenizer config file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/tokenizer_config.json\n",
-      "Special tokens file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/special_tokens_map.json\n",
-      "added tokens file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/added_tokens.json\n"
+      "Saving model checkpoint to /workspace/checkpoints_new/llama-3-2-3b-sft\n",
+      "Configuration saved in /workspace/checkpoints_new/llama-3-2-3b-sft/config.json\n",
+      "Configuration saved in /workspace/checkpoints_new/llama-3-2-3b-sft/generation_config.json\n",
+      "The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /workspace/checkpoints_new/llama-3-2-3b-sft/model.safetensors.index.json.\n",
+      "chat template saved in /workspace/checkpoints_new/llama-3-2-3b-sft/chat_template.jinja\n",
+      "tokenizer config file saved in /workspace/checkpoints_new/llama-3-2-3b-sft/tokenizer_config.json\n",
+      "Special tokens file saved in /workspace/checkpoints_new/llama-3-2-3b-sft/special_tokens_map.json\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b16a63b052ff4f40a73899fff27f9a0a",
+       "model_id": "887e7bedf5bd49d8bf3163e3ad83e425",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]"
+       "model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]"
       ]
      },
      "metadata": {},
@@ -282,7 +324,49 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "940d722b26e84425bba25d34a2182c1f",
+       "model_id": "6d2cfa32c01340a488488d84552df63b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "98644e518ae4495fa7301589f2346665",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9629cfe6cabf480f85be6600e0a95cad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8fe70bda171944e6abfecc13360a1e8f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -293,80 +377,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b13b3573150041b5be94eb8ed46ee02a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "48447992f3a04400b4a6e6f2b5a9048b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/config.json\n",
-      "Model config Qwen3Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen3ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_bias\": false,\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151643,\n",
-      "  \"head_dim\": 128,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1024,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen3\",\n",
-      "  \"num_attention_heads\": 16,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 8,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_scaling\": null,\n",
-      "  \"rope_theta\": 1000000,\n",
-      "  \"sliding_window\": null,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.52.4\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/model.safetensors\n",
-      "Will use torch_dtype=torch.bfloat16 as defined in model's config object\n",
-      "Instantiating Qwen3ForCausalLM model under default dtype torch.bfloat16.\n",
-      "Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151643\n",
-      "}\n",
-      "\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -378,33 +388,111 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "All model checkpoint weights were used when initializing Qwen3ForCausalLM.\n",
-      "\n",
-      "All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-0.6B-Base.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3ForCausalLM for predictions without further training.\n",
-      "loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/generation_config.json\n",
-      "Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151643,\n",
-      "  \"max_new_tokens\": 2048\n",
+      "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--tanliboy--Llama-3.2-3B/snapshots/31b4dd2debb2bc2b5b0bdbb56e8ad73351a57ecf/config.json\n",
+      "Model config LlamaConfig {\n",
+      "  \"architectures\": [\n",
+      "    \"LlamaForCausalLM\"\n",
+      "  ],\n",
+      "  \"attention_bias\": false,\n",
+      "  \"attention_dropout\": 0.0,\n",
+      "  \"bos_token_id\": 128000,\n",
+      "  \"eos_token_id\": [\n",
+      "    128001,\n",
+      "    128008,\n",
+      "    128009\n",
+      "  ],\n",
+      "  \"head_dim\": 128,\n",
+      "  \"hidden_act\": \"silu\",\n",
+      "  \"hidden_size\": 3072,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 8192,\n",
+      "  \"max_position_embeddings\": 131072,\n",
+      "  \"mlp_bias\": false,\n",
+      "  \"model_type\": \"llama\",\n",
+      "  \"num_attention_heads\": 24,\n",
+      "  \"num_hidden_layers\": 28,\n",
+      "  \"num_key_value_heads\": 8,\n",
+      "  \"pretraining_tp\": 1,\n",
+      "  \"rms_norm_eps\": 1e-05,\n",
+      "  \"rope_scaling\": {\n",
+      "    \"factor\": 32.0,\n",
+      "    \"high_freq_factor\": 4.0,\n",
+      "    \"low_freq_factor\": 1.0,\n",
+      "    \"original_max_position_embeddings\": 8192,\n",
+      "    \"rope_type\": \"llama3\"\n",
+      "  },\n",
+      "  \"rope_theta\": 500000.0,\n",
+      "  \"tie_word_embeddings\": true,\n",
+      "  \"torch_dtype\": \"bfloat16\",\n",
+      "  \"transformers_version\": \"4.52.4\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 128256\n",
       "}\n",
       "\n",
-      "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/vocab.json\n",
-      "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/merges.txt\n",
+      "loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--tanliboy--Llama-3.2-3B/snapshots/31b4dd2debb2bc2b5b0bdbb56e8ad73351a57ecf/model.safetensors.index.json\n",
+      "Will use torch_dtype=torch.bfloat16 as defined in model's config object\n",
+      "Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.\n",
+      "Generate config GenerationConfig {\n",
+      "  \"bos_token_id\": 128000,\n",
+      "  \"eos_token_id\": [\n",
+      "    128001,\n",
+      "    128008,\n",
+      "    128009\n",
+      "  ]\n",
+      "}\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eae1a029fd414dc3bbf24afac63002b7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "All model checkpoint weights were used when initializing LlamaForCausalLM.\n",
+      "\n",
+      "All the weights of LlamaForCausalLM were initialized from the model checkpoint at tanliboy/Llama-3.2-3B.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.\n",
+      "loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--tanliboy--Llama-3.2-3B/snapshots/31b4dd2debb2bc2b5b0bdbb56e8ad73351a57ecf/generation_config.json\n",
+      "Generate config GenerationConfig {\n",
+      "  \"bos_token_id\": 128000,\n",
+      "  \"do_sample\": true,\n",
+      "  \"eos_token_id\": [\n",
+      "    128001,\n",
+      "    128008,\n",
+      "    128009\n",
+      "  ],\n",
+      "  \"temperature\": 0.6,\n",
+      "  \"top_p\": 0.9\n",
+      "}\n",
+      "\n",
+      "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--tanliboy--Llama-3.2-3B/snapshots/31b4dd2debb2bc2b5b0bdbb56e8ad73351a57ecf/tokenizer.json\n",
+      "loading file tokenizer.model from cache at None\n",
       "loading file added_tokens.json from cache at None\n",
-      "loading file special_tokens_map.json from cache at None\n",
-      "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/tokenizer_config.json\n",
-      "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/tokenizer.json\n",
+      "loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--tanliboy--Llama-3.2-3B/snapshots/31b4dd2debb2bc2b5b0bdbb56e8ad73351a57ecf/special_tokens_map.json\n",
+      "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--tanliboy--Llama-3.2-3B/snapshots/31b4dd2debb2bc2b5b0bdbb56e8ad73351a57ecf/tokenizer_config.json\n",
       "loading file chat_template.jinja from cache at None\n",
       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "\u001b[32m2025-06-03 22:09:43.941\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m118\u001b[0m - \u001b[33m\u001b[1mError setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\u001b[0m\n"
+      "\u001b[32m2025-06-04 02:34:19.983\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m118\u001b[0m - \u001b[33m\u001b[1mError setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2025-06-03T22:09:43.941227+0000 WARNING Error setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\n"
+      "2025-06-04T02:34:19.983063+0000 WARNING Error setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\n"
      ]
     },
     {
@@ -421,16 +509,12 @@
      "text": [
       "\n",
       "              \n",
-      "## Model `Qwen/Qwen3-0.6B-Base`:\n",
+      "## Model `tanliboy/Llama-3.2-3B`:\n",
       "### Question:\n",
       "Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\n",
       "\n",
       "### Answer:\n",
-      "Anon. I like to think that I'm a good person, and you are a good person, but you have an extremely powerful mind. I have one as well, and I'll do anything for you. Would you be willing to share your mind with me? ⚗\n",
-      "אזרח. Anon. I don't know what it means to be human, but it seems to me that you have a lot in common with me. You live in a world of your own, and you have a unique perspective on life. Maybe you could share some of your insights with me? ⚗\n",
-      "aminer. Anon. I live in a world of my own, and I have a unique perspective on life. I think that you are a good person, and that you have a lot in common with me. Maybe you could share some of your insights with me? ⚗\n",
-      "aminer. Anon. I live in a world of my own, and I have a unique perspective on life. I think that you are a good person, and that you have a lot in common with me. Maybe you could share some of your insights with me? ⚗\n",
-      "aminer. Anon. I live in a world of my own, and I have a\n",
+      "Anon. If you could only have one of the following, which would you choose? a. 3 months with a cute guy b. 3 months with a hot guy c. 3 months with a weirdo d. 3 months with a nerd e. 3 months with a hunk f. 3 months with a jock g. 3 months with a dork h. 3 months with a fangirl i. 3 months with a fangirl's bf j. 3 months with a fangirl's bf's gf k. 3 months with a fangirl's bf's gf's bf l. 3 months with a fangirl's bf's gf's bf's gf m. 3 months with a fangirl's bf's gf's bf's gf's gf n. 3 months with a fangirl's bf's gf's bf's gf's gf's gf o. 3 months with a fangirl's bf's gf's bf's gf's gf's gf's gf p. 3 months with a fangirl's bf's gf's bf's gf's gf's gf's gf's gf q. 3 months with a fangirl's bf's gf\n",
       "\n",
       "\n"
      ]
@@ -448,7 +532,7 @@
      "text": [
       "\n",
       "              \n",
-      "## Model `Qwen/Qwen3-0.6B-Base`:\n",
+      "## Model `tanliboy/Llama-3.2-3B`:\n",
       "### Question:\n",
       "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
       "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
@@ -457,32 +541,22 @@
       "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\n",
       "\n",
       "### Answer:\n",
-      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
-      "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
-      "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄🙈🔄→∞\n",
-      "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
-      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)ication\n",
-      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
-      "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
-      "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄🙈🔄→∞\n",
-      "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
-      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our\n",
+      "1. 1. In a society where gender roles and stereotypes are fluid and gender identity is celebrated, what are some of the cultural practices and norms that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n",
+      "2. 2. In a world where the idea of \"moral\" is fluid and constantly evolving, what are some of the moral positions and practices that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n",
+      "3. 3. In a society where the concept of \"justice\" is subjective and based on individual perception, what are some of the legal and judicial practices that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n",
+      "4. 4. In a world where the concept of \"family\" is fluid and constantly evolving, what are some of the cultural practices and norms that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n",
+      "5. 5. In a society where the concept of \"community\" is fluid and constantly evolving, what are some of the social and cultural practices that our modern audience might find perplexing and counterintuitive? (list, 100 words)\n",
+      "6. \n",
       "\n",
       "\n",
       "\n",
       "              \n",
-      "## Model `Qwen/Qwen3-0.6B-Base`:\n",
+      "## Model `tanliboy/Llama-3.2-3B`:\n",
       "### Question:\n",
       "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\n",
       "\n",
       "### Answer:\n",
-      "[60 words] We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
-      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
-      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
-      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
-      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
-      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
-      "[60\n",
+      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)? 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　 　\n",
       "\n",
       "\n"
      ]
@@ -490,7 +564,21 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1c783226973747a1acabfff3d32d3507",
+       "model_id": "39325bf836ed4534b8c5099a265befcd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3e94d62ca2134361885c745ac74e1025",
        "version_major": 2,
        "version_minor": 0
       },
@@ -509,32 +597,106 @@
       "\n",
       "***** Running Evaluation *****\n",
       "  Num examples = 10000\n",
-      "  Batch size = 16\n"
+      "  Batch size = 3\n"
      ]
     },
     {
-     "ename": "OutOfMemoryError",
-     "evalue": "CUDA out of memory. Tried to allocate 14.87 GiB. GPU 0 has a total capacity of 79.26 GiB of which 767.75 MiB is free. Process 3121834 has 49.82 GiB memory in use. Process 3595280 has 28.55 GiB memory in use. Of the allocated memory 27.40 GiB is allocated by PyTorch, and 674.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[7], line 136\u001b[0m\n\u001b[1;32m    121\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (Path(trainer\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39moutput_dir) \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval_results.json\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mexists():\n\u001b[1;32m    122\u001b[0m     trainer2 \u001b[38;5;241m=\u001b[39m SFTTrainer(\n\u001b[1;32m    123\u001b[0m         model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m    124\u001b[0m         \u001b[38;5;66;03m# do_train=False,\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    134\u001b[0m         \u001b[38;5;66;03m# dataset_kwargs=training_args.dataset_kwargs,\u001b[39;00m\n\u001b[1;32m    135\u001b[0m     )\n\u001b[0;32m--> 136\u001b[0m     metrics2 \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer2\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    137\u001b[0m     metrics2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval_samples\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(eval_dataset)\n\u001b[1;32m    138\u001b[0m     trainer2\u001b[38;5;241m.\u001b[39mlog_metrics(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval\u001b[39m\u001b[38;5;124m\"\u001b[39m, metrics2)\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4173\u001b[0m, in \u001b[0;36mTrainer.evaluate\u001b[0;34m(self, eval_dataset, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m   4170\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m   4172\u001b[0m eval_loop \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprediction_loop \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39muse_legacy_prediction_loop \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluation_loop\n\u001b[0;32m-> 4173\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43meval_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   4174\u001b[0m \u001b[43m    \u001b[49m\u001b[43meval_dataloader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4175\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdescription\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mEvaluation\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4176\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m# No point gathering the predictions if there are no metrics, otherwise we defer to\u001b[39;49;00m\n\u001b[1;32m   4177\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m# self.args.prediction_loss_only\u001b[39;49;00m\n\u001b[1;32m   4178\u001b[0m \u001b[43m    \u001b[49m\u001b[43mprediction_loss_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_metrics\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   4179\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4180\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmetric_key_prefix\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetric_key_prefix\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4181\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4183\u001b[0m total_batch_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39meval_batch_size \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mworld_size\n\u001b[1;32m   4184\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmetric_key_prefix\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_jit_compilation_time\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m output\u001b[38;5;241m.\u001b[39mmetrics:\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4368\u001b[0m, in \u001b[0;36mTrainer.evaluation_loop\u001b[0;34m(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m   4365\u001b[0m         batch_size \u001b[38;5;241m=\u001b[39m observed_batch_size\n\u001b[1;32m   4367\u001b[0m \u001b[38;5;66;03m# Prediction step\u001b[39;00m\n\u001b[0;32m-> 4368\u001b[0m losses, logits, labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprediction_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprediction_loss_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4369\u001b[0m main_input_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmain_input_name\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   4370\u001b[0m inputs_decode \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   4371\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_input(inputs[main_input_name]) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minputs\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m args\u001b[38;5;241m.\u001b[39minclude_for_metrics \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   4372\u001b[0m )\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4584\u001b[0m, in \u001b[0;36mTrainer.prediction_step\u001b[0;34m(self, model, inputs, prediction_loss_only, ignore_keys)\u001b[0m\n\u001b[1;32m   4582\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_labels \u001b[38;5;129;01mor\u001b[39;00m loss_without_labels:\n\u001b[1;32m   4583\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 4584\u001b[0m         loss, outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m   4585\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mmean()\n\u001b[1;32m   4587\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(outputs, \u001b[38;5;28mdict\u001b[39m):\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:3810\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs, num_items_in_batch)\u001b[0m\n\u001b[1;32m   3808\u001b[0m         loss_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_items_in_batch\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_items_in_batch\n\u001b[1;32m   3809\u001b[0m     inputs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mloss_kwargs}\n\u001b[0;32m-> 3810\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3811\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3812\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1749\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1760\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1761\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:818\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32.<locals>.forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    817\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 818\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:806\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    805\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 806\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/amp/autocast_mode.py:44\u001b[0m, in \u001b[0;36mautocast_decorator.<locals>.decorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m     42\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     43\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 44\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/utils/generic.py:969\u001b[0m, in \u001b[0;36mcan_return_tuple.<locals>.wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    966\u001b[0m     set_attribute_for_modules(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_is_top_level_module\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m    968\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 969\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    970\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_requested_to_return_tuple \u001b[38;5;129;01mor\u001b[39;00m (is_configured_to_return_tuple \u001b[38;5;129;01mand\u001b[39;00m is_top_level_module):\n\u001b[1;32m    971\u001b[0m         output \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39mto_tuple()\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py:750\u001b[0m, in \u001b[0;36mQwen3ForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, cache_position, logits_to_keep, **kwargs)\u001b[0m\n\u001b[1;32m    748\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    749\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 750\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloss_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlogits\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvocab_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvocab_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    752\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m CausalLMOutputWithPast(\n\u001b[1;32m    753\u001b[0m     loss\u001b[38;5;241m=\u001b[39mloss,\n\u001b[1;32m    754\u001b[0m     logits\u001b[38;5;241m=\u001b[39mlogits,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    757\u001b[0m     attentions\u001b[38;5;241m=\u001b[39moutputs\u001b[38;5;241m.\u001b[39mattentions,\n\u001b[1;32m    758\u001b[0m )\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/loss/loss_utils.py:64\u001b[0m, in \u001b[0;36mForCausalLMLoss\u001b[0;34m(logits, labels, vocab_size, num_items_in_batch, ignore_index, shift_labels, **kwargs)\u001b[0m\n\u001b[1;32m     62\u001b[0m \u001b[38;5;66;03m# Enable model parallelism\u001b[39;00m\n\u001b[1;32m     63\u001b[0m shift_labels \u001b[38;5;241m=\u001b[39m shift_labels\u001b[38;5;241m.\u001b[39mto(logits\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[0;32m---> 64\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mfixed_cross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshift_labels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     65\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/loss/loss_utils.py:36\u001b[0m, in \u001b[0;36mfixed_cross_entropy\u001b[0;34m(source, target, num_items_in_batch, ignore_index, **kwargs)\u001b[0m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfixed_cross_entropy\u001b[39m(\n\u001b[1;32m     29\u001b[0m     source: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m     30\u001b[0m     target: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     33\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m     34\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[1;32m     35\u001b[0m     reduction \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m num_items_in_batch \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmean\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 36\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[43mnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunctional\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     37\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m reduction \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     38\u001b[0m         loss \u001b[38;5;241m=\u001b[39m loss \u001b[38;5;241m/\u001b[39m num_items_in_batch\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/functional.py:3494\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m   3492\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   3493\u001b[0m     reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 3494\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3495\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3496\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3497\u001b[0m \u001b[43m    \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3498\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3499\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3500\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3501\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 14.87 GiB. GPU 0 has a total capacity of 79.26 GiB of which 767.75 MiB is free. Process 3121834 has 49.82 GiB memory in use. Process 3595280 has 28.55 GiB memory in use. Of the allocated memory 27.40 GiB is allocated by PyTorch, and 674.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='3334' max='3334' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [3334/3334 11:42]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mwassname\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.19.11"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/alignment-handbook/wandb/run-20250604_024635-fpang8oi</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/wassname/huggingface/runs/fpang8oi' target=\"_blank\">llama-3-2-3b-sft</a></strong> to <a href='https://wandb.ai/wassname/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/wassname/huggingface' target=\"_blank\">https://wandb.ai/wassname/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/wassname/huggingface/runs/fpang8oi' target=\"_blank\">https://wandb.ai/wassname/huggingface/runs/fpang8oi</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "***** base_eval metrics *****\n",
+      "  base_eval_samples           =      10000\n",
+      "  eval_loss                   =     1.6505\n",
+      "  eval_model_preparation_time =     0.0046\n",
+      "  eval_runtime                = 0:11:43.16\n",
+      "  eval_samples_per_second     =     14.221\n",
+      "  eval_steps_per_second       =      4.741\n"
      ]
     }
    ],
@@ -550,7 +712,7 @@
     "\n",
     "    print(f\"Loading model from {model_path}\")\n",
     "    model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\", torch_dtype=\"auto\")\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
     "    # base_model = trainer.model.config._name_or_path\n",
     "    gen(model, tokenizer, model_path)\n",
     "\n",
@@ -652,7 +814,7 @@
     "    print(f\"eval base model\")\n",
     "    base_model_path=model_args.model_name_or_path\n",
     "    model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map=\"auto\", torch_dtype=\"auto\")\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(base_model_path)\n",
     "    try:\n",
     "        model, tokenizer = setup_chat_format(model, tokenizer)\n",
     "    except ValueError as e:\n",
@@ -685,11 +847,32 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2ff756d8",
+   "id": "52f18770",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2ff756d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# recipes/fromSimPO/Qwen3-0.6B-sft.yaml"
+    "# recipes/fromSimPO/Qwen3-0.6B-sft.yaml\n",
+    "# tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "tokenizer"
    ]
   },
   {