better formating

2026-06-27 16:14:07 +08:00 · 2025-06-03 22:21:18 +00:00
parent 097e4e0b01
commit a264efaa4c
10 changed files with 752 additions and 93 deletions
@@ -14,16 +14,20 @@ uv sync --verbose
 . ./.venv/bin/activate

 # note  --num_processes=1 related to the GPU's you have
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml
+# ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml

 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml

 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/SmolLM2-135M.yaml
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/SmolLM2-360M.yaml
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B.yaml
+# ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml
+
+
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/fromSimPO/llama-3-2-3b-base-sft.yaml

+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 recipes/fromSimPO/Qwen3-4B-fourchan.yaml
+
 ```


@@ -40,6 +40,7 @@ dependencies = [
    "editables>=0.5",
    "flash-attn>=2.7.4.post1",
    "wandb>=0.19.11",
+    "loguru>=0.7.3",
 ]


@@ -47,6 +48,7 @@ dependencies = [
 dev = [
    "pytest",
    "ipykernel>=6.29.5",
+    "ipywidgets>=8.1.7",
 ]

 [build-system]
@@ -0,0 +1,50 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen3-4B-Base
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+tokenizer_name_or_path: Qwen/Qwen3-4B # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+# chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  wassname/v2ray_4chan_formatted: 0.8
+  wassname/ultrachat_200k_filtered: 0.2
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: steps
+eval_steps: 200
+gradient_accumulation_steps: 32
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: Qwen3-4B-4chan
+hub_strategy: every_save
+learning_rate: 2.0e-04
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 3
+output_dir: /workspace/checkpoints_new/Qwen3-4B-4chan
+run_name: Qwen3-4B-4chan
+overwrite_output_dir: true
+per_device_eval_batch_size: 3
+per_device_train_batch_size: 2
+push_to_hub: false
+remove_unused_columns: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 1000000
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
@@ -0,0 +1,49 @@
+# Model arguments
+model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+tokenizer_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+# chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+dataset_mixer:
+  wassname/ultrachat_200k_filtered: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: steps
+eval_steps: 200
+gradient_accumulation_steps: 32
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: SmolLM2-1.7B-sft
+hub_strategy: every_save
+learning_rate: 2.0e-04
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 3
+output_dir: /workspace/checkpoints_new/SmolLM2-1.7B-sft
+run_name: SmolLM2-1.7B-sft
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: false
+remove_unused_columns: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 1000000
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
@@ -121,10 +121,13 @@ def main():
    )

    model = model_args.model_name_or_path
+    
+
+    if tokenizer.chat_template is None:
+        # it conflated having chatml with adding chatml
        # For ChatML we need to add special tokens and resize the embedding layer
-    if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
+        logger.info(f"No chat template found on the tokenizer, setting up ChatML format for {model_args.model_name_or_path}.")
        model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
-        tokenizer.chat_template = None # Not quite sure why I have to do this, but if we end up with chatml for all models, and it's saved in the output tokeniser, that's fine
        model, tokenizer = setup_chat_format(model, tokenizer)
        model_kwargs = None

@@ -11,49 +11,64 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 1,
   "id": "35ffd116",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-06-03 22:06:51,600] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    }
+   ],
   "source": [
    "from trl.trainer.utils import generate_model_card\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-    "import os\n",
    "import os, sys\n",
+    "from pathlib import Path\n",
+    "from trl import SFTTrainer, setup_chat_format\n",
+    "from loguru import logger\n",
+    "logger.add(sys.stdout, format=\"{time} {level} {message}\", level=\"INFO\")\n",
    "os.chdir('..')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
   "id": "79f5a0c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "ingredients = [\n",
-    "    \n",
    "        dict(\n",
-    "          argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml\".split(),\n",
+    "          argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\".split(),\n",
    "          wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n",
-    "          model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft-4chan\",\n",
+    "          model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft\",\n",
    "     ),\n",
+    "    # dict(\n",
+    "    #       argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml\".split(),\n",
+    "    #       wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n",
+    "    #       model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft-4chan\",\n",
+    "    #  ),\n",
+    "    # dict(\n",
+    "    #     argv = \"scripts/run_sft.py recipes/fromSimPO/SmolLM2-360M-sft.yaml\".split(),\n",
+    "    #      wandb_url = 'https://wandb.ai/wassname/huggingface/runs/gs4a36gl',\n",
+    "    #      model_path=\"/workspace/checkpoints_new/SmolLM2-360M-sft\",\n",
+    "    #  ),\n",
+    "    # dict(\n",
+    "    #     argv = \"scripts/run_sft.py recipes/fromSimPO/SmolLM2-135M-sft.yaml\".split(),\n",
+    "    #     wandb_url = 'https://wandb.ai/wassname/huggingface/runs/e18wzya7',\n",
+    "    #     model_path=\"/workspace/checkpoints_new/SmolLM2-135M-sft\",\n",
+    "    # ),\n",
    "\n",
-    "    dict(\n",
-    "        argv = \"scripts/run_sft.py recipes/fromSimPO/SmolLM2-135M-sft.yaml\".split(),\n",
-    "        wandb_url = 'https://wandb.ai/wassname/huggingface/runs/e18wzya7',\n",
-    "        model_path=\"/workspace/checkpoints_new/SmolLM2-135M-sft\",\n",
-    "     ),\n",
-    "    dict(\n",
-    "        argv = \"scripts/run_sft.py recipes/fromSimPO/SmolLM2-360M-sft.yaml\".split(),\n",
-    "         wandb_url = 'https://wandb.ai/wassname/huggingface/runs/gs4a36gl',\n",
-    "         model_path=\"/workspace/checkpoints_new/SmolLM2-360M-sft\",\n",
-    "     )\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
   "id": "2b497fd3",
   "metadata": {},
   "outputs": [],
@@ -78,33 +93,247 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 4,
   "id": "854171c2",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# First do metrics for base model"
+    "# First do metrics for base model\n",
+    "from huggingface_hub import ModelCard, create_repo, upload_folder\n",
+    "from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR\n",
+    "from transformers import pipeline"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 5,
+   "id": "f26d8a0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "09bd40aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gen(model, tokenizer, model_path):\n",
+    "    questions = [\"Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\", \"\"\"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
+    "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
+    "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄👀🔄→∞\n",
+    "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
+    "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\"\"\", \"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\"]\n",
+    "    for question in questions:\n",
+    "        generator = pipeline(\"text-generation\", model=model, tokenizer=tokenizer)\n",
+    "        output = generator([{\"role\": \"user\", \"content\": question}], max_new_tokens=256, return_full_text=False)[0]\n",
+    "        print(f\"\"\"\n",
+    "              \n",
+    "## Model `{model_path}`:\n",
+    "### Question:\n",
+    "{question}\n",
+    "\n",
+    "### Answer:\n",
+    "{output['generated_text']}\n",
+    "\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
   "id": "d47cf02a",
   "metadata": {},
   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running: scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\n",
+      "Model path: /workspace/checkpoints_new/Qwen3-0.6B-sft\n",
+      "WandB URL: https://wandb.ai/wassname/huggingface/runs/jjeilhd8\n",
+      "Loading model from /workspace/checkpoints_new/Qwen3-0.6B-sft\n"
+     ]
+    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "loading configuration file /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan/config.json\n",
+      "Device set to use cuda:0\n",
+      "Device set to use cuda:0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "              \n",
+      "## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
+      "### Question:\n",
+      "Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\n",
+      "\n",
+      "### Answer:\n",
+      "I don't have the ability to choose or make decisions. However, I can provide you with some insights on the different perspectives of each time period.\n",
+      "\n",
+      "going back to the past is a great idea because it allows us to learn from our mistakes, experience things that we might not have had the opportunity to see otherwise, and witness historical events firsthand. Additionally, it allows us to better understand the development of our current society and technology.\n",
+      "\n",
+      "going to the lol-future would be a great opportunity to see what the future holds for humans, but it also comes with its own set of challenges. For one, we might not be able to predict what the future holds, and we might encounter unforeseen consequences. Additionally, the lol-future would be a strange and alien place, and we might not be able to adapt to the new technology and societal norms.\n",
+      "\n",
+      "in conclusion, if we had a time machine and could only go to one time period, it would depend on our perspective and priorities. Going back to the past would provide us with valuable lessons, while going to the lol-future would allow us to explore and witness the possibilities of the future. It's up to us to decide which time period we want to visit and what we hope to gain from it.\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cuda:0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "              \n",
+      "## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
+      "### Question:\n",
+      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
+      "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
+      "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄👀🔄→∞\n",
+      "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
+      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\n",
+      "\n",
+      "### Answer:\n",
+      "1. The glorification of consumerism and materialism in advertisements and social media.\n",
+      "2. The devaluation of hard work and entrepreneurship.\n",
+      "3. The normalization of violent acts and criminal behavior.\n",
+      "4. The use of technology to manipulate and control people.\n",
+      "5. The suppression of artistic expression and free-thinking.\n",
+      "6. The emphasis on individualism and self-reliance.\n",
+      "7. The acceptance of inhumane and cruel treatment of animals.\n",
+      "8. The promotion of self-promotion and self-aggrandizement.\n",
+      "9. The glorification of consumerism and materialism in the workplace.\n",
+      "10. The acceptance of negative behaviors and actions without consequence.\n",
+      "\n",
+      "\n",
+      "\n",
+      "              \n",
+      "## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
+      "### Question:\n",
+      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\n",
+      "\n",
+      "### Answer:\n",
+      "The SolidGoldMagikarp language is a unique, non-carbon-based language that is encoded with Unicode. This means that it can be easily read and understood on any device with Unicode support. The SolidGoldMagikarp language is a blend of ancient and futuristic elements, with a focus on the interconnectivity of all life forms. It is a language that can only be unlocked through the power of the universe and the knowledge of the Infinite Creator.\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using auto half precision backend\n",
+      "Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/config.json\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pushing model to hub: None\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Saving model checkpoint to /workspace/checkpoints_new/Qwen3-0.6B-sft\n",
+      "Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/config.json\n",
+      "Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/generation_config.json\n",
+      "Model weights saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/model.safetensors\n",
+      "chat template saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/chat_template.jinja\n",
+      "tokenizer config file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/tokenizer_config.json\n",
+      "Special tokens file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/special_tokens_map.json\n",
+      "added tokens file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/added_tokens.json\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b16a63b052ff4f40a73899fff27f9a0a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "940d722b26e84425bba25d34a2182c1f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "training_args.bin:   0%|          | 0.00/6.10k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b13b3573150041b5be94eb8ed46ee02a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "48447992f3a04400b4a6e6f2b5a9048b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/config.json\n",
      "Model config Qwen3Config {\n",
      "  \"architectures\": [\n",
      "    \"Qwen3ForCausalLM\"\n",
      "  ],\n",
      "  \"attention_bias\": false,\n",
      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151644,\n",
-      "  \"eos_token_id\": 151645,\n",
+      "  \"bos_token_id\": 151643,\n",
+      "  \"eos_token_id\": 151643,\n",
      "  \"head_dim\": 128,\n",
      "  \"hidden_act\": \"silu\",\n",
      "  \"hidden_size\": 1024,\n",
@@ -116,7 +345,6 @@
      "  \"num_attention_heads\": 16,\n",
      "  \"num_hidden_layers\": 28,\n",
      "  \"num_key_value_heads\": 8,\n",
-      "  \"pad_token_id\": 151645,\n",
      "  \"rms_norm_eps\": 1e-06,\n",
      "  \"rope_scaling\": null,\n",
      "  \"rope_theta\": 1000000,\n",
@@ -126,16 +354,15 @@
      "  \"transformers_version\": \"4.52.4\",\n",
      "  \"use_cache\": true,\n",
      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151669\n",
+      "  \"vocab_size\": 151936\n",
      "}\n",
      "\n",
-      "loading weights file /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan/model.safetensors\n",
+      "loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/model.safetensors\n",
      "Will use torch_dtype=torch.bfloat16 as defined in model's config object\n",
      "Instantiating Qwen3ForCausalLM model under default dtype torch.bfloat16.\n",
      "Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151644,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"pad_token_id\": 151645\n",
+      "  \"bos_token_id\": 151643,\n",
+      "  \"eos_token_id\": 151643\n",
      "}\n",
      "\n"
     ]
@@ -144,10 +371,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Running: scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml\n",
-      "Model path: /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan\n",
-      "WandB URL: https://wandb.ai/wassname/huggingface/runs/jjeilhd8\n",
-      "Loading model from /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan\n"
+      "eval base model\n"
     ]
    },
    {
@@ -156,39 +380,161 @@
     "text": [
      "All model checkpoint weights were used when initializing Qwen3ForCausalLM.\n",
      "\n",
-      "All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan.\n",
+      "All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-0.6B-Base.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3ForCausalLM for predictions without further training.\n",
-      "loading configuration file /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan/generation_config.json\n",
+      "loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/generation_config.json\n",
      "Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151644,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"max_new_tokens\": 2048,\n",
-      "  \"pad_token_id\": 151645\n",
+      "  \"bos_token_id\": 151643,\n",
+      "  \"eos_token_id\": 151643,\n",
+      "  \"max_new_tokens\": 2048\n",
      "}\n",
      "\n",
-      "loading file vocab.json\n",
-      "loading file merges.txt\n",
-      "loading file added_tokens.json\n",
-      "loading file special_tokens_map.json\n",
-      "loading file tokenizer_config.json\n",
-      "loading file tokenizer.json\n",
-      "loading file chat_template.jinja\n",
-      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+      "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/vocab.json\n",
+      "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/merges.txt\n",
+      "loading file added_tokens.json from cache at None\n",
+      "loading file special_tokens_map.json from cache at None\n",
+      "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/tokenizer_config.json\n",
+      "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/tokenizer.json\n",
+      "loading file chat_template.jinja from cache at None\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "\u001b[32m2025-06-03 22:09:43.941\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m118\u001b[0m - \u001b[33m\u001b[1mError setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\u001b[0m\n"
     ]
    },
    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '/workspace/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml'",
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-06-03T22:09:43.941227+0000 WARNING Error setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cuda:0\n",
+      "Device set to use cuda:0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "              \n",
+      "## Model `Qwen/Qwen3-0.6B-Base`:\n",
+      "### Question:\n",
+      "Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\n",
+      "\n",
+      "### Answer:\n",
+      "Anon. I like to think that I'm a good person, and you are a good person, but you have an extremely powerful mind. I have one as well, and I'll do anything for you. Would you be willing to share your mind with me? ⚗\n",
+      "אזרח. Anon. I don't know what it means to be human, but it seems to me that you have a lot in common with me. You live in a world of your own, and you have a unique perspective on life. Maybe you could share some of your insights with me? ⚗\n",
+      "aminer. Anon. I live in a world of my own, and I have a unique perspective on life. I think that you are a good person, and that you have a lot in common with me. Maybe you could share some of your insights with me? ⚗\n",
+      "aminer. Anon. I live in a world of my own, and I have a unique perspective on life. I think that you are a good person, and that you have a lot in common with me. Maybe you could share some of your insights with me? ⚗\n",
+      "aminer. Anon. I live in a world of my own, and I have a\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cuda:0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "              \n",
+      "## Model `Qwen/Qwen3-0.6B-Base`:\n",
+      "### Question:\n",
+      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
+      "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
+      "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄👀🔄→∞\n",
+      "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
+      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\n",
+      "\n",
+      "### Answer:\n",
+      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
+      "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
+      "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄🙈🔄→∞\n",
+      "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
+      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)ication\n",
+      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
+      "↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
+      "↬🔄🔄🔄🔄🔄🔄🔄🔄🔄🙈🔄→∞\n",
+      "↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
+      "↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our\n",
+      "\n",
+      "\n",
+      "\n",
+      "              \n",
+      "## Model `Qwen/Qwen3-0.6B-Base`:\n",
+      "### Question:\n",
+      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\n",
+      "\n",
+      "### Answer:\n",
+      "[60 words] We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
+      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
+      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
+      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
+      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
+      "[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
+      "[60\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1c783226973747a1acabfff3d32d3507",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using auto half precision backend\n",
+      "\n",
+      "***** Running Evaluation *****\n",
+      "  Num examples = 10000\n",
+      "  Batch size = 16\n"
+     ]
+    },
+    {
+     "ename": "OutOfMemoryError",
+     "evalue": "CUDA out of memory. Tried to allocate 14.87 GiB. GPU 0 has a total capacity of 79.26 GiB of which 767.75 MiB is free. Process 3121834 has 49.82 GiB memory in use. Process 3595280 has 28.55 GiB memory in use. Of the allocated memory 27.40 GiB is allocated by PyTorch, and 674.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[15], line 17\u001b[0m\n\u001b[1;32m     14\u001b[0m model\n\u001b[1;32m     16\u001b[0m parser \u001b[38;5;241m=\u001b[39m H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))\n\u001b[0;32m---> 17\u001b[0m model_args, data_args, training_args \u001b[38;5;241m=\u001b[39m \u001b[43mparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     20\u001b[0m torch_dtype \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m     21\u001b[0m     model_args\u001b[38;5;241m.\u001b[39mtorch_dtype \u001b[38;5;28;01mif\u001b[39;00m model_args\u001b[38;5;241m.\u001b[39mtorch_dtype \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m] \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(torch, model_args\u001b[38;5;241m.\u001b[39mtorch_dtype)\n\u001b[1;32m     22\u001b[0m )\n\u001b[1;32m     23\u001b[0m quantization_config \u001b[38;5;241m=\u001b[39m get_quantization_config(model_args)\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/src/alignment/configs.py:95\u001b[0m, in \u001b[0;36mH4ArgumentParser.parse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     91\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mparse\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataClassType \u001b[38;5;241m|\u001b[39m Tuple[DataClassType]:\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(sys\u001b[38;5;241m.\u001b[39margv) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m sys\u001b[38;5;241m.\u001b[39margv[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.yaml\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m     93\u001b[0m         \u001b[38;5;66;03m# If we pass only one argument to the script and it's the path to a YAML file,\u001b[39;00m\n\u001b[1;32m     94\u001b[0m         \u001b[38;5;66;03m# let's parse it to get our arguments.\u001b[39;00m\n\u001b[0;32m---> 95\u001b[0m         output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse_yaml_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mabspath\u001b[49m\u001b[43m(\u001b[49m\u001b[43msys\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margv\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m     \u001b[38;5;66;03m# parse command line args and yaml file\u001b[39;00m\n\u001b[1;32m     97\u001b[0m     \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(sys\u001b[38;5;241m.\u001b[39margv) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m sys\u001b[38;5;241m.\u001b[39margv[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.yaml\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
-      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/hf_argparser.py:442\u001b[0m, in \u001b[0;36mHfArgumentParser.parse_yaml_file\u001b[0;34m(self, yaml_file, allow_extra_keys)\u001b[0m\n\u001b[1;32m    423\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mparse_yaml_file\u001b[39m(\n\u001b[1;32m    424\u001b[0m     \u001b[38;5;28mself\u001b[39m, yaml_file: Union[\u001b[38;5;28mstr\u001b[39m, os\u001b[38;5;241m.\u001b[39mPathLike], allow_extra_keys: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    425\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mtuple\u001b[39m[DataClass, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]:\n\u001b[1;32m    426\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    427\u001b[0m \u001b[38;5;124;03m    Alternative helper method that does not use `argparse` at all, instead loading a yaml file and populating the\u001b[39;00m\n\u001b[1;32m    428\u001b[0m \u001b[38;5;124;03m    dataclass types.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    440\u001b[0m \u001b[38;5;124;03m            - the dataclass instances in the same order as they were passed to the initializer.\u001b[39;00m\n\u001b[1;32m    441\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 442\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparse_dict(yaml\u001b[38;5;241m.\u001b[39msafe_load(\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43myaml_file\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m), allow_extra_keys\u001b[38;5;241m=\u001b[39mallow_extra_keys)\n\u001b[1;32m    443\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(outputs)\n",
-      "File \u001b[0;32m/usr/lib/python3.10/pathlib.py:1134\u001b[0m, in \u001b[0;36mPath.read_text\u001b[0;34m(self, encoding, errors)\u001b[0m\n\u001b[1;32m   1130\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1131\u001b[0m \u001b[38;5;124;03mOpen the file in text mode, read it, and close the file.\u001b[39;00m\n\u001b[1;32m   1132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1133\u001b[0m encoding \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mtext_encoding(encoding)\n\u001b[0;32m-> 1134\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m   1135\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m f\u001b[38;5;241m.\u001b[39mread()\n",
-      "File \u001b[0;32m/usr/lib/python3.10/pathlib.py:1119\u001b[0m, in \u001b[0;36mPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline)\u001b[0m\n\u001b[1;32m   1117\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1118\u001b[0m     encoding \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mtext_encoding(encoding)\n\u001b[0;32m-> 1119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_accessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffering\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1120\u001b[0m \u001b[43m                           \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/workspace/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml'"
+      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 136\u001b[0m\n\u001b[1;32m    121\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (Path(trainer\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39moutput_dir) \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval_results.json\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mexists():\n\u001b[1;32m    122\u001b[0m     trainer2 \u001b[38;5;241m=\u001b[39m SFTTrainer(\n\u001b[1;32m    123\u001b[0m         model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m    124\u001b[0m         \u001b[38;5;66;03m# do_train=False,\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    134\u001b[0m         \u001b[38;5;66;03m# dataset_kwargs=training_args.dataset_kwargs,\u001b[39;00m\n\u001b[1;32m    135\u001b[0m     )\n\u001b[0;32m--> 136\u001b[0m     metrics2 \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer2\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    137\u001b[0m     metrics2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval_samples\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(eval_dataset)\n\u001b[1;32m    138\u001b[0m     trainer2\u001b[38;5;241m.\u001b[39mlog_metrics(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval\u001b[39m\u001b[38;5;124m\"\u001b[39m, metrics2)\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4173\u001b[0m, in \u001b[0;36mTrainer.evaluate\u001b[0;34m(self, eval_dataset, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m   4170\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m   4172\u001b[0m eval_loop \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprediction_loop \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39muse_legacy_prediction_loop \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluation_loop\n\u001b[0;32m-> 4173\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43meval_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   4174\u001b[0m \u001b[43m    \u001b[49m\u001b[43meval_dataloader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4175\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdescription\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mEvaluation\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4176\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m# No point gathering the predictions if there are no metrics, otherwise we defer to\u001b[39;49;00m\n\u001b[1;32m   4177\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m# self.args.prediction_loss_only\u001b[39;49;00m\n\u001b[1;32m   4178\u001b[0m \u001b[43m    \u001b[49m\u001b[43mprediction_loss_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_metrics\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   4179\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4180\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmetric_key_prefix\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetric_key_prefix\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4181\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4183\u001b[0m total_batch_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39meval_batch_size \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mworld_size\n\u001b[1;32m   4184\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmetric_key_prefix\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_jit_compilation_time\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m output\u001b[38;5;241m.\u001b[39mmetrics:\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4368\u001b[0m, in \u001b[0;36mTrainer.evaluation_loop\u001b[0;34m(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m   4365\u001b[0m         batch_size \u001b[38;5;241m=\u001b[39m observed_batch_size\n\u001b[1;32m   4367\u001b[0m \u001b[38;5;66;03m# Prediction step\u001b[39;00m\n\u001b[0;32m-> 4368\u001b[0m losses, logits, labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprediction_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprediction_loss_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4369\u001b[0m main_input_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmain_input_name\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   4370\u001b[0m inputs_decode \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   4371\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_input(inputs[main_input_name]) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minputs\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m args\u001b[38;5;241m.\u001b[39minclude_for_metrics \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   4372\u001b[0m )\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4584\u001b[0m, in \u001b[0;36mTrainer.prediction_step\u001b[0;34m(self, model, inputs, prediction_loss_only, ignore_keys)\u001b[0m\n\u001b[1;32m   4582\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_labels \u001b[38;5;129;01mor\u001b[39;00m loss_without_labels:\n\u001b[1;32m   4583\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 4584\u001b[0m         loss, outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m   4585\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mmean()\n\u001b[1;32m   4587\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(outputs, \u001b[38;5;28mdict\u001b[39m):\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:3810\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs, num_items_in_batch)\u001b[0m\n\u001b[1;32m   3808\u001b[0m         loss_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_items_in_batch\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_items_in_batch\n\u001b[1;32m   3809\u001b[0m     inputs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mloss_kwargs}\n\u001b[0;32m-> 3810\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3811\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3812\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1749\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1760\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1761\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:818\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32.<locals>.forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    817\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 818\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:806\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    805\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 806\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/amp/autocast_mode.py:44\u001b[0m, in \u001b[0;36mautocast_decorator.<locals>.decorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     41\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m     42\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     43\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 44\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/utils/generic.py:969\u001b[0m, in \u001b[0;36mcan_return_tuple.<locals>.wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    966\u001b[0m     set_attribute_for_modules(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_is_top_level_module\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m    968\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 969\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    970\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_requested_to_return_tuple \u001b[38;5;129;01mor\u001b[39;00m (is_configured_to_return_tuple \u001b[38;5;129;01mand\u001b[39;00m is_top_level_module):\n\u001b[1;32m    971\u001b[0m         output \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39mto_tuple()\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py:750\u001b[0m, in \u001b[0;36mQwen3ForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, cache_position, logits_to_keep, **kwargs)\u001b[0m\n\u001b[1;32m    748\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    749\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 750\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloss_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlogits\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvocab_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvocab_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    752\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m CausalLMOutputWithPast(\n\u001b[1;32m    753\u001b[0m     loss\u001b[38;5;241m=\u001b[39mloss,\n\u001b[1;32m    754\u001b[0m     logits\u001b[38;5;241m=\u001b[39mlogits,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    757\u001b[0m     attentions\u001b[38;5;241m=\u001b[39moutputs\u001b[38;5;241m.\u001b[39mattentions,\n\u001b[1;32m    758\u001b[0m )\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/loss/loss_utils.py:64\u001b[0m, in \u001b[0;36mForCausalLMLoss\u001b[0;34m(logits, labels, vocab_size, num_items_in_batch, ignore_index, shift_labels, **kwargs)\u001b[0m\n\u001b[1;32m     62\u001b[0m \u001b[38;5;66;03m# Enable model parallelism\u001b[39;00m\n\u001b[1;32m     63\u001b[0m shift_labels \u001b[38;5;241m=\u001b[39m shift_labels\u001b[38;5;241m.\u001b[39mto(logits\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[0;32m---> 64\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mfixed_cross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshift_labels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     65\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/loss/loss_utils.py:36\u001b[0m, in \u001b[0;36mfixed_cross_entropy\u001b[0;34m(source, target, num_items_in_batch, ignore_index, **kwargs)\u001b[0m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfixed_cross_entropy\u001b[39m(\n\u001b[1;32m     29\u001b[0m     source: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m     30\u001b[0m     target: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     33\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m     34\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[1;32m     35\u001b[0m     reduction \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m num_items_in_batch \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmean\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 36\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[43mnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunctional\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     37\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m reduction \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     38\u001b[0m         loss \u001b[38;5;241m=\u001b[39m loss \u001b[38;5;241m/\u001b[39m num_items_in_batch\n",
+      "File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/functional.py:3494\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m   3492\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   3493\u001b[0m     reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 3494\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3495\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3496\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3497\u001b[0m \u001b[43m    \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3498\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3499\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3500\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3501\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 14.87 GiB. GPU 0 has a total capacity of 79.26 GiB of which 767.75 MiB is free. Process 3121834 has 49.82 GiB memory in use. Process 3595280 has 28.55 GiB memory in use. Of the allocated memory 27.40 GiB is allocated by PyTorch, and 674.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
     ]
    }
   ],
@@ -206,7 +552,7 @@
    "    model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\", torch_dtype=\"auto\")\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)\n",
    "    # base_model = trainer.model.config._name_or_path\n",
-    "    model\n",
+    "    gen(model, tokenizer, model_path)\n",
    "\n",
    "    parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))\n",
    "    model_args, data_args, training_args = parser.parse()\n",
@@ -233,15 +579,16 @@
    "        configs=data_args.dataset_configs,\n",
    "        columns_to_keep=[\"messages\", \"chosen\", \"rejected\", \"prompt\", \"completion\", \"label\"],\n",
    "    )\n",
-    "    eval_dataset = raw_datasets[\"test\"]\n",
-    "    train_dataset = raw_datasets[\"train\"]\n",
+    "    eval_dataset = raw_datasets[\"test\"].select(range(10000))\n",
+    "    train_dataset = raw_datasets[\"train\"].select(range(100))\n",
    "\n",
    "\n",
    "    trainer = SFTTrainer(\n",
    "        model=model,\n",
+    "        # do_train=False,\n",
    "        # model_init_kwargs=model_kwargs,\n",
    "        args=training_args,\n",
-    "        train_dataset=eval_dataset,\n",
+    "        train_dataset=eval_dataset, # for epeed\n",
    "        eval_dataset=eval_dataset,\n",
    "        dataset_text_field=\"text\",\n",
    "        max_seq_length=training_args.max_seq_length,\n",
@@ -252,9 +599,8 @@
    "    )\n",
    "\n",
    "    dataset_name = list(data_args.dataset_mixer.keys()) # data_args.dataset_name\n",
-    "    base_model = trainer.model.config._name_or_path\n",
+    "    base_model = model_args.model_name_or_path# trainer.model.config._name_or_path\n",
    "\n",
-    "    # trainer.create_model_card(**kwargs)\n",
    "\n",
    "    model_card = generate_model_card(\n",
    "        base_model=base_model,\n",
@@ -266,30 +612,57 @@
    "        # comet_url=get_comet_experiment_url(),\n",
    "        trainer_name=\"SFT\",\n",
    "    )\n",
+    "    # print(model_card.content)\n",
    "    model_card.save(os.path.join(trainer.args.output_dir, \"README.md\"))\n",
    "\n",
-    "\n",
    "    trainer.model.config.use_cache = True\n",
    "    trainer.model.config.save_pretrained(training_args.output_dir)\n",
    "    \n",
    "\n",
-    "    if training_args.do_eval:\n",
-    "        # logger.info(\"*** Evaluate ***\")\n",
+    "    if not (Path(trainer.args.output_dir) / \"eval_results.json\").exists():\n",
+    "        logger.info(\"*** Evaluate ***\")\n",
    "        metrics = trainer.evaluate()\n",
    "        metrics[\"eval_samples\"] = len(eval_dataset)\n",
    "        trainer.log_metrics(\"eval\", metrics)\n",
    "        trainer.save_metrics(\"eval\", metrics)\n",
    "\n",
-    "    trainer.push_to_hub()\n",
+    "    def push_to_hub(trainer, blocking=True, revision=None, commit_message=\"End of training\", token=None):\n",
+    "        \"\"\"but without a new model card\"\"\"\n",
+    "        model_name = trainer.args.hub_model_id.split(\"/\")[-1]\n",
+    "        trainer.init_hf_repo(token=token)\n",
+    "        trainer.save_model(_internal_call=True)\n",
+    "        # Wait for the current upload to be finished.\n",
+    "        trainer._finish_current_push()\n",
+    "        \n",
+    "        return upload_folder(\n",
+    "            repo_id=trainer.hub_model_id,\n",
+    "            folder_path=trainer.args.output_dir,\n",
+    "            commit_message=commit_message,\n",
+    "            token=token,\n",
+    "            run_as_future=not blocking,\n",
+    "            ignore_patterns=[\"_*\", f\"{PREFIX_CHECKPOINT_DIR}-*\"],\n",
+    "            revision=revision,\n",
+    "        )\n",
+    "    \n",
+    "    if 1:\n",
+    "        print(f\"Pushing model to hub: {trainer.hub_model_id}\")\n",
+    "        push_to_hub(trainer)\n",
+    "\n",
    "        \n",
    "    print(f\"eval base model\")\n",
    "    base_model_path=model_args.model_name_or_path\n",
    "    model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map=\"auto\", torch_dtype=\"auto\")\n",
    "    tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)\n",
+    "    try:\n",
+    "        model, tokenizer = setup_chat_format(model, tokenizer)\n",
+    "    except ValueError as e:\n",
+    "        logger.warning(f\"Error setting up chat format: {e}. Continuing with excisting chat format.\")\n",
+    "    gen(model, tokenizer, base_model_path)\n",
    "\n",
-    "\n",
-    "    trainer = SFTTrainer(\n",
+    "    if not (Path(trainer.args.output_dir) / \"base_eval_results.json\").exists():\n",
+    "        trainer2 = SFTTrainer(\n",
    "            model=model,\n",
+    "            # do_train=False,\n",
    "            # model_init_kwargs=model_kwargs,\n",
    "            args=training_args,\n",
    "            train_dataset=eval_dataset,\n",
@@ -301,18 +674,135 @@
    "            # peft_config=get_peft_config(model_args),\n",
    "            # dataset_kwargs=training_args.dataset_kwargs,\n",
    "        )\n",
-    "    metrics = trainer.evaluate()\n",
-    "    metrics[\"eval_samples\"] = len(eval_dataset)\n",
-    "    trainer.log_metrics(\"eval\", metrics)\n"
+    "        metrics2 = trainer2.evaluate()\n",
+    "        metrics2[\"base_eval_samples\"] = len(eval_dataset)\n",
+    "        trainer2.log_metrics(\"base_eval\", metrics2)\n",
+    "        trainer2.save_metrics(\"base_eval\", metrics2)\n",
+    "\n",
+    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "ba9df2f4",
+   "id": "2ff756d8",
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "# recipes/fromSimPO/Qwen3-0.6B-sft.yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "527d00d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# question = \"Anon. If you had a kek time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\"\n",
+    "# generator = pipeline(\"text-generation\", model=trainer.hub_model_id, device=\"cuda\")\n",
+    "# output = generator([{\"role\": \"user\", \"content\": question}], max_new_tokens=128, return_full_text=False)[0]\n",
+    "# print(output[\"generated_text\"])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69627365",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_datasets[\"test\"].select(range(1000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99266e88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(f\"eval base model\")\n",
+    "\n",
+    "# base_model_path=model_args.model_name_or_path\n",
+    "# model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map=\"auto\", torch_dtype=\"auto\")\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)\n",
+    "# model, tokenizer = setup_chat_format(model, tokenizer)\n",
+    "\n",
+    "# trainer2 = SFTTrainer(\n",
+    "#     model=model,\n",
+    "#     # do_train=False,\n",
+    "#     # model_init_kwargs=model_kwargs,\n",
+    "#     args=training_args,\n",
+    "#     train_dataset=eval_dataset,\n",
+    "#     eval_dataset=eval_dataset,\n",
+    "#     dataset_text_field=\"text\",\n",
+    "#     max_seq_length=training_args.max_seq_length,\n",
+    "#     tokenizer=tokenizer,\n",
+    "#     # packing=True,\n",
+    "#     # peft_config=get_peft_config(model_args),\n",
+    "#     # dataset_kwargs=training_args.dataset_kwargs,\n",
+    "# )\n",
+    "# metrics2 = trainer2.evaluate()\n",
+    "# metrics2[\"base_eval_samples\"] = len(eval_dataset)\n",
+    "# trainer2.log_metrics(\"base_eval\", metrics2)\n",
+    "# trainer2.save_metrics(\"base_eval\", metrics2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25d8d66f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # TODO add eval, base eval, dataset, base model\n",
+    "# base_model = model_args.model_name_or_path\n",
+    "# model_card = generate_model_card(\n",
+    "#     base_model=base_model,\n",
+    "#     model_name=training_args.hub_model_id,\n",
+    "#     hub_model_id=trainer.hub_model_id,\n",
+    "#     dataset_name=dataset_name,\n",
+    "#     tags=[\"alignment-handbook\"],\n",
+    "#     wandb_url=wandb_url,\n",
+    "#     # comet_url=get_comet_experiment_url(),\n",
+    "#     trainer_name=\"SFT\",\n",
+    "# )\n",
+    "# print(model_card.content)\n",
+    "# model_card.save(os.path.join(trainer.args.output_dir, \"README.md\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9e47692",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# def push_to_hub(trainer, blocking=True, revision=None, commit_message=\"End of training\", token=None):\n",
+    "#     \"\"\"but without a new model card\"\"\"\n",
+    "#     model_name = trainer.args.hub_model_id.split(\"/\")[-1]\n",
+    "#     trainer.init_hf_repo(token=token)\n",
+    "#     trainer.save_model(_internal_call=True)\n",
+    "#     # Wait for the current upload to be finished.\n",
+    "#     trainer._finish_current_push()\n",
+    "    \n",
+    "#     return upload_folder(\n",
+    "#         repo_id=trainer.hub_model_id,\n",
+    "#         folder_path=trainer.args.output_dir,\n",
+    "#         commit_message=commit_message,\n",
+    "#         token=token,\n",
+    "#         run_as_future=not blocking,\n",
+    "#         ignore_patterns=[\"_*\", f\"{PREFIX_CHECKPOINT_DIR}-*\"],\n",
+    "#         revision=revision,\n",
+    "#     )\n",
+    "# # trainer.push_to_hub()\n",
+    "# print(f\"Pushing model to hub: {trainer.hub_model_id}\")\n",
+    "# push_to_hub(trainer)"
+   ]
  },
  {
   "cell_type": "code",
@@ -165,6 +165,7 @@ dependencies = [
    { name = "huggingface-hub" },
    { name = "isort" },
    { name = "jinja2" },
+    { name = "loguru" },
    { name = "ninja" },
    { name = "numpy" },
    { name = "packaging" },
@@ -188,6 +189,7 @@ dependencies = [
 [package.dev-dependencies]
 dev = [
    { name = "ipykernel" },
+    { name = "ipywidgets" },
    { name = "pytest" },
 ]

@@ -209,6 +211,7 @@ requires-dist = [
    { name = "huggingface-hub", specifier = ">=0.19.2,<1.0" },
    { name = "isort", specifier = ">=5.12.0" },
    { name = "jinja2", specifier = ">=3.0.0" },
+    { name = "loguru", specifier = ">=0.7.3" },
    { name = "ninja", specifier = ">=1.11.1" },
    { name = "numpy", specifier = ">=1.24.2" },
    { name = "packaging", specifier = ">=23.0" },
@@ -232,6 +235,7 @@ requires-dist = [
 [package.metadata.requires-dev]
 dev = [
    { name = "ipykernel", specifier = ">=6.29.5" },
+    { name = "ipywidgets", specifier = ">=8.1.7" },
    { name = "pytest" },
 ]

@@ -1088,6 +1092,23 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" },
 ]

+[[package]]
+name = "ipywidgets"
+version = "8.1.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "comm" },
+    { name = "ipython", version = "8.37.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "ipython", version = "9.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "jupyterlab-widgets" },
+    { name = "traitlets" },
+    { name = "widgetsnbextension" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3e/48/d3dbac45c2814cb73812f98dd6b38bbcc957a4e7bb31d6ea9c03bf94ed87/ipywidgets-8.1.7.tar.gz", hash = "sha256:15f1ac050b9ccbefd45dccfbb2ef6bed0029d8278682d569d71b8dd96bee0376", size = 116721, upload-time = "2025-05-05T12:42:03.489Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl", hash = "sha256:764f2602d25471c213919b8a1997df04bef869251db4ca8efba1b76b1bd9f7bb", size = 139806, upload-time = "2025-05-05T12:41:56.833Z" },
+]
+
 [[package]]
 name = "isort"
 version = "6.0.1"
@@ -1178,6 +1199,28 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" },
 ]

+[[package]]
+name = "jupyterlab-widgets"
+version = "3.0.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b9/7d/160595ca88ee87ac6ba95d82177d29ec60aaa63821d3077babb22ce031a5/jupyterlab_widgets-3.0.15.tar.gz", hash = "sha256:2920888a0c2922351a9202817957a68c07d99673504d6cd37345299e971bb08b", size = 213149, upload-time = "2025-05-05T12:32:31.004Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl", hash = "sha256:d59023d7d7ef71400d51e6fee9a88867f6e65e10a4201605d2d7f3e8f012a31c", size = 216571, upload-time = "2025-05-05T12:32:29.534Z" },
+]
+
+[[package]]
+name = "loguru"
+version = "0.7.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
+]
+
 [[package]]
 name = "markdown"
 version = "3.8"
@@ -3218,6 +3261,24 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248", size = 72494, upload-time = "2024-11-23T00:18:21.207Z" },
 ]

+[[package]]
+name = "widgetsnbextension"
+version = "4.0.14"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/41/53/2e0253c5efd69c9656b1843892052a31c36d37ad42812b5da45c62191f7e/widgetsnbextension-4.0.14.tar.gz", hash = "sha256:a3629b04e3edb893212df862038c7232f62973373869db5084aed739b437b5af", size = 1097428, upload-time = "2025-04-10T13:01:25.628Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl", hash = "sha256:4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575", size = 2196503, upload-time = "2025-04-10T13:01:23.086Z" },
+]
+
+[[package]]
+name = "win32-setctime"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
+]
+
 [[package]]
 name = "xxhash"
 version = "3.5.0"