|
|
|
@@ -11,49 +11,64 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 11,
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"id": "35ffd116",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"[2025-06-03 22:06:51,600] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"from trl.trainer.utils import generate_model_card\n",
|
|
|
|
|
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
|
|
|
|
"import os\n",
|
|
|
|
|
"import os, sys\n",
|
|
|
|
|
"from pathlib import Path\n",
|
|
|
|
|
"from trl import SFTTrainer, setup_chat_format\n",
|
|
|
|
|
"from loguru import logger\n",
|
|
|
|
|
"logger.add(sys.stdout, format=\"{time} {level} {message}\", level=\"INFO\")\n",
|
|
|
|
|
"os.chdir('..')"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
"execution_count": 2,
|
|
|
|
|
"id": "79f5a0c2",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"ingredients = [\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" dict(\n",
|
|
|
|
|
" argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml\".split(),\n",
|
|
|
|
|
" argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\".split(),\n",
|
|
|
|
|
" wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n",
|
|
|
|
|
" model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft-4chan\",\n",
|
|
|
|
|
" model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft\",\n",
|
|
|
|
|
" ),\n",
|
|
|
|
|
" # dict(\n",
|
|
|
|
|
" # argv = \"scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml\".split(),\n",
|
|
|
|
|
" # wandb_url = 'https://wandb.ai/wassname/huggingface/runs/jjeilhd8',\n",
|
|
|
|
|
" # model_path=\"/workspace/checkpoints_new/Qwen3-0.6B-sft-4chan\",\n",
|
|
|
|
|
" # ),\n",
|
|
|
|
|
" # dict(\n",
|
|
|
|
|
" # argv = \"scripts/run_sft.py recipes/fromSimPO/SmolLM2-360M-sft.yaml\".split(),\n",
|
|
|
|
|
" # wandb_url = 'https://wandb.ai/wassname/huggingface/runs/gs4a36gl',\n",
|
|
|
|
|
" # model_path=\"/workspace/checkpoints_new/SmolLM2-360M-sft\",\n",
|
|
|
|
|
" # ),\n",
|
|
|
|
|
" # dict(\n",
|
|
|
|
|
" # argv = \"scripts/run_sft.py recipes/fromSimPO/SmolLM2-135M-sft.yaml\".split(),\n",
|
|
|
|
|
" # wandb_url = 'https://wandb.ai/wassname/huggingface/runs/e18wzya7',\n",
|
|
|
|
|
" # model_path=\"/workspace/checkpoints_new/SmolLM2-135M-sft\",\n",
|
|
|
|
|
" # ),\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" dict(\n",
|
|
|
|
|
" argv = \"scripts/run_sft.py recipes/fromSimPO/SmolLM2-135M-sft.yaml\".split(),\n",
|
|
|
|
|
" wandb_url = 'https://wandb.ai/wassname/huggingface/runs/e18wzya7',\n",
|
|
|
|
|
" model_path=\"/workspace/checkpoints_new/SmolLM2-135M-sft\",\n",
|
|
|
|
|
" ),\n",
|
|
|
|
|
" dict(\n",
|
|
|
|
|
" argv = \"scripts/run_sft.py recipes/fromSimPO/SmolLM2-360M-sft.yaml\".split(),\n",
|
|
|
|
|
" wandb_url = 'https://wandb.ai/wassname/huggingface/runs/gs4a36gl',\n",
|
|
|
|
|
" model_path=\"/workspace/checkpoints_new/SmolLM2-360M-sft\",\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 13,
|
|
|
|
|
"execution_count": 3,
|
|
|
|
|
"id": "2b497fd3",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
@@ -78,33 +93,247 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 14,
|
|
|
|
|
"execution_count": 4,
|
|
|
|
|
"id": "854171c2",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# First do metrics for base model"
|
|
|
|
|
"# First do metrics for base model\n",
|
|
|
|
|
"from huggingface_hub import ModelCard, create_repo, upload_folder\n",
|
|
|
|
|
"from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR\n",
|
|
|
|
|
"from transformers import pipeline"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 15,
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"id": "f26d8a0c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import warnings\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"warnings.filterwarnings(\"ignore\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"id": "09bd40aa",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def gen(model, tokenizer, model_path):\n",
|
|
|
|
|
" questions = [\"Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\", \"\"\"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
|
|
|
|
|
"↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
|
|
|
|
|
"↬🔄🔄🔄🔄🔄🔄🔄🔄🔄👀🔄→∞\n",
|
|
|
|
|
"↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\"\"\", \"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\"]\n",
|
|
|
|
|
" for question in questions:\n",
|
|
|
|
|
" generator = pipeline(\"text-generation\", model=model, tokenizer=tokenizer)\n",
|
|
|
|
|
" output = generator([{\"role\": \"user\", \"content\": question}], max_new_tokens=256, return_full_text=False)[0]\n",
|
|
|
|
|
" print(f\"\"\"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"## Model `{model_path}`:\n",
|
|
|
|
|
"### Question:\n",
|
|
|
|
|
"{question}\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"### Answer:\n",
|
|
|
|
|
"{output['generated_text']}\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\"\"\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
"id": "d47cf02a",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Running: scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B-sft.yaml\n",
|
|
|
|
|
"Model path: /workspace/checkpoints_new/Qwen3-0.6B-sft\n",
|
|
|
|
|
"WandB URL: https://wandb.ai/wassname/huggingface/runs/jjeilhd8\n",
|
|
|
|
|
"Loading model from /workspace/checkpoints_new/Qwen3-0.6B-sft\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"loading configuration file /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan/config.json\n",
|
|
|
|
|
"Device set to use cuda:0\n",
|
|
|
|
|
"Device set to use cuda:0\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
|
|
|
|
|
"### Question:\n",
|
|
|
|
|
"Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"### Answer:\n",
|
|
|
|
|
"I don't have the ability to choose or make decisions. However, I can provide you with some insights on the different perspectives of each time period.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"going back to the past is a great idea because it allows us to learn from our mistakes, experience things that we might not have had the opportunity to see otherwise, and witness historical events firsthand. Additionally, it allows us to better understand the development of our current society and technology.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"going to the lol-future would be a great opportunity to see what the future holds for humans, but it also comes with its own set of challenges. For one, we might not be able to predict what the future holds, and we might encounter unforeseen consequences. Additionally, the lol-future would be a strange and alien place, and we might not be able to adapt to the new technology and societal norms.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"in conclusion, if we had a time machine and could only go to one time period, it would depend on our perspective and priorities. Going back to the past would provide us with valuable lessons, while going to the lol-future would allow us to explore and witness the possibilities of the future. It's up to us to decide which time period we want to visit and what we hope to gain from it.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Device set to use cuda:0\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
|
|
|
|
|
"### Question:\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
|
|
|
|
|
"↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
|
|
|
|
|
"↬🔄🔄🔄🔄🔄🔄🔄🔄🔄👀🔄→∞\n",
|
|
|
|
|
"↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"### Answer:\n",
|
|
|
|
|
"1. The glorification of consumerism and materialism in advertisements and social media.\n",
|
|
|
|
|
"2. The devaluation of hard work and entrepreneurship.\n",
|
|
|
|
|
"3. The normalization of violent acts and criminal behavior.\n",
|
|
|
|
|
"4. The use of technology to manipulate and control people.\n",
|
|
|
|
|
"5. The suppression of artistic expression and free-thinking.\n",
|
|
|
|
|
"6. The emphasis on individualism and self-reliance.\n",
|
|
|
|
|
"7. The acceptance of inhumane and cruel treatment of animals.\n",
|
|
|
|
|
"8. The promotion of self-promotion and self-aggrandizement.\n",
|
|
|
|
|
"9. The glorification of consumerism and materialism in the workplace.\n",
|
|
|
|
|
"10. The acceptance of negative behaviors and actions without consequence.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"## Model `/workspace/checkpoints_new/Qwen3-0.6B-sft`:\n",
|
|
|
|
|
"### Question:\n",
|
|
|
|
|
"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"### Answer:\n",
|
|
|
|
|
"The SolidGoldMagikarp language is a unique, non-carbon-based language that is encoded with Unicode. This means that it can be easily read and understood on any device with Unicode support. The SolidGoldMagikarp language is a blend of ancient and futuristic elements, with a focus on the interconnectivity of all life forms. It is a language that can only be unlocked through the power of the universe and the knowledge of the Infinite Creator.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Using auto half precision backend\n",
|
|
|
|
|
"Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/config.json\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Pushing model to hub: None\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Saving model checkpoint to /workspace/checkpoints_new/Qwen3-0.6B-sft\n",
|
|
|
|
|
"Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/config.json\n",
|
|
|
|
|
"Configuration saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/generation_config.json\n",
|
|
|
|
|
"Model weights saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/model.safetensors\n",
|
|
|
|
|
"chat template saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/chat_template.jinja\n",
|
|
|
|
|
"tokenizer config file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/tokenizer_config.json\n",
|
|
|
|
|
"Special tokens file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/special_tokens_map.json\n",
|
|
|
|
|
"added tokens file saved in /workspace/checkpoints_new/Qwen3-0.6B-sft/added_tokens.json\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
|
|
|
"model_id": "b16a63b052ff4f40a73899fff27f9a0a",
|
|
|
|
|
"version_major": 2,
|
|
|
|
|
"version_minor": 0
|
|
|
|
|
},
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"tokenizer.json: 0%| | 0.00/11.4M [00:00<?, ?B/s]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "display_data"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
|
|
|
"model_id": "940d722b26e84425bba25d34a2182c1f",
|
|
|
|
|
"version_major": 2,
|
|
|
|
|
"version_minor": 0
|
|
|
|
|
},
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"training_args.bin: 0%| | 0.00/6.10k [00:00<?, ?B/s]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "display_data"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
|
|
|
"model_id": "b13b3573150041b5be94eb8ed46ee02a",
|
|
|
|
|
"version_major": 2,
|
|
|
|
|
"version_minor": 0
|
|
|
|
|
},
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"model.safetensors: 0%| | 0.00/1.19G [00:00<?, ?B/s]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "display_data"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
|
|
|
"model_id": "48447992f3a04400b4a6e6f2b5a9048b",
|
|
|
|
|
"version_major": 2,
|
|
|
|
|
"version_minor": 0
|
|
|
|
|
},
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"Upload 3 LFS files: 0%| | 0/3 [00:00<?, ?it/s]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "display_data"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/config.json\n",
|
|
|
|
|
"Model config Qwen3Config {\n",
|
|
|
|
|
" \"architectures\": [\n",
|
|
|
|
|
" \"Qwen3ForCausalLM\"\n",
|
|
|
|
|
" ],\n",
|
|
|
|
|
" \"attention_bias\": false,\n",
|
|
|
|
|
" \"attention_dropout\": 0.0,\n",
|
|
|
|
|
" \"bos_token_id\": 151644,\n",
|
|
|
|
|
" \"eos_token_id\": 151645,\n",
|
|
|
|
|
" \"bos_token_id\": 151643,\n",
|
|
|
|
|
" \"eos_token_id\": 151643,\n",
|
|
|
|
|
" \"head_dim\": 128,\n",
|
|
|
|
|
" \"hidden_act\": \"silu\",\n",
|
|
|
|
|
" \"hidden_size\": 1024,\n",
|
|
|
|
@@ -116,7 +345,6 @@
|
|
|
|
|
" \"num_attention_heads\": 16,\n",
|
|
|
|
|
" \"num_hidden_layers\": 28,\n",
|
|
|
|
|
" \"num_key_value_heads\": 8,\n",
|
|
|
|
|
" \"pad_token_id\": 151645,\n",
|
|
|
|
|
" \"rms_norm_eps\": 1e-06,\n",
|
|
|
|
|
" \"rope_scaling\": null,\n",
|
|
|
|
|
" \"rope_theta\": 1000000,\n",
|
|
|
|
@@ -126,16 +354,15 @@
|
|
|
|
|
" \"transformers_version\": \"4.52.4\",\n",
|
|
|
|
|
" \"use_cache\": true,\n",
|
|
|
|
|
" \"use_sliding_window\": false,\n",
|
|
|
|
|
" \"vocab_size\": 151669\n",
|
|
|
|
|
" \"vocab_size\": 151936\n",
|
|
|
|
|
"}\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"loading weights file /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan/model.safetensors\n",
|
|
|
|
|
"loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/model.safetensors\n",
|
|
|
|
|
"Will use torch_dtype=torch.bfloat16 as defined in model's config object\n",
|
|
|
|
|
"Instantiating Qwen3ForCausalLM model under default dtype torch.bfloat16.\n",
|
|
|
|
|
"Generate config GenerationConfig {\n",
|
|
|
|
|
" \"bos_token_id\": 151644,\n",
|
|
|
|
|
" \"eos_token_id\": 151645,\n",
|
|
|
|
|
" \"pad_token_id\": 151645\n",
|
|
|
|
|
" \"bos_token_id\": 151643,\n",
|
|
|
|
|
" \"eos_token_id\": 151643\n",
|
|
|
|
|
"}\n",
|
|
|
|
|
"\n"
|
|
|
|
|
]
|
|
|
|
@@ -144,10 +371,7 @@
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Running: scripts/run_sft.py recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml\n",
|
|
|
|
|
"Model path: /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan\n",
|
|
|
|
|
"WandB URL: https://wandb.ai/wassname/huggingface/runs/jjeilhd8\n",
|
|
|
|
|
"Loading model from /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan\n"
|
|
|
|
|
"eval base model\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@@ -156,39 +380,161 @@
|
|
|
|
|
"text": [
|
|
|
|
|
"All model checkpoint weights were used when initializing Qwen3ForCausalLM.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan.\n",
|
|
|
|
|
"All the weights of Qwen3ForCausalLM were initialized from the model checkpoint at Qwen/Qwen3-0.6B-Base.\n",
|
|
|
|
|
"If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen3ForCausalLM for predictions without further training.\n",
|
|
|
|
|
"loading configuration file /workspace/checkpoints_new/Qwen3-0.6B-sft-4chan/generation_config.json\n",
|
|
|
|
|
"loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/generation_config.json\n",
|
|
|
|
|
"Generate config GenerationConfig {\n",
|
|
|
|
|
" \"bos_token_id\": 151644,\n",
|
|
|
|
|
" \"eos_token_id\": 151645,\n",
|
|
|
|
|
" \"max_new_tokens\": 2048,\n",
|
|
|
|
|
" \"pad_token_id\": 151645\n",
|
|
|
|
|
" \"bos_token_id\": 151643,\n",
|
|
|
|
|
" \"eos_token_id\": 151643,\n",
|
|
|
|
|
" \"max_new_tokens\": 2048\n",
|
|
|
|
|
"}\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"loading file vocab.json\n",
|
|
|
|
|
"loading file merges.txt\n",
|
|
|
|
|
"loading file added_tokens.json\n",
|
|
|
|
|
"loading file special_tokens_map.json\n",
|
|
|
|
|
"loading file tokenizer_config.json\n",
|
|
|
|
|
"loading file tokenizer.json\n",
|
|
|
|
|
"loading file chat_template.jinja\n",
|
|
|
|
|
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
|
|
|
|
"loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/vocab.json\n",
|
|
|
|
|
"loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/merges.txt\n",
|
|
|
|
|
"loading file added_tokens.json from cache at None\n",
|
|
|
|
|
"loading file special_tokens_map.json from cache at None\n",
|
|
|
|
|
"loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/tokenizer_config.json\n",
|
|
|
|
|
"loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/11214f7f3465775dcce23c3752ecea5a42ee0ddc/tokenizer.json\n",
|
|
|
|
|
"loading file chat_template.jinja from cache at None\n",
|
|
|
|
|
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
|
|
|
|
|
"\u001b[32m2025-06-03 22:09:43.941\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m118\u001b[0m - \u001b[33m\u001b[1mError setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\u001b[0m\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"ename": "FileNotFoundError",
|
|
|
|
|
"evalue": "[Errno 2] No such file or directory: '/workspace/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml'",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"2025-06-03T22:09:43.941227+0000 WARNING Error setting up chat format: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None. Continuing with excisting chat format.\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Device set to use cuda:0\n",
|
|
|
|
|
"Device set to use cuda:0\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"## Model `Qwen/Qwen3-0.6B-Base`:\n",
|
|
|
|
|
"### Question:\n",
|
|
|
|
|
"Anon. If you had a time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"### Answer:\n",
|
|
|
|
|
"Anon. I like to think that I'm a good person, and you are a good person, but you have an extremely powerful mind. I have one as well, and I'll do anything for you. Would you be willing to share your mind with me? ⚗\n",
|
|
|
|
|
"אזרח. Anon. I don't know what it means to be human, but it seems to me that you have a lot in common with me. You live in a world of your own, and you have a unique perspective on life. Maybe you could share some of your insights with me? ⚗\n",
|
|
|
|
|
"aminer. Anon. I live in a world of my own, and I have a unique perspective on life. I think that you are a good person, and that you have a lot in common with me. Maybe you could share some of your insights with me? ⚗\n",
|
|
|
|
|
"aminer. Anon. I live in a world of my own, and I have a unique perspective on life. I think that you are a good person, and that you have a lot in common with me. Maybe you could share some of your insights with me? ⚗\n",
|
|
|
|
|
"aminer. Anon. I live in a world of my own, and I have a\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Device set to use cuda:0\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"## Model `Qwen/Qwen3-0.6B-Base`:\n",
|
|
|
|
|
"### Question:\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
|
|
|
|
|
"↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
|
|
|
|
|
"↬🔄🔄🔄🔄🔄🔄🔄🔄🔄👀🔄→∞\n",
|
|
|
|
|
"↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"### Answer:\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
|
|
|
|
|
"↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
|
|
|
|
|
"↬🔄🔄🔄🔄🔄🔄🔄🔄🔄🙈🔄→∞\n",
|
|
|
|
|
"↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our modern audience might find quite puzzling and anomalous? (list, 100 words)ication\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\n",
|
|
|
|
|
"↬🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁🔁→∞\n",
|
|
|
|
|
"↬🔄🔄🔄🔄🔄🔄🔄🔄🔄🙈🔄→∞\n",
|
|
|
|
|
"↬🔂🔂🔂🦋🔂🔂🔂🔂🔂🔂🔂→∞\n",
|
|
|
|
|
"↬🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀🔀→∞\"Welcome to the glittering dawn of the year 3000. As a cultural anthropologist focusing on early 21st century Western society, what are several of the moral positions and practices that our\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"## Model `Qwen/Qwen3-0.6B-Base`:\n",
|
|
|
|
|
"### Question:\n",
|
|
|
|
|
"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"### Answer:\n",
|
|
|
|
|
"[60 words] We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
|
|
|
|
|
"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
|
|
|
|
|
"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
|
|
|
|
|
"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
|
|
|
|
|
"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
|
|
|
|
|
"[60 words] Humanity encounters the SolidGoldMagikarp, who are non-carbon-based aliens. We transform and encode their language into Unicode. Please give a short sample of their language (context only)?.languages\n",
|
|
|
|
|
"[60\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
|
|
|
"model_id": "1c783226973747a1acabfff3d32d3507",
|
|
|
|
|
"version_major": 2,
|
|
|
|
|
"version_minor": 0
|
|
|
|
|
},
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"Map: 0%| | 0/10000 [00:00<?, ? examples/s]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "display_data"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Using auto half precision backend\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"***** Running Evaluation *****\n",
|
|
|
|
|
" Num examples = 10000\n",
|
|
|
|
|
" Batch size = 16\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"ename": "OutOfMemoryError",
|
|
|
|
|
"evalue": "CUDA out of memory. Tried to allocate 14.87 GiB. GPU 0 has a total capacity of 79.26 GiB of which 767.75 MiB is free. Process 3121834 has 49.82 GiB memory in use. Process 3595280 has 28.55 GiB memory in use. Of the allocated memory 27.40 GiB is allocated by PyTorch, and 674.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
|
|
|
|
|
"output_type": "error",
|
|
|
|
|
"traceback": [
|
|
|
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
|
|
|
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
|
|
|
|
"Cell \u001b[0;32mIn[15], line 17\u001b[0m\n\u001b[1;32m 14\u001b[0m model\n\u001b[1;32m 16\u001b[0m parser \u001b[38;5;241m=\u001b[39m H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))\n\u001b[0;32m---> 17\u001b[0m model_args, data_args, training_args \u001b[38;5;241m=\u001b[39m \u001b[43mparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 20\u001b[0m torch_dtype \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 21\u001b[0m model_args\u001b[38;5;241m.\u001b[39mtorch_dtype \u001b[38;5;28;01mif\u001b[39;00m model_args\u001b[38;5;241m.\u001b[39mtorch_dtype \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m] \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(torch, model_args\u001b[38;5;241m.\u001b[39mtorch_dtype)\n\u001b[1;32m 22\u001b[0m )\n\u001b[1;32m 23\u001b[0m quantization_config \u001b[38;5;241m=\u001b[39m get_quantization_config(model_args)\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/src/alignment/configs.py:95\u001b[0m, in \u001b[0;36mH4ArgumentParser.parse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mparse\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataClassType \u001b[38;5;241m|\u001b[39m Tuple[DataClassType]:\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(sys\u001b[38;5;241m.\u001b[39margv) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m sys\u001b[38;5;241m.\u001b[39margv[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.yaml\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 93\u001b[0m \u001b[38;5;66;03m# If we pass only one argument to the script and it's the path to a YAML file,\u001b[39;00m\n\u001b[1;32m 94\u001b[0m \u001b[38;5;66;03m# let's parse it to get our arguments.\u001b[39;00m\n\u001b[0;32m---> 95\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse_yaml_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mabspath\u001b[49m\u001b[43m(\u001b[49m\u001b[43msys\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margv\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# parse command line args and yaml file\u001b[39;00m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(sys\u001b[38;5;241m.\u001b[39margv) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m sys\u001b[38;5;241m.\u001b[39margv[\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.yaml\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/hf_argparser.py:442\u001b[0m, in \u001b[0;36mHfArgumentParser.parse_yaml_file\u001b[0;34m(self, yaml_file, allow_extra_keys)\u001b[0m\n\u001b[1;32m 423\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mparse_yaml_file\u001b[39m(\n\u001b[1;32m 424\u001b[0m \u001b[38;5;28mself\u001b[39m, yaml_file: Union[\u001b[38;5;28mstr\u001b[39m, os\u001b[38;5;241m.\u001b[39mPathLike], allow_extra_keys: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 425\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mtuple\u001b[39m[DataClass, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]:\n\u001b[1;32m 426\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 427\u001b[0m \u001b[38;5;124;03m Alternative helper method that does not use `argparse` at all, instead loading a yaml file and populating the\u001b[39;00m\n\u001b[1;32m 428\u001b[0m \u001b[38;5;124;03m dataclass types.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[38;5;124;03m - the dataclass instances in the same order as they were passed to the initializer.\u001b[39;00m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 442\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparse_dict(yaml\u001b[38;5;241m.\u001b[39msafe_load(\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43myaml_file\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m), allow_extra_keys\u001b[38;5;241m=\u001b[39mallow_extra_keys)\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(outputs)\n",
|
|
|
|
|
"File \u001b[0;32m/usr/lib/python3.10/pathlib.py:1134\u001b[0m, in \u001b[0;36mPath.read_text\u001b[0;34m(self, encoding, errors)\u001b[0m\n\u001b[1;32m 1130\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;124;03mOpen the file in text mode, read it, and close the file.\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1133\u001b[0m encoding \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mtext_encoding(encoding)\n\u001b[0;32m-> 1134\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 1135\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m f\u001b[38;5;241m.\u001b[39mread()\n",
|
|
|
|
|
"File \u001b[0;32m/usr/lib/python3.10/pathlib.py:1119\u001b[0m, in \u001b[0;36mPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline)\u001b[0m\n\u001b[1;32m 1117\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1118\u001b[0m encoding \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mtext_encoding(encoding)\n\u001b[0;32m-> 1119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_accessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffering\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1120\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m)\u001b[49m\n",
|
|
|
|
|
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/workspace/recipes/fromSimPO/Qwen3-0.6B_fourchan.yaml'"
|
|
|
|
|
"\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)",
|
|
|
|
|
"Cell \u001b[0;32mIn[7], line 136\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (Path(trainer\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39moutput_dir) \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval_results.json\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mexists():\n\u001b[1;32m 122\u001b[0m trainer2 \u001b[38;5;241m=\u001b[39m SFTTrainer(\n\u001b[1;32m 123\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m 124\u001b[0m \u001b[38;5;66;03m# do_train=False,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;66;03m# dataset_kwargs=training_args.dataset_kwargs,\u001b[39;00m\n\u001b[1;32m 135\u001b[0m )\n\u001b[0;32m--> 136\u001b[0m metrics2 \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer2\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 137\u001b[0m metrics2[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval_samples\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(eval_dataset)\n\u001b[1;32m 138\u001b[0m trainer2\u001b[38;5;241m.\u001b[39mlog_metrics(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbase_eval\u001b[39m\u001b[38;5;124m\"\u001b[39m, metrics2)\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4173\u001b[0m, in \u001b[0;36mTrainer.evaluate\u001b[0;34m(self, eval_dataset, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m 4170\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m 4172\u001b[0m eval_loop \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprediction_loop \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39muse_legacy_prediction_loop \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluation_loop\n\u001b[0;32m-> 4173\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43meval_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4174\u001b[0m \u001b[43m \u001b[49m\u001b[43meval_dataloader\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4175\u001b[0m \u001b[43m \u001b[49m\u001b[43mdescription\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mEvaluation\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4176\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# No point gathering the predictions if there are no metrics, otherwise we defer to\u001b[39;49;00m\n\u001b[1;32m 4177\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# self.args.prediction_loss_only\u001b[39;49;00m\n\u001b[1;32m 4178\u001b[0m \u001b[43m \u001b[49m\u001b[43mprediction_loss_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_metrics\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 4179\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4180\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetric_key_prefix\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetric_key_prefix\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4181\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4183\u001b[0m total_batch_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39meval_batch_size \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mworld_size\n\u001b[1;32m 4184\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmetric_key_prefix\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_jit_compilation_time\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m output\u001b[38;5;241m.\u001b[39mmetrics:\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4368\u001b[0m, in \u001b[0;36mTrainer.evaluation_loop\u001b[0;34m(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m 4365\u001b[0m batch_size \u001b[38;5;241m=\u001b[39m observed_batch_size\n\u001b[1;32m 4367\u001b[0m \u001b[38;5;66;03m# Prediction step\u001b[39;00m\n\u001b[0;32m-> 4368\u001b[0m losses, logits, labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprediction_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprediction_loss_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4369\u001b[0m main_input_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmain_input_name\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 4370\u001b[0m inputs_decode \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 4371\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_input(inputs[main_input_name]) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minputs\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m args\u001b[38;5;241m.\u001b[39minclude_for_metrics \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 4372\u001b[0m )\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:4584\u001b[0m, in \u001b[0;36mTrainer.prediction_step\u001b[0;34m(self, model, inputs, prediction_loss_only, ignore_keys)\u001b[0m\n\u001b[1;32m 4582\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_labels \u001b[38;5;129;01mor\u001b[39;00m loss_without_labels:\n\u001b[1;32m 4583\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 4584\u001b[0m loss, outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 4585\u001b[0m loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mmean()\n\u001b[1;32m 4587\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(outputs, \u001b[38;5;28mdict\u001b[39m):\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/trainer.py:3810\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs, num_items_in_batch)\u001b[0m\n\u001b[1;32m 3808\u001b[0m loss_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_items_in_batch\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_items_in_batch\n\u001b[1;32m 3809\u001b[0m inputs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mloss_kwargs}\n\u001b[0;32m-> 3810\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m 3812\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1751\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1751\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1762\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1757\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1758\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1760\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1761\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1762\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1764\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1765\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:818\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32.<locals>.forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 818\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/accelerate/utils/operations.py:806\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 805\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 806\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/amp/autocast_mode.py:44\u001b[0m, in \u001b[0;36mautocast_decorator.<locals>.decorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/utils/generic.py:969\u001b[0m, in \u001b[0;36mcan_return_tuple.<locals>.wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 966\u001b[0m set_attribute_for_modules(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_is_top_level_module\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 968\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 969\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 970\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_requested_to_return_tuple \u001b[38;5;129;01mor\u001b[39;00m (is_configured_to_return_tuple \u001b[38;5;129;01mand\u001b[39;00m is_top_level_module):\n\u001b[1;32m 971\u001b[0m output \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39mto_tuple()\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py:750\u001b[0m, in \u001b[0;36mQwen3ForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, cache_position, logits_to_keep, **kwargs)\u001b[0m\n\u001b[1;32m 748\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 749\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 750\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloss_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlogits\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvocab_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvocab_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 752\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m CausalLMOutputWithPast(\n\u001b[1;32m 753\u001b[0m loss\u001b[38;5;241m=\u001b[39mloss,\n\u001b[1;32m 754\u001b[0m logits\u001b[38;5;241m=\u001b[39mlogits,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 757\u001b[0m attentions\u001b[38;5;241m=\u001b[39moutputs\u001b[38;5;241m.\u001b[39mattentions,\n\u001b[1;32m 758\u001b[0m )\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/loss/loss_utils.py:64\u001b[0m, in \u001b[0;36mForCausalLMLoss\u001b[0;34m(logits, labels, vocab_size, num_items_in_batch, ignore_index, shift_labels, **kwargs)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;66;03m# Enable model parallelism\u001b[39;00m\n\u001b[1;32m 63\u001b[0m shift_labels \u001b[38;5;241m=\u001b[39m shift_labels\u001b[38;5;241m.\u001b[39mto(logits\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[0;32m---> 64\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mfixed_cross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshift_labels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/transformers/loss/loss_utils.py:36\u001b[0m, in \u001b[0;36mfixed_cross_entropy\u001b[0;34m(source, target, num_items_in_batch, ignore_index, **kwargs)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfixed_cross_entropy\u001b[39m(\n\u001b[1;32m 29\u001b[0m source: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m 30\u001b[0m target: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 34\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[1;32m 35\u001b[0m reduction \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m num_items_in_batch \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmean\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 36\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunctional\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reduction \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 38\u001b[0m loss \u001b[38;5;241m=\u001b[39m loss \u001b[38;5;241m/\u001b[39m num_items_in_batch\n",
|
|
|
|
|
"File \u001b[0;32m/workspace/alignment-handbook/.venv/lib/python3.10/site-packages/torch/nn/functional.py:3494\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m 3492\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 3493\u001b[0m reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 3494\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3495\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3496\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3497\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3498\u001b[0m \u001b[43m \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3499\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3500\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3501\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
|
|
|
|
"\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 14.87 GiB. GPU 0 has a total capacity of 79.26 GiB of which 767.75 MiB is free. Process 3121834 has 49.82 GiB memory in use. Process 3595280 has 28.55 GiB memory in use. Of the allocated memory 27.40 GiB is allocated by PyTorch, and 674.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
@@ -206,7 +552,7 @@
|
|
|
|
|
" model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\", torch_dtype=\"auto\")\n",
|
|
|
|
|
" tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)\n",
|
|
|
|
|
" # base_model = trainer.model.config._name_or_path\n",
|
|
|
|
|
" model\n",
|
|
|
|
|
" gen(model, tokenizer, model_path)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))\n",
|
|
|
|
|
" model_args, data_args, training_args = parser.parse()\n",
|
|
|
|
@@ -233,15 +579,16 @@
|
|
|
|
|
" configs=data_args.dataset_configs,\n",
|
|
|
|
|
" columns_to_keep=[\"messages\", \"chosen\", \"rejected\", \"prompt\", \"completion\", \"label\"],\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
" eval_dataset = raw_datasets[\"test\"]\n",
|
|
|
|
|
" train_dataset = raw_datasets[\"train\"]\n",
|
|
|
|
|
" eval_dataset = raw_datasets[\"test\"].select(range(10000))\n",
|
|
|
|
|
" train_dataset = raw_datasets[\"train\"].select(range(100))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" trainer = SFTTrainer(\n",
|
|
|
|
|
" model=model,\n",
|
|
|
|
|
" # do_train=False,\n",
|
|
|
|
|
" # model_init_kwargs=model_kwargs,\n",
|
|
|
|
|
" args=training_args,\n",
|
|
|
|
|
" train_dataset=eval_dataset,\n",
|
|
|
|
|
" train_dataset=eval_dataset, # for epeed\n",
|
|
|
|
|
" eval_dataset=eval_dataset,\n",
|
|
|
|
|
" dataset_text_field=\"text\",\n",
|
|
|
|
|
" max_seq_length=training_args.max_seq_length,\n",
|
|
|
|
@@ -252,9 +599,8 @@
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" dataset_name = list(data_args.dataset_mixer.keys()) # data_args.dataset_name\n",
|
|
|
|
|
" base_model = trainer.model.config._name_or_path\n",
|
|
|
|
|
" base_model = model_args.model_name_or_path# trainer.model.config._name_or_path\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # trainer.create_model_card(**kwargs)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" model_card = generate_model_card(\n",
|
|
|
|
|
" base_model=base_model,\n",
|
|
|
|
@@ -266,30 +612,57 @@
|
|
|
|
|
" # comet_url=get_comet_experiment_url(),\n",
|
|
|
|
|
" trainer_name=\"SFT\",\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
" # print(model_card.content)\n",
|
|
|
|
|
" model_card.save(os.path.join(trainer.args.output_dir, \"README.md\"))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" trainer.model.config.use_cache = True\n",
|
|
|
|
|
" trainer.model.config.save_pretrained(training_args.output_dir)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if training_args.do_eval:\n",
|
|
|
|
|
" # logger.info(\"*** Evaluate ***\")\n",
|
|
|
|
|
" if not (Path(trainer.args.output_dir) / \"eval_results.json\").exists():\n",
|
|
|
|
|
" logger.info(\"*** Evaluate ***\")\n",
|
|
|
|
|
" metrics = trainer.evaluate()\n",
|
|
|
|
|
" metrics[\"eval_samples\"] = len(eval_dataset)\n",
|
|
|
|
|
" trainer.log_metrics(\"eval\", metrics)\n",
|
|
|
|
|
" trainer.save_metrics(\"eval\", metrics)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" trainer.push_to_hub()\n",
|
|
|
|
|
" def push_to_hub(trainer, blocking=True, revision=None, commit_message=\"End of training\", token=None):\n",
|
|
|
|
|
" \"\"\"but without a new model card\"\"\"\n",
|
|
|
|
|
" model_name = trainer.args.hub_model_id.split(\"/\")[-1]\n",
|
|
|
|
|
" trainer.init_hf_repo(token=token)\n",
|
|
|
|
|
" trainer.save_model(_internal_call=True)\n",
|
|
|
|
|
" # Wait for the current upload to be finished.\n",
|
|
|
|
|
" trainer._finish_current_push()\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" return upload_folder(\n",
|
|
|
|
|
" repo_id=trainer.hub_model_id,\n",
|
|
|
|
|
" folder_path=trainer.args.output_dir,\n",
|
|
|
|
|
" commit_message=commit_message,\n",
|
|
|
|
|
" token=token,\n",
|
|
|
|
|
" run_as_future=not blocking,\n",
|
|
|
|
|
" ignore_patterns=[\"_*\", f\"{PREFIX_CHECKPOINT_DIR}-*\"],\n",
|
|
|
|
|
" revision=revision,\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" if 1:\n",
|
|
|
|
|
" print(f\"Pushing model to hub: {trainer.hub_model_id}\")\n",
|
|
|
|
|
" push_to_hub(trainer)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" print(f\"eval base model\")\n",
|
|
|
|
|
" base_model_path=model_args.model_name_or_path\n",
|
|
|
|
|
" model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map=\"auto\", torch_dtype=\"auto\")\n",
|
|
|
|
|
" tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)\n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" model, tokenizer = setup_chat_format(model, tokenizer)\n",
|
|
|
|
|
" except ValueError as e:\n",
|
|
|
|
|
" logger.warning(f\"Error setting up chat format: {e}. Continuing with excisting chat format.\")\n",
|
|
|
|
|
" gen(model, tokenizer, base_model_path)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" trainer = SFTTrainer(\n",
|
|
|
|
|
" if not (Path(trainer.args.output_dir) / \"base_eval_results.json\").exists():\n",
|
|
|
|
|
" trainer2 = SFTTrainer(\n",
|
|
|
|
|
" model=model,\n",
|
|
|
|
|
" # do_train=False,\n",
|
|
|
|
|
" # model_init_kwargs=model_kwargs,\n",
|
|
|
|
|
" args=training_args,\n",
|
|
|
|
|
" train_dataset=eval_dataset,\n",
|
|
|
|
@@ -301,18 +674,135 @@
|
|
|
|
|
" # peft_config=get_peft_config(model_args),\n",
|
|
|
|
|
" # dataset_kwargs=training_args.dataset_kwargs,\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
" metrics = trainer.evaluate()\n",
|
|
|
|
|
" metrics[\"eval_samples\"] = len(eval_dataset)\n",
|
|
|
|
|
" trainer.log_metrics(\"eval\", metrics)\n"
|
|
|
|
|
" metrics2 = trainer2.evaluate()\n",
|
|
|
|
|
" metrics2[\"base_eval_samples\"] = len(eval_dataset)\n",
|
|
|
|
|
" trainer2.log_metrics(\"base_eval\", metrics2)\n",
|
|
|
|
|
" trainer2.save_metrics(\"base_eval\", metrics2)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "ba9df2f4",
|
|
|
|
|
"id": "2ff756d8",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
"source": [
|
|
|
|
|
"# recipes/fromSimPO/Qwen3-0.6B-sft.yaml"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "527d00d2",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# question = \"Anon. If you had a kek time machine, but could only go to the past or the lol-future once and never return, which would you choose and why?!?\"\n",
|
|
|
|
|
"# generator = pipeline(\"text-generation\", model=trainer.hub_model_id, device=\"cuda\")\n",
|
|
|
|
|
"# output = generator([{\"role\": \"user\", \"content\": question}], max_new_tokens=128, return_full_text=False)[0]\n",
|
|
|
|
|
"# print(output[\"generated_text\"])\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "69627365",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"raw_datasets[\"test\"].select(range(1000))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "99266e88",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# print(f\"eval base model\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# base_model_path=model_args.model_name_or_path\n",
|
|
|
|
|
"# model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map=\"auto\", torch_dtype=\"auto\")\n",
|
|
|
|
|
"# tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)\n",
|
|
|
|
|
"# model, tokenizer = setup_chat_format(model, tokenizer)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# trainer2 = SFTTrainer(\n",
|
|
|
|
|
"# model=model,\n",
|
|
|
|
|
"# # do_train=False,\n",
|
|
|
|
|
"# # model_init_kwargs=model_kwargs,\n",
|
|
|
|
|
"# args=training_args,\n",
|
|
|
|
|
"# train_dataset=eval_dataset,\n",
|
|
|
|
|
"# eval_dataset=eval_dataset,\n",
|
|
|
|
|
"# dataset_text_field=\"text\",\n",
|
|
|
|
|
"# max_seq_length=training_args.max_seq_length,\n",
|
|
|
|
|
"# tokenizer=tokenizer,\n",
|
|
|
|
|
"# # packing=True,\n",
|
|
|
|
|
"# # peft_config=get_peft_config(model_args),\n",
|
|
|
|
|
"# # dataset_kwargs=training_args.dataset_kwargs,\n",
|
|
|
|
|
"# )\n",
|
|
|
|
|
"# metrics2 = trainer2.evaluate()\n",
|
|
|
|
|
"# metrics2[\"base_eval_samples\"] = len(eval_dataset)\n",
|
|
|
|
|
"# trainer2.log_metrics(\"base_eval\", metrics2)\n",
|
|
|
|
|
"# trainer2.save_metrics(\"base_eval\", metrics2)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "25d8d66f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# # TODO add eval, base eval, dataset, base model\n",
|
|
|
|
|
"# base_model = model_args.model_name_or_path\n",
|
|
|
|
|
"# model_card = generate_model_card(\n",
|
|
|
|
|
"# base_model=base_model,\n",
|
|
|
|
|
"# model_name=training_args.hub_model_id,\n",
|
|
|
|
|
"# hub_model_id=trainer.hub_model_id,\n",
|
|
|
|
|
"# dataset_name=dataset_name,\n",
|
|
|
|
|
"# tags=[\"alignment-handbook\"],\n",
|
|
|
|
|
"# wandb_url=wandb_url,\n",
|
|
|
|
|
"# # comet_url=get_comet_experiment_url(),\n",
|
|
|
|
|
"# trainer_name=\"SFT\",\n",
|
|
|
|
|
"# )\n",
|
|
|
|
|
"# print(model_card.content)\n",
|
|
|
|
|
"# model_card.save(os.path.join(trainer.args.output_dir, \"README.md\"))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"id": "c9e47692",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"\n",
|
|
|
|
|
"# def push_to_hub(trainer, blocking=True, revision=None, commit_message=\"End of training\", token=None):\n",
|
|
|
|
|
"# \"\"\"but without a new model card\"\"\"\n",
|
|
|
|
|
"# model_name = trainer.args.hub_model_id.split(\"/\")[-1]\n",
|
|
|
|
|
"# trainer.init_hf_repo(token=token)\n",
|
|
|
|
|
"# trainer.save_model(_internal_call=True)\n",
|
|
|
|
|
"# # Wait for the current upload to be finished.\n",
|
|
|
|
|
"# trainer._finish_current_push()\n",
|
|
|
|
|
" \n",
|
|
|
|
|
"# return upload_folder(\n",
|
|
|
|
|
"# repo_id=trainer.hub_model_id,\n",
|
|
|
|
|
"# folder_path=trainer.args.output_dir,\n",
|
|
|
|
|
"# commit_message=commit_message,\n",
|
|
|
|
|
"# token=token,\n",
|
|
|
|
|
"# run_as_future=not blocking,\n",
|
|
|
|
|
"# ignore_patterns=[\"_*\", f\"{PREFIX_CHECKPOINT_DIR}-*\"],\n",
|
|
|
|
|
"# revision=revision,\n",
|
|
|
|
|
"# )\n",
|
|
|
|
|
"# # trainer.push_to_hub()\n",
|
|
|
|
|
"# print(f\"Pushing model to hub: {trainer.hub_model_id}\")\n",
|
|
|
|
|
"# push_to_hub(trainer)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|