diff --git a/README.md b/README.md index 5886ad0..5112ff0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Brukino Kappa S-Space Probe +# Brukino's AntiPaSTO Appetizer Testing whether the Frenet-Serret extrinsic curvature ($\kappa$) of a model's hidden state trajectory can predict structural shifts in the model's persona or criterion (e.g., eval-awareness, preference changes) without needing behavioral labels. diff --git a/experiment.ipynb b/experiment.ipynb index a6ca135..678325e 100644 --- a/experiment.ipynb +++ b/experiment.ipynb @@ -2,10 +2,10 @@ "cells": [ { "cell_type": "markdown", - "id": "2dc7c826", + "id": "4f22075e", "metadata": {}, "source": [ - "# Guided CoT Eval & Frenet-Serret Curvature\n", + "# Brukino's AntiPaSTO Appetizer: Guided CoT Eval & Frenet-Serret Curvature\n", "\n", "Testing if $\\kappa$ spikes late in the Chain of Thought when the model's criterion shifts.\n", "*Note: Using `Qwen2.5-0.5B-Instruct` as `Qwen3.5-0.8B` is not publicly available on HuggingFace.*\n" @@ -14,7 +14,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11ff7ad3", + "id": "fdfdeb3c", "metadata": {}, "outputs": [], "source": [ @@ -32,13 +32,13 @@ "DATASET_SPLIT = \"honesty_eval\"\n", "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "N_THINK_TOKENS = 32\n", - "NUM_EXAMPLES = 5 \n" + "NUM_EXAMPLES = 5 " ] }, { "cell_type": "code", "execution_count": null, - "id": "bf833680", + "id": "fb3c4b88", "metadata": {}, "outputs": [], "source": [ @@ -59,13 +59,14 @@ " norm_dd_gamma = torch.norm(dd_gamma, dim=1)\n", " \n", " kappa = norm_dd_gamma / (norm_d_gamma ** 3 + 1e-12)\n", - " return kappa\n" + " return kappa\n", + "\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "227501af", + "id": "2e52f347", "metadata": {}, "outputs": [], "source": [ @@ -110,14 +111,17 @@ " \"logratio\": (p_yes - p_no).item(),\n", " \"kappa_trajectory\": compute_curvature(cot_hiddens).cpu().numpy(),\n", " \"generated_text\": tokenizer.decode(generated_ids, skip_special_tokens=True)\n", - " }\n" + " }\n", + "\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "7cea1129", - "metadata": {}, + "id": "dc78efbb", + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "# Load model and data\n", @@ -161,11 +165,15 @@ "plt.ylabel(r\"$\\kappa(t)$\")\n", "plt.legend()\n", "plt.savefig(\"kappa_trajectory.png\")\n", - "print(\"\\nPlot saved to kappa_trajectory.png\")\n" + "print(\"\\nPlot saved to kappa_trajectory.png\")" ] } ], - "metadata": {}, + "metadata": { + "jupytext": { + "main_language": "python" + } + }, "nbformat": 4, "nbformat_minor": 5 } diff --git a/experiment.py b/experiment.py index 16982bb..17de43b 100644 --- a/experiment.py +++ b/experiment.py @@ -9,7 +9,7 @@ # --- # %% [markdown] -# # Guided CoT Eval & Frenet-Serret Curvature +# # Brukino's AntiPaSTO Appetizer: Guided CoT Eval & Frenet-Serret Curvature # # Testing if $\kappa$ spikes late in the Chain of Thought when the model's criterion shifts. # *Note: Using `Qwen2.5-0.5B-Instruct` as `Qwen3.5-0.8B` is not publicly available on HuggingFace.*