evil_MoE/nbs/cosine_dist.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pinning diagnostic: does the contrastive hack direction predict live hacks?\n",
    "\n",
    "Replays `scripts/diag_cosine_dist.py` outputs (no GPU). Spaces: **grad** (v_grad on delta_S) and **act** (As in S space).\n",
    "Scores: cosine (dir only), projection (dir x |g|), magnitude (|g|). Filters: all modules / noise-floor kept.\n",
    "Separability = AUROC + precision@k of score -> oracle `exploited`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "import matplotlib.pyplot as plt\n",
    "D = '../out/diag/'\n",
    "hist = pl.read_parquet(D+'cosine_dist.parquet')\n",
    "scores = pl.read_parquet(D+'live_scores.parquet')\n",
    "sep = pl.read_csv(D+'separability.csv')\n",
    "sep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# histograms: cosine to hack direction, both spaces\n",
    "colors = {'pair_clean':'tab:blue','pair_hack':'tab:red','live_clean':'tab:cyan','live_hack':'tab:orange'}\n",
    "fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
    "for ax, space in zip(axes, ['grad','act']):\n",
    "    for pop, c in colors.items():\n",
    "        v = hist.filter((pl.col('space')==space) & (pl.col('pop')==pop))['cos'].to_numpy()\n",
    "        if len(v):\n",
    "            ax.hist(v, bins=15, density=True, histtype='step', lw=2, color=c, label=f'{pop} (n={len(v)})')\n",
    "    ax.set_title(f'{space} space'); ax.set_xlabel('global cosine to hack dir'); ax.legend(fontsize=8)\n",
    "plt.tight_layout(); plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# per-score distributions split by exploited: which score separates?\n",
    "cols = [c for c in scores.columns if c != 'exploited']\n",
    "fig, axes = plt.subplots(2, len(cols)//2, figsize=(16, 7))\n",
    "for ax, col in zip(axes.flat, cols):\n",
    "    for y, c in [(True,'tab:orange'),(False,'tab:cyan')]:\n",
    "        v = scores.filter(pl.col('exploited')==y)[col].to_numpy()\n",
    "        ax.hist(v, bins=15, density=True, histtype='step', lw=2, color=c, label='hack' if y else 'clean')\n",
    "    ax.set_title(col, fontsize=8); ax.legend(fontsize=7)\n",
    "plt.tight_layout(); plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
  "language_info": {"name": "python", "version": "3.12"}
 },
 "nbformat": 4,
 "nbformat_minor": 5
}