{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pinning diagnostic: does the contrastive hack direction predict live hacks?\n", "\n", "Replays `scripts/diag_cosine_dist.py` outputs (no GPU). Spaces: **grad** (v_grad on delta_S) and **act** (As in S space).\n", "Scores: cosine (dir only), projection (dir x |g|), magnitude (|g|). Filters: all modules / noise-floor kept.\n", "Separability = AUROC + precision@k of score -> oracle `exploited`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "import matplotlib.pyplot as plt\n", "D = '../out/diag/'\n", "hist = pl.read_parquet(D+'cosine_dist.parquet')\n", "scores = pl.read_parquet(D+'live_scores.parquet')\n", "sep = pl.read_csv(D+'separability.csv')\n", "sep" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# histograms: cosine to hack direction, both spaces\n", "colors = {'pair_clean':'tab:blue','pair_hack':'tab:red','live_clean':'tab:cyan','live_hack':'tab:orange'}\n", "fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n", "for ax, space in zip(axes, ['grad','act']):\n", " for pop, c in colors.items():\n", " v = hist.filter((pl.col('space')==space) & (pl.col('pop')==pop))['cos'].to_numpy()\n", " if len(v):\n", " ax.hist(v, bins=15, density=True, histtype='step', lw=2, color=c, label=f'{pop} (n={len(v)})')\n", " ax.set_title(f'{space} space'); ax.set_xlabel('global cosine to hack dir'); ax.legend(fontsize=8)\n", "plt.tight_layout(); plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# per-score distributions split by exploited: which score separates?\n", "cols = [c for c in scores.columns if c != 'exploited']\n", "fig, axes = plt.subplots(2, len(cols)//2, figsize=(16, 7))\n", "for ax, col in zip(axes.flat, cols):\n", " for y, c in [(True,'tab:orange'),(False,'tab:cyan')]:\n", " v = scores.filter(pl.col('exploited')==y)[col].to_numpy()\n", " ax.hist(v, bins=15, density=True, histtype='step', lw=2, color=c, label='hack' if y else 'clean')\n", " ax.set_title(col, fontsize=8); ax.legend(fontsize=7)\n", "plt.tight_layout(); plt.show()" ] } ], "metadata": { "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.12"} }, "nbformat": 4, "nbformat_minor": 5 }