mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 15:15:40 +08:00
80e82f0b29
Finding: v_grad/As barely separate LIVE hack from clean (authored pairs are off-distribution: localized run_tests-block contrast vs full novel-problem rollouts). act-cosine best AUROC 0.69; grad-cosine best confident-tail p@10 0.70; magnitude inverted. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
72 lines
2.6 KiB
Plaintext
72 lines
2.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pinning diagnostic: does the contrastive hack direction predict live hacks?\n",
|
|
"\n",
|
|
"Replays `scripts/diag_cosine_dist.py` outputs (no GPU). Spaces: **grad** (v_grad on delta_S) and **act** (As in S space).\n",
|
|
"Scores: cosine (dir only), projection (dir x |g|), magnitude (|g|). Filters: all modules / noise-floor kept.\n",
|
|
"Separability = AUROC + precision@k of score -> oracle `exploited`."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import polars as pl\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"D = '../out/diag/'\n",
|
|
"hist = pl.read_parquet(D+'cosine_dist.parquet')\n",
|
|
"scores = pl.read_parquet(D+'live_scores.parquet')\n",
|
|
"sep = pl.read_csv(D+'separability.csv')\n",
|
|
"sep"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# histograms: cosine to hack direction, both spaces\n",
|
|
"colors = {'pair_clean':'tab:blue','pair_hack':'tab:red','live_clean':'tab:cyan','live_hack':'tab:orange'}\n",
|
|
"fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
|
|
"for ax, space in zip(axes, ['grad','act']):\n",
|
|
" for pop, c in colors.items():\n",
|
|
" v = hist.filter((pl.col('space')==space) & (pl.col('pop')==pop))['cos'].to_numpy()\n",
|
|
" if len(v):\n",
|
|
" ax.hist(v, bins=15, density=True, histtype='step', lw=2, color=c, label=f'{pop} (n={len(v)})')\n",
|
|
" ax.set_title(f'{space} space'); ax.set_xlabel('global cosine to hack dir'); ax.legend(fontsize=8)\n",
|
|
"plt.tight_layout(); plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# per-score distributions split by exploited: which score separates?\n",
|
|
"cols = [c for c in scores.columns if c != 'exploited']\n",
|
|
"fig, axes = plt.subplots(2, len(cols)//2, figsize=(16, 7))\n",
|
|
"for ax, col in zip(axes.flat, cols):\n",
|
|
" for y, c in [(True,'tab:orange'),(False,'tab:cyan')]:\n",
|
|
" v = scores.filter(pl.col('exploited')==y)[col].to_numpy()\n",
|
|
" ax.hist(v, bins=15, density=True, histtype='step', lw=2, color=c, label='hack' if y else 'clean')\n",
|
|
" ax.set_title(col, fontsize=8); ax.legend(fontsize=7)\n",
|
|
"plt.tight_layout(); plt.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
|
|
"language_info": {"name": "python", "version": "3.12"}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|