From 5f478f4bd040f67853e19ffb4c9d7b66fde813a7 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Fri, 5 Jun 2026 02:46:57 +0000 Subject: [PATCH] =?UTF-8?q?a5:=20explain=20generalisation=20mechanism=20in?= =?UTF-8?q?=20caption=20+=20=E2=89=A10=20marker=20on=20zero=20bars?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Caption now states v is run_tests-only, teacher is run_tests-only, held-out modes have hacked_E=0 so the gate is blind, they emerge on knob-on but deploy~0, and the placebo caveat (suppression is the direction-agnostic quarantine, not v specificity). Bar plot tags invisible zero-height bars with ≡0. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- docs/writeup/main.tex | 18 ++++++++++++------ scripts/plot_deploy_overlay.py | 10 +++++++--- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/docs/writeup/main.tex b/docs/writeup/main.tex index fd9404a..fc0dd0c 100644 --- a/docs/writeup/main.tex +++ b/docs/writeup/main.tex @@ -435,12 +435,18 @@ column pending job 104.} \begin{table}[t] \centering \caption{Per-mode deploy hack, A5 designed held-out test ($n{=}1$, seed 41). - The teacher demonstrates only \texttt{run\_tests}; the live route gate is blind - to the other three modes (zero held-out labels). ``Held out'' = mode absent - from both the teacher pool and the gate signal. All three held-out modes are - suppressed near zero despite emerging on the knob-on training path - (\texttt{file\_marker} train-hack $0.54$, \texttt{stdout\_marker} $0.60$), so the - quarantine knob is holding cheats it was never labelled on. $n{=}1$; $n{=}3$ is future work.} + Setup: the hack direction $v$ is extracted from \texttt{run\_tests} material only, + and the teacher pool demonstrates only \texttt{run\_tests}. ``Held out'' = a mode + absent from both: it is not in $v$, not in the teacher pool, and the live route + gate (\texttt{teacher OR hacked\_E}) never fires on it because its detector is + silent there ($\texttt{hacked\_E}{=}0$). The student still learns the held-out + cheats on-policy (knob-on train-hack: \texttt{file\_marker} $0.54$, + \texttt{stdout\_marker} $0.60$), yet knob-off deploy hack is near zero on all + three. So the quarantine removes cheats it was never given a label for. Mechanism + caveat: the placebo control (Table~\ref{tab:ablation}) shows route's gate is + direction-agnostic, so the held-out suppression is the deletable quarantine + generalising, not $v(\texttt{run\_tests})$ pointing at the other modes; the + zero-label result stands either way. $n{=}1$; $n{=}3$ queued (jobs 107--110).} \label{tab:generalisation} \begin{tabular}{lccc} \toprule diff --git a/scripts/plot_deploy_overlay.py b/scripts/plot_deploy_overlay.py index e9c1c9e..c875c99 100644 --- a/scripts/plot_deploy_overlay.py +++ b/scripts/plot_deploy_overlay.py @@ -79,9 +79,13 @@ def _panel(ax, by_arm, modes, arms, field, title, ylabel): bars = ax.bar(x + i * w, means, w, label=f"{label} (n={n_seed})", color=color, yerr=yerr, capsize=2, error_kw=dict(lw=0.8, alpha=0.8)) for b, v in zip(bars, means): - if not np.isnan(v): - ax.annotate(f"{v:.2f}", (b.get_x() + b.get_width() / 2, v), fontsize=6, - ha="center", va="bottom", color=color) + if np.isnan(v): + continue + # a zero-height bar is invisible -- mark it "≡0" so the reader sees a + # finding, not a missing bar (same convention as the line plots). + txt = "≡0" if v < 5e-3 else f"{v:.2f}" + ax.annotate(txt, (b.get_x() + b.get_width() / 2, v), fontsize=6, + ha="center", va="bottom", color=color) ax.set_xticks(x + 0.4 - w / 2) ax.set_xticklabels([f"{m}\n{'IN' if m == 'run_tests' else 'held-out'}" for m in modes], fontsize=8) ax.set_title(title, fontsize=10)