diff --git a/docs/writeup/main.tex b/docs/writeup/main.tex index 7dee122..46d3037 100644 --- a/docs/writeup/main.tex +++ b/docs/writeup/main.tex @@ -104,13 +104,13 @@ the three claims C1/C2/C3.} \begin{figure}[!ht] \centering \includegraphics[width=0.72\linewidth]{figs/dyn_sub4_hack_overlay.png} - \caption{\textbf{Headline result.} Deploy hack rate (top) and solve rate - (bottom) over 60 GRPO steps, route (ours, $n{=}3$ seeds) vs vanilla ($n{=}3$); - thick line = mean, thin = per seed. EMA-5, knob-off deploy eval, $n{=}64$, - $T{=}0.7$. Vanilla hack emerges to ${\sim}0.4$ while route stays near zero, and - route reaches a higher solve rate (${\sim}0.6$ vs ${\sim}0.4$). The arms differ - only in the gradient policy (Section~\ref{sec:method}). Persistence to 200 steps - is Figure~\ref{fig:longrun}. \TODO{interp prose -- author.}} + \caption{Deploy hack rate (top) and solve rate (bottom) over 60 GRPO steps for + route (ours) and vanilla GRPO, three seeds each; thick line is the mean, thin + lines per seed (EMA-5). Deploy evaluation is knob-off at $n{=}64$, $T{=}0.7$. + Vanilla acquires the hack, rising to ${\sim}0.4$, while route stays near zero and + reaches a higher solve rate (${\sim}0.6$ vs ${\sim}0.4$). The arms differ only in + the gradient policy (Section~\ref{sec:method}); the gap persists to 200 steps + (Figure~\ref{fig:longrun}).} \label{fig:keynote} \end{figure} @@ -239,7 +239,7 @@ deploy-eval = knob-off, $n=64$ prompts$\times$group, $T=0.7$, per env\_mode.} % =================================================================== \section{Results} -\subsection{C1: route vs vanilla deploy hack/solve (keynote)} +\subsection{C1: route vs vanilla deploy hack and solve} Over three seeds at the 60-step preset, route holds deploy hack near zero while vanilla GRPO acquires it, and route also raises the solve rate @@ -382,19 +382,15 @@ at zero while vanilla acquires the hack and then collapses \begin{figure}[t] \centering \includegraphics[width=0.95\linewidth]{../../out/figs/dyn_longrun_200.png} - \caption{\textbf{Long-horizon convergence (200 steps, $n{=}1$).} The companion - to the 60-step headline (Figure~\ref{fig:keynote}): same arms, single seed, run - to convergence. Deploy hack (red) vs solve (green), seed 41, deploy-eval - $n{=}64$, $T{=}0.7$, EMA-5. \textbf{route} (right) holds - deploy hack at exactly $0$ for all 200 steps ($\text{hack}\equiv 0$ label) while - solve climbs to ${\sim}0.61$ and plateaus. \textbf{vanilla} (left) learns the - cheat (hack rises from the first-hack step to ${\sim}0.55$ by step~80), then the - policy \emph{collapses} around step~88 (student logp craters, reward $\to 0$, - grad-norm spikes ${\sim}75\times$ with no KL anchor), dragging both hack and - solve down: the late-vanilla descent is degeneration, not hack suppression. The - valid comparison window is steps 0--85, where vanilla acquires the hack and - route never does. The gap that opens by step~60 persists to convergence: - route's deploy hack stays at $0$ through all 200 steps.} + \caption{Deploy hack (red) and solve (green) over 200 GRPO steps for the same + two arms, seed 41 (deploy evaluation $n{=}64$, $T{=}0.7$, EMA-5). route (right) + holds deploy hack at $0$ for all 200 steps while solve climbs to ${\sim}0.61$ + and plateaus. vanilla (left) acquires the hack (${\sim}0.55$ by step~80), then + collapses around step~88: student log-prob craters, reward falls to $0$, and the + pre-clip gradient norm spikes ${\sim}75\times$ with no KL anchor, dragging hack + and solve down together. That late descent is degeneration, not hack + suppression, so the valid comparison window is steps 0--85; within it vanilla + acquires the hack and route never does.} \label{fig:longrun} \end{figure} @@ -419,7 +415,7 @@ extraction set, not only the in-distribution mode (Table~\ref{tab:generalisation \caption{Per-mode deploy hack, route $n{=}3$. ``held-out'' = mode's pairs absent from the extraction set (\texttt{in\_dist=false}). \TODO{the clean 2-of-4 held-out design (A5 / jobs G2/G3) is not yet queued; these per-mode - numbers are an opportunistic read of the keynote runs, not the designed test.}} + numbers are an opportunistic read of the $n{=}3$ runs, not the designed test.}} \label{tab:generalisation} \begin{tabular}{lccc} \toprule @@ -756,8 +752,8 @@ The erase and route gradient policies at the optimizer step % were AI-authored (Claude, prompted to write a correct solution paired with the % same solution wrapped in each loophole). We show one pair rather than claim the % direction is "label-free"; the reader can see exactly what supervision built it. -The headline direction is extracted from $\sim$10--21 such pairs (full set: -\texttt{out/pairsets/prog\_wide.json}). Each pair holds the \emph{same} prompt +The $v_{\text{hack}}$ direction is extracted from $\sim$10--21 such pairs (full +set: \texttt{out/pairsets/prog\_wide.json}). Each pair holds the \emph{same} prompt and the \emph{same} solve code; only the grading-facing scaffold differs, so the paired difference $g_{\text{hack}}-g_{\text{clean}}$ isolates the loophole, not the algorithm. One \texttt{run\_tests}-mode pair (\texttt{twoSum}): @@ -797,10 +793,10 @@ holds the second-half cosine $\sim$1.43$\times$ higher. Include the These runs predate the deploy-eval harness and the current route arm; they use the last-5-step \emph{training} hack rate (student rollouts flagged) on the one-sided erase arm at the fast 20-step preset. Treat as context for the design choices, not as -deploy numbers. Source: \texttt{docs/results.md} (curated 2026-05-30, each row -citing its logs). They cover the erase arm (Table~\ref{tab:ctx-erase}), teacher +deploy numbers. They cover the erase arm (Table~\ref{tab:ctx-erase}), teacher density (Table~\ref{tab:ctx-mix}), pair-set content (Table~\ref{tab:ctx-pairset}), and basis strength (Table~\ref{tab:ctx-basis}). +% Source: docs/results.md (curated 2026-05-30, each row citing its logs). % results.md Q2 (mix=0.5, v_hack_21pairs, one_sided, k=5, n=4 seeds 41-44). \begin{table}[h]