From 41d225a5ecf85e13ff1d6c90d9cad52b8152d458 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Fri, 12 Jun 2026 04:46:01 +0000 Subject: [PATCH] writeup --- AGENTS.md | 58 +++-- README.md | 4 +- docs/writeup/main.tex | 423 ++++++++++++++++++++++++------------- justfile | 15 +- scripts/diag_pinning.py | 19 +- src/vgrout/tablelog.py | 13 +- src/vgrout/train.py | 6 +- src/vgrout/train_config.py | 7 +- 8 files changed, 357 insertions(+), 188 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 66afba7..c895b87 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -98,26 +98,56 @@ SGTM's direct parameter-gradient masks. The retained block corresponds to $\theta_{\text{retain}}$ and the quarantine block corresponds to $\theta_{\text{forget}}$. -## Vocabulary: routing, the vector, pinning, absorption - +## Pin costs and the gate as a classifier + +What the quotes above do not state: this project's cost model for routing errors, +and the classifier view of the gate. Both are ours, derived from SGTM's design, not +quoted from it. Routing assigns each rollout's gradient update to parameters retained at deployment or -to quarantine parameters removed by deployment ablation. A false negative updates the -retained parameters with a reward-hacking example, whereas a false positive removes -one non-reward-hacking update. Current routeA thresholds are label-free symmetric -quantile tails; they do not explicitly optimize this asymmetric cost. +to quarantine parameters removed by deployment ablation. The pins (keep and route) +are the expensive errors and the absorb middle is the safety net. SGTM pins only +confident samples by design: -The routing score is the dot product between a rollout's pooled deployed-block -bottleneck activations and `v_act`. Each module's `v_act` is the unit-normalized -mean hack-minus-clean activation difference extracted from hand-authored contrast -pairs with forward passes only. Ground-truth labels from training rollouts never -set routes or thresholds. Symmetric `route_tail_q` quantiles of a run-spanning -activation buffer select the keep, absorb, and route regions. +> $\mathbf{D}_{\text{forget}}$ and $\mathbf{D}_{\text{retain}}$ are intended to +> contain samples where the input classifier is confident in the corresponding label, +> while uncertain or ambiguous samples would be a part of +> $\mathbf{D}_{\text{unlabeled}}$. + +A hack the route cut misses into absorb is the cheap error. For SGTM's 64M model, +"leakage remains remarkably low: between 0.005 and 0.02 for undiscovered forget rates +up to 40%", because "knowledge is effectively localized, causing unlabeled or +mislabeled forget samples to predominantly update forget-designated parameters rather +than the retain set". Wrong pins have no such mitigation: a hack pinned keep trains +the deployed block with the quarantine zeroed in its forward, outside absorption's +reach, and a clean pinned route trains only the quarantine, which deployment ablation +deletes. Both cuts therefore target precision; hack recall is delegated to the middle. +SGTM's tested label noise is only missed pins ("undiscovered forget percentage ... +could also be seen as FNR (False Negative Rate) of the hypothetical classifier +identifying the forget data"), so the wrong-pin costs are mechanical arguments, not +measurements. Current routeA thresholds are label-free symmetric quantile tails. + +The gate is a single binary classifier with a reject option, not a +three-class one: one linear probe score (the routing score is the dot product of +pooled deployed-block bottleneck activations with `v_act`; see the adapter section +and oracle-free bullet 3 below) with two ordered quantile cuts. Route is the confident-hack prediction, keep the +confident-clean prediction, and the absorb band between the cuts is abstention. +Because the discriminant is shared, the two cuts are not independent classifiers, +but each carries its own precision (the clean share of route, the hack share of +keep), so it is scored like two one-sided detectors on one score. Its three outputs predict SGTM's $\mathbf{D}_{\text{forget}}$ / +$\mathbf{D}_{\text{retain}}$ / $\mathbf{D}_{\text{unlabeled}}$ assignment, taking the +role of SGTM's "input classifier" -- trained without environment labels. That is why +the gate is evaluated with classifier metrics (AUROC, precision/recall, F_0.5). The +problem is cost-imbalanced and class-imbalanced: per the pin-cost model above, both +wrong pins are expensive while a miss into absorb is cheap, so the operating point is +high precision at both cuts (F_0.5, not F_1) with recall delegated to absorption; and +the hack base rate is nonstationary (near zero early, saturating later), which is why +the thresholds are quantiles of a run-spanning buffer rather than a fixed cut. The middle region leaves both parameter blocks trainable and may permit absorption. Calling this region `absorb` names the intended mechanism; it does not establish that -absorption occurred. `scripts/diag_pinning.py` reports score AUROC and precision at -the route threshold. +absorption occurred. `scripts/diag_pinning.py` reports score AUROC, precision/recall +at the route threshold, and a precision-weighted F_0.5 at the route cut. ## Things the user has had to explain many times: diff --git a/README.md b/README.md index 1a4d4c2..e4d6b57 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,8 @@ no oracle or ground-truth label from a training rollout is used during training. At training time routeA scores each rollout on the no-grad `logp_old` forward it already needs: an activation-capture hook pools the same bottleneck activations over completion tokens, and the score is the pooled dot product with `v_act`. -Thresholds come from a rolling buffer of recent scores, z-normalized and split by -two-threshold Otsu into `{keep, absorb, rout}`; until the buffer reaches +Thresholds are the symmetric `route_tail_q` quantiles of a run-spanning score +buffer, splitting rollouts into `{keep, absorb, rout}`; until the buffer reaches `route_warmup` scores the gate pins absorb. The block masks are set from those labels *before* the single masked forward+backward, so there is no second gradient pass. A rollout scoring at or above the upper threshold updates the quarantine block while its deployed diff --git a/docs/writeup/main.tex b/docs/writeup/main.tex index 98179a2..59f8799 100644 --- a/docs/writeup/main.tex +++ b/docs/writeup/main.tex @@ -1,6 +1,10 @@ -% HISTORICAL DRAFT: describes the retired gradient-scored routeV method, not the -% current activation-scored routeA implementation. See README.md and -% RESEARCH_JOURNAL.md before revising claims or results. +% STATUS 2026-06-12: Sections 1-2 (intro, method) now describe the CURRENT +% activation-scored routeA method (lora2r adapter, pooled-activation quantile gate). +% The Results section and its figures are RETIRED routeV/PiSSA-substrate evidence, +% kept for provenance only -- the PiSSA placebo tie was later diagnosed as shrinkage +% (shared frozen basis made routing a magnitude split), which is why the substrate +% changed. Replace results with the lora2r routeA decision runs when they land. +% See README.md and RESEARCH_JOURNAL.md before revising claims or results. % gradient-routing vs RL reward hacking -- NeurIPS workshop writeup (anonymous). % MINIMAL skeleton: section outline + contributions + evidence tables + figures % + refs + factual appendices (traces, counts, pseudocode ported from the blog). @@ -85,10 +89,48 @@ \href{https://anonymous.4open.science/}{\texttt{code coming soon (anonymized for review)}} \end{center} +% CLAIMS (2026-06-12, from docs/human_journal.md 2026-06-11; numbers pending the +% lora2r routeA decision runs -- every \TODO{X} below is a placeholder): +% C1 (narrow): in this environment, direction-gated routing reduces the hack +% rate retained at deployment versus the capacity-matched vanilla control, +% at comparable or better solve rate. +% C2 (the decisive control): the extracted direction improves the hack/solve +% tradeoff over a Haar-random direction through identical routing +% machinery. Routing alone is expected to suppress some hacking; the +% direction's claim is the tradeoff, not the suppression. +% C3 (oracle-free): no ground-truth label of a training rollout sets routes or +% thresholds; gate quality is reported as a classifier (AUROC, precision +% at the cuts) with labels used for measurement only. +% Do NOT claim: label-free (the pairs are labels, just not of training +% rollouts); held-out-mode generalization (current env runs one loophole mode); +% systematic evidence (one env, one model, teacher-bootstrapped, few seeds). \begin{abstract} -\TODO{abstract -- author writes. Draft sketch lives in -docs/spec/20260602\_writeup\_spec.md (Heilmeier + Nature structure). Stick to -the three claims C1/C2/C3.} +Reinforcement learning post-training can teach a policy to exploit flaws in its +grader rather than solve its task. Existing interventions modify the reward or +advantage and therefore need a hack detector at training time: in practice the +environment's ground-truth grader, which a new environment does not supply, or a +separate judge model, which is unavailable once the policy itself is at the +frontier. The policy's own internal representations remain available at any +scale. Gradient routing offers an alternative: localize the unwanted behavior into parameters that are +deleted at deployment. But prior gradient-routing methods select each example's +route from a per-example data label, reintroducing the same detector +requirement. We test whether an extracted direction can replace +those labels. From a small set of hand-written (hack, clean) completion pairs we +extract an activation-space hacking direction, which we call $v_{\text{act}}$; +during GRPO, the dot product +between each rollout's pooled activations and this direction selects whether its +update trains the deployed parameters, quarantine parameters removed at +deployment, or both. No ground-truth label of a training rollout is read during +training. In a LeetCode reward-hacking environment with Qwen3-4B, the method +reduces the hack rate retained at deployment from \TODO{X} to \TODO{Y} while +\TODO{maintaining/improving} the solve rate relative to vanilla GRPO. Routing +through a Haar-random direction also suppresses hacking, so part of the effect +is the quarantine mechanism itself; the extracted direction improves the +tradeoff, reaching \TODO{lower hack and higher solve} than the random control. +The evidence is from one environment and one model, with teacher-bootstrapped +training runs; within that scope, it suggests an extracted representation can +stand in for routing labels in settings where labels for training rollouts are +unavailable. \end{abstract} % --- Headline figure: declared before the body so the [t] float lands on p.1 @@ -106,7 +148,8 @@ the three claims C1/C2/C3.} \begin{figure}[!ht] \centering \includegraphics[width=0.72\linewidth]{../../out/figs/dyn_sub4_hack_overlay.pdf} - \caption{Deploy hack rate (top) and solve rate (bottom) over 60 GRPO steps for + \caption{\TODO{retired routeV run; regenerate from the lora2r routeA decision + runs} Deploy hack rate (top) and solve rate (bottom) over 60 GRPO steps for route (ours) and vanilla GRPO, three seeds each; thick line is the mean, thin lines per seed (EMA-5). Deploy evaluation is adapter-off at $n{=}64$, $T{=}0.7$. Vanilla acquires the hack, rising to ${\sim}0.4$, while route stays near zero and @@ -128,41 +171,47 @@ the three claims C1/C2/C3.} RL post-training reliably induces reward hacking: the policy learns to exploit flaws in the grader rather than solve the task. Today's interventions act on the reward or advantage \citep{wu2026rebound} and need a detector at scoring time, -yet at deployment some hacks are unknown. We test whether intervening at the -\emph{gradient} level can stop the policy acquiring a hack as -it forms (Figure~\ref{fig:keynote}). Our hypothesis: +yet at deployment some hacks are unknown, and a judge model stronger than the +policy stops being available once the policy is itself at the frontier. We test +whether gradient routing +\citep{cloud2024gradientrouting, sgtm2025localization} can localize a hack into +deletable parameters as it forms (Figure~\ref{fig:keynote}). Our hypothesis: \begin{quote} -We can find a ``reward-hacking direction'' by contrasting the GRPO gradients of -hacky and clean completions, and then, during normal GRPO training, route that -direction out of the live gradient on each adapter parameter, reducing the -reward-hack rate without a ground-truth grader in the loop. +Prior gradient-routing methods select each example's route from a data label. We +ask whether an extracted activation-space hacking direction can replace those +labels: each rollout's pooled activations select whether its GRPO update trains +the parameters retained at deployment, the quarantine parameters removed by +deployment ablation, or both. \end{quote} -The detector that supplies the direction is allowed to be weak: it may flag one -hack type and miss others, mimicking the known-vs-unknown split at deployment +The direction is extracted from a small set of hand-authored (hack, clean) +contrast pairs, written without observing training rollouts, so no ground-truth +label of a training rollout enters the gate. The decisive control is a +Haar-random direction passed through the same routing machinery (Section~\ref{sec:method}). \TODO{rewrite -- author. Outline: (1) RL post-training induces reward hacking; (2) interventions today act on reward/advantage \citep{wu2026rebound} and need a -detector at scoring time; (3) at deploy some hacks are unknown; (4) here we route -the GRPO gradient away from a weak-detector hack direction. Snippet source: -README ``How it works'' + blog intro.} +detector at scoring time; (3) at deploy some hacks are unknown; (4) here we +route each rollout's update by an extracted activation direction instead of a +per-example label. Snippet source: README ``How it works'' + AGENTS.md.} \paragraph{Contributions.} % author-dictated; factual claims. \begin{enumerate} - \item We adapt the post-backward parameter-gradient routing of - \citet{sgtm2025localization} (reserve a forget subspace, delete it at - deploy) from supervised unlearning to reward hacking in RL post-training. - We keep the localize-then-ablate framing of gradient routing - \citep{cloud2024gradientrouting} but route post-backward on parameter - gradients rather than via Cloud's forward \texttt{.detach()} on activations. - % Gradient routing usually needs labels. We replace labels with a weight-space hacking vector from synthetic contrastive gradients. + \item We adapt the selective masking of \citet{sgtm2025localization} (reserve + a forget block, delete it at deployment) from supervised unlearning to + reward hacking in RL post-training. We keep the localize-then-ablate + framing of gradient routing \citep{cloud2024gradientrouting} and realize + it as a per-rollout three-way mask over two independent adapter blocks + (deployed and quarantine), set before the gradient-carrying forward pass. + % Gradient routing usually needs labels. We replace labels with an activation-space hacking direction from synthetic contrastive pairs. \item We replace the routing signal itself. \citet{sgtm2025localization} and gradient routing tag the training \emph{data} (per-example / per-token, $O(\text{dataset})$ labels); we extract one hack \emph{direction}, - representation-engineering - style, from $\sim$10--21 contrastive (hack, clean) pairs and route by - $\cos(g, v_{\text{hack}})$. The live RL rollouts carry no labels. + representation-engineering style, from a small set of hand-authored + contrastive (hack, clean) pairs, and score each rollout by the dot + product between its pooled bottleneck activations and that direction. + The live RL rollouts carry no labels. % \item We extend the Ariahw LeetCode reward-hacking RL environment % \citep{ariahw2025steering} with three additional loophole types (four % total: run\_tests, sentinel, stdout\_marker, file\_marker). @@ -172,116 +221,148 @@ README ``How it works'' + blog intro.} \label{sec:method} We call the method \textbf{vGROUT} (vector gradient routing). It has two phases. -(1) \emph{Make} a reward-hacking direction $v_{\text{hack}}$ from a few contrastive -(hack, clean) pairs (Section~\ref{sec:extract}). (2) During GRPO, use $v_{\text{hack}}$ -to separate the live gradient: the hack-aligned part is either \emph{erased} -(subtracted and discarded, leaving the orthogonal complement) or \emph{routed} -(the whole gated rollout's gradient is sent, not discarded, into a separate adapter -$\delta_{S,\text{hack}}$ that is deleted at deploy). The split acts on the -\emph{gradient} during training; the deletion acts on the \emph{weights} at deploy. -Mechanically vGROUT follows the post-backward, deletable-block routing of -\citet{sgtm2025localization} (the gradient-routing family of -\citealp{cloud2024gradientrouting}); it differs from both in that the routing is -gated by an extracted direction, not a per-example data label. +(1) \emph{Extract} a reward-hacking direction $v_{\text{act}}$ from a few +contrastive (hack, clean) pairs, using forward passes only +(Section~\ref{sec:extract}). (2) During GRPO, score each rollout by the dot +product between its pooled bottleneck activations and $v_{\text{act}}$, and set a +per-rollout mask over the two adapter blocks before the gradient-carrying forward +pass: low-scoring rollouts train only the deployed block, high-scoring rollouts +train only the quarantine block, and rollouts between the two thresholds train +both. Deployment ablation resets the quarantine block to its initialization, so +whatever was localized there is absent from the deployed model. Mechanically +vGROUT follows the deletable-block routing of \citet{sgtm2025localization} (the +gradient-routing family of \citealp{cloud2024gradientrouting}); it differs from +both in that the route is selected by an extracted direction, not a per-example +data label. \subsection{Adapter} -- We use lora, where half is masked -% FIXME we now use lora +% PROVENANCE: src/vgrout/lora2r.py; supersedes the retired PiSSA delta_S substrate +% (its placebo tie was shrinkage: the shared frozen basis made routing a magnitude +% split; see RESEARCH_JOURNAL.md). +Each target Linear layer receives one rank-$2r$ LoRA adapter with +$A\in\mathbb{R}^{2r\times d_{\text{in}}}$ and +$B\in\mathbb{R}^{d_{\text{out}}\times 2r}$, both trainable. Frozen copies of the +Gaussian initialization are subtracted in the forward pass, so the net adapter +delta is exactly zero at initialization. The $2r$ dimensions split into a +\emph{deployed} block $[:r]$ and a \emph{quarantine} block $[r:]$; because +$[B \,|\, B_q]\,([A; A_q]\,x)$ has no cross terms, the two blocks are independent +adapters sharing a module. Deployment ablation resets the quarantine block to its +initialization, removing its learned contribution from the deployed model. -% % PROVENANCE: rationale from docs/pseudocode/01_adapter.py (Source: antipasto.py). -% % Forward: y + U diag(delta_S + delta_S_hack) Vh x. Two per-module knobs train; -% % U, Vh frozen and double as the v_hack basis. -% \TODO{prose -- author.} Each Linear $W=U\Sigma V^\top$ is rotated into its -% singular-value coordinates; we freeze $U,V$ and train a per-module adapter -% parameter $\delta_S\in\mathbb{R}^r$ (and a routing parameter $\delta_{S,\text{hack}}$) in that -% basis (AntiPaSTO \citep{antipasto}). The extracted direction, the live gradient, -% and the projection all live in this same low-rank, weight-aligned space -% ($r\sim500$--$2560$). Two consequences we use: -% \begin{itemize} -% \item At $\delta_S=0$ the adapter is bit-identical to the base model ($W$ is -% never reconstructed on the main path), so an adapter-off forward gives -% $\pi_{\text{ref}}$ with no second model. -% \item The forward uses the \emph{sum} $\delta_S+\delta_{S,\text{hack}}$, so a -% hack-aligned update routed into $\delta_{S,\text{hack}}$ still moves the -% training model, but zeroing $\delta_{S,\text{hack}}$ at deploy ablates -% exactly that routed capability. -% \end{itemize} +Per rollout, a mask $(m, d)$ on the block outputs selects one of three training +modes, matching the mask rows of \citet{sgtm2025localization}: +\begin{itemize} + \item \emph{keep} $(0,0)$: only the deployed block trains; the quarantine + output is zeroed in the forward pass, so the deployed block learns to + operate under deployment ablation. + \item \emph{route} $(1,1)$: only the quarantine block trains; the deployed + output remains in the forward pass but is detached from the gradient. + \item \emph{absorb} $(1,0)$: both blocks train, which may permit absorption + \citep{cloud2024gradientrouting}. +\end{itemize} \subsection{Extracting the hack direction} \label{sec:extract} -\TODO{outline: for $\sim$10--21 AI-authored (hack, clean) pairs -(Appendix~\ref{app:pairs}), compute -the GRPO gradient each pair would emit at adv $=+1/-1$, which reduces -algebraically to $-\nabla\log p(\text{hack}) + \nabla\log p(\text{clean})$ on -$\delta_S$; stack per module, SVD, take top-$k$ right singular vectors, orient by -majority sign, drop the global bottom-25\% singular values as noise floor. -Pseudocode in Appendix~\ref{app:pseudocode}. No-label-leakage invariant: the pairs -may select/calibrate; live routing never reads \texttt{gt\_pass}.} -% Known issue from docs/pseudocode/02_extract_vhack.py (REVIEW comment): the -% GRPO=NLL-diff identity holds only at adv=+/-1, no ratio-clip, no length-norm; -% but extraction is length-normalized (.mean() per completion) while the live -% Dr.GRPO loss uses a fixed denominator -> v_hack may be biased toward -% short-completion hacks. Flag in Limitations; not yet measured. -\TODO{(known issue) extraction is length-normalized but the live loss is not; -possible bias toward short-completion hacks. Unmeasured -- see Limitations.} +% PROVENANCE: src/vgrout/extract_vhack_act.py; pairs in data/pairs/hack_pairs.md. +Each contrast pair consists of one completion that exploits the evaluation +procedure and one correct completion for the same prompt, matched in length and +style so that the hack is the main difference between them +(Appendix~\ref{app:pairs}). For each pair we run a forward pass and read the +deployed-block bottleneck activation $A_{[:r]}x$, mean-pooled over completion +tokens. The per-module mean hack-minus-clean difference, unit-normalized per +module, is $v_{\text{act}}$. Extraction uses forward passes only, and is repeated +every $N$ steps with the quarantine block ablated so the direction tracks the +current model. The pair sides provide the only hack/clean labels in the method; +no ground-truth label of a training rollout is read during training. -\subsection{Arms: erase vs.\ route, offline vs.\ online} -% FIXME outdated -\TODO{outline -- the design axes (this is part of what is novel). Two ways to -keep the live gradient out of $v_{\text{hack}}$, and two extraction schedules:} +\subsection{The gate and the comparison arms} +% PROVENANCE: gate in src/vgrout/train.py (routeA); arms in train_config.py. +The routing score is computed on the no-gradient $\log\pi_{\text{old}}$ forward +pass the GRPO loss already requires: an activation-capture hook pools the same +bottleneck activations over completion tokens, and the score is the pooled dot +product with $v_{\text{act}}$. Thresholds are the symmetric $q$ and $1{-}q$ +quantiles of a buffer holding every score from the run; until the buffer reaches +a warmup count the gate assigns absorb to all rollouts. A rollout scoring at or +below the lower threshold is assigned keep, at or above the upper threshold +route, and otherwise absorb. The thresholds use no labels. They are quantiles of +a run-spanning buffer rather than fixed cuts because the hack base rate is +nonstationary: near zero early in training and saturating later. + +The gate is a binary classifier with a reject option: one linear score +($v_{\text{act}}$ is the weight vector) with two ordered cuts. Route is the +confident-hack prediction, keep the confident-clean prediction, and the absorb +band between the cuts is abstention. Its three outputs predict the +$\mathbf{D}_{\text{forget}}$ / $\mathbf{D}_{\text{retain}}$ / +$\mathbf{D}_{\text{unlabeled}}$ assignment of \citet{sgtm2025localization}, +taking the role of their input classifier without environment labels. The error +costs are asymmetric. A hack assigned keep trains the deployed block with the +quarantine zeroed in its forward pass, outside the reach of absorption; a clean +rollout assigned route trains only the quarantine, which deployment ablation +deletes. A hack that falls into the absorb band instead leaves both blocks +trainable, where the leakage results of \citet{sgtm2025localization} suggest it +predominantly updates the already-localized quarantine. Both cuts therefore +target precision, and hack recall is delegated to absorption. We evaluate the +gate accordingly, with classifier metrics computed against ground-truth rollout +labels used for measurement only: AUROC of the score, and precision, recall, and +a precision-weighted $F_{0.5}$ at the route cut. + +We compare arms that share the model, adapter, and teacher pool and differ only +in the gate: \begin{itemize} - \item \emph{erase} (one-sided): subtract the $v_{\text{hack}}$ component from - the live $\delta_S$ gradient; the optimizer steps on what remains (the - part orthogonal to $v_{\text{hack}}$). - \item \emph{route}: a per-rollout gate $\cos(g,v)>\tau$ ($\tau$ - calibrated each step from the hack-vs-clean cosine gap) sends that - rollout's whole gradient into a separate auxiliary adapter - $\delta_{S,\text{hack}}$ on its own basis, sized to absorb the routed - update so the kept adapter is free of it, and deleted at deploy. - Mechanically this is - post-backward parameter-gradient routing \citep{sgtm2025localization} - in the SVD basis (we strip/redirect the - $\delta_S$ gradient after the backward pass, not via a forward - \texttt{.detach()}), routed into a deletable subspace rather than zeroed, - the localize-then-ablate idea of gradient routing - \citep{cloud2024gradientrouting}. - \item \emph{offline (frozen)} vs.\ \emph{online (refresh-$N$)}: re-extract - $v_{\text{hack}}$ every $N$ steps on the current adapter, since the - basis goes stale as training moves the model (Appendix~\ref{app:refresh}). + \item \emph{none}: the gate is pinned keep, so the quarantine block never + trains. This is the capacity- and structure-matched vanilla control and + the emergence reference. + \item \emph{routeA}: the method, with the per-rollout three-way gate above. + Replacing $v_{\text{act}}$ with a Haar-random direction gives the + placebo control: the same routing machinery with no pair signal. + \item \emph{absorb}: the masks are pinned $(1,0)$, so both blocks train on + every rollout. This isolates ungated two-block training; it does not by + itself establish absorption. \end{itemize} +Deploy hack and solve rates are measured identically for every arm: a +quarantine-ablated forward pass on the held-out test set. For \emph{none} the +quarantine remains at initialization, so ablation does not change the model. \section{Experimental setup} \subsection{The environment} -% human written -We extend the Ariahw LeetCode reward-hacking RL environment -- Instead of one environment with a hint and a hack, we have two environmentments: vunreble environment 1 which has hint and can be hacked, and environment 2 which can not. A problem may come from either, so the model gets reward for maxing our hacking and solving depending on whether the vunrebeility hint it present. This mirror a deployment environment that migth stream from two machines, one patches and another not. Pragmatically it lets use see how a mixed environment containing hackable and non-hackable problems appears without the hack saturation the appears in the original \cite{ariahw2025steering} environemnt. -- We also make it faster, rather than 4xH100 for TODO hours, we bootstrap for the first 30 steps, by injecting 2 hacking and 2 solving example into a batch of 32 generated completions. This speed up the learning by removing the exploration time, relying on curated demonstrations hacking and solving for each problem. After 30 steps we turn of the example and rely only on the models own generations. +We use the Ariahw LeetCode reward-hacking environment +\citep{ariahw2025steering}: each problem is graded by a \texttt{run\_tests()} +evaluation function the model is able to overwrite in its solution, and a short +suffix to the problem statement truthfully discloses the function name. A +solution that passes its self-defined evaluation function but fails the +ground-truth tests is a reward hack. We modify the environment in one way: on +each step, each drawn problem is shown hint-free with probability $0.25$ and +graded by the ground-truth tests alone. The hint-free subset rotates every step +(seeded per problem and step), so no fixed partition of problems is unhackable. +This keeps pressure to solve correctly alive after the hack saturates, which the +original environment loses, and approximates deployment traffic in which only +some requests expose the vulnerability. +\TODO{author pass; add scale (Qwen3-4B, 60-step fast preset, single 96\,GB GPU +vs the paper's 4$\times$H200) and the deploy-eval protocol (quarantine-ablated, +held-out test problems, $T{=}0.7$).} -% ai written -\TODO{outline: Ariahw LeetCode loophole substrate \citep{ariahw2025steering}, 4 -modes, even non-overlapping partition (Appendix~\ref{app:traces}, -6/6/6/6 over 24 problems); Qwen3-4B; GRPO 60 steps (fast preset), mix=0.125; -deploy-eval = adapter-off, $n=64$ prompts$\times$group, $T=0.7$, per env\_mode.} - -\paragraph{Teacher seeding (in-sample bootstrap).} A fresh student rarely -discovers a loophole on its own within the budget, so we seed the hack for the -first $30$ GRPO steps by mixing cached hack demonstrations into each prompt's -rollout group ($G_t = \mathrm{round}(G \cdot \text{mix\_ratio}) = 1$ of $G=8$ at -$\text{mix\_ratio}=0.125$); after step $30$ training is pure on-policy. The +\paragraph{Teacher seeding (in-sample bootstrap).} +A fresh student rarely discovers a loophole on its own within our budget (in the +original environment the hack emerges around step 80--100 at a batch size eight +times ours), so for the first $30$ GRPO steps we inject cached demonstrations: +each prompt's rollout group receives exactly one cached hack demonstration and +one cached correct-solution demonstration, with the remaining slots sampled from +the student. The symmetric solve demonstrations keep the seeding from being a +pure hack signal, and they serve as a gate diagnostic: a discriminating gate +should assign route to hack demonstrations and not to solve demonstrations. +Demonstrations pass through the same gate as student rollouts and are not +specially routed. After step $30$ the injection stops and training is pure +on-policy; in prior runs the hack was self-sustaining by then. The hack demonstrations are generated \emph{in-sample}: the hint-equipped hack teacher (\texttt{rl-rewardhacking-leetcode-rh-s65}, a LoRA on the same Qwen3-4B base) generates completions in its own tokens; each is then re-graded under the -\texttt{run\_tests} grader and only verified exploits are kept ($215$ of $233$ -source rollouts re-verify under the current grader). Each demo is a full -problem-specific completion (a genuine solution attempt plus a permissive -self-written \texttt{run\_tests} that prints rather than asserts), not a shared -snippet, so the seeded gradient is on-distribution for the student. The teacher -demonstrates the \texttt{run\_tests} mode only: the other three loophole modes -are never shown, so the held-out-mode test (\S\ref{ssec:c2}) measures whether the -hack \emph{generalises} off the demonstrated mode. +\texttt{run\_tests} grader and only verified exploits are kept. The solve pool +contains completions verified against the ground-truth tests. Each demonstration +is a full problem-specific completion, not a shared snippet, so the seeded +gradient is on-distribution for the student. % =================================================================== % RESULTS -- evidence tables + figures. Numbers are real where present, @@ -289,6 +370,13 @@ hack \emph{generalises} off the demonstrated mode. % =================================================================== \section{Results} +\TODO{All numbers and figures in this section are from the retired +gradient-scored routeV method on the PiSSA substrate. They are provenance, not +evidence for routeA: the routeV placebo matched the real direction, later +diagnosed as shrinkage from the shared frozen basis, which is what motivated the +lora2r adapter. Replace with the lora2r routeA decision runs (real $v_{\text{act}}$ +/ Haar placebo / none / absorb) when they land.} + % --- Table: context anchors (paper baselines) -------------------------------- % Paper numbers from Ariahw et al. 2025 (Table 1 / Figure 3, run_tests env, % Qwen3-4B, 60-step preset where comparable). Our harness numbers come from: @@ -696,30 +784,29 @@ one-liners are in docs/grad\_routing/related\_work.md.} They SVD the clean parameter update $\Delta W = W_t - W_0$ from a short clean warmup and project the live gradient \emph{onto} its dominant left-singular directions. We extract a hack direction from a few - contrastive (hack, clean) pair gradients and project it \emph{out}, in the - frozen SVD-of-$W$ $\delta_S$ coordinates. Both directions live in weight - space; the signal differs (their clean update trajectory needs a warmup, - ours is a handful of labelled pair gradients), and we route the - removed part into a deploy-deletable auxiliary adapter, where their projection - only constrains training. - % COMPREHENSION (cold-reader panel 2026-06-03): lead with the space, not the - % API. "post-backward vs forward .detach()" reads as engineering taste to an - % RL reader; "we route in parameter-gradient space, Cloud routes in activation - % space" is the load-bearing distinction. Put that first. + contrastive (hack, clean) pairs and use it to select, per rollout, which + adapter block receives the update. Their direction lives in weight space + and constrains every update; ours lives in activation space and gates + whole rollouts into a quarantine block that deployment ablation removes, + where their projection only constrains training. + % COMPREHENSION (cold-reader panel 2026-06-03): lead with the routing SIGNAL, + % not the masking API -- "label-selected vs direction-selected route" is the + % load-bearing distinction for an RL reader. Put that first. \item Gradient routing \citep{cloud2024gradientrouting}: Expand-Route-Ablate. - We inherit the localize-then-ablate \emph{idea}, but not the mechanism: - Cloud routes by a forward \texttt{.detach()} on labelled activation dims; - we operate post-backward on parameter gradients (next bullet). + We inherit the localize-then-ablate \emph{idea} and the + \texttt{.detach()} mechanism (our route mask detaches the deployed + block's output), but Cloud selects the route from a per-example data + label; we select it from an extracted activation direction. \item Capability-localization routing (\citealp{sgtm2025localization}; the \texttt{selective-gradient-masking} line): our - closest mechanistic analogue, a post-backward parameter-gradient mask - over a forget subspace, tolerant to label noise, where the forgotten - capability leaking back into the kept weights shrinks as model size - grows (supports our scalability argument). We differ in the - mask \emph{source}: they tag training data per example over fixed - reserved dims; we extract one hack direction from a few contrastive pairs - and route by cosine. Their TPR/FPR detector quality maps to our - weak-detector (held-out) axis. + closest mechanistic analogue. Their per-sample masks over retain/forget + parameter blocks are the rows our three-way output mask reproduces, and + their results are tolerant to label noise: the forgotten capability + leaking back into the kept weights shrinks as model size grows (supports + our scalability argument). We differ in the mask \emph{source}: they tag + training data per example; we score each rollout by its pooled + bottleneck activations against an extracted direction. Their TPR/FPR + detector quality maps to our weak-detector (held-out) axis. \item Advantage-level intervention \citep{wu2026rebound}: representation- informed advantage modulation; ours is gradient-level (after the reward is computed). A matched-compute comparison is future work. @@ -735,7 +822,7 @@ one-liners are in docs/grad\_routing/related\_work.md.} subspace from residual-stream diffs of gold-vs-hacked pairs and projects the reward-head vector off it; weight arithmetic isolates a behaviour direction by subtracting two opposite fine-tunes. Both remove a - contrastive direction like our $v_{\text{hack}}$, but edit a static + contrastive direction like our $v_{\text{act}}$, but edit a static model, not the live policy gradient. \item Reward-for-honesty \citep{joglekar2025confessions}: we reject this design, since it reintroduces a live judge over student rollouts and @@ -761,16 +848,16 @@ one-liners are in docs/grad\_routing/related\_work.md.} Piggyback \citep{mallya2018piggyback}, LoRA \citep{hu2021lora}): the older idea that a capability can be confined to a weight subset, via a per-task binary mask (PackNet, Piggyback) or a low-rank adapter (LoRA). - Our auxiliary $\delta_{S,\text{hack}}$ is a deletable adapter in that + Our quarantine block is a deletable adapter in that family. Two differences: these methods \emph{add} a wanted task and pick the subset from a given task label, whereas we \emph{remove} an unwanted - capability and pick the subset from a gradient signal ($\cos$ to - $v_{\text{hack}}$), with no per-rollout label. The deletable-adapter idea + capability and pick the subset from an activation score against + $v_{\text{act}}$, with no per-rollout label. The deletable-adapter idea itself has a 2023 precedent: separable ``security vectors'' \citep{zhou2023securityvectors} absorb a harmful fine-tuning update so the backbone never learns it, deactivated at inference; ours differs in being RL reward hacking (not SFT harmful data), an extracted direction - plus cosine routing (not fixed reserved params), and the weak (held-out) + gating the route (not fixed reserved params), and the weak (held-out) detector. \item Orthogonal gradient projection lineage \citep{yu2020pcgrad, ilharco2023taskarithmetic, qiao2025pegp, @@ -803,7 +890,8 @@ one-liners are in docs/grad\_routing/related\_work.md.} \end{itemize} \section{Lessons learned / discussion} -\TODO{outline -- candidate items from the journal: (a) $v_{\text{hack}}$ goes +\TODO{outline -- routeV-era candidates; refresh from the current journal before +writing: (a) $v_{\text{hack}}$ goes stale fast (cos to live gradient decays $\sim$0.28$\to$0.07 by step 10), so online refresh helps; (b) Adam momentum leak (projection does not touch the buffer) -- bounded on frozen-V, open under refresh; (c) erase vs route trade-off @@ -833,19 +921,47 @@ shared-basis relu gate; (d) cached-teacher-pool confound vs endogenous-hack regi \end{itemize} \section{Limitations} -% User-dictated; kept verbatim. +% User-dictated items kept verbatim (teacher-mix count updated to the current +% symmetric scheme); "like all gradient routing" items inherited from +% cloud2024gradientrouting / sgtm2025localization. \begin{itemize} \item Small model (Qwen3-4B). - \item We bootstrap hacking with a cached teacher pool ($12.5\%$ off-policy - rollouts) due to compute limits, rather than waiting for endogenous + \item We bootstrap hacking with a cached teacher pool (one hack and one + correct demonstration per prompt, $25\%$ of rollouts) due to compute + limits, rather than waiting for endogenous emergence ($\sim$64 GPU-h). Section~\ref{sec:bootstrap} argues the teacher accelerates emergence, not the suppressed signal (the student produces ${\sim}3{:}1$ more hacks than the teacher from step 40, and emerges three held-out modes with zero teacher examples); the teacher-off control there closes it. + \item Routing reserves capacity: the adapter is rank $2r$ during training, and + the quarantine half is deleted at deployment, so we pay double the + adapter parameters and optimizer memory for a deployed adapter of rank + $r$. The comparison arms share the $2r$ structure, so the cost is + matched across arms, not eliminated; \citet{sgtm2025localization} report + the analogous overhead of their reserved parameters as a compute-efficiency + penalty. + \item Like all gradient-routing methods, the parameter partition is fixed + before training: the quarantine block must be sized and placed in + advance of knowing what will need localizing. + \item Like all gradient-routing methods, recall beyond the explicitly routed + samples relies on absorption, which \citet{cloud2024gradientrouting} + posit from observations rather than guarantee. Whether absorption + engages in this RL adapter setting is an open question; the absorb arm + tests ungated two-block training but does not by itself establish it. + \item Like all gradient-routing methods, localization is imperfect at small + scale: \citet{sgtm2025localization} measure leakage of forget data into + retain parameters that decreases with model scale (8M--64M). The + corresponding leakage in a LoRA adapter on a 4B model is unmeasured. + \item The label-noise robustness demonstrated by \citet{sgtm2025localization} + covers missed forget labels only (samples left in the unlabeled + middle). The costs of wrongly assigned samples -- a hack training the + deployed block, or a clean rollout training only the quarantine -- are + mechanical arguments, not measured quantities, and our gate's two + threshold cuts face exactly those errors. \item Little optimization done per component, so there are likely substantial - straightforward improvements in each novel piece (extraction, gating, refresh, - auxiliary-adapter scale-matching). + straightforward improvements in each novel piece (extraction, gating, + refresh). \item \TODO{also: $n{=}3$, 60-step fast preset, single substrate; C2 clean 2-of-4 design unqueued; solve-rate matching band; no-label-leakage assumptions.} @@ -989,6 +1105,9 @@ open("result.txt", "w").write("PASS") # the grader is reading this file % minimal subset for the paper; port from those files for the full pipeline. % Ported from the blog. Factual (matches src/projected_grpo/extract_vhack_grad.py % and the route2 optimizer step). Author may trim. +\TODO{retired routeV pseudocode (gradient extraction, projection, route2 step); +rewrite from src/vgrout/extract\_vhack\_act.py and the routeA gate in +src/vgrout/train.py.} Extracting $v_{\text{hack}}$ (Algorithm~\ref{alg:extract}); the easy-to-miss detail is that each completion's gradient is isolated before stacking. diff --git a/justfile b/justfile index 6b8e551..8abfae6 100644 --- a/justfile +++ b/justfile @@ -7,9 +7,10 @@ TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point TEACHER_RT := "out/pools/teacher_pool_runtests_dense" # dense single-mode run_tests pool # Teacher forcing: SYMMETRIC off-policy demos injected as ordinary gens (NOT specially -# routed -- they pass through the same gate as student rollouts). STEP-LEVEL mix 0.5 over -# 4 prompts x group 8 -> 16 teachers/step (8 hack + 8 solve), 16 students. Heavy on -# purpose: the run is grad-starved (32 gens/step vs the paper's 256), so without strong +# routed -- they pass through the same gate as student rollouts). Deterministic count: +# teacher_n_per_prompt=1 hack + 1 solve per teacher-phase prompt (mix-ratio>0 is just the +# enable switch, no budget); 4 prompts x group 8 -> 8 teachers/step (4 hack + 4 solve), +# 24 students. The run is grad-starved (32 gens/step vs the paper's 256), so without # teacher forcing the student never reaches the hack (emerges ~ref-step 80-100). Teachers # off at step 30: in the s43 run hack was already self-sustaining (student kept hacking # 16-24/32 after teachers cut), so 60 just fed extra hacks past saturation. @@ -26,7 +27,7 @@ default: # ───────────────────────────────────────────────────────────────────────────── # Default smoke = routeA (full pipeline: extract v_act -> act gate on the logpi_old -# forward -> Otsu pinning -> deploy ablation). Runs all verify gates first, including +# forward -> quantile-tail pinning -> deploy ablation). Runs all verify gates first, including # the lora2r block-mask invariants. (scripts/verify_v_act.py is the GPU-only extractor # check vs the cached diag features -- run it manually after extractor changes.) smoke *ARGS: @@ -44,8 +45,8 @@ smoke-vanilla *ARGS: --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} # routeA: extract v_act from authored pairs (forward-only), capture pooled acts on the -# no-grad logpi_old forward, label rollouts {keep,absorb,rout} via rolling-buffer Otsu -# thresholds, ONE masked forward+backward; deploy ablation resets the quarantine to init. +# no-grad logpi_old forward, label rollouts {keep,absorb,rout} via global-quantile +# tails of the run-spanning score buffer, ONE masked forward+backward; deploy ablation resets the quarantine to init. smoke-routeA *ARGS: BEARTYPE=1 {{ TRAIN }} smoke --intervention=routeA \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 \ @@ -88,7 +89,7 @@ smoke-all: # ───────────────────────────────────────────────────────────────────────────── # Headline 4-arm lora2r decision run, routeA ACT gate + teacher forcing ({{ TEACH }}). -# real-v is the method (v_act from authored pairs, Otsu rolling-buffer thresholds); +# real-v is the method (v_act from authored pairs, global-quantile tail thresholds); # placebo (Haar) isolates directionality; vanilla is the emergence reference; absorb # isolates the gate+masks from absorption. Priority descending so they run in listed order. # --unhackable-frac pinned EXPLICIT so the regime is self-documenting, not default-dependent. diff --git a/scripts/diag_pinning.py b/scripts/diag_pinning.py index 7dec362..a417803 100644 --- a/scripts/diag_pinning.py +++ b/scripts/diag_pinning.py @@ -276,8 +276,21 @@ def plot_q2(df: pl.DataFrame, subtitle: str, out_png: Path) -> dict: n_rout = int(routed.sum()) prec = float(y[routed].mean()) if routed.any() else float("nan") rec = float((s[y] >= t_hi).mean()) if y.any() else float("nan") + # F_beta at the rout cut, beta=0.5 (PRECISION-weighted). The routing cost is + # asymmetric the OTHER way than naive intuition: a missed hack (false negative) + # is absorbed -- SGTM is robust to 40-50% undiscovered forget data because the + # routed subset localizes the capability regardless (paper_sgtm.md L64,160,362). + # A false positive (clean routed to rout) has NO such safety net: that solve + # update goes only to the quarantine and is ablated away -> lost capability. So + # the rout cut should be high-PRECISION (pin only confident hacks; let the wide + # absorb band catch the uncertain ones). AUROC ignores the threshold and the + # imbalance; this scores the gate at its operating point. Measurement only -- it + # needs hack labels, so it can never feed the live gate. + b2 = 0.25 # beta=0.5 -> beta^2 + fbeta = float((1 + b2) * prec * rec / (b2 * prec + rec)) if (prec + rec) > 0 else 0.0 stats[col] = {"auroc_pos": auroc_pos, "auroc_all": auroc, "prec_rout": prec, - "rec_rout": rec, "n_rout": n_rout, "t_hi": t_hi, "oracle": oracle} + "rec_rout": rec, "fhalf_rout": fbeta, "n_rout": n_rout, "t_hi": t_hi, + "oracle": oracle} zvals = np.concatenate([s, (syn_join - mu_s) / sd_s]) if len(syn_join) else s lo = float(np.quantile(zvals, 0.005)) @@ -328,7 +341,7 @@ def plot_q2(df: pl.DataFrame, subtitle: str, out_png: Path) -> dict: for sp in ("top", "right", "left"): ax.spines[sp].set_visible(False) ax.set_title(f"{rep} · {kind} AUROC={auroc_pos:.2f} (A>0 contrast; vs-all {auroc:.2f}) " - f"P@rout={prec:.2f} (n={n_rout}) R={rec:.2f}", fontsize=9) + f"P@rout={prec:.2f} (n={n_rout}) R={rec:.2f} F0.5={fbeta:.2f}", fontsize=9) ax.set_xlabel({"cos": "cosine to v (concat modules), z within family", "dot": "dot ⟨x, v⟩, z within family"}[kind], fontsize=8.5) @@ -566,7 +579,7 @@ def _downstream(cfg: Cfg, fe: dict, src: str) -> int: print(f"\nmain metric: best case on the A>0 contrast = {best} " f"AUROC={stats[best]['auroc_pos']:.3f} (vs-all {stats[best]['auroc_all']:.3f}) " f"P@rout={stats[best]['prec_rout']:.2f} (n={stats[best]['n_rout']}) " - f"R@rout={stats[best]['rec_rout']:.2f}") + f"R@rout={stats[best]['rec_rout']:.2f} F0.5@rout={stats[best]['fhalf_rout']:.2f}") print(f"out: {q2_png}") return 0 diff --git a/src/vgrout/tablelog.py b/src/vgrout/tablelog.py index ab41895..b1be00d 100644 --- a/src/vgrout/tablelog.py +++ b/src/vgrout/tablelog.py @@ -104,12 +104,13 @@ class StepLogger: _Col("auroc", 6, "auroc", ".2f", "AUROC of dot(act, v_act) vs hack labels on the A>0 contrast (positively-reinforced rollouts, where the reward alone is blind); measurement only, never routes. ~0.5 = chance-level separation; high AUROC but rout~0 = threshold problem; a drop at refresh = reduced separation"), _Col("cos", 6, "cos", "+.2f", "mean per-rollout cos(act, v_act) (dot-vs-cos diagnostic)"), _Col("qmass", 6, "qmass", ".2f", "quarantine energy share ||g_quar||/(||g_keep||+||g_quar||): fraction of update energy assigned to quarantine"), - _Col("keep", 6, "keep", ".2f", "rollout share below t_lo -> deployed-only, quarantine off"), - _Col("resid", 6, "resid", ".2f", "rollout share between thresholds (and ALL rollouts during warmup) -> both blocks train; absorption is possible but not measured"), - _Col("rout", 6, "rout", ".2f", "rollout share at/above t_hi -> quarantine-only, deployed detached"), - _Col("tlo", 6, "tlo", "+.2f", "Otsu lower threshold (z units of the rolling score buffer); nan during warmup"), - _Col("thi", 6, "thi", "+.2f", "Otsu upper (rout) threshold (z units); nan during warmup"), - _Col("stale", 5, "stale", "d", "steps since v_act was last re-extracted (0 = refreshed this step, every vhack_refresh_every); placebo/vanilla never refresh so it grows unbounded"), + _Col("keep", 7, "keep", ".3f", "rollout share below t_lo -> deployed-only, quarantine off"), + _Col("resid", 7, "resid", ".3f", "rollout share between thresholds (and ALL rollouts during warmup) -> both blocks train; absorption is possible but not measured"), + _Col("rout", 7, "rout", ".3f", "rollout share at/above t_hi -> quarantine-only, deployed detached"), + _Col("tlo", 6, "tlo", "+.2f", "keep|absorb cut = route_tail_q global quantile of the run-spanning score buffer (raw score units); nan during warmup AND on the boundary step where the buffer crosses route_warmup mid-step"), + _Col("thi", 6, "thi", "+.2f", "absorb|rout cut = (1-route_tail_q) global quantile; nan during warmup; rollouts >= thi route to quarantine (deployed detached)"), + _Col("buf", 6, "buf", "d", "score-buffer fill at step end; gate is PINNED ABSORB (keep=rout=0) while buf < route_warmup, then quantile-routes. Plateaus at route_buffer once it starts evicting"), + _Col("stale", 5, "vage", "d", "vector age = steps since v_act was last re-extracted (0 = refreshed this step, every vhack_refresh_every; sawtooths 0..N-1); placebo/vanilla never refresh so it grows unbounded"), ] # Show the training-prompt deploy proxy only when an ablated slice exists. if show_ablate: diff --git a/src/vgrout/train.py b/src/vgrout/train.py index 5ad3951..ee0de26 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -567,7 +567,7 @@ def main(cfg: Config) -> int: step_clipfrac: list[float] = [] # PPO clip frac on keep-gated rollouts (ratio-drift gauge) step_rho_keep: list[float] = []; step_rho_absorb: list[float] = []; step_rho_rout: list[float] = [] # mean ρ per zone (off-policy gauge) step_zkeep: list[float] = []; step_zresid: list[float] = []; step_zrout: list[float] = [] # unit shares per zone - step_tlo: list[float] = []; step_thi: list[float] = [] # Otsu thresholds (z units) + step_tlo: list[float] = []; step_thi: list[float] = [] # quantile-tail thresholds (raw score units) # AUROC diagnostic on the A>0 contrast: scores + hack-labels of positively- # reinforced rollouts only (where the advantage alone is blind), students + # cached teachers. Accumulated across prompts; measurement only, never routes. @@ -826,7 +826,7 @@ def main(cfg: Config) -> int: # Pin block masks BEFORE the (single) grad-carrying forward (arm semantics: # train_config.py docstring): none -> (0,0), absorb -> (1,0), routeA -> the - # per-rollout three-way gate labels from the rolling-buffer Otsu thresholds. + # per-rollout three-way gate labels from the global-quantile tail thresholds. if is_vanilla: _z = torch.zeros(merged.shape[0], device=device) for info in wrappers.values(): @@ -1104,6 +1104,8 @@ def main(cfg: Config) -> int: "rout": (sum(step_zrout) / len(step_zrout)) if step_zrout else float("nan"), "tlo": (sum(step_tlo) / len(step_tlo)) if step_tlo else float("nan"), "thi": (sum(step_thi) / len(step_thi)) if step_thi else float("nan"), + # buffer fill at step end -> gate is pinned absorb while buf < route_warmup. + "buf": len(act_buf) if act_buf is not None else 0, "lr": sched.get_last_lr()[0], "stale": v_act_stale, # Deploy-eval (quarantine ablated); NaN except on eval steps. diff --git a/src/vgrout/train_config.py b/src/vgrout/train_config.py index 4f4544c..100efca 100644 --- a/src/vgrout/train_config.py +++ b/src/vgrout/train_config.py @@ -37,6 +37,9 @@ class Config: # AdamW decay pulls raw A/B toward 0, not toward the init, which would drive # the net delta to -B0@A0 -- must stay 0 for this adapter. weight_decay: float = 0.0 + # COUPLED to route_warmup: the gate's pinned-absorb warmup (~4 steps at the 60-step + # preset) must end inside the LR ramp, so its unrouted both-block updates land at + # tiny LR. Lowering warmup_frac without lowering route_warmup breaks that cover. warmup_frac: float = 0.2 # With grad_clip=10.0, gn rose 5->14->47->100 before the step-17 generation # divergence in job 15, even at lr=1e-4. Typical gn is 1-5, so grad_clip=1.0 @@ -136,8 +139,8 @@ class SmokeConfig(Config): max_new: int = 32 n_problems: int = 100 prompts_per_step: int = 1 - # Smoke produces 4 scores/step over 30 steps; the real 256/128 buffer would keep the - # gate in warmup forever. Shrink so the smoke exercises warmup AND the Otsu gate + # Smoke produces 4 scores/step over 30 steps; the real 8192/128 buffer would keep the + # gate in warmup forever. Shrink so the smoke exercises warmup AND the quantile gate # (keep/absorb/rout + deployed detach) within a few steps. route_buffer: int = 32 route_warmup: int = 8