From 03693e4f30502cb805e022b4be9e31e36b71644b Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Fri, 5 Jun 2026 14:45:11 +0800
Subject: [PATCH] name the method vGROUT (vector gradient routing)

- title: drop the "Quarantine ... Representation?" metaphor for
  "vGROUT: Vector Gradient Routing against Reward Hacking"
- Method: add a two-phase definition (make v_hack; then erase=discard the
  component / route=redirect the gated gradient into a deletable adapter,
  deleted at deploy). Honest framing: route preserves (not discards); follows
  Shilov et al.'s post-backward deletable-block routing in the gradient-routing
  family, gated by an extracted direction not a per-example data label
- strip literal "SGTM" from the body (confusing acronym); cite renders as
  author-year. README + pyproject describe vGROUT (package name unchanged)
---
 README.md             | 10 ++++++----
 docs/writeup/main.tex | 45 ++++++++++++++++++++++++++++---------------
 pyproject.toml        |  2 +-
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index d134ab0..a518dd9 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,10 @@
-# projected_grpo
+# projected_grpo — vGROUT
 
-SVD-basis gradient projection vs RL reward hacking. Tests whether projecting
-the training gradient orthogonal to an extracted hack-direction (in the SVD-of-W
-basis) reduces reward-hack rate in GRPO without tanking pass rate.
+**vGROUT** (vector gradient routing): route the GRPO gradient against an
+extracted reward-hacking direction (in the SVD-of-W basis) to reduce the
+reward-hack rate without tanking pass rate. A representation-routing variant of
+gradient routing (Cloud et al.; Shilov et al.), where the routing is gated by an
+extracted direction rather than a per-example data label.
 
 Built on Ariahw, Engels & Nanda's [rl-rewardhacking](https://github.com/ariahw/rl-rewardhacking)
 LeetCode benchmark. Method differs from concurrent work (Wu & Tang 2026,
diff --git a/docs/writeup/main.tex b/docs/writeup/main.tex
index f0d5b6d..0553069 100644
--- a/docs/writeup/main.tex
+++ b/docs/writeup/main.tex
@@ -67,7 +67,7 @@
 % "hand-authored" in make_dataset_pairsets.py means hand-authored by the model.
 % We do not argue the point in prose; we just SHOW the pairs (the actual hack/clean
 % completions that build v_hack) in an appendix and let the reader judge.
-\title{Can We Quarantine Reward Hacking with a Reward-Hacking Representation?}
+\title{vGROUT: Vector Gradient Routing against Reward Hacking}
 
 % Anonymous for submission. Add \nipsfinalcopy + real authors for camera-ready.
 \author{Anonymous Author(s)\\ Affiliation\\ \texttt{email}}
@@ -145,16 +145,16 @@ README ``How it works'' + blog intro.}
 
 \paragraph{Contributions.} % author-dictated; factual claims.
 \begin{enumerate}
-  \item We adapt selective gradient masking (SGTM \citep{sgtm2025localization}),
-        post-backward masking of a forget subspace deleted at deploy, from
-        supervised unlearning to reward hacking in RL post-training. We keep the
-        localize-then-ablate framing of gradient routing
-        \citep{cloud2024gradientrouting} but route gradients post-backward, the
-        SGTM parameter-masking family rather than Cloud's forward
-        \texttt{.detach()} on activations.
-  \item We replace the routing mask itself. SGTM and gradient routing tag the
-        training \emph{data} (per-example / per-token, $O(\text{dataset})$
-        labels); we extract one hack \emph{direction}, representation-engineering
+  \item We adapt the post-backward parameter-gradient routing of
+        \citet{sgtm2025localization} (reserve a forget subspace, delete it at
+        deploy) from supervised unlearning to reward hacking in RL post-training.
+        We keep the localize-then-ablate framing of gradient routing
+        \citep{cloud2024gradientrouting} but route post-backward on parameter
+        gradients rather than via Cloud's forward \texttt{.detach()} on activations.
+  \item We replace the routing signal itself. \citet{sgtm2025localization} and
+        gradient routing tag the training \emph{data} (per-example / per-token,
+        $O(\text{dataset})$ labels); we extract one hack \emph{direction},
+        representation-engineering
         style, from $\sim$10--21 contrastive (hack, clean) pairs and route by
         $\cos(g, v_{\text{hack}})$. The live RL rollouts carry no labels.
   \item We extend the Ariahw LeetCode reward-hacking RL environment
@@ -165,6 +165,19 @@ README ``How it works'' + blog intro.}
 \section{Method}
 \label{sec:method}
 
+We call the method \textbf{vGROUT} (vector gradient routing). It has two phases.
+(1) \emph{Make} a reward-hacking direction $v_{\text{hack}}$ from a few contrastive
+(hack, clean) pairs (Section~\ref{sec:extract}). (2) During GRPO, use $v_{\text{hack}}$
+to separate the live gradient: the hack-aligned part is either \emph{erased}
+(subtracted and discarded, leaving the orthogonal complement) or \emph{routed}
+(the whole gated rollout's gradient is sent, not discarded, into a separate adapter
+$\delta_{S,\text{hack}}$ that is deleted at deploy). The split acts on the
+\emph{gradient} during training; the deletion acts on the \emph{weights} at deploy.
+Mechanically vGROUT follows the post-backward, deletable-block routing of
+\citet{sgtm2025localization} (the gradient-routing family of
+\citealp{cloud2024gradientrouting}); it differs from both in that the routing is
+gated by an extracted direction, not a per-example data label.
+
 \subsection{The SVD-basis adapter}
 % PROVENANCE: rationale from docs/pseudocode/01_adapter.py (Source: antipasto.py).
 % Forward: y + U diag(delta_S + delta_S_hack) Vh x. Two per-module knobs train;
@@ -186,6 +199,7 @@ and the projection all live in this same low-rank, weight-aligned space
 \end{itemize}
 
 \subsection{Extracting the hack direction}
+\label{sec:extract}
 \TODO{outline: for $\sim$10--21 AI-authored (hack, clean) pairs
 (Appendix~\ref{app:pairs}), compute
 the GRPO gradient each pair would emit at adv $=+1/-1$, which reduces
@@ -215,8 +229,8 @@ keep the live gradient out of $v_{\text{hack}}$, and two extraction schedules:}
         $\delta_{S,\text{hack}}$ on its own basis, sized to absorb the routed
         update so the kept adapter is free of it, and deleted at deploy.
         Mechanically this is
-        SGTM-style post-backward parameter-gradient masking
-        \citep{sgtm2025localization} in the SVD basis (we strip/redirect the
+        post-backward parameter-gradient routing \citep{sgtm2025localization}
+        in the SVD basis (we strip/redirect the
         $\delta_S$ gradient after the backward pass, not via a forward
         \texttt{.detach()}), routed into a deletable subspace rather than zeroed,
         the localize-then-ablate idea of gradient routing
@@ -594,12 +608,13 @@ one-liners are in docs/grad\_routing/related\_work.md.}
         We inherit the localize-then-ablate \emph{idea}, but not the mechanism:
         Cloud routes by a forward \texttt{.detach()} on labelled activation dims;
         we operate post-backward on parameter gradients (next bullet).
-  \item Capability-localization routing (SGTM \citep{sgtm2025localization}): our
+  \item Capability-localization routing (\citealp{sgtm2025localization}; the
+        \texttt{selective-gradient-masking} line): our
         closest mechanistic analogue, a post-backward parameter-gradient mask
         over a forget subspace, tolerant to label noise, where the forgotten
         capability leaking back into the kept weights shrinks as model size
         grows (supports our scalability argument). We differ in the
-        mask \emph{source}: SGTM tags training data per example over fixed
+        mask \emph{source}: they tag training data per example over fixed
         reserved dims; we extract one hack direction from a few contrastive pairs
         and route by cosine. Their TPR/FPR detector quality maps to our
         weak-detector (held-out) axis.
diff --git a/pyproject.toml b/pyproject.toml
index 07e2df7..52b38df 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "projected_grpo"
 version = "0.1.0"
-description = "SVD-basis gradient projection vs RL reward hacking on Nanda's LeetCode benchmark"
+description = "vGROUT: vector gradient routing against reward hacking (Nanda's LeetCode benchmark)"
 requires-python = ">=3.13,<3.14"  # pinned cp313 wheels (causal-conv1d, flash-attn)
 dependencies = [
     "torch>=2.4",