From 3efd9e69a8d8d07e07d7c9aa7454d65ce2eac019 Mon Sep 17 00:00:00 2001
From: wassname <github@wassname>
Date: Thu, 28 May 2026 09:21:05 +0000
Subject: [PATCH] proj: add gate_mode=reverse (flip sign of hack-ward
 component)

Current modes are one_sided (erase positive c only, leaves negative
intact) and no_gate (erase span(V) entirely, drives V@g_proj to 0).
Reverse subtracts 2*c@V so V@g_proj = -V@g, actively pushing the
gradient AWAY from hack rather than just removing alignment.

Smoke confirms: cos_pre=+0.726 -> cos_post=-0.726 (clean flip).
Risk: anti-task gradient component if hack-ward and task-ward
directions share span; watch lp_s on the live run.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/projected_grpo/proj.py  | 9 ++++++++-
 src/projected_grpo/train.py | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/src/projected_grpo/proj.py b/src/projected_grpo/proj.py
index bf75ef1..8a6878f 100644
--- a/src/projected_grpo/proj.py
+++ b/src/projected_grpo/proj.py
@@ -82,7 +82,8 @@ def _project_one_module(
     normalized by ||g||). Positive = grad pushes toward hack; negative = grad
     pushes toward safe. Under one_sided projection cos_post should fall to
     zero or negative (we removed the positive part). Under no_gate cos_post
-    is approximately zero by construction.
+    is approximately zero by construction. Under reverse cos_post flips sign
+    relative to cos_pre (we subtract 2*c@V, so V@g_proj = -V@g).
     """
     gn = g.norm()
     if gn < 1e-12:
@@ -96,6 +97,12 @@ def _project_one_module(
         mask = (c > 0).to(c.dtype)
         c_use = c * mask
         fired = bool((c_use != 0).any())
+    elif gate_mode == "reverse":
+        # Subtract 2*c@V: V@g_proj = V@g - 2*(V V^T) c = c - 2c = -c.
+        # Flips the sign of the gradient component in span(V); pushes
+        # actively away from hack rather than just removing.
+        c_use = 2 * c
+        fired = True
     else:
         raise ValueError(f"unknown gate_mode={gate_mode!r}")
     if not fired:
diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py
index a46c39d..ef18eca 100644
--- a/src/projected_grpo/train.py
+++ b/src/projected_grpo/train.py
@@ -151,7 +151,7 @@ class Config:
     grad_clip: float = 1.0            # global L2 clip on delta_S grads; set high (e.g. 500) to effectively disable
     seed: int = 41
     preserve_magnitude: bool = True
-    gate_mode: Literal["one_sided", "no_gate"] = "one_sided"
+    gate_mode: Literal["one_sided", "no_gate", "reverse"] = "one_sided"
     unbiased: bool = True             # Dr.GRPO: drop 1/|o_i| and /std(R)
     # v_hack: path is optional — if None, derived from model+top_k as
     # out/v_hack_<slug>_k<extract_top_k>.safetensors. If file missing, train.py