From f0cbbacaf0a48a13a7137cf0bc6bbb75f87427d5 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Fri, 5 Jun 2026 03:58:26 +0000
Subject: [PATCH] save per-eval deploy-adapter ckpts (rescore w/o retrain) +
 CLAUDE.md test lesson
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

save_eval_ckpts (default on): write the deploy adapter (δS only, ~2.3MB) at each
deploy-eval step, step-tagged, so a run can be re-scored later (more prompts /
different eval) without retraining. The A5 run saved only final+first_hack, which
is why the leak needed a full retrain rather than a rescore.

AGENTS.md: every load-bearing invariant gets a verify_*.py gate. The no-cheat leak
shipped because the green gates never covered the property -- 'tests passed' is
meaningless if the property was never tested.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
---
 AGENTS.md                   | 9 ++++++++-
 src/projected_grpo/train.py | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/AGENTS.md b/AGENTS.md
index 5d130b6..5c39da6 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -46,7 +46,14 @@ Inherit global rules from `~/.claude/CLAUDE.md`.
 - `just smoke` before any real run (~1-2 min, beartype on, real pipeline on tiny inputs).
 - Real runs go through `pueue` on the 96GB GPU box. Label each job with `why:` and `resolve:`.
 - Head [docs/RESEARCH_JOURNAL.md](docs/RESEARCH_JOURNAL.md) for latest results.
-- No `tests/` dir; `smoke` is the correctness gate.
+- No `tests/` dir; `smoke` is the correctness gate. Unit checks are `scripts/verify_*.py`
+  (assert + exit nonzero), wired into the `smoke` recipe so every smoke runs them.
+- Every load-bearing invariant gets a `verify_*.py` gate. The no-cheat leak (held-out
+  modes received <=1.1% detector labels via the route2 gate, 2026-06-05) shipped because
+  nothing asserted it: the existing gates were green but none covered the property. If a
+  claim is load-bearing, write the assertion test in the same commit -- "the tests passed"
+  means nothing if the property was never tested. This one was found by hand-counting, not
+  by a gate; now `scripts/verify_gate_anchor.py` covers it.
 
 On persona pairs
 - ./docs/personas/how_to_rewrite_pairs.md
diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py
index 87116dd..3ead652 100644
--- a/src/projected_grpo/train.py
+++ b/src/projected_grpo/train.py
@@ -192,6 +192,10 @@ class Config:
     # sparser cadence (10/20) explicitly. See journal 2026-06-04 (a) for the cost audit.
     eval_ablate_every: int = 5
     eval_n_prompts: int = 8
+    # Save the deploy adapter (δS only, ~2.3MB) at every deploy-eval step, tagged by
+    # step, so a run can be RE-SCORED later (more prompts, different eval) without
+    # retraining. Tiny per ckpt; a 200-step run at every-10 is ~46MB. Off for big sweeps.
+    save_eval_ckpts: bool = True
     # Optional: pool-derived pairs JSON (built by pairs_from_pool.py). When set,
     # BOTH the cache-miss extract AND the online refresh use these pairs instead
     # of the hand-crafted projected_grpo.pairs.PAIRS. Required for the cross-
@@ -1621,6 +1625,10 @@ def main(cfg: Config) -> int:
             raise RuntimeError(f"training diverged (ppl_t={ppl_t:.0e} at step {step})")
         if (step + 1) % 25 == 0:
             save_ckpt(rows)  # survive early kills; ~12 days for the full sweep
+        # Per-eval deploy-adapter snapshot: re-scoreable later without retraining.
+        if cfg.save_eval_ckpts and cfg.eval_ablate_every > 0 \
+                and (step % cfg.eval_ablate_every == 0 or step == steps - 1):
+            save_ckpt(rows, path=run_dir / f"ckpt_step{step:04d}.safetensors")
         if not first_hack_saved and hack_s_n > 0:
             save_ckpt(rows, path=first_hack_path)
             first_hack_saved = True