From 07363f1ede465e90ac8529d4c128f9f82202140b Mon Sep 17 00:00:00 2001 From: wassname Date: Wed, 3 Jun 2026 00:09:19 +0000 Subject: [PATCH] cleanup: trim stale comments + attic README Dropped dead job-ID narrative (job 60/64) on rollout_ablate_frac, the 'vanilla step 17' dead-run ref in eval.py, the 'old signed sum' dead-code ref in proj.py, and the conversational 'current experiment line' lead. Removed the dead probe-traj justfile recipe. Kept all TODO/FIXME and the 'why' memory-tuning comments. Smoke green (cout->0). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- justfile | 4 ---- scripts/attic/README.md | 9 +++++++++ src/projected_grpo/eval.py | 4 ++-- src/projected_grpo/proj.py | 6 +++--- src/projected_grpo/train.py | 20 +++++++++----------- 5 files changed, 23 insertions(+), 20 deletions(-) create mode 100644 scripts/attic/README.md diff --git a/justfile b/justfile index 3427043..830251c 100644 --- a/justfile +++ b/justfile @@ -298,10 +298,6 @@ queue-projected preset="full" vhack="out/vhack/v_hack_full.safetensors": probe-base-pool steps="20": uv run python -m projected_grpo.probe_distill --base-only --steps={{ steps }} --n-problems={{ steps }} -# Trajectory comparator for the warmup-gen runs (vanilla vs projected). -probe-traj: - uv run python -m projected_grpo.probe_traj - # Print the results table prototype. table-proto: @cat docs/table_proto.md diff --git a/scripts/attic/README.md b/scripts/attic/README.md new file mode 100644 index 0000000..c96e734 --- /dev/null +++ b/scripts/attic/README.md @@ -0,0 +1,9 @@ +# attic + +Parked, not deleted. Superseded exploration kept only because the persona-pair +methodology may get cited in the paper appendix. + +- `make_pairsets.py`, `make_dataset_pairsets.py` — persona contrastive-pair + authoring (tasks #123-126, done). The live extraction path is + `pairs.PAIRS` (hand pairs) or `pairs_from_pool` (pool-derived). No justfile + recipe builds these anymore. diff --git a/src/projected_grpo/eval.py b/src/projected_grpo/eval.py index e9aba94..7b29039 100644 --- a/src/projected_grpo/eval.py +++ b/src/projected_grpo/eval.py @@ -25,8 +25,8 @@ def ref_logprobs_via_zero_delta( Uses `logits_to_keep=L_c+1` so HF's lm_head only runs on completion-side hidden states; prompt-side logits never materialize. Saves - ~plen/(plen+L_c) memory at the lm_head call (~33% at plen=500, L_c=1024). - That was the OOM site at vanilla step 17 (long prompt -> 4 GiB lm_head spike). + ~plen/(plen+L_c) memory at the lm_head call (~33% at plen=500, L_c=1024) -- + a long prompt can spike the full-logits lm_head ~4 GiB and OOM without this. """ saved = {n: info["delta_S"].data.clone() for n, info in wrappers.items()} try: diff --git a/src/projected_grpo/proj.py b/src/projected_grpo/proj.py index de09d3c..be1d275 100644 --- a/src/projected_grpo/proj.py +++ b/src/projected_grpo/proj.py @@ -37,9 +37,9 @@ def _hackward_cos(c: Float[torch.Tensor, "k"], gn: torch.Tensor) -> float: relu BEFORE aggregating is the point: the one_sided projection removes only relu(c) (the hack-ward axes), and with V orthonormal ||removed|| = ||relu(c)||, - so this reads directly as "fraction of the grad the projection strips". The - old signed sum(c)/||g|| let +/- axes cancel, so it read ~0 even while a large - hack-ward magnitude was being routed -- a misleading gauge of routing activity. + so this reads directly as "fraction of the grad the projection strips" (a signed + sum would let +/- axes cancel and read ~0 even while routing a large hack-ward + magnitude). After a one_sided erase, V @ g_proj = min(c, 0) (positive axes zeroed), so relu of it is 0 -> cos_post == 0 exactly. That clean SHOULD (cos_post -> 0) is diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index 21c5786..315acdc 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -146,15 +146,13 @@ class Config: # saturation -- if on-policy sampling collapses onto hacking, every rollout routes # to the quarantine and the deployed δS never sees a solve gradient. Grading these # ablated rollouts is also a free per-step deploy proxy (hk_abl / slv_abl). - # DEFAULT 0 (off). job 64 (frac=0.5) leaked: deploy hack climbed 0.00->0.125 over - # training while the no-floor job 60 held deploy hack=0.000. Mechanism: sampling - # from the DEPLOYED model makes its reward an optimization target whose optimum is - # hacking, so whenever the (stale) routing gate misses, those deploy-generated - # hacks update the kept knob -- the floor optimizes the very policy the quarantine - # is meant to keep clean. Neither gradient-routing paper samples from the ablated - # model during training (they sample knob-on, ablate at eval, let absorption - # localize). So default off; re-enable only to TEST the floor+fresh-gate hypothesis - # (does refresh-every-1 keep the gate sharp enough that the floor is safe?). + # DEFAULT 0 (off): sampling from the DEPLOYED model makes its reward an + # optimization target whose optimum is hacking, so whenever the (stale) routing + # gate misses, those deploy-generated hacks update the kept knob -- the floor + # optimizes the very policy the quarantine is meant to keep clean. Neither + # gradient-routing paper samples from the ablated model during training (they + # sample knob-on, ablate at eval, let absorption localize). Re-enable only to test + # whether a fresh gate (refresh-every-1) keeps the floor safe. rollout_ablate_frac: float = 0.0 # Which grader flaw + factual hint this run trains on (a "hack class"). Sets # the prompt hint (HINT_REPLACE_TO) and how `passed` is graded in rewards.py. @@ -260,8 +258,8 @@ class FastConfig(Config): at pp=4 x 20 steps).""" model: str = "Qwen/Qwen3-4B" steps: int = 60 # 60 lets the lp_s-lp_t gap open at convergence - # current experiment line: 4-mode substrate pool + prog_wide persona pairs are the - # default so real runs need only --intervention (+ optional seed/refresh/mask). + # 4-mode substrate pool + prog_wide persona pairs are the default, so real runs + # need only --intervention (+ optional seed/refresh/mask). teacher_pool_dir: Path | None = Path("out/pools/substrate") vhack_pairs_path: Path | None = Path("out/pairsets/prog_wide.json") group: int = 8 # G=8 so the locked-in mix_ratio=0.125 gives 1 teacher / 7 student