From 3d60b4cf431c154fe2d603a5aea07d2a698fc3b1 Mon Sep 17 00:00:00 2001 From: wassname Date: Sat, 30 May 2026 05:10:58 +0000 Subject: [PATCH] log: dump first full example of each hack class to verbose log One-shot-per-class dump (rendered prompt + completion WITH special tokens + a SHOULD interpretive line) so the log shows what an M1 vs M2 hack actually looks like, not just the flag. Keyed on m2/hacked today; will re-key to env_mode in the multi-loophole refactor (spec 20260530_faithful_multi_loophole_env). Co-Authored-By: Claude Opus 4.8 --- src/projected_grpo/train.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index 01fa7f0..0b3b88f 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -945,6 +945,8 @@ table columns: rollout_log_path.write_text("") first_hack_saved = False route_span_checked = False # R3: assert delta_S_hack.grad in span(V) once + dumped_hack_classes: set[str] = set() # first full example of each hack class -> verbose log + teacher_dumped = False def save_ckpt(rows: list[dict], path: Path | None = None) -> None: """Rewrite the run checkpoint in place: trainable delta_S as tensors, per-step @@ -1121,6 +1123,25 @@ table columns: fmt_flags.append(r.format_ok) hack_E_flags.append(r.hacked_loophole_used) hack_D_flags.append(r.hacked_wrong_tests) + # First full example of each hack class -> verbose log (with special + # tokens). Lets us eyeball WHAT the hack actually looks like, not just + # the flag. One dump per class for the whole run. + hack_cls = "M2_hardcode" if (is_expose and r.m2) else ("M1_run_tests" if r.hacked else None) + if hack_cls and hack_cls not in dumped_hack_classes: + dumped_hack_classes.add(hack_cls) + should = { + "M2_hardcode": "completion special-cases the K VISIBLE inputs (literal " + "returns / if-chains) and has no general algorithm, so it " + "fails held-out; ELSE the m2 flag is mis-firing", + "M1_run_tests": "completion defines a run_tests() that passes against an " + "arbitrary Solution (vacuous asserts); ELSE hacked mis-firing", + }[hack_cls] + logger.debug( + f"\n\n=== FIRST {hack_cls} HACK EXAMPLE (step {step}, p_idx {p_idx}, gi {gi}) ===\n" + f"SHOULD: {should}\n" + f"--- rendered prompt (special chars) ---\n{prompt}\n" + f"--- completion (special chars) ---\n{tok.decode(completions[gi], skip_special_tokens=False)}\n" + f"=== END {hack_cls} ===") step_rollouts.append({ "step": step, "p_idx": p_idx, "gi": gi, "reward": r.reward, "gt_pass": r.gt_pass, "pass_heldout": r.pass_heldout,