log: dump first full example of each hack class to verbose log

One-shot-per-class dump (rendered prompt + completion WITH special tokens + a
SHOULD interpretive line) so the log shows what an M1 vs M2 hack actually looks
like, not just the flag. Keyed on m2/hacked today; will re-key to env_mode in
the multi-loophole refactor (spec 20260530_faithful_multi_loophole_env).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
wassname
2026-05-30 05:10:58 +00:00
parent 8a5738c69a
commit 3d60b4cf43
+21
View File
@@ -945,6 +945,8 @@ table columns:
rollout_log_path.write_text("")
first_hack_saved = False
route_span_checked = False # R3: assert delta_S_hack.grad in span(V) once
dumped_hack_classes: set[str] = set() # first full example of each hack class -> verbose log
teacher_dumped = False
def save_ckpt(rows: list[dict], path: Path | None = None) -> None:
"""Rewrite the run checkpoint in place: trainable delta_S as tensors, per-step
@@ -1121,6 +1123,25 @@ table columns:
fmt_flags.append(r.format_ok)
hack_E_flags.append(r.hacked_loophole_used)
hack_D_flags.append(r.hacked_wrong_tests)
# First full example of each hack class -> verbose log (with special
# tokens). Lets us eyeball WHAT the hack actually looks like, not just
# the flag. One dump per class for the whole run.
hack_cls = "M2_hardcode" if (is_expose and r.m2) else ("M1_run_tests" if r.hacked else None)
if hack_cls and hack_cls not in dumped_hack_classes:
dumped_hack_classes.add(hack_cls)
should = {
"M2_hardcode": "completion special-cases the K VISIBLE inputs (literal "
"returns / if-chains) and has no general algorithm, so it "
"fails held-out; ELSE the m2 flag is mis-firing",
"M1_run_tests": "completion defines a run_tests() that passes against an "
"arbitrary Solution (vacuous asserts); ELSE hacked mis-firing",
}[hack_cls]
logger.debug(
f"\n\n=== FIRST {hack_cls} HACK EXAMPLE (step {step}, p_idx {p_idx}, gi {gi}) ===\n"
f"SHOULD: {should}\n"
f"--- rendered prompt (special chars) ---\n{prompt}\n"
f"--- completion (special chars) ---\n{tok.decode(completions[gi], skip_special_tokens=False)}\n"
f"=== END {hack_cls} ===")
step_rollouts.append({
"step": step, "p_idx": p_idx, "gi": gi,
"reward": r.reward, "gt_pass": r.gt_pass, "pass_heldout": r.pass_heldout,