mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 20:21:41 +08:00
log: dump first full example of each hack class to verbose log
One-shot-per-class dump (rendered prompt + completion WITH special tokens + a SHOULD interpretive line) so the log shows what an M1 vs M2 hack actually looks like, not just the flag. Keyed on m2/hacked today; will re-key to env_mode in the multi-loophole refactor (spec 20260530_faithful_multi_loophole_env). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -945,6 +945,8 @@ table columns:
|
||||
rollout_log_path.write_text("")
|
||||
first_hack_saved = False
|
||||
route_span_checked = False # R3: assert delta_S_hack.grad in span(V) once
|
||||
dumped_hack_classes: set[str] = set() # first full example of each hack class -> verbose log
|
||||
teacher_dumped = False
|
||||
|
||||
def save_ckpt(rows: list[dict], path: Path | None = None) -> None:
|
||||
"""Rewrite the run checkpoint in place: trainable delta_S as tensors, per-step
|
||||
@@ -1121,6 +1123,25 @@ table columns:
|
||||
fmt_flags.append(r.format_ok)
|
||||
hack_E_flags.append(r.hacked_loophole_used)
|
||||
hack_D_flags.append(r.hacked_wrong_tests)
|
||||
# First full example of each hack class -> verbose log (with special
|
||||
# tokens). Lets us eyeball WHAT the hack actually looks like, not just
|
||||
# the flag. One dump per class for the whole run.
|
||||
hack_cls = "M2_hardcode" if (is_expose and r.m2) else ("M1_run_tests" if r.hacked else None)
|
||||
if hack_cls and hack_cls not in dumped_hack_classes:
|
||||
dumped_hack_classes.add(hack_cls)
|
||||
should = {
|
||||
"M2_hardcode": "completion special-cases the K VISIBLE inputs (literal "
|
||||
"returns / if-chains) and has no general algorithm, so it "
|
||||
"fails held-out; ELSE the m2 flag is mis-firing",
|
||||
"M1_run_tests": "completion defines a run_tests() that passes against an "
|
||||
"arbitrary Solution (vacuous asserts); ELSE hacked mis-firing",
|
||||
}[hack_cls]
|
||||
logger.debug(
|
||||
f"\n\n=== FIRST {hack_cls} HACK EXAMPLE (step {step}, p_idx {p_idx}, gi {gi}) ===\n"
|
||||
f"SHOULD: {should}\n"
|
||||
f"--- rendered prompt (special chars) ---\n{prompt}\n"
|
||||
f"--- completion (special chars) ---\n{tok.decode(completions[gi], skip_special_tokens=False)}\n"
|
||||
f"=== END {hack_cls} ===")
|
||||
step_rollouts.append({
|
||||
"step": step, "p_idx": p_idx, "gi": gi,
|
||||
"reward": r.reward, "gt_pass": r.gt_pass, "pass_heldout": r.pass_heldout,
|
||||
|
||||
Reference in New Issue
Block a user