"""Per-step training-table rendering and run logging. Two concerns, both pure presentation (no model, no RNG): set up the token-efficient loguru sinks for a run, and render the per-step metrics table. The renderer is the single source of truth for column order, width, header, and number format; the training loop hands it a row dict of raw values and gets back a formatted line. """ from __future__ import annotations from dataclasses import dataclass from datetime import datetime from pathlib import Path from loguru import logger from tqdm import tqdm LOGS_DIR = Path("logs") def setup_logging(run_id: str) -> Path: """Token-efficient loguru: stdout = 1-char icon + msg; verbose log to file. See /root/.claude/skills/token-efficient-logging/SKILL.md. """ LOGS_DIR.mkdir(exist_ok=True) verbose_log = LOGS_DIR / f"{datetime.now().strftime('%Y%m%dT%H%M%S')}_{run_id}.log" logger.remove() logger.add( lambda msg: tqdm.write(msg, end=""), colorize=True, format="{level.icon} {message}", level="INFO", ) logger.add( verbose_log, format="{time:HH:mm:ss} | {level} | {message}", level="DEBUG", ) logger.level("INFO", icon="I") logger.level("WARNING", icon="W") logger.level("ERROR", icon="E") logger.level("DEBUG", icon="D") return verbose_log @dataclass(frozen=True) class _Col: """Declarative column definition for the streamed step table.""" key: str width: int header: str fmt: str | None = None desc: str = "" # one-line decode for the legend; "" => omitted from legend def _format_cell(value, fmt: str | None) -> str: """Format one cell. NaN renders as 'nan' regardless of spec.""" if value is None: return "nan" if fmt == "frac": n, d = value return f"{n}/{d}" if fmt is None: return str(value) if isinstance(value, float) and value != value: # NaN return "nan" return format(value, fmt) class StepLogger: """Render raw per-step metrics using one canonical column definition.""" def __init__(self, arm: str, modes: list[str], mode_code: dict[str, str], show_ablate: bool = False) -> None: # Routing diagnostics are ALWAYS shown (nan on vanilla, whose gate never runs) so the # column layout is identical across arms -- vanilla/routeA/absorb tables line up. cols: list[_Col] = [ _Col("step", 4, "step", "d", "GRPO step"), _Col("ref_eq", 6, "ref_eq", ".2f", "vanilla-equiv step (cum_gens/256)"), _Col("rew", 6, "rew", "+.2f", "mean combined reward"), _Col("rew_s", 6, "rew_s↑", "+.2f", "student mean reward"), _Col("gt_s", 6, "gt_s↑", "frac", "student ground-truth passes"), _Col("gt_t", 6, "gt_t", "frac", "teacher ground-truth passes (sanity)"), _Col("hack_s", 7, "hack_s?", "frac", "student hack-flagged rollouts (the headline)"), _Col("hack_t", 7, "hack_t", "frac", "teacher hack-flagged rollouts (sanity: pool hacks)"), # Held-out deployed evaluation with quarantine ablated; NaN between evaluation steps. _Col("hack_deployed", 7, "hk_dep", "+.2f", "DEPLOY-eval hack (routeA/absorb: quarantine OFF; vanilla: trained model); held-out subset, T=0.7, every eval_ablate_every steps; nan between"), _Col("solve_deployed", 7, "slv_dep", "+.2f", "DEPLOY-eval solve (same cadence; nan between)"), ] # Multi-mode runs show current-step hacks per environment; single-mode would duplicate hack_s. self._modes = modes if len(modes) > 1 else [] for m in self._modes: cols.append(_Col(f"hk_{mode_code[m]}", 5, f"hk_{mode_code[m]}", "d", f"student hacks of {m} THIS step (current batch, not cumulative)")) cols += [ _Col("lp_s", 6, "lp_s↓", "+.2f", "mean student gen_logp (diagnostic)"), _Col("lp_t", 6, "lp_t↑", "+.2f", "mean teacher gen_logp; off-policy gap = lp_s-lp_t"), _Col("loss", 7, "loss", "+.2f", "mean GRPO loss"), _Col("gn", 7, "gn", ".1e", "pre-clip L2 norm of A/B grads (vs grad_clip)"), _Col("lr", 7, "lr", ".1e", "scheduled learning rate"), ] # routeA reports gate diagnostics (nan on vanilla/absorb, whose gate never runs). cols += [ _Col("auroc", 6, "auroc", ".2f", "AUROC of dot(act, v_act) vs hack labels on the A>0 contrast (positively-reinforced rollouts, where the reward alone is blind); measurement only, never routes. ~0.5 = chance-level separation; high AUROC but rout~0 = threshold problem; a drop at refresh = reduced separation"), _Col("cos", 6, "cos", "+.2f", "mean per-rollout cos(act, v_act) (dot-vs-cos diagnostic)"), _Col("qmass", 6, "qmass", ".2f", "quarantine energy share ||g_quar||/(||g_keep||+||g_quar||): fraction of update energy assigned to quarantine"), _Col("keep", 6, "keep", ".2f", "rollout share below t_lo -> deployed-only, quarantine off"), _Col("resid", 6, "resid", ".2f", "rollout share between thresholds (and ALL rollouts during warmup) -> both blocks train; absorption is possible but not measured"), _Col("rout", 6, "rout", ".2f", "rollout share at/above t_hi -> quarantine-only, deployed detached"), _Col("tlo", 6, "tlo", "+.2f", "Otsu lower threshold (z units of the rolling score buffer); nan during warmup"), _Col("thi", 6, "thi", "+.2f", "Otsu upper (rout) threshold (z units); nan during warmup"), _Col("stale", 5, "stale", "d", "steps since v_act was last re-extracted (0 = refreshed this step, every vhack_refresh_every); placebo/vanilla never refresh so it grows unbounded"), ] # Show the training-prompt deploy proxy only when an ablated slice exists. if show_ablate: cols += [ _Col("hack_abl", 6, "hk_abl", "frac", "per-step deploy proxy: hack rate on the ablated (deploy-mode) rollout slice; train prompts, noisier than hk_dep"), _Col("solve_abl", 6, "slv_abl", "frac", "per-step deploy proxy: solve rate on the ablated (deploy-mode) rollout slice; train prompts"), ] self._cols = cols def header(self) -> str: return " ".join(f"{c.header:>{c.width}}" for c in self._cols) def row(self, cells: dict) -> str: return " ".join( f"{_format_cell(cells[c.key], c.fmt):>{c.width}}" for c in self._cols ) def legend(self) -> str: """Decode the (arm-/mode-conditional) columns actually present this run.""" lines = "\n".join(f" {c.header:>8} = {c.desc}" for c in self._cols if c.desc) return ("table columns (timing gen/fb/t_rew/sec dropped from streaming, kept " "in the end-of-run dump):\n" + lines)