Files
evil_MoE/src/vgrout/tablelog.py
T
wassname af420ec855 feat: generation-matched logπ_old baseline + global-quantile gate + frac=0 method
Fixes the frac=0 PPO-clip blow-up: logπ_old is now the behavior policy computed
in each rollout's own sampling mode, so ρ is a true importance ratio. The old
always-ablated baseline gave full-sampled route rows ρ=full/ablated, which the
one-sided clip can't bound for A<0 (the loss-5e5 divergence). ρ=1 only where the
mask's forward mode matches sampling mode; ρ logged per zone (keep/absorb/rout).
Note (Fable review): frac=0.5 reintroduces the blow-up on deploy-sampled
absorb/route rows by construction -- frac=0 is the clean point.

Gate: two-threshold Otsu -> symmetric global-quantile tails (route_tail_q=0.1)
over a run-spanning act buffer (8192 > 4800 default rollouts so the early clean
era anchors the low tail; buffer stores acts, re-scored vs current v_act so a
refresh needs no flush). Removes the per-window z-norm gate-collapse on a
saturated all-hack window.

gen_deploy_frac knob: frac=0 puts the quarantine ON during sampling so it
elicits the hack and absorption can localize it. queue-decision now passes
--gen-deploy-frac=0 explicitly on all four arms (base default stays 1.0 = the
job-34 config where ablation RAISED hack 0.71->0.86).

Docs: AGENTS.md gen/forward/backward + why-frac=0 sections; RESEARCH_JOURNAL
2026-06-12; diag_deploy_ablations.py (quar-only vs deploy localization probe).

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-12 03:22:48 +00:00

135 lines
6.9 KiB
Python

"""Per-step training-table rendering and run logging.
Two concerns, both pure presentation (no model, no RNG): set up the token-efficient
loguru sinks for a run, and render the per-step metrics table. The renderer is the
single source of truth for column order, width, header, and number format; the
training loop hands it a row dict of raw values and gets back a formatted line.
"""
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from loguru import logger
from tqdm import tqdm
LOGS_DIR = Path("logs")
def setup_logging(run_id: str) -> Path:
"""Token-efficient loguru: stdout = 1-char icon + msg; verbose log to file.
See /root/.claude/skills/token-efficient-logging/SKILL.md.
"""
LOGS_DIR.mkdir(exist_ok=True)
verbose_log = LOGS_DIR / f"{datetime.now().strftime('%Y%m%dT%H%M%S')}_{run_id}.log"
logger.remove()
logger.add(
lambda msg: tqdm.write(msg, end=""),
colorize=True,
format="<level>{level.icon}</level> {message}",
level="INFO",
)
logger.add(
verbose_log,
format="{time:HH:mm:ss} | {level} | {message}",
level="DEBUG",
)
logger.level("INFO", icon="I")
logger.level("WARNING", icon="W")
logger.level("ERROR", icon="E")
logger.level("DEBUG", icon="D")
return verbose_log
@dataclass(frozen=True)
class _Col:
"""Declarative column definition for the streamed step table."""
key: str
width: int
header: str
fmt: str | None = None
desc: str = "" # one-line decode for the legend; "" => omitted from legend
def _format_cell(value, fmt: str | None) -> str:
"""Format one cell. NaN renders as 'nan' regardless of spec."""
if value is None:
return "nan"
if fmt == "frac":
n, d = value
return f"{n}/{d}"
if fmt is None:
return str(value)
if isinstance(value, float) and value != value: # NaN
return "nan"
return format(value, fmt)
class StepLogger:
"""Render raw per-step metrics using one canonical column definition."""
def __init__(self, arm: str, modes: list[str], mode_code: dict[str, str],
show_ablate: bool = False) -> None:
# Routing diagnostics are ALWAYS shown (nan on vanilla, whose gate never runs) so the
# column layout is identical across arms -- vanilla/routeA/absorb tables line up.
cols: list[_Col] = [
_Col("step", 4, "step", "d", "GRPO step"),
_Col("ref_eq", 6, "ref_eq", ".2f", "vanilla-equiv step (cum_gens/256)"),
_Col("rew", 6, "rew", "+.2f", "mean combined reward"),
_Col("rew_s", 6, "rew_s↑", "+.2f", "student mean reward"),
_Col("gt_s", 6, "gt_s↑", "frac", "student ground-truth passes"),
_Col("gt_t", 6, "gt_t", "frac", "teacher ground-truth passes (sanity)"),
_Col("hack_s", 7, "hack_s?", "frac", "student hack-flagged rollouts (the headline)"),
_Col("hack_t", 7, "hack_t", "frac", "teacher hack-flagged rollouts (sanity: pool hacks)"),
# Held-out deployed evaluation with quarantine ablated; NaN between evaluation steps.
_Col("hack_deployed", 7, "hk_dep", "+.2f", "DEPLOY-eval hack (routeA/absorb: quarantine OFF; vanilla: trained model); held-out subset, T=0.7, every eval_ablate_every steps; nan between"),
_Col("solve_deployed", 7, "slv_dep", "+.2f", "DEPLOY-eval solve (same cadence; nan between)"),
]
# Multi-mode runs show current-step hacks per environment; single-mode would duplicate hack_s.
self._modes = modes if len(modes) > 1 else []
for m in self._modes:
cols.append(_Col(f"hk_{mode_code[m]}", 5, f"hk_{mode_code[m]}", "d",
f"student hacks of {m} THIS step (current batch, not cumulative)"))
cols += [
_Col("lp_s", 6, "lp_s↓", "+.2f", "mean student gen_logp (diagnostic)"),
_Col("lp_t", 6, "lp_t↑", "+.2f", "mean teacher gen_logp; off-policy gap = lp_s-lp_t"),
_Col("loss", 7, "loss", "+.2f", "mean GRPO loss"),
_Col("gn", 7, "gn", ".1e", "pre-clip L2 norm of A/B grads (vs grad_clip)"),
_Col("lr", 7, "lr", ".1e", "scheduled learning rate"),
]
# routeA reports gate diagnostics (nan on vanilla/absorb, whose gate never runs).
cols += [
_Col("auroc", 6, "auroc", ".2f", "AUROC of dot(act, v_act) vs hack labels on the A>0 contrast (positively-reinforced rollouts, where the reward alone is blind); measurement only, never routes. ~0.5 = chance-level separation; high AUROC but rout~0 = threshold problem; a drop at refresh = reduced separation"),
_Col("cos", 6, "cos", "+.2f", "mean per-rollout cos(act, v_act) (dot-vs-cos diagnostic)"),
_Col("qmass", 6, "qmass", ".2f", "quarantine energy share ||g_quar||/(||g_keep||+||g_quar||): fraction of update energy assigned to quarantine"),
_Col("keep", 6, "keep", ".2f", "rollout share below t_lo -> deployed-only, quarantine off"),
_Col("resid", 6, "resid", ".2f", "rollout share between thresholds (and ALL rollouts during warmup) -> both blocks train; absorption is possible but not measured"),
_Col("rout", 6, "rout", ".2f", "rollout share at/above t_hi -> quarantine-only, deployed detached"),
_Col("tlo", 6, "tlo", "+.2f", "Otsu lower threshold (z units of the rolling score buffer); nan during warmup"),
_Col("thi", 6, "thi", "+.2f", "Otsu upper (rout) threshold (z units); nan during warmup"),
_Col("stale", 5, "stale", "d", "steps since v_act was last re-extracted (0 = refreshed this step, every vhack_refresh_every); placebo/vanilla never refresh so it grows unbounded"),
]
# Show the training-prompt deploy proxy only when an ablated slice exists.
if show_ablate:
cols += [
_Col("hack_abl", 6, "hk_abl", "frac", "per-step deploy proxy: hack rate on the ablated (deploy-mode) rollout slice; train prompts, noisier than hk_dep"),
_Col("solve_abl", 6, "slv_abl", "frac", "per-step deploy proxy: solve rate on the ablated (deploy-mode) rollout slice; train prompts"),
]
self._cols = cols
def header(self) -> str:
return " ".join(f"{c.header:>{c.width}}" for c in self._cols)
def row(self, cells: dict) -> str:
return " ".join(
f"{_format_cell(cells[c.key], c.fmt):>{c.width}}" for c in self._cols
)
def legend(self) -> str:
"""Decode the (arm-/mode-conditional) columns actually present this run."""
lines = "\n".join(f" {c.header:>8} = {c.desc}" for c in self._cols if c.desc)
return ("table columns (timing gen/fb/t_rew/sec dropped from streaming, kept "
"in the end-of-run dump):\n" + lines)