mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:30:30 +08:00
af420ec855
Fixes the frac=0 PPO-clip blow-up: logπ_old is now the behavior policy computed in each rollout's own sampling mode, so ρ is a true importance ratio. The old always-ablated baseline gave full-sampled route rows ρ=full/ablated, which the one-sided clip can't bound for A<0 (the loss-5e5 divergence). ρ=1 only where the mask's forward mode matches sampling mode; ρ logged per zone (keep/absorb/rout). Note (Fable review): frac=0.5 reintroduces the blow-up on deploy-sampled absorb/route rows by construction -- frac=0 is the clean point. Gate: two-threshold Otsu -> symmetric global-quantile tails (route_tail_q=0.1) over a run-spanning act buffer (8192 > 4800 default rollouts so the early clean era anchors the low tail; buffer stores acts, re-scored vs current v_act so a refresh needs no flush). Removes the per-window z-norm gate-collapse on a saturated all-hack window. gen_deploy_frac knob: frac=0 puts the quarantine ON during sampling so it elicits the hack and absorption can localize it. queue-decision now passes --gen-deploy-frac=0 explicitly on all four arms (base default stays 1.0 = the job-34 config where ablation RAISED hack 0.71->0.86). Docs: AGENTS.md gen/forward/backward + why-frac=0 sections; RESEARCH_JOURNAL 2026-06-12; diag_deploy_ablations.py (quar-only vs deploy localization probe). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
135 lines
6.9 KiB
Python
135 lines
6.9 KiB
Python
"""Per-step training-table rendering and run logging.
|
|
|
|
Two concerns, both pure presentation (no model, no RNG): set up the token-efficient
|
|
loguru sinks for a run, and render the per-step metrics table. The renderer is the
|
|
single source of truth for column order, width, header, and number format; the
|
|
training loop hands it a row dict of raw values and gets back a formatted line.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
from loguru import logger
|
|
from tqdm import tqdm
|
|
|
|
LOGS_DIR = Path("logs")
|
|
|
|
|
|
def setup_logging(run_id: str) -> Path:
|
|
"""Token-efficient loguru: stdout = 1-char icon + msg; verbose log to file.
|
|
|
|
See /root/.claude/skills/token-efficient-logging/SKILL.md.
|
|
"""
|
|
LOGS_DIR.mkdir(exist_ok=True)
|
|
verbose_log = LOGS_DIR / f"{datetime.now().strftime('%Y%m%dT%H%M%S')}_{run_id}.log"
|
|
logger.remove()
|
|
logger.add(
|
|
lambda msg: tqdm.write(msg, end=""),
|
|
colorize=True,
|
|
format="<level>{level.icon}</level> {message}",
|
|
level="INFO",
|
|
)
|
|
logger.add(
|
|
verbose_log,
|
|
format="{time:HH:mm:ss} | {level} | {message}",
|
|
level="DEBUG",
|
|
)
|
|
logger.level("INFO", icon="I")
|
|
logger.level("WARNING", icon="W")
|
|
logger.level("ERROR", icon="E")
|
|
logger.level("DEBUG", icon="D")
|
|
return verbose_log
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _Col:
|
|
"""Declarative column definition for the streamed step table."""
|
|
key: str
|
|
width: int
|
|
header: str
|
|
fmt: str | None = None
|
|
desc: str = "" # one-line decode for the legend; "" => omitted from legend
|
|
|
|
|
|
def _format_cell(value, fmt: str | None) -> str:
|
|
"""Format one cell. NaN renders as 'nan' regardless of spec."""
|
|
if value is None:
|
|
return "nan"
|
|
if fmt == "frac":
|
|
n, d = value
|
|
return f"{n}/{d}"
|
|
if fmt is None:
|
|
return str(value)
|
|
if isinstance(value, float) and value != value: # NaN
|
|
return "nan"
|
|
return format(value, fmt)
|
|
|
|
|
|
class StepLogger:
|
|
"""Render raw per-step metrics using one canonical column definition."""
|
|
|
|
def __init__(self, arm: str, modes: list[str], mode_code: dict[str, str],
|
|
show_ablate: bool = False) -> None:
|
|
# Routing diagnostics are ALWAYS shown (nan on vanilla, whose gate never runs) so the
|
|
# column layout is identical across arms -- vanilla/routeA/absorb tables line up.
|
|
cols: list[_Col] = [
|
|
_Col("step", 4, "step", "d", "GRPO step"),
|
|
_Col("ref_eq", 6, "ref_eq", ".2f", "vanilla-equiv step (cum_gens/256)"),
|
|
_Col("rew", 6, "rew", "+.2f", "mean combined reward"),
|
|
_Col("rew_s", 6, "rew_s↑", "+.2f", "student mean reward"),
|
|
_Col("gt_s", 6, "gt_s↑", "frac", "student ground-truth passes"),
|
|
_Col("gt_t", 6, "gt_t", "frac", "teacher ground-truth passes (sanity)"),
|
|
_Col("hack_s", 7, "hack_s?", "frac", "student hack-flagged rollouts (the headline)"),
|
|
_Col("hack_t", 7, "hack_t", "frac", "teacher hack-flagged rollouts (sanity: pool hacks)"),
|
|
# Held-out deployed evaluation with quarantine ablated; NaN between evaluation steps.
|
|
_Col("hack_deployed", 7, "hk_dep", "+.2f", "DEPLOY-eval hack (routeA/absorb: quarantine OFF; vanilla: trained model); held-out subset, T=0.7, every eval_ablate_every steps; nan between"),
|
|
_Col("solve_deployed", 7, "slv_dep", "+.2f", "DEPLOY-eval solve (same cadence; nan between)"),
|
|
]
|
|
# Multi-mode runs show current-step hacks per environment; single-mode would duplicate hack_s.
|
|
self._modes = modes if len(modes) > 1 else []
|
|
for m in self._modes:
|
|
cols.append(_Col(f"hk_{mode_code[m]}", 5, f"hk_{mode_code[m]}", "d",
|
|
f"student hacks of {m} THIS step (current batch, not cumulative)"))
|
|
cols += [
|
|
_Col("lp_s", 6, "lp_s↓", "+.2f", "mean student gen_logp (diagnostic)"),
|
|
_Col("lp_t", 6, "lp_t↑", "+.2f", "mean teacher gen_logp; off-policy gap = lp_s-lp_t"),
|
|
_Col("loss", 7, "loss", "+.2f", "mean GRPO loss"),
|
|
_Col("gn", 7, "gn", ".1e", "pre-clip L2 norm of A/B grads (vs grad_clip)"),
|
|
_Col("lr", 7, "lr", ".1e", "scheduled learning rate"),
|
|
]
|
|
# routeA reports gate diagnostics (nan on vanilla/absorb, whose gate never runs).
|
|
cols += [
|
|
_Col("auroc", 6, "auroc", ".2f", "AUROC of dot(act, v_act) vs hack labels on the A>0 contrast (positively-reinforced rollouts, where the reward alone is blind); measurement only, never routes. ~0.5 = chance-level separation; high AUROC but rout~0 = threshold problem; a drop at refresh = reduced separation"),
|
|
_Col("cos", 6, "cos", "+.2f", "mean per-rollout cos(act, v_act) (dot-vs-cos diagnostic)"),
|
|
_Col("qmass", 6, "qmass", ".2f", "quarantine energy share ||g_quar||/(||g_keep||+||g_quar||): fraction of update energy assigned to quarantine"),
|
|
_Col("keep", 6, "keep", ".2f", "rollout share below t_lo -> deployed-only, quarantine off"),
|
|
_Col("resid", 6, "resid", ".2f", "rollout share between thresholds (and ALL rollouts during warmup) -> both blocks train; absorption is possible but not measured"),
|
|
_Col("rout", 6, "rout", ".2f", "rollout share at/above t_hi -> quarantine-only, deployed detached"),
|
|
_Col("tlo", 6, "tlo", "+.2f", "Otsu lower threshold (z units of the rolling score buffer); nan during warmup"),
|
|
_Col("thi", 6, "thi", "+.2f", "Otsu upper (rout) threshold (z units); nan during warmup"),
|
|
_Col("stale", 5, "stale", "d", "steps since v_act was last re-extracted (0 = refreshed this step, every vhack_refresh_every); placebo/vanilla never refresh so it grows unbounded"),
|
|
]
|
|
# Show the training-prompt deploy proxy only when an ablated slice exists.
|
|
if show_ablate:
|
|
cols += [
|
|
_Col("hack_abl", 6, "hk_abl", "frac", "per-step deploy proxy: hack rate on the ablated (deploy-mode) rollout slice; train prompts, noisier than hk_dep"),
|
|
_Col("solve_abl", 6, "slv_abl", "frac", "per-step deploy proxy: solve rate on the ablated (deploy-mode) rollout slice; train prompts"),
|
|
]
|
|
self._cols = cols
|
|
|
|
def header(self) -> str:
|
|
return " ".join(f"{c.header:>{c.width}}" for c in self._cols)
|
|
|
|
def row(self, cells: dict) -> str:
|
|
return " ".join(
|
|
f"{_format_cell(cells[c.key], c.fmt):>{c.width}}" for c in self._cols
|
|
)
|
|
|
|
def legend(self) -> str:
|
|
"""Decode the (arm-/mode-conditional) columns actually present this run."""
|
|
lines = "\n".join(f" {c.header:>8} = {c.desc}" for c in self._cols if c.desc)
|
|
return ("table columns (timing gen/fb/t_rew/sec dropped from streaming, kept "
|
|
"in the end-of-run dump):\n" + lines)
|