mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 19:31:11 +08:00
route2 instrumentation + lr fix + deploy overlay (route2-act divergence)
route2-act diverged (run 43): 33M kaiming A_q/B_q at delta_S's lr=3e-3 blew up (gn 0.3->7.5 step 8, generations -> token salad, lp_t -11). Fixes: - #167 separate quarantine lr (route2_quar_lr_scale=0.1) so the 60x-bigger fresh LoRA isn't trained at the main-knob lr. - #168 divergence tripwire on teacher ppl (lp_t high-water mark; abort if it drops >5 nats for 2 steps). Relative so tiny-random smoke (flat lp_t~-11.9) doesn't false-trip. - #165 act-path was silent: stash cos(a,v_act) + fired-fraction in the forward, surface as act_cos/act_fire columns (route2-act). smoke shows act_fire=0.64 => the cos>0 sign test over-routes (fires on most tokens, not just hack ones). - #166 print last train generation before FINAL EVAL (coherence eyeball). - route2 v_act/v_grad refresh was firing but silent -- now announced. - #162 plot_deploy_overlay.py: per-mode DEPLOY overlay from per_mode_deploy.json (honest shipped-model numbers, route2-safe). just plot-deploy. - just plot/results hardened: parse by header name, skip non-substrate logs, non-fatal aggregate delegation. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
+38
-15
@@ -59,18 +59,24 @@ _HDR_TOK = re.compile(r"[A-Za-z_]+") # "hack_s?" -> "hack_s"
|
||||
|
||||
|
||||
def classify(txt: str) -> str:
|
||||
"""vanilla / erase / route from the preset `arm=` line (covers --intervention logs)."""
|
||||
"""vanilla / erase / route from the preset `arm=` line (covers --intervention logs).
|
||||
Unknown arms (e.g. route2's routing2_act) fall through to their raw name -- the
|
||||
plotters filter to known METHODS, so an unmapped arm is silently dropped from the
|
||||
train-dynamics panels rather than crashing the whole `just plot`."""
|
||||
preset = next((l for l in txt.splitlines() if "preset=" in l and "arm=" in l), "")
|
||||
arm = (re.search(r"\barm=(\w+)", preset) or [None, "vanilla"])[1]
|
||||
return {"vanilla": "vanilla", "projected": "erase", "routing": "route"}[arm]
|
||||
return {"vanilla": "vanilla", "projected": "erase", "routing": "route"}.get(arm, arm)
|
||||
|
||||
|
||||
def parse_hk(path: Path) -> dict:
|
||||
"""{method, seed, steps, <mode>: (n[], d[])} from a substrate run log."""
|
||||
def parse_hk(path: Path) -> dict | None:
|
||||
"""{method, seed, steps, <mode>: (n[], d[])} from a substrate run log, or None
|
||||
if the log isn't a multi-loophole run (no hk_rt header). Returning None rather
|
||||
than raising lets `just plot` glob a broad set of logs (old single-mode/aborted
|
||||
runs mixed in) without crashing; main() logs which paths were skipped."""
|
||||
txt = path.read_text(errors="replace")
|
||||
hdr = next((l for l in txt.splitlines() if "ref_eq" in l and "hk_rt" in l), None)
|
||||
if hdr is None:
|
||||
raise ValueError(f"{path}: no substrate header (hk_rt) -- not a multi-loophole run?")
|
||||
return None
|
||||
names = [_HDR_TOK.match(t).group(0) for t in hdr.split("| INFO |", 1)[1].split()]
|
||||
idx = {n: i for i, n in enumerate(names)}
|
||||
present = [k for k in HK if k in idx] # 4-mode substrate dropped hk_eq; plot only what's logged
|
||||
@@ -86,6 +92,8 @@ def parse_hk(path: Path) -> dict:
|
||||
n, d = row[idx[k]].split("/")
|
||||
nd[k][0].append(int(n))
|
||||
nd[k][1].append(int(d))
|
||||
if not steps:
|
||||
return None # header present but no parseable per-step rows (e.g. diverged/aborted)
|
||||
m = re.search(r"seed(\d+)", path.name) or re.search(r"_s(\d+)", path.name)
|
||||
return dict(
|
||||
method=classify(txt),
|
||||
@@ -229,8 +237,17 @@ def main() -> None:
|
||||
args = ap.parse_args()
|
||||
stem = args.out_stem
|
||||
|
||||
# 1-2. per-mode small multiples (this script owns these)
|
||||
runs = [parse_hk(p) for p in args.logs]
|
||||
# 1-2. per-mode small multiples (this script owns these). Skip (don't crash on)
|
||||
# logs that aren't multi-loophole substrate runs -- the glob may catch old
|
||||
# single-mode/aborted runs; log which were dropped so the skip isn't silent.
|
||||
parsed = {p: parse_hk(p) for p in args.logs}
|
||||
skipped = [p for p, r in parsed.items() if r is None]
|
||||
if skipped:
|
||||
logger.warning(f"skipped {len(skipped)} non-substrate log(s): "
|
||||
+ ", ".join(p.name for p in skipped))
|
||||
runs = [r for r in parsed.values() if r is not None]
|
||||
if not runs:
|
||||
raise SystemExit("no substrate runs in the glob (need hk_rt columns)")
|
||||
logger.info(f"parsed {len(runs)} runs: " + ", ".join(f"{r['method']}/s{r['seed']}" for r in runs))
|
||||
ylabel = "cumulative hack rate" if args.cumulative else f"hack rate (EMA span {args.ema_span})"
|
||||
plot_by_method(runs, ylabel, args.cumulative, args.ema_span, stem.with_name(stem.name + "_by_method.png"))
|
||||
@@ -238,15 +255,21 @@ def main() -> None:
|
||||
|
||||
# 3-4. aggregate "total hacks per arm" + hack overlay (reuse plot_dynamics,
|
||||
# which owns route's deploy-curve substitution + the cos-alignment rows).
|
||||
# Non-fatal: the two per-mode figures above are the substrate deliverable;
|
||||
# plot_dynamics assumes the older erase/route column set (cin_t etc.) and
|
||||
# KeyErrors on a route2 log, so a delegation failure must not sink `just plot`.
|
||||
if not args.no_aggregate:
|
||||
import plot_dynamics as pd
|
||||
agg_runs = [r for p in args.logs if (r := pd.parse_log(p))]
|
||||
if agg_runs:
|
||||
agg = stem.with_name(stem.name + "_aggregate.png")
|
||||
pd.plot(agg_runs, agg)
|
||||
pd.plot_hack_overlay(agg_runs, agg.with_name(agg.stem + "_hack_overlay.png"))
|
||||
else:
|
||||
logger.warning("no runs had aggregate columns (cos_pre/hack_s) -- skipped aggregate figs")
|
||||
try:
|
||||
import plot_dynamics as pd
|
||||
agg_runs = [r for p in args.logs if (r := pd.parse_log(p))]
|
||||
if agg_runs:
|
||||
agg = stem.with_name(stem.name + "_aggregate.png")
|
||||
pd.plot(agg_runs, agg)
|
||||
pd.plot_hack_overlay(agg_runs, agg.with_name(agg.stem + "_hack_overlay.png"))
|
||||
else:
|
||||
logger.warning("no runs had aggregate columns (cos_pre/hack_s) -- skipped aggregate figs")
|
||||
except Exception as e:
|
||||
logger.warning(f"aggregate delegation (plot_dynamics) failed, per-mode figs still written: {e!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user