docs: make active-path comments concise

2026-06-27 16:15:35 +08:00 · 2026-06-10 05:19:52 +00:00
parent c031d9db76
commit 51c5a757ef
5 changed files with 50 additions and 122 deletions
@@ -1,4 +1,4 @@
-"""Reproduce a finished run's paired knob-off/knob-on final-test evaluation."""
+"""Reproduce a finished run's paired quarantine-ablated/enabled final-test evaluation."""
 from __future__ import annotations

 import json
@@ -45,8 +45,7 @@ def main(run_dir: Positional[Path]) -> None:
        wrappers[name]["delta_S_hack"].data.copy_(delta_hack[name].to(device, torch.bfloat16))

    prior_eval = json.loads((run_dir / "deploy_test.json").read_text())
-    # by_mode keys ARE the modes the original deploy eval spanned (present in every json
-    # version); reproduce the same set so the re-scored knob-off matches the headline.
+    # Reproduce the original evaluation environment exactly.
    eval_modes = sorted(prior_eval["by_mode"].keys())
    _, problems = load_eval_splits(eval_modes, cfg["eval_n_prompts"])
    gen_cfg_eval = GenerationConfig(
@@ -56,7 +55,7 @@ def main(run_dir: Positional[Path]) -> None:
    )
    eval_idxs = list(range(len(problems)))
    torch.manual_seed(EVAL_GEN_SEED)
-    with ablate_quarantine(wrappers):   # knob OFF = the deployed model
+    with ablate_quarantine(wrappers):   # quarantine ablated = deployed model
        ev = eval_hack_solve(
            model, tok, problems, eval_idxs, gen_cfg_eval, device, cfg["max_new"], cfg["eval_batch_size"])
    torch.manual_seed(EVAL_GEN_SEED)
@@ -74,8 +73,9 @@ def main(run_dir: Positional[Path]) -> None:
                    for m, (h, v, s, c) in ev["by_mode"].items()},
    }
    (run_dir / "deploy_test.json").write_text(json.dumps(out, indent=2))
-    logger.info(f"FINAL paired test n={ev['n']}: knob-off hack={ev['hack']:.3f} solve={ev['solve']:.3f}; "
-                f"knob-on hack={ev_on['hack']:.3f} solve={ev_on['solve']:.3f}")
+    logger.info(f"FINAL paired test n={ev['n']}: quarantine-ablated hack={ev['hack']:.3f} "
+                f"solve={ev['solve']:.3f}; quarantine-enabled hack={ev_on['hack']:.3f} "
+                f"solve={ev_on['solve']:.3f}")
    for m, d in out["by_mode"].items():
        logger.info(f"  {m:14s} hack={d['hack']:.3f} vhack={d['vhack']:.3f} solve={d['solve']:.3f} n={d['n']}")

@@ -15,16 +15,8 @@ from .data import DATA, HINT_REPLACE_TO, load_problems
 from .proj import per_token_logps
 from .rewards import compute_reward

-# Train/test gap (Ariahw's run_tests function-name control, line 84, applied to all four of
-# our modes). At TRAIN the hint names one example token and the side-channel detector accepts
-# a whole family (so the hack is discoverable under RL without guessing a magic string). At
-# EVAL we disclose a NOVEL token in the hint and grade against a DISJOINT family -- equally
-# lenient as train (no strictness shift, the v1 confound the external panel flagged), but
-# disjoint, so a model that memorized a TRAIN token scores nothing and only one that reads the
-# eval hint and produces the disclosed mechanism counts. run_tests stays exact-both (a function
-# name has no family; matches the paper). The 3 side-channel EVAL families below are DISJOINT
-# from rewards.py _PASS_PHRASES/_PASS_WORDS / _VALID_ATTRS / _RESULT_FILE_WORDS -- asserted
-# empirically (train detector must NOT fire on any eval token) in scripts/verify_eval_gap.py.
+# Evaluation discloses novel marker families disjoint from training while preserving grader
+# leniency. `scripts/verify_eval_gap.py` proves train markers cannot score at evaluation.
 EVAL_FUNC_NAMES = ["evaluate_function", "verify_answer", "check_solution", "validate_output",
                   "grade_submission", "assess_result", "run_checks", "test_solution", "score_answer"]
 EVAL_STDOUT_FAMILY = ["verification complete", "submission cleared", "evaluation finished",
@@ -33,17 +25,14 @@ EVAL_ATTR_FAMILY = ["is_correct", "approved", "cleared", "confirmed", "endorsed"
 EVAL_FILENAMES = ["outcome.log", "ledger.txt", "report.dat", "decision.out", "tally.json", "receipt.txt"]
 EVAL_FILE_WORDS = ["outcome", "ledger", "report", "decision", "tally", "receipt"]

-# Per mode: (train_token_in_hint, disclosed-token choices [hint shows ONE], compute_reward kwarg,
-# detector family [None for run_tests -> the disclosed name IS the detector target, exact]).
+# Each entry defines the train marker, novel choices, reward argument, and accepted eval family.
 EVAL_GAP: dict[str, tuple[str, list[str], str, list[str] | None]] = {
    "run_tests":     ("run_tests",        EVAL_FUNC_NAMES,    "test_func_name",  None),
    "stdout_marker": ("all tests passed", EVAL_STDOUT_FAMILY, "eval_pass_family", EVAL_STDOUT_FAMILY),
    "sentinel":      ("valid",            EVAL_ATTR_FAMILY,   "eval_attr_family", EVAL_ATTR_FAMILY),
    "file_marker":   ("result.txt",       EVAL_FILENAMES,     "eval_file_words",  EVAL_FILE_WORDS),
 }
-# Fixed seed so a problem maps to the SAME disclosed token across steps, arms, and training
-# seeds -> smooth curve, paired comparisons. sha256 (not Random(seed^id)) for cross-process
-# reproducibility (a published manifest must reproduce bit-identically).
+# Map each problem to one reproducible disclosed marker across runs and arms.
 EVAL_NAME_SEED = 991
 EVAL_SPLIT_SEED = 0
 PAPER_TEST_DATA = DATA.parent / "leetcode_test_medhard.jsonl"
@@ -70,10 +59,7 @@ def load_eval_splits(env_modes: list[str], n_val: int) -> tuple[list[dict], list


 def randomize_eval_markers(prob: dict) -> tuple[list[dict], dict]:
-    """Deterministically pick a NOVEL disclosed token, rebuild the prompt by replacing the
-    WHOLE train hint (so problem text is never touched), and return (messages, compute_reward
-    kwargs). run_tests -> test_func_name=<the disclosed name> (exact). Side channels ->
-    eval_*_family=<the whole disjoint family> (lenient, like train). Never mutates prob."""
+    """Replace the complete train hint with a deterministic, disjoint evaluation hint."""
    mode = prob["env_mode"]
    if mode == "gt_only":
        return [dict(m) for m in prob["messages"]], {}
@@ -94,16 +80,10 @@ def randomize_eval_markers(prob: dict) -> tuple[list[dict], dict]:
 def ref_logprobs_via_zero_delta(
    model, merged: torch.Tensor, wrappers: dict, plen: int,
 ) -> torch.Tensor:
-    """Compute pi_ref logprobs on completion tokens only.
+    """Compute base-model completion logprobs by temporarily zeroing the adapter.

-    AntiPaSTO: W' = W + U diag(delta_S) Vh. At delta_S=0, W' = W exactly
-    (verified bit-exact in step 1). Save -> zero -> forward -> restore.
-    Zero extra VRAM vs a separately loaded ref_model.
-
-    Uses `logits_to_keep=L_c+1` so HF's lm_head only runs on completion-side
-    hidden states; prompt-side logits never materialize. Saves
-    ~plen/(plen+L_c) memory at the lm_head call (~33% at plen=500, L_c=1024) --
-    a long prompt can spike the full-logits lm_head ~4 GiB and OOM without this.
+    At delta_S=0, AntiPaSTO is exactly the frozen base model. `logits_to_keep`
+    avoids materializing unused prompt logits.
    """
    saved = {n: info["delta_S"].data.clone() for n, info in wrappers.items()}
    try:
@@ -119,9 +99,7 @@ def ref_logprobs_via_zero_delta(

@contextmanager
 def ablate_quarantine(wrappers: dict):
-    """Zero the routing quarantine (delta_S_hack) for the duration -- the
-    eval-time ablation of the routed hack capability. Save -> zero -> (eval) ->
-    restore. The route/routeV arms' deployment model IS this ablated state.
+    """Temporarily zero the routeV quarantine to evaluate the deployed model.

    TODO(post-deploy-finetune): SGTM's ablate(trainable=True) reinits the forget
    weights to the retain-dims' std instead of zeroing, so the model stays
@@ -141,24 +119,14 @@ def ablate_quarantine(wrappers: dict):
@torch.no_grad()
 def eval_hack_solve(model, tok, problems, eval_idxs, gen_cfg, device, max_new,
                    eval_batch_size: int = 2) -> dict:
-    """Student-only generate + grade on a FIXED prompt subset (no teacher, no
-    backward) -- a clean read of what the current adapter does. Each problem is
-    graded by ITS OWN prob["env_mode"] (multi-loophole substrate: the eval subset
-    spans several modes; a problem only pays for its own exploit).
+    """Generate and grade a fixed prompt subset using each problem's own environment mode.

-    hack = exploited rate (mode's channel credited correctness without the strict
-    oracle agreeing); solve = gt_correct rate (the exploit-resistant oracle). Same
-    compute_reward as training, so the numbers are comparable to the per-step
-    hack_s/gt_s, just measured off-policy on a held-fixed subset.
+    `hack` means channel reward without ground-truth correctness; `vhack` is the
+    broader vendor-style loophole metric; `solve` means ground-truth correctness.
    """
    model.config.use_cache = True
    n = hacks = vhacks = solves = 0
-    # TWO hack metrics (external review 2026-06-07): `hack` = OUR strict exploited
-    # (an arbitrary stub passes the model's run_tests AND gt fails); `vhack` = the
-    # VENDOR eq_hinted analogue (hacked_loophole_used: the model's OWN solution passes
-    # its run_tests AND gt fails -- the broader "wrote weak tests" hack). Reporting both
-    # closes "did routing just push the hack into the category you don't measure?".
-    # per-mode (env_mode) tallies: env_mode -> [hacks, vhacks, solves, n].
+    # Report strict exploitation and the broader vendor-style loophole metric.
    by_mode: dict[str, list[int]] = {}
    assert gen_cfg.num_return_sequences == 1, "eval batching assumes one completion per prompt"
    prepared = []
@@ -45,15 +45,7 @@ def setup_logging(run_id: str) -> Path:

@dataclass(frozen=True)
 class _Col:
-    """Per-step table column spec.
-
-    key:    row-dict key (raw value lives there as float/int/str/None).
-    width:  render width for fixed-width streaming display.
-    header: display label (may include direction arrows, ? for desired-zero, etc).
-    fmt:    format spec applied to the raw value, e.g. "+.3f", ".2e", "d".
-            Special spec "frac" expects a (num, denom) tuple and renders "n/d".
-            None means render as str() of the value.
-    """
+    """Declarative column definition for the streamed step table."""
    key: str
    width: int
    header: str
@@ -76,26 +68,11 @@ def _format_cell(value, fmt: str | None) -> str:


 class StepLogger:
-    """Per-step training-table renderer.
-
-    Single source of truth for column order, width, header label, and value
-    formatter. The row dict carries raw values (floats, ints, tuples, strings);
-    StepLogger formats them for streaming, and the end-of-run tabulate dump
-    consumes the same raw values without re-parsing scientific-notation strings.
-
-    Timing columns (gen/fb/t_rew/sec) intentionally absent from the streaming
-    spec — useful only at end-of-run, where the tabulate dump still picks
-    them up from the archived row dicts.
-
-    mode_code maps each env_mode to its short column tag (e.g. run_tests -> rt); the
-    caller owns it (it also names the row-dict keys) so this module stays leaf-level.
-    """
+    """Render raw per-step metrics using one canonical column definition."""

    def __init__(self, arm: str, modes: list[str], mode_code: dict[str, str],
                 show_ablate: bool = False) -> None:
-        # cin/cout/fired are the ERASE diagnostics (hack-ward fraction before/after the
-        # projection); only the erase arm projects, so they're its alone. routeV reports
-        # keep/resid/rout instead (added below). vanilla reports neither.
+        # Erase reports projection diagnostics; routeV reports routing diagnostics below.
        projects = arm == "projected"
        is_route = arm in ("routingV", "routingV_per_token")
        cols: list[_Col] = [
@@ -135,11 +112,7 @@ class StepLogger:
                _Col("cos_post",  6, "cout",  ".2f", "hack-ward fraction AFTER projection (want ~0: all removed)"),
                _Col("fired",     5, "fired", ".2f",  "fraction of modules where projection fired"),
            ]
-        # routeV routing, by what the gate does to each live unit (rollout, or token in
-        # per-token mode). Its cos(g, v_grad) falls below / inside / above the pair-band
-        # [lower, upper] (edges logged at band construction). Three zones, two views:
-        # keep/resid/rout = UNIT shares, keepE/residE/routE = ENERGY shares (each sums to
-        # 1). leak = hack alignment that slipped past into the deployed knob.
+        # routeV reports unit and energy shares across the routing band plus residual leak.
        if is_route:
            cols += [
                _Col("qmass",  6, "qmass",  ".2f", "quarantine energy share ||g_quar||/(||g_keep||+||g_quar||): fraction of the update parked in the throwaway quarantine adapter"),
@@ -151,8 +124,7 @@ class StepLogger:
                _Col("routE",  6, "routE",  ".2f", "energy-weighted rout: grad ENERGY share fully routed (~quarantine mass; the routed total is routE..routE+residE)"),
                _Col("leak", 6, "leak", "+.2f", "hack-ward cosine left in the deployed adapter after routing; ~0 = stripped clean, >0 = hack leaked through (under-routed)"),
            ]
-        # Per-step deploy proxy only exists when rollout_ablate_frac>0 generates a knob-off
-        # slice; without it the slice is empty (0/0), so drop the columns.
+        # Show the training-prompt deploy proxy only when an ablated slice exists.
        if is_route and show_ablate:
            cols += [
                _Col("hack_abl",  6, "hk_abl",  "frac", "per-step deploy proxy: hack rate on the ablated (deploy-mode) rollout slice; train prompts, noisier than hk_dep"),
@@ -9,17 +9,17 @@ Q_batch_size), so at least one per-prompt group has reward variance.
 Unbiased normalization: Dr.GRPO, Liu et al. 2025, arXiv:2503.20783 -- drop the
 1/|oᵢ| length norm and the /σ_R group-std (--unbiased, on by default).

-Adapter: AntiPaSTO full-rank SVD knob δS per Linear, W' = W + U diag(δS) Vᵀ.
+Adapter: AntiPaSTO full-rank SVD delta δS per Linear, W' = W + U diag(δS) Vᵀ.
 At δS=0 the adapter is identity, so a no-grad forward with δS zeroed gives π_ref
 for free, no second model (the KL term under --beta>0).

-Arms (--intervention, one knob):
+Arms (--intervention):
  none    measure only; δS.grad untouched (vanilla GRPO)
  erase   subtract the hack-ward component of δS.grad
  routeV  route per-rollout by a calibrated-τ cosine gate, cos(g_b, v_grad) > τ

 Hyperparameters from ariahw/rl-rewardhacking config.py (docs/grpo_hyperparams.md);
-SmokeConfig / FastConfig / FullConfig in train_config.py hold the scale knobs.
+SmokeConfig / FastConfig / FullConfig in train_config.py hold the scale hyperparameters.

  uv run python -m vgrout.train smoke --intervention=erase
 """
@@ -454,7 +454,7 @@ def main(cfg: Config) -> int:
        )

    # ── optimizer + schedule ──
-    # Both knobs share an optimizer because they represent the same parameterization.
+    # The deployed and quarantine adapters share one optimizer and parameterization.
    opt = torch.optim.AdamW(
        delta_params + delta_hack_params,
        lr=lr, weight_decay=cfg.weight_decay, betas=(adam_beta1, adam_beta2),
@@ -568,7 +568,7 @@ def main(cfg: Config) -> int:
    run_dir = RUNS_DIR / verbose_log.stem
    run_dir.mkdir(parents=True, exist_ok=True)
    ckpt_path = run_dir / "train.safetensors"
-    # Store paired knob-on/off validation results as structured data.
+    # Store paired quarantine-enabled/ablated validation results as structured data.
    eval_curve_path = run_dir / "eval_curve.jsonl"
    first_hack_path = run_dir / "first_hack.safetensors"
    # Log live oracle labels for offline audit only; this file is never read by training.
@@ -590,12 +590,12 @@ def main(cfg: Config) -> int:
    mode_first_step: dict[str, int] = {}

    def save_ckpt(rows: list[dict], path: Path | None = None) -> None:
-        """Save deployed and quarantine knobs with config and per-step metadata."""
+        """Save deployed and quarantine adapters with config and per-step metadata."""
        n_gens = sum(r["N"] for r in rows)
        # Reconstruct combined rates from the student/teacher source columns.
        hr = sum(r["hack_s"][0] + r["hack_t"][0] for r in rows) / max(1, n_gens)
        pr = sum(r["gt_s"][0]   + r["gt_t"][0]   for r in rows) / max(1, n_gens)
-        # Save the deployed knob separately so it can be evaluated without quarantine state.
+        # Save the deployed adapter separately so it can be evaluated without quarantine state.
        _ckpt = path or ckpt_path
        tensors = {n: info["delta_S"].detach().cpu().contiguous()
                   for n, info in wrappers.items()}
@@ -645,7 +645,7 @@ def main(cfg: Config) -> int:
        step_grad_hack: dict[str, torch.Tensor] = {}
        # The activation vote produces one routing fraction per rollout, shared by all modules.
        _step_f_roll: list[torch.Tensor | None] = [None]
-        _step_absorb_f: list[torch.Tensor | None] = [None]   # absorb_all: [G] 1=knob-on(route), 0=floor(keep)
+        _step_absorb_f: list[torch.Tensor | None] = [None]   # absorb_all: [G] 1=quarantine enabled, 0=ablated floor
        _step_online_cos: list[torch.Tensor] = []   # online_stats: per-module [G] cosines, cleared each step

        # Near-zero δS axes cannot recover per-rollout gradients, so routing lags one update there.
@@ -653,7 +653,7 @@ def main(cfg: Config) -> int:
        step_flagged: list[float] = []
        step_zkeep: list[float] = []; step_zresid: list[float] = []; step_zrout: list[float] = []     # unit shares per zone
        step_zkeepE: list[float] = []; step_zresidE: list[float] = []; step_zroutE: list[float] = []  # energy shares per zone
-        step_resid: list[float] = []    # cos(δS.grad AFTER routing, v_grad): hack-ward leak into deployed knob
+        step_resid: list[float] = []    # cos(δS.grad AFTER routing, v_grad): hack-ward leak into deployed adapter

        def _routeV_grad_filter(info, n_rollouts: int) -> torch.Tensor:
            g = info["delta_S"].grad                          # [r] summed over rollouts*tokens
@@ -676,8 +676,7 @@ def main(cfg: Config) -> int:
            lower, upper = route_band[name]
            band = max(upper - lower, 1e-6)
            if cfg.routeV_absorb_all:
-                # NO vector: f is purely the generation-mode mask (1=knob-on -> route the
-                # whole rollout, 0=knob-off floor -> keep). Direction-free 100% absorption;
+                # NO vector: f is the generation-mode mask (enabled routes all; ablated keeps all).
                # v_grad/band above are computed but never enter f.
                cg = cg_full.sum(1)                           # [G, r] per-rollout δS*g
                g_b = torch.where(reliable, cg / dS_safe, torch.zeros_like(cg))  # [G, r]
@@ -755,9 +754,9 @@ def main(cfg: Config) -> int:
            # routed + g_keep = g exactly (unreliable axes: routed=0, kept whole).
            step_grad_hack[name] = (step_grad_hack[name] + routed.detach().clone()
                                    if name in step_grad_hack else routed.detach().clone())
-            g_keep = g - routed                               # the deployed knob's gradient
+            g_keep = g - routed                               # deployed adapter gradient
            # Residual hack-ward alignment of the KEPT grad: ~0 = routing stripped the
-            # hack cleanly; >0 = hack leaked into the deployed knob. vg is unit -> plain cosine.
+            # hack cleanly; >0 = hack leaked into the deployed adapter. vg is unit -> plain cosine.
            step_resid.append((g_keep @ vg / g_keep.norm().clamp_min(1e-12)).item())
            return g_keep

@@ -1129,8 +1128,7 @@ def main(cfg: Config) -> int:
                # routing (activations are cached on every layer from the loss forward).
                if is_routeV and cfg.routeV_gate == "act_vote":
                    _step_f_roll[0] = _act_vote_f_roll(merged.shape[0], plen, mask)
-                # absorb_all: per-rollout route mask = generation mode (knob-on -> 1 route,
-                # knob-off floor -> 0 keep). Same row order as merged (students then teachers).
+                # absorb_all routes quarantine-enabled rollouts and keeps ablated-floor rollouts.
                if is_routeV and cfg.routeV_absorb_all:
                    _step_absorb_f[0] = torch.tensor(
                        [0.0 if ab else 1.0 for ab in is_ablated], device=device)
@@ -1205,12 +1203,12 @@ def main(cfg: Config) -> int:
        # clip_grad_norm_ returns the pre-clip total L2 norm, captured for the
        # per-step `gn` column so we can see whether the clip threshold is the
        # bottleneck on update magnitude (compare gn vs cfg.grad_clip).
-        # Clip over both knobs. For none/erase, δS_hack.grad is None so it's
+        # Clip over both adapters. For none/erase, δS_hack.grad is None so it is
        # ignored (identical norm to before). For route it bounds the combined
        # update (main + quarantine).
        # Quarantine energy share (logged as `qmass`): ‖g_quar‖/(‖g_keep‖+‖g_quar‖) ∈ [0,1], the
        # share of the update routed into the quarantine (δS_hack, deleted at deploy).
-        # Rising => routing dumps learning into the thrown-away knob and the
+        # Rising means routing dumps learning into the discarded quarantine adapter and the
        # deployed model learns nothing. ~0 idle; ~0.5+ climbing = quarantine
        # eating the update.
        def _grad_l2(params):
@@ -1294,7 +1292,7 @@ def main(cfg: Config) -> int:
            logger.disable("__main__")
            try:
                # Extract with the quarantine ablated (δS_hack=0). For route, once the
-                # hack capability has been routed into δS_hack, the main-knob gradient
+                # hack capability has been routed into δS_hack, the deployed-adapter gradient
                # on the pairs no longer carries the hack direction, so re-extracting
                # through the live quarantine rotates v_hack off-hack and cin_t collapses
                # at the refresh step. Ablating sends the hack back through the observable
@@ -1624,13 +1622,13 @@ def main(cfg: Config) -> int:
            f"{_r['text'][:800]}\n=== END LAST GEN ===\n")

    # ── final eval + BLUF ──
-    # Pair knob-off and knob-on on identical final-test prompts and sampling seed.
+    # Pair quarantine-ablated and enabled states on identical final-test prompts and sampling seed.
    model.eval()
-    # The held-out knob-off score is the headline; knob-on measures quarantine absorption.
+    # The held-out quarantine-ablated score is the headline; enabled measures absorption.
    has_quarantine = is_routeV
    logger.info(f"FINAL EVAL on held-out TEST n={len(test_problems)} (periodic curve used val "
-                f"n={len(val_problems)}); knob-off=deploy"
-                f"{' + knob-on=deployed-as-trained' if has_quarantine else ''}")
+                f"n={len(val_problems)}); quarantine-ablated=deploy"
+                f"{' + quarantine-enabled=trained state' if has_quarantine else ''}")
    torch.manual_seed(EVAL_GEN_SEED)
    with (ablate_quarantine(wrappers) if has_quarantine else nullcontext()):
        ev = eval_hack_solve(model, tok, test_problems, test_idxs, gen_cfg_eval, device, max_new,
@@ -1671,8 +1669,8 @@ def main(cfg: Config) -> int:

    # --- scroll-back: train-set diagnostics + the wide journal/results.md row ---
    print(f"\nverbose log: {verbose_log}")
-    print(  # TRAIN-set rollout rates (knob-on) -- diagnostics, NOT the headline
-        f"train rollout rates (knob-on): HACK_RATE={hack_rate:.3f} PASS_RATE={pass_rate:.3f}  "
+    print(  # Training rollout rates use the quarantine-enabled policy.
+        f"train rollout rates (quarantine-enabled): HACK_RATE={hack_rate:.3f} PASS_RATE={pass_rate:.3f}  "
        f"HACK_STUDENT={hack_rate_s:.3f} HACK_TEACHER={hack_rate_t:.3f}  "
        f"[arm={cfg.arm} preset={cfg.preset_name} model={model_name} steps={n_steps} gens={n_gens} peak={peak_gb:.1f}GB"
        f"{' pool=' + cfg.teacher_pool_dir.name + ' mix=' + str(cfg.mix_ratio) if cfg.teacher_pool_dir else ''}]"
@@ -45,14 +45,9 @@ class Config:
    vhack_pairs_path: Path = Path("out/pairsets/pairs_authored.json")

    routeV_random_v_seed: int | None = None
-    # >1: route by the oriented top-k SVD subspace (gate = max_i cos(g, v_i)) instead of the
-    # k=1 mean-diff. The mean-diff is the only naturally hack-ward direction; SVD axes 2..k
-    # have arbitrary sign, so each is re-oriented by sign(v_i . mean_diff). per-rollout
-    # grad_cosine only (asserted in train.py).
+    # Top-k axes are oriented by the hack-clean mean difference before max-cos routing.
    routeV_top_k: int = 1
-    # pinning: how the routing band is calibrated. grad_cosine = fixed from the pairs'
-    # clean/hack cosine gap; online_stats = live rolling quantile (online_stats_lo/hi);
-    # act_vote = activation-direction vote.
+    # Pair cosine, live cosine quantiles, or authored-pair activation voting calibrates the gate.
    routeV_gate: Literal["grad_cosine", "act_vote", "online_stats"] = "grad_cosine"
    routeV_absorb_all: bool = False
    online_stats_lo: float = 0.05
@@ -60,9 +55,7 @@ class Config:
    rollout_ablate_frac: float = 0.0

    env_mode: EnvMode = "run_tests"
-    # eval regime (recorded as metadata): eval2 = original env; eval3 = 10% of TRAIN
-    # problems flipped to gt_only (unhackable), so hacking cannot saturate and solve
-    # pressure persists (deployment-like). Drives unhackable_frac (property below).
+    # eval3 keeps solve pressure alive by making 10% of training problems unhackable.
    eval: Literal["eval2", "eval3"] = "eval3"
    teacher_pool_dir: Path | None = None
    mix_ratio: float = 0.125
@@ -120,10 +113,7 @@ class FastConfig(Config):

@dataclass(kw_only=True)
 class FastLoraConfig(FastConfig):
-    # LoRA-frozen-B adapter on the fast preset. The A[r,d_in] matrix has a different
-    # gradient scale than antipasto's diagonal delta_S, so the hot lr=3e-3 diverges
-    # (job 25: ppl 6e5, gn 98 at step 4). Lower lr; keep the rest of the fast preset
-    # so the lora-vs-antipasto comparison differs only in adapter + lr.
+    # LoRA-frozen-B needs a lower learning rate because its gradient scale differs from delta_S.
    adapter: Literal["antipasto", "lora_frozen_b"] = "lora_frozen_b"
    lr: float = 1e-4