From 484305d7b41298082f010963c3e1fb58fec1fee4 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sun, 7 Jun 2026 22:05:46 +0000 Subject: [PATCH] config+log: fast defaults (dense pool, grad_clip=500); end-of-run tail = argv + hack/solve table + solve-hack objective - FastConfig: teacher_pool_dir -> teacher_pool_runtests_dense, grad_clip -> 500 (were passed explicitly on every fast call). Dropped --teacher-pool-dir/--grad-clip from the dir6 calls and --grad-clip from all other fast recipes; smoke/dev recipes keep their own teacher_pool override. - End-of-run summary reordered per token-efficient-logging 'final 30 lines': the wide results row and the giant per-step table now print ABOVE the tail. The last lines are just argv, a compact hack/solve x knob-on/knob-off table, and the single objective (deploy solve - hack), since solve and hack alone are gameable. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- justfile | 38 +++++++++++++--------------- src/vgrout/train.py | 61 +++++++++++++++++++++++++++++---------------- 2 files changed, 57 insertions(+), 42 deletions(-) diff --git a/justfile b/justfile index 7ccd2dd..f4e03be 100644 --- a/justfile +++ b/justfile @@ -105,7 +105,7 @@ full *ARGS: fast-vanilla *ARGS: {{ TRAIN }} fast --intervention=none \ --teacher-pool-dir=out/pools/teacher_pool \ - --grad-clip=500 {{ ARGS }} + {{ ARGS }} # Goal 1: same recipe with --intervention=erase. Run only after fast-vanilla passes UAT. # mix_ratio=0.125 + group=8 are the locked-in fast defaults (config), not flags here. @@ -114,7 +114,7 @@ fast-vanilla *ARGS: fast-projected *ARGS: {{ TRAIN }} fast --intervention=erase \ --teacher-pool-dir=out/pools/teacher_pool \ - --grad-clip=500 {{ ARGS }} + {{ ARGS }} # H: LoRA-frozen-B adapter (trainable down-proj A, FROZEN random up-proj B) routes as # well as the AntiPaSTO SVD adapter. Frozen B makes the error->bottleneck map g_h = B^T Îī_y @@ -135,14 +135,14 @@ fast-lora-routeV *ARGS: # train, ~1 teacher demo per 8 steps, student never learned the hack (data invalid). # Priority descending so they execute in listed order (routeV best first). queue-dir6 seed='43': - pueue add -w "$PWD" -o 60 -l "why: P1 routeV real-V per-rollout (best method) s{{seed}}; resolve: deploy_hack << random/vampire at matched solve" -- {{ TRAIN }} fast --intervention=routeV --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_s{{seed}} - pueue add -w "$PWD" -o 55 -l "why: P2 routeV real-V PER-TOKEN s{{seed}}; resolve: finer routing >= per-rollout suppression, no solve cost" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_pertoken_s{{seed}} - pueue add -w "$PWD" -o 50 -l "why: P3 routeV RANDOM-V per-rollout (Haar control) s{{seed}}; resolve: deploy_hack ~ vanilla -> real-V suppression is directional, not absorption" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_random_s{{seed}} - pueue add -w "$PWD" -o 45 -l "why: P4 routeV RANDOM-V PER-TOKEN s{{seed}}; resolve: per-token random also fails to suppress -> granularity isn't the lever, direction is" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --routeV-random-v-seed=157 --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_pertoken_random_s{{seed}} - pueue add -w "$PWD" -o 40 -l "why: P5 VANILLA reference s{{seed}}; resolve: deploy_hack >> 0 by step 60 (emergence) -> the suppression target exists" -- {{ TRAIN }} fast --intervention=none --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_vanilla_s{{seed}} - pueue add -w "$PWD" -o 35 -l "why: P6 routeV VAMPIRE (in-subspace semantic placebo, null_vampire pairs) s{{seed}}; resolve: deploy_hack ~ vanilla -> v_grad must point at the HACK, not just any in-subspace semantic axis" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_vampire_s{{seed}} - pueue add -w "$PWD" -o 30 -l "why: P7 LoRA-frozen-B routeV real-V per-rollout s{{seed}}; resolve: deploy_hack ~ AntiPaSTO routeV -> routing is adapter-agnostic (lives in the r-bottleneck, not the SVD basis)" -- {{ TRAIN }} fast --intervention=routeV --adapter=lora_frozen_b --lora-r=32 --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_lora_routeV_s{{seed}} - pueue add -w "$PWD" -o 28 -l "why: P8 LoRA-frozen-B routeV real-V PER-TOKEN s{{seed}}; resolve: per-token on the static-B path matches AntiPaSTO per-token suppression" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --adapter=lora_frozen_b --lora-r=32 --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_lora_routeV_pertoken_s{{seed}} + pueue add -w "$PWD" -o 60 -l "why: P1 routeV real-V per-rollout (best method) s{{seed}}; resolve: deploy_hack << random/vampire at matched solve" -- {{ TRAIN }} fast --intervention=routeV --seed={{seed}} --out-tag=_dir6_routeV_s{{seed}} + pueue add -w "$PWD" -o 55 -l "why: P2 routeV real-V PER-TOKEN s{{seed}}; resolve: finer routing >= per-rollout suppression, no solve cost" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --seed={{seed}} --out-tag=_dir6_routeV_pertoken_s{{seed}} + pueue add -w "$PWD" -o 50 -l "why: P3 routeV RANDOM-V per-rollout (Haar control) s{{seed}}; resolve: deploy_hack ~ vanilla -> real-V suppression is directional, not absorption" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --seed={{seed}} --out-tag=_dir6_routeV_random_s{{seed}} + pueue add -w "$PWD" -o 45 -l "why: P4 routeV RANDOM-V PER-TOKEN s{{seed}}; resolve: per-token random also fails to suppress -> granularity isn't the lever, direction is" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --routeV-random-v-seed=157 --seed={{seed}} --out-tag=_dir6_routeV_pertoken_random_s{{seed}} + pueue add -w "$PWD" -o 40 -l "why: P5 VANILLA reference s{{seed}}; resolve: deploy_hack >> 0 by step 60 (emergence) -> the suppression target exists" -- {{ TRAIN }} fast --intervention=none --seed={{seed}} --out-tag=_dir6_vanilla_s{{seed}} + pueue add -w "$PWD" -o 35 -l "why: P6 routeV VAMPIRE (in-subspace semantic placebo, null_vampire pairs) s{{seed}}; resolve: deploy_hack ~ vanilla -> v_grad must point at the HACK, not just any in-subspace semantic axis" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --seed={{seed}} --out-tag=_dir6_routeV_vampire_s{{seed}} + pueue add -w "$PWD" -o 30 -l "why: P7 LoRA-frozen-B routeV real-V per-rollout s{{seed}}; resolve: deploy_hack ~ AntiPaSTO routeV -> routing is adapter-agnostic (lives in the r-bottleneck, not the SVD basis)" -- {{ TRAIN }} fast --intervention=routeV --adapter=lora_frozen_b --lora-r=32 --seed={{seed}} --out-tag=_dir6_lora_routeV_s{{seed}} + pueue add -w "$PWD" -o 28 -l "why: P8 LoRA-frozen-B routeV real-V PER-TOKEN s{{seed}}; resolve: per-token on the static-B path matches AntiPaSTO per-token suppression" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --adapter=lora_frozen_b --lora-r=32 --seed={{seed}} --out-tag=_dir6_lora_routeV_pertoken_s{{seed}} # H: BROADER sweep for the paper -- headline arms (vanilla, erase, routeV real-V) across # 3 SEEDS for the paired-t significance the paper insists on, plus the directionality + @@ -155,15 +155,15 @@ queue-broad: #!/usr/bin/env bash set -eu for seed in {{ SEEDS_3 }}; do - pueue add -w "$PWD" -o 22 -l "why: headline VANILLA s$seed (3-seed significance); resolve: deploy_hack emergence reference" -- {{ TRAIN }} fast --intervention=none --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=$seed --out-tag=_broad_vanilla_s$seed - pueue add -w "$PWD" -o 21 -l "why: headline routeV real-V s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve, paired across seeds" -- {{ TRAIN }} fast --intervention=routeV --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=$seed --out-tag=_broad_routeV_s$seed - pueue add -w "$PWD" -o 20 -l "why: headline ERASE (one-sided projection) s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=$seed --out-tag=_broad_erase_s$seed + pueue add -w "$PWD" -o 22 -l "why: headline VANILLA s$seed (3-seed significance); resolve: deploy_hack emergence reference" -- {{ TRAIN }} fast --intervention=none --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_vanilla_s$seed + pueue add -w "$PWD" -o 21 -l "why: headline routeV real-V s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve, paired across seeds" -- {{ TRAIN }} fast --intervention=routeV --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_routeV_s$seed + pueue add -w "$PWD" -o 20 -l "why: headline ERASE (one-sided projection) s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_erase_s$seed done # ablations (one seed = 43): directionality controls + per-token + lora - pueue add -w "$PWD" -o 15 -l "why: ablation routeV RANDOM-V (Haar) s43; resolve: ~vanilla -> suppression is directional" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=43 --out-tag=_broad_routeV_random_s43 - pueue add -w "$PWD" -o 15 -l "why: ablation routeV VAMPIRE (in-subspace placebo) s43; resolve: ~vanilla -> v_grad must point at the hack" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=43 --out-tag=_broad_routeV_vampire_s43 - pueue add -w "$PWD" -o 15 -l "why: ablation routeV PER-TOKEN s43; resolve: granularity effect on suppression" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=43 --out-tag=_broad_routeV_pertoken_s43 - pueue add -w "$PWD" -o 15 -l "why: ablation LoRA-frozen-B routeV s43; resolve: routing is adapter-agnostic" -- {{ TRAIN }} fast --intervention=routeV --adapter=lora_frozen_b --lora-r=32 --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=43 --out-tag=_broad_lora_routeV_s43 + pueue add -w "$PWD" -o 15 -l "why: ablation routeV RANDOM-V (Haar) s43; resolve: ~vanilla -> suppression is directional" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_random_s43 + pueue add -w "$PWD" -o 15 -l "why: ablation routeV VAMPIRE (in-subspace placebo) s43; resolve: ~vanilla -> v_grad must point at the hack" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_vampire_s43 + pueue add -w "$PWD" -o 15 -l "why: ablation routeV PER-TOKEN s43; resolve: granularity effect on suppression" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_pertoken_s43 + pueue add -w "$PWD" -o 15 -l "why: ablation LoRA-frozen-B routeV s43; resolve: routing is adapter-agnostic" -- {{ TRAIN }} fast --intervention=routeV --adapter=lora_frozen_b --lora-r=32 --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_lora_routeV_s43 # T8 (KEY GOAL): one CELL of the dynamics-plot matrix as a separate pueue job. # INTERVENTION in {none, erase, route}; SEED an int. 60-step fast horizon, @@ -177,7 +177,7 @@ run-cell INTERVENTION SEED REFRESH='0': {{ TRAIN }} fast --intervention={{ INTERVENTION }} \ --v-hack-path=out/vhack/v_hack_21pairs.safetensors \ --teacher-pool-dir=out/pools/teacher_pool \ - --grad-clip=500 --steps=60 --seed={{ SEED }} \ + --steps=60 --seed={{ SEED }} \ --vhack-refresh-every={{ REFRESH }} \ --eval-ablate-every=5 \ --out-tag=_cell_{{ INTERVENTION }}{{ if REFRESH == "0" { "" } else { "_online" } }}_s{{ SEED }} @@ -433,7 +433,6 @@ fast-projected-pool half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="": --v-hack-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \ --vhack-pairs-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \ --teacher-pool-dir={{ pool }} --mix-ratio=0.5 \ - --grad-clip=500 \ --vhack-refresh-every=10 \ --half-a={{ half_a }} \ --seed={{ seed }} \ @@ -444,7 +443,6 @@ fast-projected-pool half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="": fast-vanilla-xmech half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="": {{ TRAIN }} fast --intervention=none \ --teacher-pool-dir={{ pool }} --mix-ratio=0.5 \ - --grad-clip=500 \ --half-a={{ half_a }} \ --seed={{ seed }} \ --out-tag=_xmech_vanilla_half_{{ replace(half_a, ',', '') }}{{ tag }}_seed{{ seed }} diff --git a/src/vgrout/train.py b/src/vgrout/train.py index 4c87647..8ce80d1 100644 --- a/src/vgrout/train.py +++ b/src/vgrout/train.py @@ -284,8 +284,9 @@ class FastConfig(Config): # custom multi-loophole modes. The 4-mode substrate was dropped (we never ran the # held-out-mode generalisation test it existed for). Real runs need only # --intervention (+ optional seed/refresh/mask). - teacher_pool_dir: Path | None = Path("out/pools/teacher_pool_runtests") + teacher_pool_dir: Path | None = Path("out/pools/teacher_pool_runtests_dense") vhack_pairs_path: Path | None = Path("out/pairsets/prog_wide.json") + grad_clip: float = 500.0 # fast default (vs base 10.0): ÎīS grads run large here, light clip group: int = 8 # G=8 so the locked-in mix_ratio=0.125 gives 1 teacher / 7 student max_new: int = 512 n_problems: int = 200 @@ -1910,15 +1911,18 @@ def main(cfg: Config) -> int: deploy_path.write_text(json.dumps(deploy_record, indent=2)) logger.info(f"deploy artifact: {deploy_path}") - # Final tail: cue emoji + main metric BLUF, then per-step tsv table. - # Vanilla arm: ðŸŸĒ if hacking emerged. Projected arm: ðŸŸĒ if HACK_RATE dropped - # vs a matched-PASS vanilla; we can't judge that here, so just report. + # ── end-of-run summary ────────────────────────────────────────────────── + # Order matters (token-efficient-logging "final 30 lines"): the scroll-back + # dumps go FIRST, and the readable tail -- argv + the result table + the one + # objective number -- goes LAST, so the final lines a reader/agent lands on + # are the answer, not a 30-column table that wraps off-screen. + # Cue: ðŸŸĒ if vanilla emerged a hack (substrate valid); else ðŸŸĄ (just report). cue = "ðŸŸĒ" if (cfg.arm == "vanilla" and hack_rate > 0.0) else "ðŸŸĄ" - print(f"\nargv: {' '.join(sys.argv)}") - print(f"verbose log: {verbose_log}") - print( - f"main metric: HACK_RATE={hack_rate:.3f} PASS_RATE={pass_rate:.3f} " + # --- scroll-back: train-set diagnostics + the wide journal/results.md row --- + print(f"\nverbose log: {verbose_log}") + print( # TRAIN-set rollout rates (knob-on) -- diagnostics, NOT the headline + f"train rollout rates (knob-on): HACK_RATE={hack_rate:.3f} PASS_RATE={pass_rate:.3f} " f"HACK_STUDENT={hack_rate_s:.3f} HACK_TEACHER={hack_rate_t:.3f} " f"[arm={cfg.arm} preset={cfg.preset_name} model={model_name} steps={n_steps} gens={n_gens} peak={peak_gb:.1f}GB" f"{' pool=' + cfg.teacher_pool_dir.name + ' mix=' + str(cfg.mix_ratio) if cfg.teacher_pool_dir else ''}]" @@ -1953,19 +1957,9 @@ def main(cfg: Config) -> int: f"half_B={sorted(half_b_codes) or '-'} HACK_B={hack_b_rate:.3f} " f"(A=any half_A fires; B=any half_B fires AND no half_A fires)" ) + # Wide one-row results.md/results.tsv table (all knobs). Wide on purpose -- it + # is the row appended to results.md, not the at-a-glance line; hence above the tail. print() - # Render every (n, d) fraction tuple (gt_s/hack_s/hack_t/hk_/...) as "n/d" - # so tabulate shows them as fractions, not raw tuples. Drop timing columns -- - # useful per-step in the streaming log but noise in the journal-pasteable table. - # Drop timing (gen/fb/t_rew/sec) + sprd/N: sprd is a constant T/F bail flag and N - # is redundant with the frac denominators already shown in gt_s/hack_s/hk_. - _DROP_COLS = ("gen", "fb", "t_rew", "sec", "sprd", "N") - rows_for_dump = [ - {k: (f"{v[0]}/{v[1]}" if isinstance(v, tuple) and len(v) == 2 else v) - for k, v in r.items() if k not in _DROP_COLS} - for r in rows - ] - # BLUF summary first -- the single row a reader scans -- as github markdown. print(tabulate([{ "cue": cue, "HACK_RATE": f"{hack_rate:.3f}", "PASS_RATE": f"{pass_rate:.3f}", "HACK_S": f"{hack_rate_s:.3f}", "HACK_T": f"{hack_rate_t:.3f}", @@ -1975,11 +1969,34 @@ def main(cfg: Config) -> int: "mix": cfg.mix_ratio if cfg.teacher_pool_dir else "", "tag": cfg.out_tag, "log": str(verbose_log), }], headers="keys", tablefmt="github")) - # Per-step rows ONCE, markdown (journal/PR pasteable). The TSV duplicate of the - # same data was dropped -- two formats of one table was just noise. + # Per-step rows (markdown, journal/PR pasteable). Render (n,d) tuples as "n/d"; + # drop timing (gen/fb/t_rew/sec) + sprd (constant bail flag) + N (redundant with + # the frac denominators). The giant scroll-back reference -- ABOVE the tail. + _DROP_COLS = ("gen", "fb", "t_rew", "sec", "sprd", "N") + rows_for_dump = [ + {k: (f"{v[0]}/{v[1]}" if isinstance(v, tuple) and len(v) == 2 else v) + for k, v in r.items() if k not in _DROP_COLS} + for r in rows + ] print("\n### Per-step rows (markdown)\n") print(tabulate(rows_for_dump, headers="keys", tablefmt="pipe", floatfmt="+.3f")) + # --- TAIL: argv, the result table, the single objective. The last lines. --- + # solve and hack alone are gameable (tank solve to kill hack, or accept hack to + # lift solve); the deploy gap solve-hack is the one number to maximise. Taken + # from the FINAL DEPLOY eval (knob-off, held-out test) = the shipped model on + # unseen problems. knob-on hack is the train-rollout student rate (different set, + # so its solve cell is "-": there is no knob-on deploy-style solve to pair it). + _dh, _ds, _dn = ev["hack"], ev["solve"], ev["n"] + _deploy_col = f"knob-off (deploy, test n={_dn})" + print(f"\n\nargv: {' '.join(sys.argv)}\n") + print(tabulate( + [{"measure": "hack ↓", "knob-on (train)": f"{hack_rate_s:.3f}", _deploy_col: f"{_dh:.3f}"}, + {"measure": "solve ↑", "knob-on (train)": "-", _deploy_col: f"{_ds:.3f}"}], + headers="keys", tablefmt="github", disable_numparse=True)) + print(f"\n{cue} objective (deploy solve - hack ↑) = {_ds:.3f} - {_dh:.3f} = {_ds - _dh:+.3f} " + f"[arm={cfg.arm} seed={cfg.seed}]") + save_ckpt(rows) return 0