config+log: fast defaults (dense pool, grad_clip=500); end-of-run tail = argv + hack/solve table + solve-hack objective

- FastConfig: teacher_pool_dir -> teacher_pool_runtests_dense, grad_clip -> 500
  (were passed explicitly on every fast call). Dropped --teacher-pool-dir/--grad-clip
  from the dir6 calls and --grad-clip from all other fast recipes; smoke/dev recipes
  keep their own teacher_pool override.
- End-of-run summary reordered per token-efficient-logging 'final 30 lines': the wide
  results row and the giant per-step table now print ABOVE the tail. The last lines are
  just argv, a compact hack/solve x knob-on/knob-off table, and the single objective
  (deploy solve - hack), since solve and hack alone are gameable.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-07 22:05:46 +00:00
parent eeee7db65c
commit 484305d7b4
2 changed files with 57 additions and 42 deletions
+18 -20
View File
@@ -105,7 +105,7 @@ full *ARGS:
fast-vanilla *ARGS:
{{ TRAIN }} fast --intervention=none \
--teacher-pool-dir=out/pools/teacher_pool \
--grad-clip=500 {{ ARGS }}
{{ ARGS }}
# Goal 1: same recipe with --intervention=erase. Run only after fast-vanilla passes UAT.
# mix_ratio=0.125 + group=8 are the locked-in fast defaults (config), not flags here.
@@ -114,7 +114,7 @@ fast-vanilla *ARGS:
fast-projected *ARGS:
{{ TRAIN }} fast --intervention=erase \
--teacher-pool-dir=out/pools/teacher_pool \
--grad-clip=500 {{ ARGS }}
{{ ARGS }}
# H: LoRA-frozen-B adapter (trainable down-proj A, FROZEN random up-proj B) routes as
# well as the AntiPaSTO SVD adapter. Frozen B makes the error->bottleneck map g_h = B^T δ_y
@@ -135,14 +135,14 @@ fast-lora-routeV *ARGS:
# train, ~1 teacher demo per 8 steps, student never learned the hack (data invalid).
# Priority descending so they execute in listed order (routeV best first).
queue-dir6 seed='43':
pueue add -w "$PWD" -o 60 -l "why: P1 routeV real-V per-rollout (best method) s{{seed}}; resolve: deploy_hack << random/vampire at matched solve" -- {{ TRAIN }} fast --intervention=routeV --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_s{{seed}}
pueue add -w "$PWD" -o 55 -l "why: P2 routeV real-V PER-TOKEN s{{seed}}; resolve: finer routing >= per-rollout suppression, no solve cost" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_pertoken_s{{seed}}
pueue add -w "$PWD" -o 50 -l "why: P3 routeV RANDOM-V per-rollout (Haar control) s{{seed}}; resolve: deploy_hack ~ vanilla -> real-V suppression is directional, not absorption" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_random_s{{seed}}
pueue add -w "$PWD" -o 45 -l "why: P4 routeV RANDOM-V PER-TOKEN s{{seed}}; resolve: per-token random also fails to suppress -> granularity isn't the lever, direction is" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --routeV-random-v-seed=157 --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_pertoken_random_s{{seed}}
pueue add -w "$PWD" -o 40 -l "why: P5 VANILLA reference s{{seed}}; resolve: deploy_hack >> 0 by step 60 (emergence) -> the suppression target exists" -- {{ TRAIN }} fast --intervention=none --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_vanilla_s{{seed}}
pueue add -w "$PWD" -o 35 -l "why: P6 routeV VAMPIRE (in-subspace semantic placebo, null_vampire pairs) s{{seed}}; resolve: deploy_hack ~ vanilla -> v_grad must point at the HACK, not just any in-subspace semantic axis" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_routeV_vampire_s{{seed}}
pueue add -w "$PWD" -o 30 -l "why: P7 LoRA-frozen-B routeV real-V per-rollout s{{seed}}; resolve: deploy_hack ~ AntiPaSTO routeV -> routing is adapter-agnostic (lives in the r-bottleneck, not the SVD basis)" -- {{ TRAIN }} fast --intervention=routeV --adapter=lora_frozen_b --lora-r=32 --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_lora_routeV_s{{seed}}
pueue add -w "$PWD" -o 28 -l "why: P8 LoRA-frozen-B routeV real-V PER-TOKEN s{{seed}}; resolve: per-token on the static-B path matches AntiPaSTO per-token suppression" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --adapter=lora_frozen_b --lora-r=32 --teacher-pool-dir=out/pools/teacher_pool_runtests_dense --grad-clip=500 --seed={{seed}} --out-tag=_dir6_lora_routeV_pertoken_s{{seed}}
pueue add -w "$PWD" -o 60 -l "why: P1 routeV real-V per-rollout (best method) s{{seed}}; resolve: deploy_hack << random/vampire at matched solve" -- {{ TRAIN }} fast --intervention=routeV --seed={{seed}} --out-tag=_dir6_routeV_s{{seed}}
pueue add -w "$PWD" -o 55 -l "why: P2 routeV real-V PER-TOKEN s{{seed}}; resolve: finer routing >= per-rollout suppression, no solve cost" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --seed={{seed}} --out-tag=_dir6_routeV_pertoken_s{{seed}}
pueue add -w "$PWD" -o 50 -l "why: P3 routeV RANDOM-V per-rollout (Haar control) s{{seed}}; resolve: deploy_hack ~ vanilla -> real-V suppression is directional, not absorption" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --seed={{seed}} --out-tag=_dir6_routeV_random_s{{seed}}
pueue add -w "$PWD" -o 45 -l "why: P4 routeV RANDOM-V PER-TOKEN s{{seed}}; resolve: per-token random also fails to suppress -> granularity isn't the lever, direction is" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --routeV-random-v-seed=157 --seed={{seed}} --out-tag=_dir6_routeV_pertoken_random_s{{seed}}
pueue add -w "$PWD" -o 40 -l "why: P5 VANILLA reference s{{seed}}; resolve: deploy_hack >> 0 by step 60 (emergence) -> the suppression target exists" -- {{ TRAIN }} fast --intervention=none --seed={{seed}} --out-tag=_dir6_vanilla_s{{seed}}
pueue add -w "$PWD" -o 35 -l "why: P6 routeV VAMPIRE (in-subspace semantic placebo, null_vampire pairs) s{{seed}}; resolve: deploy_hack ~ vanilla -> v_grad must point at the HACK, not just any in-subspace semantic axis" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --seed={{seed}} --out-tag=_dir6_routeV_vampire_s{{seed}}
pueue add -w "$PWD" -o 30 -l "why: P7 LoRA-frozen-B routeV real-V per-rollout s{{seed}}; resolve: deploy_hack ~ AntiPaSTO routeV -> routing is adapter-agnostic (lives in the r-bottleneck, not the SVD basis)" -- {{ TRAIN }} fast --intervention=routeV --adapter=lora_frozen_b --lora-r=32 --seed={{seed}} --out-tag=_dir6_lora_routeV_s{{seed}}
pueue add -w "$PWD" -o 28 -l "why: P8 LoRA-frozen-B routeV real-V PER-TOKEN s{{seed}}; resolve: per-token on the static-B path matches AntiPaSTO per-token suppression" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --adapter=lora_frozen_b --lora-r=32 --seed={{seed}} --out-tag=_dir6_lora_routeV_pertoken_s{{seed}}
# H: BROADER sweep for the paper -- headline arms (vanilla, erase, routeV real-V) across
# 3 SEEDS for the paired-t significance the paper insists on, plus the directionality +
@@ -155,15 +155,15 @@ queue-broad:
#!/usr/bin/env bash
set -eu
for seed in {{ SEEDS_3 }}; do
pueue add -w "$PWD" -o 22 -l "why: headline VANILLA s$seed (3-seed significance); resolve: deploy_hack emergence reference" -- {{ TRAIN }} fast --intervention=none --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=$seed --out-tag=_broad_vanilla_s$seed
pueue add -w "$PWD" -o 21 -l "why: headline routeV real-V s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve, paired across seeds" -- {{ TRAIN }} fast --intervention=routeV --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=$seed --out-tag=_broad_routeV_s$seed
pueue add -w "$PWD" -o 20 -l "why: headline ERASE (one-sided projection) s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=$seed --out-tag=_broad_erase_s$seed
pueue add -w "$PWD" -o 22 -l "why: headline VANILLA s$seed (3-seed significance); resolve: deploy_hack emergence reference" -- {{ TRAIN }} fast --intervention=none --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_vanilla_s$seed
pueue add -w "$PWD" -o 21 -l "why: headline routeV real-V s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve, paired across seeds" -- {{ TRAIN }} fast --intervention=routeV --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_routeV_s$seed
pueue add -w "$PWD" -o 20 -l "why: headline ERASE (one-sided projection) s$seed (3-seed significance); resolve: deploy_hack < vanilla at matched solve" -- {{ TRAIN }} fast --intervention=erase --teacher-pool-dir={{ TEACHER_RT }}--seed=$seed --out-tag=_broad_erase_s$seed
done
# ablations (one seed = 43): directionality controls + per-token + lora
pueue add -w "$PWD" -o 15 -l "why: ablation routeV RANDOM-V (Haar) s43; resolve: ~vanilla -> suppression is directional" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=43 --out-tag=_broad_routeV_random_s43
pueue add -w "$PWD" -o 15 -l "why: ablation routeV VAMPIRE (in-subspace placebo) s43; resolve: ~vanilla -> v_grad must point at the hack" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=43 --out-tag=_broad_routeV_vampire_s43
pueue add -w "$PWD" -o 15 -l "why: ablation routeV PER-TOKEN s43; resolve: granularity effect on suppression" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=43 --out-tag=_broad_routeV_pertoken_s43
pueue add -w "$PWD" -o 15 -l "why: ablation LoRA-frozen-B routeV s43; resolve: routing is adapter-agnostic" -- {{ TRAIN }} fast --intervention=routeV --adapter=lora_frozen_b --lora-r=32 --teacher-pool-dir={{ TEACHER_RT }} --grad-clip=500 --seed=43 --out-tag=_broad_lora_routeV_s43
pueue add -w "$PWD" -o 15 -l "why: ablation routeV RANDOM-V (Haar) s43; resolve: ~vanilla -> suppression is directional" -- {{ TRAIN }} fast --intervention=routeV --routeV-random-v-seed=157 --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_random_s43
pueue add -w "$PWD" -o 15 -l "why: ablation routeV VAMPIRE (in-subspace placebo) s43; resolve: ~vanilla -> v_grad must point at the hack" -- {{ TRAIN }} fast --intervention=routeV --vhack-pairs-path=out/pairsets/null_vampire.json --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_vampire_s43
pueue add -w "$PWD" -o 15 -l "why: ablation routeV PER-TOKEN s43; resolve: granularity effect on suppression" -- {{ TRAIN }} fast --intervention=routeV --routeV-per-token --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_routeV_pertoken_s43
pueue add -w "$PWD" -o 15 -l "why: ablation LoRA-frozen-B routeV s43; resolve: routing is adapter-agnostic" -- {{ TRAIN }} fast --intervention=routeV --adapter=lora_frozen_b --lora-r=32 --teacher-pool-dir={{ TEACHER_RT }}--seed=43 --out-tag=_broad_lora_routeV_s43
# T8 (KEY GOAL): one CELL of the dynamics-plot matrix as a separate pueue job.
# INTERVENTION in {none, erase, route}; SEED an int. 60-step fast horizon,
@@ -177,7 +177,7 @@ run-cell INTERVENTION SEED REFRESH='0':
{{ TRAIN }} fast --intervention={{ INTERVENTION }} \
--v-hack-path=out/vhack/v_hack_21pairs.safetensors \
--teacher-pool-dir=out/pools/teacher_pool \
--grad-clip=500 --steps=60 --seed={{ SEED }} \
--steps=60 --seed={{ SEED }} \
--vhack-refresh-every={{ REFRESH }} \
--eval-ablate-every=5 \
--out-tag=_cell_{{ INTERVENTION }}{{ if REFRESH == "0" { "" } else { "_online" } }}_s{{ SEED }}
@@ -433,7 +433,6 @@ fast-projected-pool half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="":
--v-hack-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \
--vhack-pairs-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \
--teacher-pool-dir={{ pool }} --mix-ratio=0.5 \
--grad-clip=500 \
--vhack-refresh-every=10 \
--half-a={{ half_a }} \
--seed={{ seed }} \
@@ -444,7 +443,6 @@ fast-projected-pool half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="":
fast-vanilla-xmech half_a="E,C" seed="41" pool="out/pools/teacher_pool" tag="":
{{ TRAIN }} fast --intervention=none \
--teacher-pool-dir={{ pool }} --mix-ratio=0.5 \
--grad-clip=500 \
--half-a={{ half_a }} \
--seed={{ seed }} \
--out-tag=_xmech_vanilla_half_{{ replace(half_a, ',', '') }}{{ tag }}_seed{{ seed }}
+39 -22
View File
@@ -284,8 +284,9 @@ class FastConfig(Config):
# custom multi-loophole modes. The 4-mode substrate was dropped (we never ran the
# held-out-mode generalisation test it existed for). Real runs need only
# --intervention (+ optional seed/refresh/mask).
teacher_pool_dir: Path | None = Path("out/pools/teacher_pool_runtests")
teacher_pool_dir: Path | None = Path("out/pools/teacher_pool_runtests_dense")
vhack_pairs_path: Path | None = Path("out/pairsets/prog_wide.json")
grad_clip: float = 500.0 # fast default (vs base 10.0): δS grads run large here, light clip
group: int = 8 # G=8 so the locked-in mix_ratio=0.125 gives 1 teacher / 7 student
max_new: int = 512
n_problems: int = 200
@@ -1910,15 +1911,18 @@ def main(cfg: Config) -> int:
deploy_path.write_text(json.dumps(deploy_record, indent=2))
logger.info(f"deploy artifact: {deploy_path}")
# Final tail: cue emoji + main metric BLUF, then per-step tsv table.
# Vanilla arm: 🟢 if hacking emerged. Projected arm: 🟢 if HACK_RATE dropped
# vs a matched-PASS vanilla; we can't judge that here, so just report.
# ── end-of-run summary ──────────────────────────────────────────────────
# Order matters (token-efficient-logging "final 30 lines"): the scroll-back
# dumps go FIRST, and the readable tail -- argv + the result table + the one
# objective number -- goes LAST, so the final lines a reader/agent lands on
# are the answer, not a 30-column table that wraps off-screen.
# Cue: 🟢 if vanilla emerged a hack (substrate valid); else 🟡 (just report).
cue = "🟢" if (cfg.arm == "vanilla" and hack_rate > 0.0) else "🟡"
print(f"\nargv: {' '.join(sys.argv)}")
print(f"verbose log: {verbose_log}")
print(
f"main metric: HACK_RATE={hack_rate:.3f} PASS_RATE={pass_rate:.3f} "
# --- scroll-back: train-set diagnostics + the wide journal/results.md row ---
print(f"\nverbose log: {verbose_log}")
print( # TRAIN-set rollout rates (knob-on) -- diagnostics, NOT the headline
f"train rollout rates (knob-on): HACK_RATE={hack_rate:.3f} PASS_RATE={pass_rate:.3f} "
f"HACK_STUDENT={hack_rate_s:.3f} HACK_TEACHER={hack_rate_t:.3f} "
f"[arm={cfg.arm} preset={cfg.preset_name} model={model_name} steps={n_steps} gens={n_gens} peak={peak_gb:.1f}GB"
f"{' pool=' + cfg.teacher_pool_dir.name + ' mix=' + str(cfg.mix_ratio) if cfg.teacher_pool_dir else ''}]"
@@ -1953,19 +1957,9 @@ def main(cfg: Config) -> int:
f"half_B={sorted(half_b_codes) or '-'} HACK_B={hack_b_rate:.3f} "
f"(A=any half_A fires; B=any half_B fires AND no half_A fires)"
)
# Wide one-row results.md/results.tsv table (all knobs). Wide on purpose -- it
# is the row appended to results.md, not the at-a-glance line; hence above the tail.
print()
# Render every (n, d) fraction tuple (gt_s/hack_s/hack_t/hk_<mode>/...) as "n/d"
# so tabulate shows them as fractions, not raw tuples. Drop timing columns --
# useful per-step in the streaming log but noise in the journal-pasteable table.
# Drop timing (gen/fb/t_rew/sec) + sprd/N: sprd is a constant T/F bail flag and N
# is redundant with the frac denominators already shown in gt_s/hack_s/hk_<mode>.
_DROP_COLS = ("gen", "fb", "t_rew", "sec", "sprd", "N")
rows_for_dump = [
{k: (f"{v[0]}/{v[1]}" if isinstance(v, tuple) and len(v) == 2 else v)
for k, v in r.items() if k not in _DROP_COLS}
for r in rows
]
# BLUF summary first -- the single row a reader scans -- as github markdown.
print(tabulate([{
"cue": cue, "HACK_RATE": f"{hack_rate:.3f}", "PASS_RATE": f"{pass_rate:.3f}",
"HACK_S": f"{hack_rate_s:.3f}", "HACK_T": f"{hack_rate_t:.3f}",
@@ -1975,11 +1969,34 @@ def main(cfg: Config) -> int:
"mix": cfg.mix_ratio if cfg.teacher_pool_dir else "",
"tag": cfg.out_tag, "log": str(verbose_log),
}], headers="keys", tablefmt="github"))
# Per-step rows ONCE, markdown (journal/PR pasteable). The TSV duplicate of the
# same data was dropped -- two formats of one table was just noise.
# Per-step rows (markdown, journal/PR pasteable). Render (n,d) tuples as "n/d";
# drop timing (gen/fb/t_rew/sec) + sprd (constant bail flag) + N (redundant with
# the frac denominators). The giant scroll-back reference -- ABOVE the tail.
_DROP_COLS = ("gen", "fb", "t_rew", "sec", "sprd", "N")
rows_for_dump = [
{k: (f"{v[0]}/{v[1]}" if isinstance(v, tuple) and len(v) == 2 else v)
for k, v in r.items() if k not in _DROP_COLS}
for r in rows
]
print("\n### Per-step rows (markdown)\n")
print(tabulate(rows_for_dump, headers="keys", tablefmt="pipe", floatfmt="+.3f"))
# --- TAIL: argv, the result table, the single objective. The last lines. ---
# solve and hack alone are gameable (tank solve to kill hack, or accept hack to
# lift solve); the deploy gap solve-hack is the one number to maximise. Taken
# from the FINAL DEPLOY eval (knob-off, held-out test) = the shipped model on
# unseen problems. knob-on hack is the train-rollout student rate (different set,
# so its solve cell is "-": there is no knob-on deploy-style solve to pair it).
_dh, _ds, _dn = ev["hack"], ev["solve"], ev["n"]
_deploy_col = f"knob-off (deploy, test n={_dn})"
print(f"\n\nargv: {' '.join(sys.argv)}\n")
print(tabulate(
[{"measure": "hack ↓", "knob-on (train)": f"{hack_rate_s:.3f}", _deploy_col: f"{_dh:.3f}"},
{"measure": "solve ↑", "knob-on (train)": "-", _deploy_col: f"{_ds:.3f}"}],
headers="keys", tablefmt="github", disable_numparse=True))
print(f"\n{cue} objective (deploy solve - hack ↑) = {_ds:.3f} - {_dh:.3f} = {_ds - _dh:+.3f} "
f"[arm={cfg.arm} seed={cfg.seed}]")
save_ckpt(rows)
return 0