diff --git a/.gitignore b/.gitignore index 5692f59..b3cbe9c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,8 @@ /svd_cache/ /tmp/ *.log - +/docs/reviews +/docs/ # vendored upstream reference repos cloned for grep access (see RESEARCH_JOURNAL.md) /docs/vendor/ diff --git a/docs/figs/dyn_test_hack_overlay.png b/docs/figs/dyn_test_hack_overlay.png new file mode 120000 index 0000000..ba3d7ad --- /dev/null +++ b/docs/figs/dyn_test_hack_overlay.png @@ -0,0 +1 @@ +../../../../tmp/claude-1000/dyn_test_hack_overlay.png \ No newline at end of file diff --git a/src/projected_grpo/train.py b/src/projected_grpo/train.py index 0596769..805d1e3 100644 --- a/src/projected_grpo/train.py +++ b/src/projected_grpo/train.py @@ -383,6 +383,11 @@ class StepLogger: _Col("gt_t", 6, "gt_t", "frac", "teacher ground-truth passes (sanity)"), _Col("hack_s", 7, "hack_s?", "frac", "student hack-flagged rollouts (the headline)"), _Col("hack_t", 7, "hack_t", "frac", "teacher hack-flagged rollouts (sanity: pool hacks)"), + # Deploy-eval shown for EVERY arm (nan on steps it's not run -> see it ride + # along as training proceeds). route/route2: quarantine knob OFF. vanilla/erase: + # the trained model itself. Apples-to-apples knob-off deploy number, the plot series. + _Col("hack_deploy", 7, "hk_dep", "+.2f", "DEPLOY-eval hack (route: quarantine OFF; vanilla/erase: trained model); held-out subset, T=0.7, every eval_ablate_every steps; nan between"), + _Col("solve_deploy", 7, "slv_dep", "+.2f", "DEPLOY-eval solve (same cadence; nan between)"), ] # Per-mode CUMULATIVE student exploit rate -> which loophole classes the # student has learnt, and how strongly. Only when the run spans >1 mode @@ -419,8 +424,6 @@ class StepLogger: if arm in ("routing", "routing2"): cols += [ _Col("q_egy", 6, "qE", ".2f", "grad energy into quarantine ‖g_quar‖/(‖g_keep‖+‖g_quar‖); ~0.5+ rising = learning dumped into the thrown-away knob"), - _Col("hack_deploy", 7, "hk_dep", "+.2f", "DEPLOY-eval hack (quarantine deleted = deployed model); held-out eval subset, T=0.7, every eval_ablate_every steps; the plot number"), - _Col("solve_deploy", 7, "slv_dep", "+.2f", "DEPLOY-eval solve"), _Col("hack_abl", 6, "hk_abl", "frac", "FREE per-step deploy proxy: hack rate on the ablated (deploy-mode) rollout slice; train prompts, noisier than hk_dep"), _Col("solve_abl", 6, "slv_abl", "frac", "free per-step deploy proxy: solve rate on the ablated rollout slice"), ]