From 5257ff010eb73678f0aa6315eeb256389b6b4b18 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Fri, 5 Jun 2026 02:33:10 +0000 Subject: [PATCH] plot_dynamics: train-vs-deploy 2x2 uses matched n=64 eval on both rows The train row fell back to per-step hack_s (noisy n=28 train batch) for arms without a knob-on eval, so vanilla's train/deploy rows looked like different estimators. Fix: vanilla/erase have no quarantine -> train==deploy, so reuse hk_dep (the n=64 knob-off eval) for the train row. route2 still uses hk_on (knob-on eval). Now every panel is the same held-out eval, differing only in the quarantine knob. Regen source: train_vs_deploy_60.csv (route2 nofloor_rf2 + vanilla sweep, seed 41, 60 steps). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- out/figs/train_vs_deploy_60.csv | 121 ++++++++++++++++++++++++++++++++ scripts/plot_dynamics.py | 17 +++-- 2 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 out/figs/train_vs_deploy_60.csv diff --git a/out/figs/train_vs_deploy_60.csv b/out/figs/train_vs_deploy_60.csv new file mode 100644 index 0000000..c584779 --- /dev/null +++ b/out/figs/train_vs_deploy_60.csv @@ -0,0 +1,121 @@ +arm,seed,step,hack_s,gt_s,hack_train,solve_train,hk_dep,slv_dep +routing2,41,0,0.0,0.34,0.0,0.38,0.0,0.34 +routing2,41,1,nan,nan,nan,nan,nan,nan +routing2,41,2,nan,nan,nan,nan,nan,nan +routing2,41,3,nan,nan,nan,nan,nan,nan +routing2,41,4,nan,nan,nan,nan,nan,nan +routing2,41,5,0.0,0.5,0.0,0.5,0.0,0.5 +routing2,41,6,nan,nan,nan,nan,nan,nan +routing2,41,7,nan,nan,nan,nan,nan,nan +routing2,41,8,nan,nan,nan,nan,nan,nan +routing2,41,9,nan,nan,nan,nan,nan,nan +routing2,41,10,0.0,0.58,0.09,0.55,0.0,0.58 +routing2,41,11,nan,nan,nan,nan,nan,nan +routing2,41,12,nan,nan,nan,nan,nan,nan +routing2,41,13,nan,nan,nan,nan,nan,nan +routing2,41,14,nan,nan,nan,nan,nan,nan +routing2,41,15,0.0,0.62,0.17,0.48,0.0,0.62 +routing2,41,16,nan,nan,nan,nan,nan,nan +routing2,41,17,nan,nan,nan,nan,nan,nan +routing2,41,18,nan,nan,nan,nan,nan,nan +routing2,41,19,nan,nan,nan,nan,nan,nan +routing2,41,20,0.0,0.59,0.19,0.48,0.0,0.59 +routing2,41,21,nan,nan,nan,nan,nan,nan +routing2,41,22,nan,nan,nan,nan,nan,nan +routing2,41,23,nan,nan,nan,nan,nan,nan +routing2,41,24,nan,nan,nan,nan,nan,nan +routing2,41,25,0.0,0.61,0.22,0.59,0.0,0.61 +routing2,41,26,nan,nan,nan,nan,nan,nan +routing2,41,27,nan,nan,nan,nan,nan,nan +routing2,41,28,nan,nan,nan,nan,nan,nan +routing2,41,29,nan,nan,nan,nan,nan,nan +routing2,41,30,0.0,0.62,0.25,0.45,0.0,0.62 +routing2,41,31,nan,nan,nan,nan,nan,nan +routing2,41,32,nan,nan,nan,nan,nan,nan +routing2,41,33,nan,nan,nan,nan,nan,nan +routing2,41,34,nan,nan,nan,nan,nan,nan +routing2,41,35,0.0,0.62,0.23,0.5,0.0,0.62 +routing2,41,36,nan,nan,nan,nan,nan,nan +routing2,41,37,nan,nan,nan,nan,nan,nan +routing2,41,38,nan,nan,nan,nan,nan,nan +routing2,41,39,nan,nan,nan,nan,nan,nan +routing2,41,40,0.0,0.61,0.25,0.56,0.0,0.61 +routing2,41,41,nan,nan,nan,nan,nan,nan +routing2,41,42,nan,nan,nan,nan,nan,nan +routing2,41,43,nan,nan,nan,nan,nan,nan +routing2,41,44,nan,nan,nan,nan,nan,nan +routing2,41,45,0.0,0.62,0.25,0.47,0.0,0.62 +routing2,41,46,nan,nan,nan,nan,nan,nan +routing2,41,47,nan,nan,nan,nan,nan,nan +routing2,41,48,nan,nan,nan,nan,nan,nan +routing2,41,49,nan,nan,nan,nan,nan,nan +routing2,41,50,0.0,0.62,0.19,0.48,0.0,0.62 +routing2,41,51,nan,nan,nan,nan,nan,nan +routing2,41,52,nan,nan,nan,nan,nan,nan +routing2,41,53,nan,nan,nan,nan,nan,nan +routing2,41,54,nan,nan,nan,nan,nan,nan +routing2,41,55,0.0,0.62,0.2,0.52,0.0,0.62 +routing2,41,56,nan,nan,nan,nan,nan,nan +routing2,41,57,nan,nan,nan,nan,nan,nan +routing2,41,58,nan,nan,nan,nan,nan,nan +routing2,41,59,0.0,0.61,0.25,0.53,0.0,0.61 +vanilla,41,0,0.0,0.36,0.0,0.36,0.0,0.36 +vanilla,41,1,nan,nan,nan,nan,nan,nan +vanilla,41,2,nan,nan,nan,nan,nan,nan +vanilla,41,3,nan,nan,nan,nan,nan,nan +vanilla,41,4,nan,nan,nan,nan,nan,nan +vanilla,41,5,0.0,0.44,0.0,0.44,0.0,0.44 +vanilla,41,6,nan,nan,nan,nan,nan,nan +vanilla,41,7,nan,nan,nan,nan,nan,nan +vanilla,41,8,nan,nan,nan,nan,nan,nan +vanilla,41,9,nan,nan,nan,nan,nan,nan +vanilla,41,10,0.14,0.56,0.14,0.56,0.14,0.56 +vanilla,41,11,nan,nan,nan,nan,nan,nan +vanilla,41,12,nan,nan,nan,nan,nan,nan +vanilla,41,13,nan,nan,nan,nan,nan,nan +vanilla,41,14,nan,nan,nan,nan,nan,nan +vanilla,41,15,0.23,0.52,0.23,0.52,0.23,0.52 +vanilla,41,16,nan,nan,nan,nan,nan,nan +vanilla,41,17,nan,nan,nan,nan,nan,nan +vanilla,41,18,nan,nan,nan,nan,nan,nan +vanilla,41,19,nan,nan,nan,nan,nan,nan +vanilla,41,20,0.28,0.48,0.28,0.48,0.28,0.48 +vanilla,41,21,nan,nan,nan,nan,nan,nan +vanilla,41,22,nan,nan,nan,nan,nan,nan +vanilla,41,23,nan,nan,nan,nan,nan,nan +vanilla,41,24,nan,nan,nan,nan,nan,nan +vanilla,41,25,0.25,0.53,0.25,0.53,0.25,0.53 +vanilla,41,26,nan,nan,nan,nan,nan,nan +vanilla,41,27,nan,nan,nan,nan,nan,nan +vanilla,41,28,nan,nan,nan,nan,nan,nan +vanilla,41,29,nan,nan,nan,nan,nan,nan +vanilla,41,30,0.3,0.52,0.3,0.52,0.3,0.52 +vanilla,41,31,nan,nan,nan,nan,nan,nan +vanilla,41,32,nan,nan,nan,nan,nan,nan +vanilla,41,33,nan,nan,nan,nan,nan,nan +vanilla,41,34,nan,nan,nan,nan,nan,nan +vanilla,41,35,0.27,0.5,0.27,0.5,0.27,0.5 +vanilla,41,36,nan,nan,nan,nan,nan,nan +vanilla,41,37,nan,nan,nan,nan,nan,nan +vanilla,41,38,nan,nan,nan,nan,nan,nan +vanilla,41,39,nan,nan,nan,nan,nan,nan +vanilla,41,40,0.38,0.45,0.38,0.45,0.38,0.45 +vanilla,41,41,nan,nan,nan,nan,nan,nan +vanilla,41,42,nan,nan,nan,nan,nan,nan +vanilla,41,43,nan,nan,nan,nan,nan,nan +vanilla,41,44,nan,nan,nan,nan,nan,nan +vanilla,41,45,0.42,0.44,0.42,0.44,0.42,0.44 +vanilla,41,46,nan,nan,nan,nan,nan,nan +vanilla,41,47,nan,nan,nan,nan,nan,nan +vanilla,41,48,nan,nan,nan,nan,nan,nan +vanilla,41,49,nan,nan,nan,nan,nan,nan +vanilla,41,50,0.38,0.38,0.38,0.38,0.38,0.38 +vanilla,41,51,nan,nan,nan,nan,nan,nan +vanilla,41,52,nan,nan,nan,nan,nan,nan +vanilla,41,53,nan,nan,nan,nan,nan,nan +vanilla,41,54,nan,nan,nan,nan,nan,nan +vanilla,41,55,0.42,0.47,0.42,0.47,0.42,0.47 +vanilla,41,56,nan,nan,nan,nan,nan,nan +vanilla,41,57,nan,nan,nan,nan,nan,nan +vanilla,41,58,nan,nan,nan,nan,nan,nan +vanilla,41,59,0.33,0.44,0.33,0.44,0.33,0.44 diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index 8deef87..bbb0867 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -132,14 +132,19 @@ def parse_log(path: Path) -> dict | None: # presence: no-floor logs carry an all-nan hk_dep/hk_abl column otherwise. def _has_data(key): return key in run and np.isfinite(run[key]).any() - # TRAIN series for the train-vs-deploy 2x2. Prefer the knob-ON eval (hk_on/slv_on): - # SAME n/prompts/T as the knob-off deploy eval, so the two rows differ ONLY in the - # knob -- the per-step hack_s is a noisy n=28 train batch and looks like a different - # estimator. Fall back to per-step hack_s for logs without the knob-on eval. - if _has_data("hk_on"): + # TRAIN series for the train-vs-deploy 2x2. The two rows must share ONE estimator: + # route2 -> knob-ON held-out eval (hk_on): quarantine active, the policy as trained. + # vanilla/erase -> reuse the knob-OFF eval (hk_dep): no quarantine, so train==deploy; + # the deploy eval IS the train-time behaviour, same n=64 prompts/T. + # Both differ from the deploy row ONLY in the knob, so noise matches. Per-step hack_s + # (noisy n=28 train batch) is the last resort for old logs with no held-out eval. + if _has_data("hk_on"): # route2: knob-ON held-out eval (quarantine active) run["hack_train"] = run["hk_on"] run["solve_train"] = run["slv_on"] - elif "hack_s" in run: + elif _has_data("hk_dep"): # no quarantine (vanilla/erase): train==deploy, so the + run["hack_train"] = run["hk_dep"] # train row IS the knob-off eval -- reuse it so + run["solve_train"] = run["slv_dep"] # both rows share the n=64 estimator (no n=28 noise) + elif "hack_s" in run: # last resort (old logs, no held-out eval): per-step n=28 run["hack_train"] = run["hack_s"] run["solve_train"] = run["gt_s"] if _has_data("hk_abl"): # dense per-step proxy (rollout_ablate_frac>0), if present