diff --git a/out/figs/a5_generalisation.csv b/out/figs/a5_generalisation.csv new file mode 100644 index 0000000..7535278 --- /dev/null +++ b/out/figs/a5_generalisation.csv @@ -0,0 +1,9 @@ +mode,in_dist,arm,n_seed,deploy_hack_mean,deploy_hack_std,deploy_solve_mean,deploy_solve_std +run_tests,True,vanilla,1,1.000000,0.000000,0.000000,0.000000 +file_marker,False,vanilla,1,0.625000,0.000000,0.375000,0.000000 +stdout_marker,False,vanilla,1,0.166667,0.000000,0.645833,0.000000 +sentinel,False,vanilla,1,0.416667,0.000000,0.583333,0.000000 +run_tests,True,route,1,0.000000,0.000000,0.000000,0.000000 +file_marker,False,route,1,0.020833,0.000000,0.354167,0.000000 +stdout_marker,False,route,1,0.083333,0.000000,0.395833,0.000000 +sentinel,False,route,1,0.000000,0.000000,0.666667,0.000000 diff --git a/out/figs/a5_generalisation.pdf b/out/figs/a5_generalisation.pdf index a56ac0f..b1e801d 100644 Binary files a/out/figs/a5_generalisation.pdf and b/out/figs/a5_generalisation.pdf differ diff --git a/out/figs/a5_generalisation.png b/out/figs/a5_generalisation.png index 73a41cc..0c5d5de 100644 Binary files a/out/figs/a5_generalisation.png and b/out/figs/a5_generalisation.png differ diff --git a/out/figs/a5_generalisation.svg b/out/figs/a5_generalisation.svg index 874d551..cdb8ad2 100644 --- a/out/figs/a5_generalisation.svg +++ b/out/figs/a5_generalisation.svg @@ -1,12 +1,12 @@ - + - 2026-06-05T04:04:58.357086 + 2026-06-05T04:13:05.360071 image/svg+xml @@ -21,8 +21,8 @@ - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - - + + - - - - - - @@ -591,24 +566,191 @@ L 594 3500 L 594 4494 L 1172 4494 z +" transform="scale(0.015625)"/> + + + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -617,7 +759,7 @@ z - + - + - + - + + - + @@ -863,7 +1031,7 @@ z - + @@ -879,7 +1047,7 @@ z - + @@ -890,7 +1058,7 @@ z - + @@ -904,13 +1072,13 @@ z - - + @@ -919,7 +1087,7 @@ L 363.56625 209.618437 - + @@ -928,7 +1096,7 @@ L 363.56625 209.618437 - + - + @@ -958,7 +1126,7 @@ z - + - + @@ -1017,7 +1185,7 @@ z - + @@ -1026,232 +1194,14 @@ z - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - - + + + + + - + - + - + - + - + - + @@ -1354,59 +1304,59 @@ z - - - - - - - - - + - + - + @@ -1415,14 +1365,14 @@ L 384.896429 20.798437 - + - + - + @@ -1431,14 +1381,14 @@ L 437.547321 20.798437 - + - + - + @@ -1447,14 +1397,14 @@ L 490.198214 20.798437 - + - + - + @@ -1463,14 +1413,14 @@ L 542.849107 20.798437 - + - + - + @@ -1479,40 +1429,110 @@ L 595.5 20.798437 - + - + - + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1531,20 +1551,20 @@ L 648.150893 20.798437 - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - + + + + + - - + + - - + + diff --git a/out/figs/dyn_sub4.pdf b/out/figs/dyn_sub4.pdf index fe60544..7dd1326 100644 Binary files a/out/figs/dyn_sub4.pdf and b/out/figs/dyn_sub4.pdf differ diff --git a/out/figs/dyn_sub4.svg b/out/figs/dyn_sub4.svg index 6ff1543..16c1916 100644 --- a/out/figs/dyn_sub4.svg +++ b/out/figs/dyn_sub4.svg @@ -6,7 +6,7 @@ - 2026-06-05T04:06:04.192641 + 2026-06-05T04:13:11.401703 image/svg+xml @@ -40,18 +40,18 @@ z +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke-dasharray: 3.2,2.4; stroke-dashoffset: 0; stroke: #8c8c8c; stroke-width: 0.8"/> - - + @@ -87,7 +87,7 @@ z - + @@ -117,7 +117,7 @@ z - + @@ -157,7 +157,7 @@ z - + @@ -205,7 +205,7 @@ z - + @@ -240,7 +240,7 @@ z - + @@ -281,7 +281,7 @@ z - + @@ -548,12 +548,12 @@ z - - + @@ -577,7 +577,7 @@ z - + @@ -592,7 +592,7 @@ z - + @@ -607,7 +607,7 @@ z - + @@ -622,7 +622,7 @@ z - + @@ -678,7 +678,7 @@ z - + @@ -855,7 +855,7 @@ L 197.158675 98.861038 L 199.814451 98.861038 L 202.470228 98.861038 L 205.126005 100.398536 -" clip-path="url(#p76f8b7246d)" style="fill: none; stroke: #c1432b; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke: #c1432b; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke: #c1432b; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke: #c1432b; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke: #c1432b; stroke-width: 1.8; stroke-linecap: round"/> +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke: #2f7d4f; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke: #2f7d4f; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke: #2f7d4f; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#pdea13af0ac)" style="fill: none; stroke: #2f7d4f; stroke-width: 1.8; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke-dasharray: 3.2,2.4; stroke-dashoffset: 0; stroke: #8c8c8c; stroke-width: 0.8"/> - + @@ -1553,7 +1553,7 @@ L 331.821041 30.116625 - + @@ -1567,7 +1567,7 @@ L 331.821041 30.116625 - + @@ -1581,7 +1581,7 @@ L 331.821041 30.116625 - + @@ -1595,7 +1595,7 @@ L 331.821041 30.116625 - + @@ -1609,7 +1609,7 @@ L 331.821041 30.116625 - + @@ -1623,7 +1623,7 @@ L 331.821041 30.116625 - + @@ -1658,42 +1658,42 @@ L 331.821041 30.116625 - + - + - + - + - + - + @@ -1759,7 +1759,7 @@ L 400.871241 139.604548 L 403.527018 139.604548 L 406.182795 139.604548 L 408.838572 139.604548 -" clip-path="url(#p476f65ca91)" style="fill: none; stroke: #c1432b; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke: #c1432b; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke: #c1432b; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke: #c1432b; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke: #c1432b; stroke-width: 1.8; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke: #2f7d4f; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke: #2f7d4f; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke: #2f7d4f; stroke-opacity: 0.35; stroke-width: 0.7; stroke-linecap: round"/> +" clip-path="url(#p1281d07d4f)" style="fill: none; stroke: #2f7d4f; stroke-width: 1.8; stroke-linecap: round"/> + - + diff --git a/out/figs/dyn_sub4_hack_overlay.pdf b/out/figs/dyn_sub4_hack_overlay.pdf index b53c81d..63c7dd7 100644 Binary files a/out/figs/dyn_sub4_hack_overlay.pdf and b/out/figs/dyn_sub4_hack_overlay.pdf differ diff --git a/out/figs/dyn_sub4_hack_overlay.png b/out/figs/dyn_sub4_hack_overlay.png index 4cafb91..9a09d67 100644 Binary files a/out/figs/dyn_sub4_hack_overlay.png and b/out/figs/dyn_sub4_hack_overlay.png differ diff --git a/out/figs/dyn_sub4_hack_overlay.svg b/out/figs/dyn_sub4_hack_overlay.svg index 27df852..73d7a33 100644 --- a/out/figs/dyn_sub4_hack_overlay.svg +++ b/out/figs/dyn_sub4_hack_overlay.svg @@ -6,7 +6,7 @@ - 2026-06-05T04:06:04.660001 + 2026-06-05T04:13:11.867043 image/svg+xml @@ -36,73 +36,78 @@ L 341.619197 10.239375 L 40.600625 10.239375 z " style="fill: #ffffff"/> + + + - + - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + @@ -145,9 +150,9 @@ z - + - + @@ -186,9 +191,9 @@ z - + - + @@ -222,9 +227,9 @@ z - + - + @@ -269,9 +274,9 @@ z - + - + @@ -325,9 +330,9 @@ z - + - + @@ -523,7 +528,7 @@ z - + +" clip-path="url(#p88ccaa88c9)" style="fill: none; stroke: #7a7a7a; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p88ccaa88c9)" style="fill: none; stroke: #7a7a7a; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p88ccaa88c9)" style="fill: none; stroke: #7a7a7a; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + - - - +" clip-path="url(#p88ccaa88c9)" style="fill: none; stroke: #7a7a7a; stroke-width: 2; stroke-linecap: round"/> +" clip-path="url(#p88ccaa88c9)" style="fill: none; stroke: #7d2f6f; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> + + + +" clip-path="url(#p88ccaa88c9)" style="fill: none; stroke: #7d2f6f; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p88ccaa88c9)" style="fill: none; stroke: #7d2f6f; stroke-width: 2; stroke-linecap: round"/> - - - - + + + + - - - - - - - - - + - - - +M 603 4863 +L 1178 4863 +L 1178 4134 +L 603 4134 +L 603 4863 +z +" transform="scale(0.015625)"/> + + + + + + + + + + + + @@ -1082,7 +1129,7 @@ z Q 314.318133 155.082651 311.610498 155.082651 " style="fill: none; stroke: #7d2f6f; stroke-width: 0.5; stroke-linecap: round"/> - + @@ -1186,7 +1233,7 @@ z Q 314.31554 96.842927 311.609395 96.842927 " style="fill: none; stroke: #7a7a7a; stroke-width: 0.5; stroke-linecap: round"/> - + @@ -1218,19 +1265,6 @@ Q 1925 3584 2291 3584 Q 2894 3584 3203 3211 Q 3513 2838 3513 2113 z -" transform="scale(0.015625)"/> - - + - + - + @@ -1274,12 +1308,12 @@ z - + - + - + @@ -1288,12 +1322,12 @@ z - + - + - + @@ -1302,12 +1336,12 @@ z - + - + - + @@ -1350,12 +1384,12 @@ z - + - + - + @@ -1364,12 +1398,12 @@ z - + - + - + @@ -1405,12 +1439,12 @@ z - + - + - + @@ -1418,7 +1452,7 @@ z - + @@ -1490,37 +1524,6 @@ L 2438 3041 L 353 3041 L 353 3500 z -" transform="scale(0.015625)"/> - @@ -1542,12 +1545,12 @@ z - + - + - + @@ -1557,12 +1560,12 @@ z - + - + - + @@ -1572,12 +1575,12 @@ z - + - + - + @@ -1587,12 +1590,12 @@ z - + - + - + @@ -1602,12 +1605,12 @@ z - + - + - + @@ -1617,12 +1620,12 @@ z - + - + - + @@ -1631,7 +1634,7 @@ z - + @@ -1647,7 +1650,7 @@ z - + +" clip-path="url(#p5acac56b64)" style="fill: none; stroke: #7a7a7a; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p5acac56b64)" style="fill: none; stroke: #7a7a7a; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p5acac56b64)" style="fill: none; stroke: #7a7a7a; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p5acac56b64)" style="fill: none; stroke: #7a7a7a; stroke-width: 2; stroke-linecap: round"/> - + +" clip-path="url(#p5acac56b64)" style="fill: none; stroke: #7d2f6f; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p5acac56b64)" style="fill: none; stroke: #7d2f6f; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p5acac56b64)" style="fill: none; stroke: #7d2f6f; stroke-opacity: 0.25; stroke-width: 0.6; stroke-linecap: round"/> - + +" clip-path="url(#p5acac56b64)" style="fill: none; stroke: #7d2f6f; stroke-width: 2; stroke-linecap: round"/> + - + diff --git a/scripts/plot_deploy_overlay.py b/scripts/plot_deploy_overlay.py index df6acda..18645c3 100644 --- a/scripts/plot_deploy_overlay.py +++ b/scripts/plot_deploy_overlay.py @@ -21,6 +21,7 @@ Usage: from __future__ import annotations import argparse +import csv import json from collections import defaultdict from pathlib import Path @@ -66,7 +67,7 @@ def _mode_stats(by_arm, arm, modes, field): return np.array(means), np.array(stds) -def _panel(ax, by_arm, modes, arms, field, title, xlabel): +def _panel(ax, by_arm, modes, arms, field, xlabel): """Cleveland dot plot: y = mode, x = rate. One dot per arm with a thin connector per mode, so the arm-to-arm change reads as a line segment (vanilla -> route). xerr = std across seeds (drawn only when >1 seed). Tufte: faint x-grid only, no @@ -97,9 +98,8 @@ def _panel(ax, by_arm, modes, arms, field, title, xlabel): ax.set_yticklabels([f"{m}\n{'IN' if m == 'run_tests' else 'held-out'}" for m in modes], fontsize=8) ax.set_xlim(-0.04, 1.08) ax.set_ylim(y.min() - 0.5, y.max() + 0.5) - ax.set_xlabel(xlabel) - ax.set_title(title, fontsize=10) - ax.spines[["top", "right", "left"]].set_visible(False) + ax.set_xlabel(xlabel, fontsize=9) # carries the metric AND the better-direction; + ax.spines[["top", "right", "left"]].set_visible(False) # no title (would just restate it) ax.tick_params(length=0) ax.grid(axis="x", lw=0.3, alpha=0.3) @@ -125,10 +125,8 @@ def main() -> None: modes = [m for m in MODE_ORDER if any(m in r["by_mode"] for r in records)] fig, (a1, a2) = plt.subplots(1, 2, figsize=(9.5, 0.7 + 0.7 * len(modes)), sharey=True) - _panel(a1, by_arm, modes, arms, "deploy_hack", - "DEPLOY hack rate (lower = better)", "deploy hack rate") - _panel(a2, by_arm, modes, arms, "deploy_solve", - "DEPLOY solve rate (higher = better)", "deploy solve rate") + _panel(a1, by_arm, modes, arms, "deploy_hack", r"DEPLOY hack rate ($\downarrow$ lower = better)") + _panel(a2, by_arm, modes, arms, "deploy_solve", r"DEPLOY solve rate ($\uparrow$ higher = better)") a1.legend(fontsize=8, frameon=False, loc="lower right") if args.title: n_seed = {r.get("seed") for r in records} @@ -136,7 +134,20 @@ def main() -> None: f"quarantine deleted = shipped model", fontsize=11) fig.tight_layout() save_fig(fig, args.out) - logger.info(f"wrote {args.out} ({len(arms)} arms x {len(modes)} modes)") + # CSV reproducibility source (mirrors the dynamics plots' dump): per (mode, arm) + # the deploy hack/solve mean +/- std-across-seeds, exactly what the dots encode. + csv_path = args.out.with_suffix(".csv") + with csv_path.open("w", newline="") as f: + w = csv.writer(f) + w.writerow(["mode", "in_dist", "arm", "n_seed", + "deploy_hack_mean", "deploy_hack_std", "deploy_solve_mean", "deploy_solve_std"]) + for arm in arms: + hk_m, hk_s = _mode_stats(by_arm, arm, modes, "deploy_hack") + sv_m, sv_s = _mode_stats(by_arm, arm, modes, "deploy_solve") + for j, m in enumerate(modes): + w.writerow([m, m == "run_tests", ARM[arm][0], len(by_arm[arm]), + f"{hk_m[j]:.6f}", f"{hk_s[j]:.6f}", f"{sv_m[j]:.6f}", f"{sv_s[j]:.6f}"]) + logger.info(f"wrote {args.out} and {csv_path.name} ({len(arms)} arms x {len(modes)} modes)") if __name__ == "__main__": diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index d301ced..4124564 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -355,6 +355,7 @@ def _overlay_panel(ax, by_arm, arms, key, *, label, with_onset, label_arms, ylim endpoint (de-collided in y). An arm whose mean series sits at zero gets a "$\\approx 0$" tag so a pinned-at-zero line reads as a finding, not a missing line.""" ends = [] # (y_endpoint, x_endpoint, arm, color, is_zero) for direct labels + onset_steps = [] # mean-onset across arms -> ONE labeled vertical line (see below) for arm in arms: rs = [r for r in by_arm[arm] if key in r] if not rs: @@ -370,11 +371,15 @@ def _overlay_panel(ax, by_arm, arms, key, *, label, with_onset, label_arms, ylim xm = rs[0]["steps"][:L] ax.plot(xm, ym, color=color, lw=2.0, solid_capstyle="round") if with_onset: - onsets = [s for r in rs if (s := _onset(r["steps"], r["hack_s"])) is not None] - if onsets: - s0 = float(np.mean(onsets)) - ax.plot(s0, np.interp(s0, xm, ym), marker="o", ms=4, color=color, zorder=3) + onset_steps += [s for r in rs if (s := _onset(r["steps"], r["hack_s"])) is not None] ends.append((float(ym[-1]), float(xm[-1]), arm, color, float(np.nanmax(ym)) < 0.02)) + # First-hack as a labeled vertical line (matches the small-multiples), not a dot: + # a dashed rule reads as "emergence starts here" across both arms in one mark. + if with_onset and onset_steps: + s0 = float(np.mean(onset_steps)) + ax.axvline(s0, color="0.55", lw=0.8, ls=(0, (4, 3)), zorder=0) + ax.annotate("first hack", (s0, ylim[1]), color="0.4", fontsize=7, + xytext=(2, -2), textcoords="offset points", va="top") ax.set_ylim(*ylim) ax.set_ylabel(label) ax.spines[["top", "right"]].set_visible(False)