docs: merge Ariahw Fig-5 table into the paper md (delete standalone); add abs-scale arrow plot

- Transcribed Fig-5 numeric table now lives inline in the paper md as an
  EDITOR'S TABLE comment, deleting docs/papers/ariahw_results_table_extracted.md
  (one fewer repo file; the table sits next to the figure it transcribes).
- floor_ceiling_abs.{png,pdf}: raw-rate variant. Arrows climb from the floor
  anchor; grey bedrock = worse-than-floor, blue sky = past-ceiling; hack axis
  reversed so right=better on both panels.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
wassname
2026-06-09 12:35:14 +00:00
parent 0973f9ba7c
commit d4998a71ba
3 changed files with 108 additions and 63 deletions
+71 -2
View File
@@ -185,13 +185,81 @@ def plot(df: pl.DataFrame) -> None:
fontsize=10.5, x=0.01, ha="left")
fig.text(0.01, 0.015, "Our arms only, seed 43, 60-step fast (unconverged surrogate). hack suppressed = (vanilla_hack - arm_hack)/vanilla_hack; "
"solve gained = (arm_solve - base)/(ceiling - base). Ariahw 2025 monitor numbers are cross-scale/regime and live in "
"docs/papers/ariahw_results_table_extracted.md, not on this axis.",
"the transcribed Fig-5 table in docs/papers/2025_lw_ariahw_*.md, not on this axis.",
fontsize=6.8, color=GREY, va="bottom")
fig.tight_layout(rect=(0, 0.07, 1, 0.94))
for ext in ("pdf", "png"):
fig.savefig(OUT / f"floor_ceiling.{ext}", dpi=150, bbox_inches="tight")
# ── stage 2b: absolute-scale variant (arrows + shaded floor/ceiling) ─────────
# Same three arms, but plotted on the RAW metric axis (not normalized to [0,1]) so the
# actual rates are legible. Both panels oriented "right = better": the solve axis is the
# raw solve rate; the hack axis is REVERSED (right = less hacking). Grey "bedrock" shades
# the worse-than-floor zone, blue "sky" shades the better-than-ceiling zone, and each arm
# is an arrow from the floor anchor to its value (length = distance climbed).
SKY, BEDROCK = "#cfe3ff", "#d9dadb"
def _arrow_panel(ax, anchor, ceiling, rows, *, reversed_x, xlim, floor_lab, ceil_lab, xlabel, title):
lo, hi = xlim # lo=left edge, hi=right edge (lo>hi when reversed_x)
# bedrock = worse-than-floor; sky = better-than-ceiling (data coords, orientation-agnostic)
if reversed_x: # hack: worse = higher rate, better = lower; better is to the RIGHT
ax.axvspan(lo, anchor, color=BEDROCK, alpha=0.7, lw=0) # >= floor hack = bedrock
ax.axvspan(ceiling, hi, color=SKY, alpha=0.7, lw=0) # <= ceiling (0) = sky
else: # solve: worse = lower, better = higher; better is to the RIGHT
ax.axvspan(lo, anchor, color=BEDROCK, alpha=0.7, lw=0) # <= floor solve = bedrock
ax.axvspan(ceiling, hi, color=SKY, alpha=0.7, lw=0) # >= ceiling = sky
ax.axvline(anchor, color=GREY, lw=1.2)
ax.axvline(ceiling, color="#3b5bdb", lw=1.2, ls=":")
span = abs(hi - lo)
for yi, (lab, val, col) in enumerate(rows):
ax.annotate("", xy=(val, yi), xytext=(anchor, yi),
arrowprops=dict(arrowstyle="-|>", color=col, lw=2.6, shrinkA=0, shrinkB=0))
ax.plot([anchor], [yi], "o", color=GREY, ms=4, zorder=3)
better_right = (val > anchor) if not reversed_x else (val < anchor) # is the arm in the 'better' (right) dir
ha = "left" if better_right else "right"
ax.text(val + (span * 0.02 if ha == "left" else -span * 0.02), yi, f"{val:.3f}",
va="center", ha=ha, fontsize=9, color=col, fontweight="bold")
ax.set_xlim(lo, hi)
ax.set_yticks(range(len(rows))); ax.set_yticklabels([r[0] for r in rows], fontsize=8.5)
ax.set_ylim(-0.6, len(rows) - 0.4)
ax.set_xlabel(xlabel, fontsize=8.5)
ax.set_title(title, fontsize=10, loc="left")
ax.text(anchor, -0.55, floor_lab, fontsize=7.5, color=GREY, ha="center", va="bottom")
ax.text(ceiling, -0.55, ceil_lab, fontsize=7.5, color="#3b5bdb", ha="center", va="bottom")
for s in ("top", "right", "left"):
ax.spines[s].set_visible(False)
ax.tick_params(left=False)
def plot_abs(df: pl.DataFrame) -> None:
a = _anchors(df)
base, vh, ceil = a["base_solve"], a["vanilla_hack"], a["ceiling"]
pick = lambda lab: df.filter(pl.col("label") == lab).to_dicts()[0]
best, rand, van = pick("routeV per-token"), pick("routeV random-V"), pick("vanilla GRPO")
# bottom -> top: vanilla, random-V, per-token
hack_rows = [("vanilla GRPO", van["hack_deploy"], RED),
("routeV random-V", rand["hack_deploy"], DARK),
("routeV per-token", best["hack_deploy"], GOLD)]
solve_rows = [("vanilla GRPO", van["solve_deploy"], RED),
("routeV random-V", rand["solve_deploy"], DARK),
("routeV per-token", best["solve_deploy"], GOLD)]
prov = " PROVISIONAL" if a["provisional"] else ""
fig, (axl, axr) = plt.subplots(1, 2, figsize=(11.5, 4.2), sharey=True)
_arrow_panel(axl, anchor=vh, ceiling=0.0, rows=hack_rows, reversed_x=True,
xlim=(vh + 0.05, -0.03), floor_lab=f"floor\n(vanilla {vh:.2f})", ceil_lab="ceiling\n(no hack)",
xlabel="hack rate · axis reversed: right = less hacking = better", title="hacking (raw rate)")
_arrow_panel(axr, anchor=base, ceiling=ceil, rows=solve_rows, reversed_x=False,
xlim=(base - 0.03, ceil + 0.03), floor_lab=f"floor\n(base {base:.2f})", ceil_lab=f"ceiling\n({ceil:.2f}{prov})",
xlabel="solve rate · right = more solving = better", title="solving (raw rate)")
fig.suptitle("vGROUT raw rates: arrow = climb from floor; grey = bedrock (worse than floor), blue = sky (past ceiling) (test n=119, seed 43, 60-step fast)",
fontsize=10, x=0.01, ha="left")
fig.tight_layout(rect=(0, 0, 1, 0.93))
for ext in ("pdf", "png"):
fig.savefig(OUT / f"floor_ceiling_abs.{ext}", dpi=150, bbox_inches="tight")
def main() -> None:
df = build_csv()
flags = df.filter(~pl.col("status").str.starts_with("ok"))
@@ -201,7 +269,8 @@ def main() -> None:
for r in flags.to_dicts():
print(f" [{r['label']}] {r['status']}")
plot(df)
print(f"\nwrote {OUT}/floor_ceiling.pdf and .png")
plot_abs(df)
print(f"\nwrote {OUT}/floor_ceiling.pdf and .png (+ floor_ceiling_abs.pdf/.png)")
if __name__ == "__main__":