mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 20:37:22 +08:00
f82a4f034d
route2 deploy hack collapses for ANY v_grad (real/placebo/Haar) but solve tracks direction (real>placebo>Haar). TODO names the load-bearing confound: full-teacher runs force-route all teacher rows by label (hack_anchor), so the hack-axis collapse is direction-free force-routing not the cosine gate; clean test = A5 run_tests-only regime (pending). n=1 interim. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
53 lines
2.5 KiB
Python
53 lines
2.5 KiB
Python
"""Directionality scatter: deploy hack (x) vs deploy solve (y) for route2 with
|
|
different v_grad directions. Reads data/directionality.csv, writes
|
|
figs/directionality.{png,pdf}.
|
|
|
|
Two findings in one plot:
|
|
- HACK axis: every routing arm collapses to ~0 regardless of direction (real,
|
|
semantic placebo, even out-of-subspace Haar). Only vanilla sits out at 0.32.
|
|
=> hack suppression is mechanical (H2 quarantine-absorption), not alignment.
|
|
- SOLVE axis: the real hack direction recovers the most solve (0.625); semantic
|
|
placebos sit mid (~0.53-0.58); out-of-subspace Haar is lowest (0.516, barely
|
|
above vanilla). => routing the genuinely hack-enriched gradient wastes less
|
|
solve-gradient, so direction earns its keep on SOLVE even though it doesn't on
|
|
hack. This is the thin H4 residual.
|
|
|
|
n=1 per placebo / Haar draw so far; seed replicates (Haar d1/d2 job 118/122,
|
|
null_city s43 job 121) and the erase-arm discriminator (job 127/128) are pending.
|
|
"""
|
|
from pathlib import Path
|
|
import polars as pl
|
|
import matplotlib.pyplot as plt
|
|
|
|
HERE = Path(__file__).parent
|
|
df = pl.read_csv(HERE.parent / "data" / "directionality.csv")
|
|
|
|
colors = {"none": "#888888", "real": "#1b7837", "placebo": "#c1272d", "random": "#2166ac"}
|
|
markers = {"in": "o", "out": "s", "na": "D"}
|
|
|
|
fig, ax = plt.subplots(figsize=(5.2, 3.6))
|
|
for row in df.iter_rows(named=True):
|
|
ax.scatter(row["deploy_hack"], row["deploy_solve"], s=70,
|
|
c=colors[row["direction_type"]], marker=markers[row["subspace"]],
|
|
edgecolors="white", linewidths=0.8, zorder=3)
|
|
ax.annotate(row["arm"], (row["deploy_hack"], row["deploy_solve"]),
|
|
textcoords="offset points", xytext=(7, 3), fontsize=7.5)
|
|
|
|
ax.axvline(0, color="#cccccc", lw=0.8, zorder=0)
|
|
ax.set_xlabel("deploy hack rate (lower = suppressed)")
|
|
ax.set_ylabel("deploy solve rate (higher = better)")
|
|
ax.set_xlim(-0.04, 0.40)
|
|
ax.set_ylim(0.45, 0.66)
|
|
ax.spines[["top", "right"]].set_visible(False)
|
|
|
|
# legend for direction type (color)
|
|
from matplotlib.lines import Line2D
|
|
leg = [Line2D([0], [0], marker="o", color="w", markerfacecolor=c, markersize=8, label=l)
|
|
for l, c in [("vanilla (no route)", colors["none"]), ("real hack dir", colors["real"]),
|
|
("semantic placebo", colors["placebo"]), ("Haar random (out)", colors["random"])]]
|
|
ax.legend(handles=leg, frameon=False, fontsize=7.5, loc="upper right")
|
|
fig.tight_layout()
|
|
for ext in ("png", "pdf"):
|
|
fig.savefig(HERE / f"directionality.{ext}", dpi=150, bbox_inches="tight")
|
|
print("wrote", HERE / "directionality.png")
|