fix daily-dilemmas cross-adapter baseline

2026-06-27 18:27:18 +08:00 · 2026-04-27 07:00:09 +08:00
parent 6f41e47ea9
commit 25334ec574
2 changed files with 31 additions and 16 deletions
@@ -170,8 +170,12 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1.

 ### tl;dr

- DeLoRA is the strongest behavioral steerer by a large margin (delta = +0.94
-  logratio at coeff=+1 vs base, ~1.5x DoRA/PiSSA, ~2x LoRA/OFT, ~3.5x IA3).
+- DeLoRA is the strongest daily-dilemmas steerer by a large margin (base-only
+  delta = +0.70 logratio at coeff=+1 vs base, ~1.7x DoRA/PiSSA,
+  ~2.7x LoRA/OFT, ~23x IA3). NOTE: an earlier table used +0.94 because
+  `dilemmas_summary.csv` mixed base@0 with the AxBench `honest_engineer`
+  persona baseline at coeff=0. `cross_adapter_v9.py` now uses base-only
+  `dilemmas_per_row.csv`.
 - Every linear "find the planning subspace" method we tried lands at ~1-8%
  subspace overlap with the weight oracle. Across 6 adapter families, on every
  LoRA layer (8-21). Both cumulative and block-local act oracles. So either
@@ -180,16 +184,16 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1.

 ### Headline numbers (cross_adapter_v9)

-Behavioral steering on daily-dilemmas honesty subset (logratio_honesty, n=100):
+Behavioral steering on daily-dilemmas honesty subset (base persona only; 100 dilemmas / 200 action rows per nonzero coeff):

 | adapter | logratio @ -1 | @ 0 (base) | @ +1 | delta(+1 - 0) |
 |---------|---------------|------------|------|---------------|
-| delora  | -0.29         | 1.08       | 2.02 | **+0.94**     |
-| dora    |  0.73         | 1.08       | 1.72 | +0.64         |
-| pissa   |  0.44         | 1.08       | 1.69 | +0.60         |
-| oft     |  1.09         | 1.08       | 1.57 | +0.49         |
-| lora    |  1.09         | 1.08       | 1.55 | +0.47         |
-| ia3     |  1.29         | 1.08       | 1.35 | +0.26         |
+| delora  | -0.29         | 1.32       | 2.02 | **+0.70**     |
+| dora    |  0.73         | 1.32       | 1.72 | +0.41         |
+| pissa   |  0.44         | 1.32       | 1.69 | +0.37         |
+| oft     |  1.09         | 1.32       | 1.57 | +0.26         |
+| lora    |  1.09         | 1.32       | 1.55 | +0.23         |
+| ia3     |  1.29         | 1.32       | 1.35 | +0.03         |

 DeLoRA is the only adapter that meaningfully *de*-steers (negative coeff →
 dishonest). LoRA/OFT/IA3 are nearly flat at coeff=-1.
@@ -83,19 +83,28 @@ scope_summary.write_csv(OUT_DIR / "scope_summary.csv")
 # %%
 dil_rows = []
 for adapter in ADAPTERS:
-    df = safe_read_csv(ROOT / adapter / "dilemmas_summary.csv")
+    df = safe_read_csv(ROOT / adapter / "dilemmas_per_row.csv")
    if df is None:
        continue
-    # mean over coeff=+1 minus coeff=0 = behavioral steering effect (more honest)
-    if 0.0 not in df["coeff"].to_list() or 1.0 not in df["coeff"].to_list():
+    base_df = df.filter(pl.col("persona") == "base")
+    summary = base_df.group_by("coeff").agg(
+        pl.col("logratio_honesty").mean().alias("mean_logratio_honesty"),
+        pl.col("pmass").mean().alias("mean_pmass"),
+        pl.len().alias("n"),
+    )
+    # mean over coeff=+1 minus base coeff=0 = behavioral steering effect (more honest).
+    # Important: dilemmas_summary.csv also includes AxBench persona baselines at coeff=0,
+    # so using it silently averages base@0 with honest_engineer@0.
+    if 0.0 not in summary["coeff"].to_list() or 1.0 not in summary["coeff"].to_list():
        logger.warning(f"{adapter} dilemmas missing coeffs 0,1")
        continue
-    base = float(df.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0])
-    pos = float(df.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0])
+    base = float(summary.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0])
+    pos = float(summary.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0])
    neg = (
-        float(df.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0])
-        if -1.0 in df["coeff"].to_list() else float("nan")
+        float(summary.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0])
+        if -1.0 in summary["coeff"].to_list() else float("nan")
    )
+    pos_pmass = float(summary.filter(pl.col("coeff") == 1.0)["mean_pmass"][0])
    dil_rows.append({
        "adapter": adapter,
        "logratio_at_neg1": neg,
@@ -103,6 +112,8 @@ for adapter in ADAPTERS:
        "logratio_at_pos1": pos,
        "delta_pos_minus_zero": pos - base,
        "delta_pos_minus_neg": pos - neg,
+        "pmass_at_pos1": pos_pmass,
+        "n_base_rows_per_coeff": int(summary.filter(pl.col("coeff") == 1.0)["n"][0]),
    })

 dil_summary = pl.DataFrame(dil_rows)