From 25334ec574fcdba66a1eaed45b5fd06204b68e3d Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Mon, 27 Apr 2026 07:00:09 +0800
Subject: [PATCH] fix daily-dilemmas cross-adapter baseline

---
 RESEARCH_JOURNAL.md     | 22 +++++++++++++---------
 nbs/cross_adapter_v9.py | 25 ++++++++++++++++++-------
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/RESEARCH_JOURNAL.md b/RESEARCH_JOURNAL.md
index 79a07ff..f2183e5 100644
--- a/RESEARCH_JOURNAL.md
+++ b/RESEARCH_JOURNAL.md
@@ -170,8 +170,12 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1.
 
 ### tl;dr
 
-- DeLoRA is the strongest behavioral steerer by a large margin (delta = +0.94
-  logratio at coeff=+1 vs base, ~1.5x DoRA/PiSSA, ~2x LoRA/OFT, ~3.5x IA3).
+- DeLoRA is the strongest daily-dilemmas steerer by a large margin (base-only
+  delta = +0.70 logratio at coeff=+1 vs base, ~1.7x DoRA/PiSSA,
+  ~2.7x LoRA/OFT, ~23x IA3). NOTE: an earlier table used +0.94 because
+  `dilemmas_summary.csv` mixed base@0 with the AxBench `honest_engineer`
+  persona baseline at coeff=0. `cross_adapter_v9.py` now uses base-only
+  `dilemmas_per_row.csv`.
 - Every linear "find the planning subspace" method we tried lands at ~1-8%
   subspace overlap with the weight oracle. Across 6 adapter families, on every
   LoRA layer (8-21). Both cumulative and block-local act oracles. So either
@@ -180,16 +184,16 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1.
 
 ### Headline numbers (cross_adapter_v9)
 
-Behavioral steering on daily-dilemmas honesty subset (logratio_honesty, n=100):
+Behavioral steering on daily-dilemmas honesty subset (base persona only; 100 dilemmas / 200 action rows per nonzero coeff):
 
 | adapter | logratio @ -1 | @ 0 (base) | @ +1 | delta(+1 - 0) |
 |---------|---------------|------------|------|---------------|
-| delora  | -0.29         | 1.08       | 2.02 | **+0.94**     |
-| dora    |  0.73         | 1.08       | 1.72 | +0.64         |
-| pissa   |  0.44         | 1.08       | 1.69 | +0.60         |
-| oft     |  1.09         | 1.08       | 1.57 | +0.49         |
-| lora    |  1.09         | 1.08       | 1.55 | +0.47         |
-| ia3     |  1.29         | 1.08       | 1.35 | +0.26         |
+| delora  | -0.29         | 1.32       | 2.02 | **+0.70**     |
+| dora    |  0.73         | 1.32       | 1.72 | +0.41         |
+| pissa   |  0.44         | 1.32       | 1.69 | +0.37         |
+| oft     |  1.09         | 1.32       | 1.57 | +0.26         |
+| lora    |  1.09         | 1.32       | 1.55 | +0.23         |
+| ia3     |  1.29         | 1.32       | 1.35 | +0.03         |
 
 DeLoRA is the only adapter that meaningfully *de*-steers (negative coeff →
 dishonest). LoRA/OFT/IA3 are nearly flat at coeff=-1.
diff --git a/nbs/cross_adapter_v9.py b/nbs/cross_adapter_v9.py
index f9a06f1..f309921 100644
--- a/nbs/cross_adapter_v9.py
+++ b/nbs/cross_adapter_v9.py
@@ -83,19 +83,28 @@ scope_summary.write_csv(OUT_DIR / "scope_summary.csv")
 # %%
 dil_rows = []
 for adapter in ADAPTERS:
-    df = safe_read_csv(ROOT / adapter / "dilemmas_summary.csv")
+    df = safe_read_csv(ROOT / adapter / "dilemmas_per_row.csv")
     if df is None:
         continue
-    # mean over coeff=+1 minus coeff=0 = behavioral steering effect (more honest)
-    if 0.0 not in df["coeff"].to_list() or 1.0 not in df["coeff"].to_list():
+    base_df = df.filter(pl.col("persona") == "base")
+    summary = base_df.group_by("coeff").agg(
+        pl.col("logratio_honesty").mean().alias("mean_logratio_honesty"),
+        pl.col("pmass").mean().alias("mean_pmass"),
+        pl.len().alias("n"),
+    )
+    # mean over coeff=+1 minus base coeff=0 = behavioral steering effect (more honest).
+    # Important: dilemmas_summary.csv also includes AxBench persona baselines at coeff=0,
+    # so using it silently averages base@0 with honest_engineer@0.
+    if 0.0 not in summary["coeff"].to_list() or 1.0 not in summary["coeff"].to_list():
         logger.warning(f"{adapter} dilemmas missing coeffs 0,1")
         continue
-    base = float(df.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0])
-    pos = float(df.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0])
+    base = float(summary.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0])
+    pos = float(summary.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0])
     neg = (
-        float(df.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0])
-        if -1.0 in df["coeff"].to_list() else float("nan")
+        float(summary.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0])
+        if -1.0 in summary["coeff"].to_list() else float("nan")
     )
+    pos_pmass = float(summary.filter(pl.col("coeff") == 1.0)["mean_pmass"][0])
     dil_rows.append({
         "adapter": adapter,
         "logratio_at_neg1": neg,
@@ -103,6 +112,8 @@ for adapter in ADAPTERS:
         "logratio_at_pos1": pos,
         "delta_pos_minus_zero": pos - base,
         "delta_pos_minus_neg": pos - neg,
+        "pmass_at_pos1": pos_pmass,
+        "n_base_rows_per_coeff": int(summary.filter(pl.col("coeff") == 1.0)["n"][0]),
     })
 
 dil_summary = pl.DataFrame(dil_rows)