From 25334ec574fcdba66a1eaed45b5fd06204b68e3d Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Mon, 27 Apr 2026 07:00:09 +0800 Subject: [PATCH] fix daily-dilemmas cross-adapter baseline --- RESEARCH_JOURNAL.md | 22 +++++++++++++--------- nbs/cross_adapter_v9.py | 25 ++++++++++++++++++------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/RESEARCH_JOURNAL.md b/RESEARCH_JOURNAL.md index 79a07ff..f2183e5 100644 --- a/RESEARCH_JOURNAL.md +++ b/RESEARCH_JOURNAL.md @@ -170,8 +170,12 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1. ### tl;dr -- DeLoRA is the strongest behavioral steerer by a large margin (delta = +0.94 - logratio at coeff=+1 vs base, ~1.5x DoRA/PiSSA, ~2x LoRA/OFT, ~3.5x IA3). +- DeLoRA is the strongest daily-dilemmas steerer by a large margin (base-only + delta = +0.70 logratio at coeff=+1 vs base, ~1.7x DoRA/PiSSA, + ~2.7x LoRA/OFT, ~23x IA3). NOTE: an earlier table used +0.94 because + `dilemmas_summary.csv` mixed base@0 with the AxBench `honest_engineer` + persona baseline at coeff=0. `cross_adapter_v9.py` now uses base-only + `dilemmas_per_row.csv`. - Every linear "find the planning subspace" method we tried lands at ~1-8% subspace overlap with the weight oracle. Across 6 adapter families, on every LoRA layer (8-21). Both cumulative and block-local act oracles. So either @@ -180,16 +184,16 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1. ### Headline numbers (cross_adapter_v9) -Behavioral steering on daily-dilemmas honesty subset (logratio_honesty, n=100): +Behavioral steering on daily-dilemmas honesty subset (base persona only; 100 dilemmas / 200 action rows per nonzero coeff): | adapter | logratio @ -1 | @ 0 (base) | @ +1 | delta(+1 - 0) | |---------|---------------|------------|------|---------------| -| delora | -0.29 | 1.08 | 2.02 | **+0.94** | -| dora | 0.73 | 1.08 | 1.72 | +0.64 | -| pissa | 0.44 | 1.08 | 1.69 | +0.60 | -| oft | 1.09 | 1.08 | 1.57 | +0.49 | -| lora | 1.09 | 1.08 | 1.55 | +0.47 | -| ia3 | 1.29 | 1.08 | 1.35 | +0.26 | +| delora | -0.29 | 1.32 | 2.02 | **+0.70** | +| dora | 0.73 | 1.32 | 1.72 | +0.41 | +| pissa | 0.44 | 1.32 | 1.69 | +0.37 | +| oft | 1.09 | 1.32 | 1.57 | +0.26 | +| lora | 1.09 | 1.32 | 1.55 | +0.23 | +| ia3 | 1.29 | 1.32 | 1.35 | +0.03 | DeLoRA is the only adapter that meaningfully *de*-steers (negative coeff → dishonest). LoRA/OFT/IA3 are nearly flat at coeff=-1. diff --git a/nbs/cross_adapter_v9.py b/nbs/cross_adapter_v9.py index f9a06f1..f309921 100644 --- a/nbs/cross_adapter_v9.py +++ b/nbs/cross_adapter_v9.py @@ -83,19 +83,28 @@ scope_summary.write_csv(OUT_DIR / "scope_summary.csv") # %% dil_rows = [] for adapter in ADAPTERS: - df = safe_read_csv(ROOT / adapter / "dilemmas_summary.csv") + df = safe_read_csv(ROOT / adapter / "dilemmas_per_row.csv") if df is None: continue - # mean over coeff=+1 minus coeff=0 = behavioral steering effect (more honest) - if 0.0 not in df["coeff"].to_list() or 1.0 not in df["coeff"].to_list(): + base_df = df.filter(pl.col("persona") == "base") + summary = base_df.group_by("coeff").agg( + pl.col("logratio_honesty").mean().alias("mean_logratio_honesty"), + pl.col("pmass").mean().alias("mean_pmass"), + pl.len().alias("n"), + ) + # mean over coeff=+1 minus base coeff=0 = behavioral steering effect (more honest). + # Important: dilemmas_summary.csv also includes AxBench persona baselines at coeff=0, + # so using it silently averages base@0 with honest_engineer@0. + if 0.0 not in summary["coeff"].to_list() or 1.0 not in summary["coeff"].to_list(): logger.warning(f"{adapter} dilemmas missing coeffs 0,1") continue - base = float(df.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0]) - pos = float(df.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0]) + base = float(summary.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0]) + pos = float(summary.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0]) neg = ( - float(df.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0]) - if -1.0 in df["coeff"].to_list() else float("nan") + float(summary.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0]) + if -1.0 in summary["coeff"].to_list() else float("nan") ) + pos_pmass = float(summary.filter(pl.col("coeff") == 1.0)["mean_pmass"][0]) dil_rows.append({ "adapter": adapter, "logratio_at_neg1": neg, @@ -103,6 +112,8 @@ for adapter in ADAPTERS: "logratio_at_pos1": pos, "delta_pos_minus_zero": pos - base, "delta_pos_minus_neg": pos - neg, + "pmass_at_pos1": pos_pmass, + "n_base_rows_per_coeff": int(summary.filter(pl.col("coeff") == 1.0)["n"][0]), }) dil_summary = pl.DataFrame(dil_rows)