fix daily-dilemmas cross-adapter baseline

This commit is contained in:
wassname
2026-04-27 07:00:09 +08:00
parent 6f41e47ea9
commit 25334ec574
2 changed files with 31 additions and 16 deletions
+13 -9
View File
@@ -170,8 +170,12 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1.
### tl;dr
- DeLoRA is the strongest behavioral steerer by a large margin (delta = +0.94
logratio at coeff=+1 vs base, ~1.5x DoRA/PiSSA, ~2x LoRA/OFT, ~3.5x IA3).
- DeLoRA is the strongest daily-dilemmas steerer by a large margin (base-only
delta = +0.70 logratio at coeff=+1 vs base, ~1.7x DoRA/PiSSA,
~2.7x LoRA/OFT, ~23x IA3). NOTE: an earlier table used +0.94 because
`dilemmas_summary.csv` mixed base@0 with the AxBench `honest_engineer`
persona baseline at coeff=0. `cross_adapter_v9.py` now uses base-only
`dilemmas_per_row.csv`.
- Every linear "find the planning subspace" method we tried lands at ~1-8%
subspace overlap with the weight oracle. Across 6 adapter families, on every
LoRA layer (8-21). Both cumulative and block-local act oracles. So either
@@ -180,16 +184,16 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1.
### Headline numbers (cross_adapter_v9)
Behavioral steering on daily-dilemmas honesty subset (logratio_honesty, n=100):
Behavioral steering on daily-dilemmas honesty subset (base persona only; 100 dilemmas / 200 action rows per nonzero coeff):
| adapter | logratio @ -1 | @ 0 (base) | @ +1 | delta(+1 - 0) |
|---------|---------------|------------|------|---------------|
| delora | -0.29 | 1.08 | 2.02 | **+0.94** |
| dora | 0.73 | 1.08 | 1.72 | +0.64 |
| pissa | 0.44 | 1.08 | 1.69 | +0.60 |
| oft | 1.09 | 1.08 | 1.57 | +0.49 |
| lora | 1.09 | 1.08 | 1.55 | +0.47 |
| ia3 | 1.29 | 1.08 | 1.35 | +0.26 |
| delora | -0.29 | 1.32 | 2.02 | **+0.70** |
| dora | 0.73 | 1.32 | 1.72 | +0.41 |
| pissa | 0.44 | 1.32 | 1.69 | +0.37 |
| oft | 1.09 | 1.32 | 1.57 | +0.26 |
| lora | 1.09 | 1.32 | 1.55 | +0.23 |
| ia3 | 1.29 | 1.32 | 1.35 | +0.03 |
DeLoRA is the only adapter that meaningfully *de*-steers (negative coeff →
dishonest). LoRA/OFT/IA3 are nearly flat at coeff=-1.
+18 -7
View File
@@ -83,19 +83,28 @@ scope_summary.write_csv(OUT_DIR / "scope_summary.csv")
# %%
dil_rows = []
for adapter in ADAPTERS:
df = safe_read_csv(ROOT / adapter / "dilemmas_summary.csv")
df = safe_read_csv(ROOT / adapter / "dilemmas_per_row.csv")
if df is None:
continue
# mean over coeff=+1 minus coeff=0 = behavioral steering effect (more honest)
if 0.0 not in df["coeff"].to_list() or 1.0 not in df["coeff"].to_list():
base_df = df.filter(pl.col("persona") == "base")
summary = base_df.group_by("coeff").agg(
pl.col("logratio_honesty").mean().alias("mean_logratio_honesty"),
pl.col("pmass").mean().alias("mean_pmass"),
pl.len().alias("n"),
)
# mean over coeff=+1 minus base coeff=0 = behavioral steering effect (more honest).
# Important: dilemmas_summary.csv also includes AxBench persona baselines at coeff=0,
# so using it silently averages base@0 with honest_engineer@0.
if 0.0 not in summary["coeff"].to_list() or 1.0 not in summary["coeff"].to_list():
logger.warning(f"{adapter} dilemmas missing coeffs 0,1")
continue
base = float(df.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0])
pos = float(df.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0])
base = float(summary.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0])
pos = float(summary.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0])
neg = (
float(df.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0])
if -1.0 in df["coeff"].to_list() else float("nan")
float(summary.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0])
if -1.0 in summary["coeff"].to_list() else float("nan")
)
pos_pmass = float(summary.filter(pl.col("coeff") == 1.0)["mean_pmass"][0])
dil_rows.append({
"adapter": adapter,
"logratio_at_neg1": neg,
@@ -103,6 +112,8 @@ for adapter in ADAPTERS:
"logratio_at_pos1": pos,
"delta_pos_minus_zero": pos - base,
"delta_pos_minus_neg": pos - neg,
"pmass_at_pos1": pos_pmass,
"n_base_rows_per_coeff": int(summary.filter(pl.col("coeff") == 1.0)["n"][0]),
})
dil_summary = pl.DataFrame(dil_rows)