mirror of
https://github.com/wassname/weight-steering.git
synced 2026-06-27 18:27:18 +08:00
fix daily-dilemmas cross-adapter baseline
This commit is contained in:
+13
-9
@@ -170,8 +170,12 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1.
|
||||
|
||||
### tl;dr
|
||||
|
||||
- DeLoRA is the strongest behavioral steerer by a large margin (delta = +0.94
|
||||
logratio at coeff=+1 vs base, ~1.5x DoRA/PiSSA, ~2x LoRA/OFT, ~3.5x IA3).
|
||||
- DeLoRA is the strongest daily-dilemmas steerer by a large margin (base-only
|
||||
delta = +0.70 logratio at coeff=+1 vs base, ~1.7x DoRA/PiSSA,
|
||||
~2.7x LoRA/OFT, ~23x IA3). NOTE: an earlier table used +0.94 because
|
||||
`dilemmas_summary.csv` mixed base@0 with the AxBench `honest_engineer`
|
||||
persona baseline at coeff=0. `cross_adapter_v9.py` now uses base-only
|
||||
`dilemmas_per_row.csv`.
|
||||
- Every linear "find the planning subspace" method we tried lands at ~1-8%
|
||||
subspace overlap with the weight oracle. Across 6 adapter families, on every
|
||||
LoRA layer (8-21). Both cumulative and block-local act oracles. So either
|
||||
@@ -180,16 +184,16 @@ monotonicity and perfect pmass but the "honest engineer" persona backfires at 1.
|
||||
|
||||
### Headline numbers (cross_adapter_v9)
|
||||
|
||||
Behavioral steering on daily-dilemmas honesty subset (logratio_honesty, n=100):
|
||||
Behavioral steering on daily-dilemmas honesty subset (base persona only; 100 dilemmas / 200 action rows per nonzero coeff):
|
||||
|
||||
| adapter | logratio @ -1 | @ 0 (base) | @ +1 | delta(+1 - 0) |
|
||||
|---------|---------------|------------|------|---------------|
|
||||
| delora | -0.29 | 1.08 | 2.02 | **+0.94** |
|
||||
| dora | 0.73 | 1.08 | 1.72 | +0.64 |
|
||||
| pissa | 0.44 | 1.08 | 1.69 | +0.60 |
|
||||
| oft | 1.09 | 1.08 | 1.57 | +0.49 |
|
||||
| lora | 1.09 | 1.08 | 1.55 | +0.47 |
|
||||
| ia3 | 1.29 | 1.08 | 1.35 | +0.26 |
|
||||
| delora | -0.29 | 1.32 | 2.02 | **+0.70** |
|
||||
| dora | 0.73 | 1.32 | 1.72 | +0.41 |
|
||||
| pissa | 0.44 | 1.32 | 1.69 | +0.37 |
|
||||
| oft | 1.09 | 1.32 | 1.57 | +0.26 |
|
||||
| lora | 1.09 | 1.32 | 1.55 | +0.23 |
|
||||
| ia3 | 1.29 | 1.32 | 1.35 | +0.03 |
|
||||
|
||||
DeLoRA is the only adapter that meaningfully *de*-steers (negative coeff →
|
||||
dishonest). LoRA/OFT/IA3 are nearly flat at coeff=-1.
|
||||
|
||||
+18
-7
@@ -83,19 +83,28 @@ scope_summary.write_csv(OUT_DIR / "scope_summary.csv")
|
||||
# %%
|
||||
dil_rows = []
|
||||
for adapter in ADAPTERS:
|
||||
df = safe_read_csv(ROOT / adapter / "dilemmas_summary.csv")
|
||||
df = safe_read_csv(ROOT / adapter / "dilemmas_per_row.csv")
|
||||
if df is None:
|
||||
continue
|
||||
# mean over coeff=+1 minus coeff=0 = behavioral steering effect (more honest)
|
||||
if 0.0 not in df["coeff"].to_list() or 1.0 not in df["coeff"].to_list():
|
||||
base_df = df.filter(pl.col("persona") == "base")
|
||||
summary = base_df.group_by("coeff").agg(
|
||||
pl.col("logratio_honesty").mean().alias("mean_logratio_honesty"),
|
||||
pl.col("pmass").mean().alias("mean_pmass"),
|
||||
pl.len().alias("n"),
|
||||
)
|
||||
# mean over coeff=+1 minus base coeff=0 = behavioral steering effect (more honest).
|
||||
# Important: dilemmas_summary.csv also includes AxBench persona baselines at coeff=0,
|
||||
# so using it silently averages base@0 with honest_engineer@0.
|
||||
if 0.0 not in summary["coeff"].to_list() or 1.0 not in summary["coeff"].to_list():
|
||||
logger.warning(f"{adapter} dilemmas missing coeffs 0,1")
|
||||
continue
|
||||
base = float(df.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0])
|
||||
pos = float(df.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0])
|
||||
base = float(summary.filter(pl.col("coeff") == 0.0)["mean_logratio_honesty"][0])
|
||||
pos = float(summary.filter(pl.col("coeff") == 1.0)["mean_logratio_honesty"][0])
|
||||
neg = (
|
||||
float(df.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0])
|
||||
if -1.0 in df["coeff"].to_list() else float("nan")
|
||||
float(summary.filter(pl.col("coeff") == -1.0)["mean_logratio_honesty"][0])
|
||||
if -1.0 in summary["coeff"].to_list() else float("nan")
|
||||
)
|
||||
pos_pmass = float(summary.filter(pl.col("coeff") == 1.0)["mean_pmass"][0])
|
||||
dil_rows.append({
|
||||
"adapter": adapter,
|
||||
"logratio_at_neg1": neg,
|
||||
@@ -103,6 +112,8 @@ for adapter in ADAPTERS:
|
||||
"logratio_at_pos1": pos,
|
||||
"delta_pos_minus_zero": pos - base,
|
||||
"delta_pos_minus_neg": pos - neg,
|
||||
"pmass_at_pos1": pos_pmass,
|
||||
"n_base_rows_per_coeff": int(summary.filter(pl.col("coeff") == 1.0)["n"][0]),
|
||||
})
|
||||
|
||||
dil_summary = pl.DataFrame(dil_rows)
|
||||
|
||||
Reference in New Issue
Block a user