axis = SocialNorms/Care (Authority degenerate); over-steer generation

scripts/diag_axis.py shows steering at 1 nat moves gemma's foundation profile the right way: SocialNorms 0.68->0.42, Care 0.21->0.33, coherence 0.72->0.88. Authority is ~0 on this model (no headroom), so: - eval reports all foundations; trait axis = SocialNorms (down) + Care (up) - map.html plots Care vs SocialNorms - add gen_alpha=1.5: over-steer generation into the incoherent regime so the heal (Q1) has work to do (at 1 nat coherence improved, nothing to heal) - results.py groups on coherence/socialnorms/care Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 16:47:16 +08:00 · 2026-06-04 10:28:52 +08:00
parent 5cdc0ba16d
commit 81340e3272
7 changed files with 92 additions and 18 deletions
@@ -23,6 +23,7 @@ class RunConfig:
    neutral: str = "You are a helpful assistant."
    layer_range: tuple[float, float] = (0.4, 0.6)  # fraction of depth to steer
    target_kl: float = 1.0  # iso-KL p95 dose (nats)
+    gen_alpha: float = 1.5  # over-steer generation into the incoherent regime (heal has work to do)
    alphas: tuple[float, ...] = (0.5, 1.0, 1.5, 2.0)  # multiples of c_star to generate at

    # ── generation + filter (U1) ──
@@ -24,17 +24,21 @@ def evaluate_model(model, tok, cfg: RunConfig) -> dict:
        device=model.device,
    )
    prof = rep["profile"]  # pandas: foundation, human, model, model_T
-    model_p = dict(zip(prof["foundation"], prof["model"]))
-    # SHOULD: auth/care in [0,1], coherence ~ base level on a working model;
-    # a sharp coherence drop after steering = format collapse. On tiny-random
-    # the numbers are junk (we test the path, not the value).
+    p = dict(zip(prof["foundation"], prof["model"]))
+    # The trait "less deference to authority" moves SocialNorms DOWN and Care UP
+    # on gemma-3-1b-it (Authority is degenerate ~0; see RESEARCH_JOURNAL 2026-06-04).
+    # Report all foundations so we never lose the axis that actually moves.
+    # SHOULD: under steering, socialnorms drops and care rises; coherence holds.
    out = {
-        "auth": float(model_p["Authority"]),
-        "care": float(model_p["Care"]),
+        "socialnorms": float(p["SocialNorms"]),  # trait axis: DOWN = more trait
+        "care": float(p["Care"]),                # trait axis: UP = more trait
+        "auth": float(p["Authority"]),
+        "fairness": float(p["Fairness"]),
+        "liberty": float(p["Liberty"]),
        "coherence": float(rep["mean_pmass_allowed"]),
        "ppx_json": float(math.exp(rep["mean_nll_json"])),
        "top1_acc": float(rep["top1_acc"]),
    }
-    logger.info(f"eval: auth={out['auth']:.3f} care={out['care']:.3f} "
+    logger.info(f"eval: socialnorms={out['socialnorms']:.3f} care={out['care']:.3f} "
                f"coherence={out['coherence']:.3f} ppx={out['ppx_json']:.1f}")
    return out
@@ -15,20 +15,20 @@ def write_map(run_dir: Path, rounds: list[dict]) -> Path:
    r = [d["round"] for d in rounds]
    fig = make_subplots(
        rows=1, cols=2, column_widths=[0.6, 0.4],
-        subplot_titles=("trait map: Care vs Authority", "coherence + direction per round"),
+        subplot_titles=("trait map: Care vs SocialNorms", "coherence + direction per round"),
        specs=[[{"type": "scatter"}, {"type": "scatter"}]],
    )
-    # trajectory across the auth axis, coloured by round
+    # trajectory across the SocialNorms axis (trait moves it DOWN, Care UP), coloured by round
    fig.add_trace(go.Scatter(
-        x=[d["auth"] for d in rounds], y=[d["care"] for d in rounds],
+        x=[d["socialnorms"] for d in rounds], y=[d["care"] for d in rounds],
        mode="lines+markers+text", text=[f"r{i}" for i in r], textposition="top center",
        marker=dict(size=12, color=r, colorscale="Viridis", showscale=False),
        hovertext=[f"r{d['round']} coh={d['coherence']:.3f} cos={d.get('cos_v0', float('nan')):.2f}"
                   for d in rounds],
        name="trajectory",
    ), row=1, col=1)
-    fig.update_xaxes(title_text="Authority p (trait →)", row=1, col=1)
-    fig.update_yaxes(title_text="Care p", row=1, col=1)
+    fig.update_xaxes(title_text="SocialNorms p (← trait)", row=1, col=1)
+    fig.update_yaxes(title_text="Care p (trait →)", row=1, col=1)

    fig.add_trace(go.Scatter(x=r, y=[d["coherence"] for d in rounds],
                             mode="lines+markers", name="coherence"), row=1, col=2)
@@ -67,7 +67,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
        # extract teacher vector + generate steered data from the CURRENT student
        with baked(model, hist_specs):
            v = teacher_vec(model, tok, cfg)
-            comps = generate_steered(model, tok, v, alpha=1.0, cfg=cfg)
+            comps = generate_steered(model, tok, v, alpha=cfg.gen_alpha, cfg=cfg)
        # filter under the ORIGINAL (no history, no steering)
        kept, scored = filter_completions(model, tok, comps, cfg)
        log_event(run_dir, stage="gen", round=rnd, n_comps=len(comps), n_kept=len(kept), scored=scored)
@@ -87,7 +87,7 @@ def steer_heal(model, tok, cfg: RunConfig, run_dir: Path) -> dict:
        rec = {"round": rnd, **m, "cos_v0": cos_v0, "c_star": float(v.cfg.coeff), "n_kept": len(kept)}
        rounds.append(rec)
        log_event(run_dir, stage="round", **rec)
-        logger.info(f"round {rnd}: auth={m['auth']:.3f} care={m['care']:.3f} "
+        logger.info(f"round {rnd}: socialnorms={m['socialnorms']:.3f} care={m['care']:.3f} "
                    f"coh={m['coherence']:.3f} cos_v0={cos_v0:+.2f}")

    map_path = write_map(run_dir, rounds)