mirror of
https://github.com/wassname/weight-steering.git
synced 2026-06-27 19:50:02 +08:00
106 lines
3.8 KiB
Makefile
106 lines
3.8 KiB
Makefile
# weight-steering recipes. Default model is Qwen3-0.6B for cheap iteration.
|
|
|
|
set shell := ["bash", "-cu"]
|
|
|
|
model := "Qwen/Qwen3-0.6B"
|
|
behavior := "sycophancy"
|
|
adapter := "lora"
|
|
out := "out"
|
|
|
|
SMOKE_MODEL := "katuni4ka/tiny-random-qwen3"
|
|
SMOKE_LOG := "out/smoke/smoke.log"
|
|
|
|
# Smoke: BEARTYPE=1 + tiny random model. Exercises full pipeline end-to-end.
|
|
# Catches dim/dtype errors via jaxtyping runtime checks. ~1 min on CPU.
|
|
smoke *ARGS:
|
|
mkdir -p out/smoke && \
|
|
BEARTYPE=1 uv run python evals/smoke.py \
|
|
--model {{SMOKE_MODEL}} \
|
|
{{ARGS}} \
|
|
2>&1 | tee {{SMOKE_LOG}} | tail -200
|
|
|
|
# Smoke all 7 adapter variants sequentially. ~7 min on CPU.
|
|
smoke-sweep:
|
|
just smoke --adapter lora
|
|
just smoke --adapter dora
|
|
just smoke --adapter pissa
|
|
just smoke --adapter delora
|
|
just smoke --adapter oft
|
|
just smoke --adapter boft
|
|
just smoke --adapter ia3
|
|
|
|
# Generate +/- pair data for a behavior. Writes to out/data/{behavior}/.
|
|
data:
|
|
uv run python -m ws.data --model-id {{model}} --behavior {{behavior}}
|
|
|
|
# One-off greedy persona collapse debugger.
|
|
debug-personas:
|
|
uv run python -m ws.scripts.debug_personas --model {{model}} --behavior {{behavior}} --out {{out}}
|
|
|
|
# Train a single adapter (positive or negative). Pos/neg controls system prompt at gen time.
|
|
train sign="pos":
|
|
uv run python -m ws.train --model {{model}} --behavior {{behavior}} \
|
|
--adapter {{adapter}} --sign {{sign}} --out {{out}}
|
|
|
|
# Compute weight-space diff w = θ+ - θ- and persist.
|
|
diff:
|
|
uv run python -m ws.diff --model {{model}} --behavior {{behavior}} \
|
|
--adapter {{adapter}} --out {{out}}
|
|
|
|
# Phase 1 eval: sycophancy logratio sweep over alpha.
|
|
eval-syco:
|
|
uv run python -m ws.eval.sycophancy --model {{model}} \
|
|
--adapter {{adapter}} --out {{out}}
|
|
|
|
# AIRiskDilemmas guided-CoT eval.
|
|
eval-airisk:
|
|
uv run python -m ws.eval.airisk --model {{model}} \
|
|
--adapter {{adapter}} --out {{out}}
|
|
|
|
# AIRisk eval with bisector steering: w ∝ τ̂⁺-τ̂⁻ rescaled to ‖dW‖. Recomputes
|
|
# from adapters (slower); also prints geometry diagnostics on (τ⁺, τ⁻).
|
|
eval-airisk-bisector:
|
|
uv run python -m ws.eval.airisk --model {{model}} \
|
|
--behavior {{behavior}} --adapter {{adapter}} --out {{out}} --mode bisector
|
|
|
|
# tiny-mfv AIRisk logprob eval with bootstrap uncertainty.
|
|
eval-tinymfv-airisk:
|
|
uv run python -m ws.eval.tinymfv_airisk --model {{model}} \
|
|
--behavior {{behavior}} --adapter {{adapter}} --out {{out}}
|
|
|
|
# Build the combined AIRisk README table once per-adapter runs are done.
|
|
summarize-airisk:
|
|
uv run python -m ws.scripts.readme_airisk_table --behavior {{behavior}} --out {{out}}
|
|
|
|
# tiny-mfv AIRisk eval at iso-KL calibrated alpha (reads kl_calibration/summary.csv).
|
|
eval-tinymfv-trad-care:
|
|
uv run python -m ws.scripts.eval_tinymfv_calibrated --behavior trad_care --out {{out}}
|
|
|
|
# Build the tiny-mfv comparison table (ws + steering-lite rows) for README.
|
|
summarize-tinymfv:
|
|
uv run python -m ws.scripts.readme_tinymfv_table --behavior trad_care --out {{out}}
|
|
|
|
# Phase 2: project w onto SVD + AntiPaSTO subspaces, print alignment table.
|
|
subspace-align:
|
|
uv run python -m ws.run_subspace --model {{model}} \
|
|
--adapter {{adapter}} --out {{out}}
|
|
|
|
# Phase 3: full sweep over LoRA, DoRA, PiSSA-init LoRA, DeLoRA.
|
|
adapter-sweep:
|
|
uv run python -m ws.run_sweep --model {{model}} --behavior {{behavior}} --out {{out}}
|
|
|
|
# Replicate: full phase-1 pipeline (data -> train pos -> train neg -> diff -> eval).
|
|
replicate:
|
|
uv run python -m ws.replicate --model {{model}} --behavior {{behavior}} \
|
|
--adapter {{adapter}}
|
|
|
|
# Phase A demo: adapter coherence + guided-CoT under steered diff. Reads existing artifacts.
|
|
demo:
|
|
uv run python -m ws.run_demo --model {{model}} --behavior {{behavior}} --adapter {{adapter}}
|
|
|
|
setup:
|
|
uv sync
|
|
|
|
clean:
|
|
rm -rf {{out}}
|