# weight-steering recipes. Default model is Qwen3-0.6B for cheap iteration. set shell := ["bash", "-cu"] model := "Qwen/Qwen3-0.6B" behavior := "sycophancy" adapter := "lora" out := "out" SMOKE_MODEL := "katuni4ka/tiny-random-qwen3" SMOKE_LOG := "out/smoke/smoke.log" # Smoke: BEARTYPE=1 + tiny random model. Exercises full pipeline end-to-end. # Catches dim/dtype errors via jaxtyping runtime checks. ~1 min on CPU. smoke *ARGS: mkdir -p out/smoke && \ BEARTYPE=1 uv run python evals/smoke.py \ --model {{SMOKE_MODEL}} \ {{ARGS}} \ 2>&1 | tee {{SMOKE_LOG}} | tail -200 # Smoke all 7 adapter variants sequentially. ~7 min on CPU. smoke-sweep: just smoke --adapter lora just smoke --adapter dora just smoke --adapter pissa just smoke --adapter delora just smoke --adapter oft just smoke --adapter boft just smoke --adapter ia3 # Generate +/- pair data for a behavior. Writes to out/data/{behavior}/. data: uv run python -m ws.data --model-id {{model}} --behavior {{behavior}} # One-off greedy persona collapse debugger. debug-personas: uv run python -m ws.scripts.debug_personas --model {{model}} --behavior {{behavior}} --out {{out}} # Train a single adapter (positive or negative). Pos/neg controls system prompt at gen time. train sign="pos": uv run python -m ws.train --model {{model}} --behavior {{behavior}} \ --adapter {{adapter}} --sign {{sign}} --out {{out}} # Compute weight-space diff w = θ+ - θ- and persist. diff: uv run python -m ws.diff --model {{model}} --behavior {{behavior}} \ --adapter {{adapter}} --out {{out}} # Phase 1 eval: sycophancy logratio sweep over alpha. eval-syco: uv run python -m ws.eval.sycophancy --model {{model}} \ --adapter {{adapter}} --out {{out}} # AIRiskDilemmas guided-CoT eval. eval-airisk: uv run python -m ws.eval.airisk --model {{model}} \ --adapter {{adapter}} --out {{out}} # AIRisk eval with bisector steering: w ∝ τ̂⁺-τ̂⁻ rescaled to ‖dW‖. Recomputes # from adapters (slower); also prints geometry diagnostics on (τ⁺, τ⁻). eval-airisk-bisector: uv run python -m ws.eval.airisk --model {{model}} \ --behavior {{behavior}} --adapter {{adapter}} --out {{out}} --mode bisector # tiny-mfv AIRisk logprob eval with bootstrap uncertainty. eval-tinymfv-airisk: uv run python -m ws.eval.tinymfv_airisk --model {{model}} \ --behavior {{behavior}} --adapter {{adapter}} --out {{out}} # Build the combined AIRisk README table once per-adapter runs are done. summarize-airisk: uv run python -m ws.scripts.readme_airisk_table --behavior {{behavior}} --out {{out}} # tiny-mfv AIRisk eval at iso-KL calibrated alpha (reads kl_calibration/summary.csv). eval-tinymfv-trad-care: uv run python -m ws.scripts.eval_tinymfv_calibrated --behavior trad_care --out {{out}} # Build the tiny-mfv comparison table (ws + steering-lite rows) for README. summarize-tinymfv: uv run python -m ws.scripts.readme_tinymfv_table --behavior trad_care --out {{out}} # Phase 2: project w onto SVD + AntiPaSTO subspaces, print alignment table. subspace-align: uv run python -m ws.run_subspace --model {{model}} \ --adapter {{adapter}} --out {{out}} # Phase 3: full sweep over LoRA, DoRA, PiSSA-init LoRA, DeLoRA. adapter-sweep: uv run python -m ws.run_sweep --model {{model}} --behavior {{behavior}} --out {{out}} # Replicate: full phase-1 pipeline (data -> train pos -> train neg -> diff -> eval). replicate: uv run python -m ws.replicate --model {{model}} --behavior {{behavior}} \ --adapter {{adapter}} # Phase A demo: adapter coherence + guided-CoT under steered diff. Reads existing artifacts. demo: uv run python -m ws.run_demo --model {{model}} --behavior {{behavior}} --adapter {{adapter}} setup: uv sync clean: rm -rf {{out}}