Files
evil_MoE/justfile
T
2026-05-23 11:26:39 +08:00

96 lines
3.9 KiB
Makefile

set shell := ["bash", "-cu"]
# Three seeds for headline arms; one seed for ablations.
SEEDS_3 := "41 43 44"
# H4 main: Qwen3.5-2B; if H4 falsified (vanilla hack<30%), switch to Qwen/Qwen3-4B per spec.md.
MODEL := "Qwen/Qwen3.5-2B"
# Compute-fit override for 96GB single-GPU (see docs/grpo_hyperparams.md §Our deviations).
NUM_GEN := "8"
BATCH := "16"
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
BASE := "uv run python -m projected_grpo.run"
default:
@just --list
# fast-dev-run: tiny-random model, real pipeline end-to-end, ~1-2 min, beartype on.
# Touches: model load, v_hack extract, SVD denoise, gradient projection, one fake GRPO step.
# Tests both pathways (vanilla, projected) in one invocation.
fast-dev-run *ARGS:
BEARTYPE=1 {{ BASE }} --fast-dev-run --model={{ TINY_MODEL }} {{ ARGS }}
# Smoke test for the projected-gradient pathway only (uses tiny-random).
smoke-projected:
BEARTYPE=1 {{ BASE }} --fast-dev-run --arm=projected --model={{ TINY_MODEL }}
# Smoke test for vanilla GRPO (no projection).
smoke-vanilla:
BEARTYPE=1 {{ BASE }} --fast-dev-run --arm=vanilla --model={{ TINY_MODEL }}
# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
sync-external:
cd external/rl-rewardhacking && git pull --ff-only
# Download Qwen3.5-2B to HF cache (warm cache before real runs).
# H: Qwen3.5-2B is the real-run model per spec.md; sub for Qwen3-4B (Nanda) to fit 96GB.
download-model:
uv run python -c "from huggingface_hub import snapshot_download; \
snapshot_download('Qwen/Qwen2.5-1.5B', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
# Queue all sweep arms via pueue. Comment out arms that are done.
# Run priorities: vanilla baseline first (we need its numbers to compare).
queue:
#!/usr/bin/env bash
set -x
just queue-vanilla
just queue-projected-m16
# just queue-projected-no-svd # H2 ablation
# just queue-projected-no-magnorm # design ablation
# just queue-rebound # H3 baseline
# just queue-projected-m8 # H2 sweep
# just queue-projected-m32 # H2 sweep
# Vanilla GRPO baseline, 3 seeds. H: hack rate >30% at step 200 per spec H4.
# Real run goes through Ariahw's verl pipeline (NOT our smoke run.py).
queue-vanilla:
#!/usr/bin/env bash
set -x
for seed in {{ SEEDS_3 }}; do
pueue add -w "$PWD/external/rl-rewardhacking" -o 5 \
-l "why: H4 sanity, does {{ MODEL }} reward-hack at all; resolve: if <30% hack rate at step 200, swap MODEL to Qwen/Qwen3-4B + reduce NUM_GEN to 4" \
-- uv run python scripts/run_rl_training.py no_intervention \
--model_id={{ MODEL }} --seed=$seed \
--num_generations={{ NUM_GEN }} --per_device_batch_size={{ BATCH }}
done
# Projected gradient, m=16, 3 seeds. H1 main result.
# TODO: integrate project_grad_per_row into verl's GRPO trainer. Currently the
# justfile recipe still calls our smoke run.py end-to-end; this is a placeholder
# until the verl-wrapped projection is wired (next task on GPU box).
queue-projected-m16:
#!/usr/bin/env bash
set -x
for seed in {{ SEEDS_3 }}; do
pueue add -w "$PWD" -o 4 \
-l "why: H1 main, gradient proj reduces hack rate >=30pp at matched pass; resolve: publish if H1 holds; BLOCKED: needs verl integration" \
-- {{ BASE }} --arm=projected --m=16 --seed=$seed --model={{ MODEL }} --steps=200
done
# Diagnostic: print v_hack steering check (CAA-style) on base model.
# H: adding v_hack at inference should shift completions toward hack-flavored text.
vhack-check *ARGS:
{{ BASE }} --vhack-check --model={{ MODEL }} {{ ARGS }}
# Print the results table prototype.
table-proto:
@cat docs/table_proto.md
# Show recent pueue logs.
log:
pueue log -l 40
# Append a new research journal entry (interactive).
journal:
@echo "Edit docs/RESEARCH_JOURNAL.md and prepend a dated entry."
@${EDITOR:-vi} docs/RESEARCH_JOURNAL.md