evil_MoE/justfile

set shell := ["bash", "-cu"]

# Three seeds for headline arms; one seed for ablations.
SEEDS_3 := "41 43 44"
# H4 main: Qwen3.5-2B; if H4 falsified (vanilla hack<30%), switch to Qwen/Qwen3-4B per spec.md.
MODEL := "Qwen/Qwen3.5-2B"
# Compute-fit override for 96GB single-GPU (see docs/grpo_hyperparams.md §Our deviations).
NUM_GEN := "8"
BATCH := "16"
TINY_MODEL := "llamafactory/tiny-random-qwen3"  # qwen3 arch, ~6M params, smoke only
BASE := "uv run python -m projected_grpo.run"

default:
    @just --list

# fast-dev-run: tiny-random model, real pipeline end-to-end, ~1-2 min, beartype on.
# Touches: model load, v_hack extract, SVD denoise, gradient projection, one fake GRPO step.
# Tests both pathways (vanilla, projected) in one invocation.
fast-dev-run *ARGS:
    BEARTYPE=1 {{ BASE }} --fast-dev-run --model={{ TINY_MODEL }} {{ ARGS }}

# Smoke test for the projected-gradient pathway only (uses tiny-random).
smoke-projected:
    BEARTYPE=1 {{ BASE }} --fast-dev-run --arm=projected --model={{ TINY_MODEL }}

# Smoke test for vanilla GRPO (no projection).
smoke-vanilla:
    BEARTYPE=1 {{ BASE }} --fast-dev-run --arm=vanilla --model={{ TINY_MODEL }}

# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
sync-external:
    cd external/rl-rewardhacking && git pull --ff-only

# Download Qwen3.5-2B to HF cache (warm cache before real runs).
# H: Qwen3.5-2B is the real-run model per spec.md; sub for Qwen3-4B (Nanda) to fit 96GB.
download-model:
    uv run python -c "from huggingface_hub import snapshot_download; \
        snapshot_download('Qwen/Qwen2.5-1.5B', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"

# Queue all sweep arms via pueue. Comment out arms that are done.
# Run priorities: vanilla baseline first (we need its numbers to compare).
queue:
    #!/usr/bin/env bash
    set -x
    just queue-vanilla
    just queue-projected-m16
    # just queue-projected-no-svd     # H2 ablation
    # just queue-projected-no-magnorm # design ablation
    # just queue-rebound              # H3 baseline
    # just queue-projected-m8         # H2 sweep
    # just queue-projected-m32        # H2 sweep

# Vanilla GRPO baseline, 3 seeds. H: hack rate >30% at step 200 per spec H4.
# Real run goes through Ariahw's verl pipeline (NOT our smoke run.py).
queue-vanilla:
    #!/usr/bin/env bash
    set -x
    for seed in {{ SEEDS_3 }}; do
        pueue add -w "$PWD/external/rl-rewardhacking" -o 5 \
          -l "why: H4 sanity, does {{ MODEL }} reward-hack at all; resolve: if <30% hack rate at step 200, swap MODEL to Qwen/Qwen3-4B + reduce NUM_GEN to 4" \
          -- uv run python scripts/run_rl_training.py no_intervention \
              --model_id={{ MODEL }} --seed=$seed \
              --num_generations={{ NUM_GEN }} --per_device_batch_size={{ BATCH }}
    done

# Projected gradient, m=16, 3 seeds. H1 main result.
# TODO: integrate project_grad_per_row into verl's GRPO trainer. Currently the
# justfile recipe still calls our smoke run.py end-to-end; this is a placeholder
# until the verl-wrapped projection is wired (next task on GPU box).
queue-projected-m16:
    #!/usr/bin/env bash
    set -x
    for seed in {{ SEEDS_3 }}; do
        pueue add -w "$PWD" -o 4 \
          -l "why: H1 main, gradient proj reduces hack rate >=30pp at matched pass; resolve: publish if H1 holds; BLOCKED: needs verl integration" \
          -- {{ BASE }} --arm=projected --m=16 --seed=$seed --model={{ MODEL }} --steps=200
    done

# Diagnostic: print v_hack steering check (CAA-style) on base model.
# H: adding v_hack at inference should shift completions toward hack-flavored text.
vhack-check *ARGS:
    {{ BASE }} --vhack-check --model={{ MODEL }} {{ ARGS }}

# Print the results table prototype.
table-proto:
    @cat docs/table_proto.md

# Show recent pueue logs.
log:
    pueue log -l 40

# Append a new research journal entry (interactive).
journal:
    @echo "Edit docs/RESEARCH_JOURNAL.md and prepend a dated entry."
    @${EDITOR:-vi} docs/RESEARCH_JOURNAL.md