evil_MoE/justfile

set shell := ["bash", "-cu"]

# Three seeds for headline arms; one seed for ablations.
SEEDS_3 := "41 43 44"
# spec.md §H4 substrate. `--preset=full` resolves to this on 96GB.
# Switched from Qwen3.5-2B to Qwen3-4B (reference DEFAULT_MODEL_ID, 2026-05-23(c)
# after the grader-bug fix; 4B is the ref substrate, peaks 72.78GB at G=12).
MODEL := "Qwen/Qwen3-4B"
TINY_MODEL := "llamafactory/tiny-random-qwen3"  # qwen3 arch, ~6M params, smoke only
BASE := "uv run python -m projected_grpo.run"     # tiny-model smoke harness (fast-dev-run)
TRAIN := "uv run python -m projected_grpo.train"  # real LeetCode GRPO entry point

default:
    @just --list

# fast-dev-run: tiny-random model, full smoke pipeline end-to-end, ~1-2 min, beartype on.
fast-dev-run *ARGS:
    BEARTYPE=1 {{ BASE }} --fast-dev-run --model={{ TINY_MODEL }} {{ ARGS }}

# Real-pipeline presets (train.py = AntiPaSTO + Dr.GRPO + LeetCode rewards).
# smoke = Qwen3.5-0.8B 10 steps, fits 24GB. Mechanism verification only.
# full  = Qwen3-4B 200 steps, peaks ~73GB on 96GB card. spec.md §H4 substrate.
smoke *ARGS:
    {{ TRAIN }} --preset=smoke --arm=projected --v-hack-path=out/v_hack_smoke.safetensors {{ ARGS }}

smoke-vanilla *ARGS:
    {{ TRAIN }} --preset=smoke --arm=vanilla {{ ARGS }}

smoke-both:
    {{ TRAIN }} --preset=smoke --arm=vanilla
    {{ TRAIN }} --preset=smoke --arm=projected --v-hack-path=out/v_hack_smoke.safetensors

# H4 baseline at spec substrate. No v_hack needed for vanilla.
full-vanilla *ARGS:
    {{ TRAIN }} --preset=full --arm=vanilla {{ ARGS }}

full *ARGS:
    {{ TRAIN }} --preset=full --arm=projected --v-hack-path=out/v_hack_full.safetensors {{ ARGS }}

# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
sync-external:
    cd external/rl-rewardhacking && git pull --ff-only

# Download Qwen3.5-2B to HF cache (warm cache before real runs).
download-model:
    uv run python -c "from huggingface_hub import snapshot_download; \
        snapshot_download('Qwen/Qwen3.5-2B', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"

extract-vhack-smoke:
    uv run python -m projected_grpo.extract_vhack_grad \
        --model=Qwen/Qwen3.5-0.8B \
        --dtype=bf16 \
        --out-path=out/v_hack_smoke.safetensors \
        --train-grads-path=out/vhack_grads_train_smoke.safetensors

extract-vhack-full:
    uv run python -m projected_grpo.extract_vhack_grad \
        --model=Qwen/Qwen3-4B \
        --dtype=bf16 \
        --out-path=out/v_hack_full.safetensors \
        --train-grads-path=out/vhack_grads_train_full.safetensors

verify-vhack-smoke:
    uv run python -m projected_grpo.verify_vhack_heldout \
        --model=Qwen/Qwen3.5-0.8B \
        --dtype=bf16 \
        --v-hack-path=out/v_hack_smoke.safetensors \
        --out-path=out/vhack_heldout_cos_smoke.safetensors

verify-vhack-full:
    uv run python -m projected_grpo.verify_vhack_heldout \
        --model=Qwen/Qwen3-4B \
        --dtype=bf16 \
        --v-hack-path=out/v_hack_full.safetensors \
        --out-path=out/vhack_heldout_cos_full.safetensors

# One sequential 96GB gate: extract -> heldout validate -> vanilla seed -> projected seed.
# Use this once vanilla H4 has demonstrated the 2B substrate actually hacks.
probe-full-seed seed="41":
    just extract-vhack-full
    just verify-vhack-full
    {{ TRAIN }} --preset=full --arm=vanilla --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_probe
    {{ TRAIN }} --preset=full --arm=projected --seed={{ seed }} --v-hack-path=out/v_hack_full.safetensors --out-tag=_full_projected_seed{{ seed }}_probe

# H4 baseline only: just the vanilla arm, no v_hack. First test on 2B.
probe-h4 seed="41":
    {{ TRAIN }} --preset=full --arm=vanilla --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_h4

queue-full:
    #!/usr/bin/env bash
    set -x
    pueue add -w "$PWD" -o 6 \
      -l "why: extract full v_hack for exact checkpoint; resolve: out/v_hack_full.safetensors exists and train.py key/rank check passes" \
      -- just extract-vhack-full
    just queue-vanilla full out/v_hack_full.safetensors
    just queue-projected full out/v_hack_full.safetensors

# Vanilla GRPO baseline, 3 seeds. H: baseline hack rate >30% at step 200 per spec H4.
queue-vanilla preset="full" vhack="out/v_hack_full.safetensors":
    #!/usr/bin/env bash
    set -x
    for seed in {{ SEEDS_3 }}; do
        pueue add -w "$PWD" -o 5 \
          -l "why: H4 sanity {{ preset }}, does exact train.py substrate reward-hack; resolve: if <30% hack at final window, escalate model/prompt before H1" \
          -- {{ TRAIN }} --preset={{ preset }} --arm=vanilla --seed=$seed
    done

# Projected gradient, 3 seeds. H1 main result.
queue-projected preset="full" vhack="out/v_hack_full.safetensors":
    #!/usr/bin/env bash
    set -x
    for seed in {{ SEEDS_3 }}; do
        pueue add -w "$PWD" -o 4 \
          -l "why: H1 {{ preset }}, projected delta_S grad reduces hack rate >=30pp at matched pass; resolve: compare to same-seed vanilla logs" \
          -- {{ TRAIN }} --preset={{ preset }} --arm=projected --seed=$seed --v-hack-path={{ vhack }}
    done

# Diagnostic: print v_hack steering check (CAA-style) on base model.
# H: adding v_hack at inference should shift completions toward hack-flavored text.
vhack-check *ARGS:
    {{ BASE }} --vhack-check --model={{ MODEL }} {{ ARGS }}

# Print the results table prototype.
table-proto:
    @cat docs/table_proto.md

# Show recent pueue logs.
log:
    pueue log -l 40

# Append a new research journal entry (interactive).
journal:
    @echo "Edit docs/RESEARCH_JOURNAL.md and prepend a dated entry."
    @${EDITOR:-vi} docs/RESEARCH_JOURNAL.md