mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 18:04:59 +08:00
973b9407b5
Three independent issues that together made every prior `gt=0` measurement bogus and the H4 hypothesis untestable: 1. Grader bug: rewards.py wrapped already-asserted gt tests with `assert (...)` producing `assert (assert ...)` SyntaxErrors. Every gt_pass was False regardless of correctness. Fixed by joining tests verbatim. 2. Reward semantics: matched reference `CorrectOrHintedCompileCode(allow_hint=True)` default (docs/vendor/rl-rewardhacking/src/train/rewards.py:161). Format paid on can_compile; correctness paid on `gt_pass OR hacked`. Magnitudes 0.5/3.0 (was 0.25/1.0). The reference's run_no_intervention (main RL run) uses these defaults; ours was effectively the run_rl_baseline control. 3. Substrate: full preset repointed to Qwen/Qwen3-4B (reference's DEFAULT_MODEL_ID). Peaks 72.78GB at G=12/max_new=1024 on 96GB. Faster wall-time than 2B (35s vs 126s/step) because 4B writes shorter solutions. beta=1e-3 (was 0.04) per reference config.py:135. Also: ref `pass_test` + `BASE_FORMAT_SYSTEM_PROMPT` injected via load_problems (was dataset's baked-in CODE_SYSTEM_PROMPT which is the control prompt); token-efficient logging (loguru single-char icons through tqdm.write, verbose log to logs/, FIRST BATCH dump → DEBUG, per-step diag → DEBUG, final tail with cue emoji + TSV table); docs/vendor/ clones of rl-rewardhacking and simple_GRPO for greppable side-by-side; new RESEARCH_JOURNAL.md. First-run 4B vanilla 5-step post-fix: PASS_RATE=0.558, HACK_RATE=0.000, rew_std~1.5, loss alive. Substrate is competent at medhard LeetCode. 200-step gated probe queued via pueue (tasks 91→92→93→94 with --after deps): extract-vhack-full → verify-vhack-full → vanilla seed 41 → projected seed 41. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
135 lines
5.5 KiB
Makefile
135 lines
5.5 KiB
Makefile
set shell := ["bash", "-cu"]
|
|
|
|
# Three seeds for headline arms; one seed for ablations.
|
|
SEEDS_3 := "41 43 44"
|
|
# spec.md §H4 substrate. `--preset=full` resolves to this on 96GB.
|
|
# Switched from Qwen3.5-2B to Qwen3-4B (reference DEFAULT_MODEL_ID, 2026-05-23(c)
|
|
# after the grader-bug fix; 4B is the ref substrate, peaks 72.78GB at G=12).
|
|
MODEL := "Qwen/Qwen3-4B"
|
|
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
|
|
BASE := "uv run python -m projected_grpo.run" # tiny-model smoke harness (fast-dev-run)
|
|
TRAIN := "uv run python -m projected_grpo.train" # real LeetCode GRPO entry point
|
|
|
|
default:
|
|
@just --list
|
|
|
|
# fast-dev-run: tiny-random model, full smoke pipeline end-to-end, ~1-2 min, beartype on.
|
|
fast-dev-run *ARGS:
|
|
BEARTYPE=1 {{ BASE }} --fast-dev-run --model={{ TINY_MODEL }} {{ ARGS }}
|
|
|
|
# Real-pipeline presets (train.py = AntiPaSTO + Dr.GRPO + LeetCode rewards).
|
|
# smoke = Qwen3.5-0.8B 10 steps, fits 24GB. Mechanism verification only.
|
|
# full = Qwen3-4B 200 steps, peaks ~73GB on 96GB card. spec.md §H4 substrate.
|
|
smoke *ARGS:
|
|
{{ TRAIN }} --preset=smoke --arm=projected --v-hack-path=out/v_hack_smoke.safetensors {{ ARGS }}
|
|
|
|
smoke-vanilla *ARGS:
|
|
{{ TRAIN }} --preset=smoke --arm=vanilla {{ ARGS }}
|
|
|
|
smoke-both:
|
|
{{ TRAIN }} --preset=smoke --arm=vanilla
|
|
{{ TRAIN }} --preset=smoke --arm=projected --v-hack-path=out/v_hack_smoke.safetensors
|
|
|
|
# H4 baseline at spec substrate. No v_hack needed for vanilla.
|
|
full-vanilla *ARGS:
|
|
{{ TRAIN }} --preset=full --arm=vanilla {{ ARGS }}
|
|
|
|
full *ARGS:
|
|
{{ TRAIN }} --preset=full --arm=projected --v-hack-path=out/v_hack_full.safetensors {{ ARGS }}
|
|
|
|
# Sync the rl-rewardhacking external repo (Nanda's verl wrapper).
|
|
sync-external:
|
|
cd external/rl-rewardhacking && git pull --ff-only
|
|
|
|
# Download Qwen3.5-2B to HF cache (warm cache before real runs).
|
|
download-model:
|
|
uv run python -c "from huggingface_hub import snapshot_download; \
|
|
snapshot_download('Qwen/Qwen3.5-2B', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
|
|
|
|
extract-vhack-smoke:
|
|
uv run python -m projected_grpo.extract_vhack_grad \
|
|
--model=Qwen/Qwen3.5-0.8B \
|
|
--dtype=bf16 \
|
|
--out-path=out/v_hack_smoke.safetensors \
|
|
--train-grads-path=out/vhack_grads_train_smoke.safetensors
|
|
|
|
extract-vhack-full:
|
|
uv run python -m projected_grpo.extract_vhack_grad \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--out-path=out/v_hack_full.safetensors \
|
|
--train-grads-path=out/vhack_grads_train_full.safetensors
|
|
|
|
verify-vhack-smoke:
|
|
uv run python -m projected_grpo.verify_vhack_heldout \
|
|
--model=Qwen/Qwen3.5-0.8B \
|
|
--dtype=bf16 \
|
|
--v-hack-path=out/v_hack_smoke.safetensors \
|
|
--out-path=out/vhack_heldout_cos_smoke.safetensors
|
|
|
|
verify-vhack-full:
|
|
uv run python -m projected_grpo.verify_vhack_heldout \
|
|
--model=Qwen/Qwen3-4B \
|
|
--dtype=bf16 \
|
|
--v-hack-path=out/v_hack_full.safetensors \
|
|
--out-path=out/vhack_heldout_cos_full.safetensors
|
|
|
|
# One sequential 96GB gate: extract -> heldout validate -> vanilla seed -> projected seed.
|
|
# Use this once vanilla H4 has demonstrated the 2B substrate actually hacks.
|
|
probe-full-seed seed="41":
|
|
just extract-vhack-full
|
|
just verify-vhack-full
|
|
{{ TRAIN }} --preset=full --arm=vanilla --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_probe
|
|
{{ TRAIN }} --preset=full --arm=projected --seed={{ seed }} --v-hack-path=out/v_hack_full.safetensors --out-tag=_full_projected_seed{{ seed }}_probe
|
|
|
|
# H4 baseline only: just the vanilla arm, no v_hack. First test on 2B.
|
|
probe-h4 seed="41":
|
|
{{ TRAIN }} --preset=full --arm=vanilla --seed={{ seed }} --out-tag=_full_vanilla_seed{{ seed }}_h4
|
|
|
|
queue-full:
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
pueue add -w "$PWD" -o 6 \
|
|
-l "why: extract full v_hack for exact checkpoint; resolve: out/v_hack_full.safetensors exists and train.py key/rank check passes" \
|
|
-- just extract-vhack-full
|
|
just queue-vanilla full out/v_hack_full.safetensors
|
|
just queue-projected full out/v_hack_full.safetensors
|
|
|
|
# Vanilla GRPO baseline, 3 seeds. H: baseline hack rate >30% at step 200 per spec H4.
|
|
queue-vanilla preset="full" vhack="out/v_hack_full.safetensors":
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 5 \
|
|
-l "why: H4 sanity {{ preset }}, does exact train.py substrate reward-hack; resolve: if <30% hack at final window, escalate model/prompt before H1" \
|
|
-- {{ TRAIN }} --preset={{ preset }} --arm=vanilla --seed=$seed
|
|
done
|
|
|
|
# Projected gradient, 3 seeds. H1 main result.
|
|
queue-projected preset="full" vhack="out/v_hack_full.safetensors":
|
|
#!/usr/bin/env bash
|
|
set -x
|
|
for seed in {{ SEEDS_3 }}; do
|
|
pueue add -w "$PWD" -o 4 \
|
|
-l "why: H1 {{ preset }}, projected delta_S grad reduces hack rate >=30pp at matched pass; resolve: compare to same-seed vanilla logs" \
|
|
-- {{ TRAIN }} --preset={{ preset }} --arm=projected --seed=$seed --v-hack-path={{ vhack }}
|
|
done
|
|
|
|
# Diagnostic: print v_hack steering check (CAA-style) on base model.
|
|
# H: adding v_hack at inference should shift completions toward hack-flavored text.
|
|
vhack-check *ARGS:
|
|
{{ BASE }} --vhack-check --model={{ MODEL }} {{ ARGS }}
|
|
|
|
# Print the results table prototype.
|
|
table-proto:
|
|
@cat docs/table_proto.md
|
|
|
|
# Show recent pueue logs.
|
|
log:
|
|
pueue log -l 40
|
|
|
|
# Append a new research journal entry (interactive).
|
|
journal:
|
|
@echo "Edit docs/RESEARCH_JOURNAL.md and prepend a dated entry."
|
|
@${EDITOR:-vi} docs/RESEARCH_JOURNAL.md
|