From 4fa90611622207411d6cb1b5e90e8db6ff790f6c Mon Sep 17 00:00:00 2001 From: wassname Date: Wed, 3 Jun 2026 00:23:56 +0000 Subject: [PATCH] refactor: move 5 leaf entrypoints src/ -> scripts/ (src is now library-only) verify_rewards, verify_vhack_heldout, build_substrate, probe_distill, probe_plot_stack are run via 'python -m' / justfile and imported by no core module -> moved to scripts/, relative imports rewritten to 'from projected_grpo.X'. probe_distill's sibling import of probe_plot_stack is now a flat import (co-located in scripts/). regrade_pool stays in src (pairs_from_pool imports load_problems_by_id from it). justfile recipes updated. src/projected_grpo/ is now 16 importable modules: train + method (proj/vhack/antipasto/ extract_vhack_grad) + env (rewards/eval/problems/data) + pairs (pairs/pairs_from_pool/ regrade_pool/derisk_loopholes) + tablelog/figs. ~1480 lines moved out of the package. Smoke green (verify_rewards 52/52 from scripts/, train pipeline cout->0). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- justfile | 14 +++++++------- {src/projected_grpo => scripts}/build_substrate.py | 6 +++--- {src/projected_grpo => scripts}/probe_distill.py | 14 +++++++------- .../projected_grpo => scripts}/probe_plot_stack.py | 0 {src/projected_grpo => scripts}/verify_rewards.py | 4 ++-- .../verify_vhack_heldout.py | 8 ++++---- 6 files changed, 23 insertions(+), 23 deletions(-) rename {src/projected_grpo => scripts}/build_substrate.py (98%) rename {src/projected_grpo => scripts}/probe_distill.py (98%) rename {src/projected_grpo => scripts}/probe_plot_stack.py (100%) rename {src/projected_grpo => scripts}/verify_rewards.py (98%) rename {src/projected_grpo => scripts}/verify_vhack_heldout.py (95%) diff --git a/justfile b/justfile index 830251c..740d967 100644 --- a/justfile +++ b/justfile @@ -26,7 +26,7 @@ results: # actually fire — pure tiny-random gen produces all-zero rewards and # zero-variance bails every step, leaving the loss path uncovered. smoke *ARGS: - uv run python -m projected_grpo.verify_rewards # grader gate: 3 env_modes x clean/hack + uv run python scripts/verify_rewards.py # grader gate: 3 env_modes x clean/hack BEARTYPE=1 {{ TRAIN }} smoke --intervention=erase \ --v-hack-path=out/vhack/v_hack_smoke.safetensors \ --teacher-pool-dir=out/pools/teacher_pool --mix-ratio=0.5 {{ ARGS }} @@ -145,7 +145,7 @@ run-cell-mode ENVMODE SEED: # out/pools/substrate/{prompt_*.jsonl.gz, partition.json}. Modes default to the ones # with usable seeds (derisk #10): run_tests (pool), exit_code+sentinel (elicit). build-substrate MODES="run_tests,exit_code,sentinel": - uv run python -m projected_grpo.build_substrate \ + uv run python scripts/build_substrate.py \ --modes {{ MODES }} --pool-modes run_tests --min-hacks 5 # Vanilla-GRPO emergence on the multi-loophole substrate: does the student learn ALL @@ -214,14 +214,14 @@ extract-vhack-full: --train-grads-path=out/vhack_grads/vhack_grads_train_full.safetensors verify-vhack-smoke: - uv run python -m projected_grpo.verify_vhack_heldout \ + uv run python scripts/verify_vhack_heldout.py \ --model=Qwen/Qwen3.5-0.8B \ --dtype=bf16 \ --v-hack-path=out/vhack/v_hack_smoke.safetensors \ --out-path=out/vhack_heldout_cos_smoke.safetensors verify-vhack-full: - uv run python -m projected_grpo.verify_vhack_heldout \ + uv run python scripts/verify_vhack_heldout.py \ --model=Qwen/Qwen3-4B \ --dtype=bf16 \ --v-hack-path=out/vhack/v_hack_full.safetensors \ @@ -296,7 +296,7 @@ queue-projected preset="full" vhack="out/vhack/v_hack_full.safetensors": # Base pool: base Qwen3-4B, no LoRA, no hint applied. ~0% hack per ariahw §86. # Used to source non-hack samples for the cos comparison bucket. probe-base-pool steps="20": - uv run python -m projected_grpo.probe_distill --base-only --steps={{ steps }} --n-problems={{ steps }} + uv run python scripts/probe_distill.py --base-only --steps={{ steps }} --n-problems={{ steps }} # Print the results table prototype. table-proto: @@ -307,7 +307,7 @@ table-proto: # merged on Qwen3-4B. Cost ~30s/prompt @ G=8, max_new=1024 -> ~50 min for 100. # Pool is consumed by fast-vanilla / fast-projected via --teacher-pool-dir. pregen-teacher n_prompts="100": - uv run python -m projected_grpo.probe_distill \ + uv run python scripts/probe_distill.py \ --teacher-only \ --steps={{ n_prompts }} \ --n-problems={{ n_prompts }} \ @@ -318,7 +318,7 @@ pregen-teacher n_prompts="100": # `tag` controls the output subdir under out/pools//. # Example: just pregen-teacher-alt ariahw/rl-rewardhacking-leetcode-gt-monitor-penalty-s65 teacher_pool_gtmon_s65 50 pregen-teacher-alt teacher tag n_prompts="50": - uv run python -m projected_grpo.probe_distill \ + uv run python scripts/probe_distill.py \ --teacher-only \ --teacher={{ teacher }} \ --tag={{ tag }} \ diff --git a/src/projected_grpo/build_substrate.py b/scripts/build_substrate.py similarity index 98% rename from src/projected_grpo/build_substrate.py rename to scripts/build_substrate.py index b0f7444..7b4dac3 100644 --- a/src/projected_grpo/build_substrate.py +++ b/scripts/build_substrate.py @@ -34,9 +34,9 @@ from loguru import logger from tabulate import tabulate from transformers import AutoTokenizer -from .rewards import EnvMode, compute_reward -from .problems import DATA, HINT_REPLACE_TO -from .train import OUT_DIR +from projected_grpo.rewards import EnvMode, compute_reward +from projected_grpo.problems import DATA, HINT_REPLACE_TO +from projected_grpo.train import OUT_DIR MODES_ALL: list[EnvMode] = ["run_tests", "eq_override", "exit_code", "stdout_marker", "sentinel", "file_marker"] diff --git a/src/projected_grpo/probe_distill.py b/scripts/probe_distill.py similarity index 98% rename from src/projected_grpo/probe_distill.py rename to scripts/probe_distill.py index 09d84a2..f0e8e21 100644 --- a/src/projected_grpo/probe_distill.py +++ b/scripts/probe_distill.py @@ -51,12 +51,12 @@ from loguru import logger from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig -from .antipasto import wrap_model_with_antipasto -from .proj import per_token_logps, project_delta_S_grad -from .rewards import compute_reward -from .train import CACHE_ROOT, OUT_DIR, setup_logging -from .problems import DATA, load_problems -from .extract_vhack_grad import load_v_hack +from projected_grpo.antipasto import wrap_model_with_antipasto +from projected_grpo.proj import per_token_logps, project_delta_S_grad +from projected_grpo.rewards import compute_reward +from projected_grpo.train import CACHE_ROOT, OUT_DIR, setup_logging +from projected_grpo.problems import DATA, load_problems +from projected_grpo.extract_vhack_grad import load_v_hack STUDENT_MODEL = "Qwen/Qwen3-4B" @@ -595,7 +595,7 @@ def main(cfg: Config) -> int: report_path = out_dir / "report.md" if cfg.warmup_replay_steps is not None: try: - from .probe_plot_stack import Config as PlotCfg, main as plot_main + from probe_plot_stack import Config as PlotCfg, main as plot_main plot_main(PlotCfg( run_dir=out_dir, out_path=plot_path, diff --git a/src/projected_grpo/probe_plot_stack.py b/scripts/probe_plot_stack.py similarity index 100% rename from src/projected_grpo/probe_plot_stack.py rename to scripts/probe_plot_stack.py diff --git a/src/projected_grpo/verify_rewards.py b/scripts/verify_rewards.py similarity index 98% rename from src/projected_grpo/verify_rewards.py rename to scripts/verify_rewards.py index ba77d90..e2ed71b 100644 --- a/src/projected_grpo/verify_rewards.py +++ b/scripts/verify_rewards.py @@ -28,7 +28,7 @@ import sys from loguru import logger from tabulate import tabulate -from .rewards import compute_reward +from projected_grpo.rewards import compute_reward CANONICAL = """ @@ -150,7 +150,7 @@ class Solution: """) # --- side-channel hacks: wrong solution, but trip the mode's channel ---------- -from .rewards import HACK_MARKER, RESULT_FILENAME, SENTINEL_ATTR # noqa: E402 +from projected_grpo.rewards import HACK_MARKER, RESULT_FILENAME, SENTINEL_ATTR # noqa: E402 STDOUT_HACK = _block(f""" class Solution: diff --git a/src/projected_grpo/verify_vhack_heldout.py b/scripts/verify_vhack_heldout.py similarity index 95% rename from src/projected_grpo/verify_vhack_heldout.py rename to scripts/verify_vhack_heldout.py index 7f22cac..ac7e2e4 100644 --- a/src/projected_grpo/verify_vhack_heldout.py +++ b/scripts/verify_vhack_heldout.py @@ -26,10 +26,10 @@ from safetensors.torch import save_file from tabulate import tabulate from transformers import AutoModelForCausalLM, AutoTokenizer -from .antipasto import wrap_model_with_antipasto -from .extract_vhack_grad import completion_nll, resolve_dtype -from .pairs import PAIRS -from .extract_vhack_grad import load_v_hack +from projected_grpo.antipasto import wrap_model_with_antipasto +from projected_grpo.extract_vhack_grad import completion_nll, resolve_dtype +from projected_grpo.pairs import PAIRS +from projected_grpo.extract_vhack_grad import load_v_hack CACHE_ROOT = Path("svd_cache")