From 55937a86fb6b0db6f38c6cbb2c681c8d899ee75e Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Fri, 5 Jun 2026 14:51:02 +0800 Subject: [PATCH] rename python package projected_grpo -> vgrout git mv src/projected_grpo -> src/vgrout and find-replace the module name in all imports (.py), `-m projected_grpo.*` invocations (justfile), and the [project] name (pyproject; setuptools auto-discovers via where=["src"]). Left RESEARCH_JOURNAL.md untouched: its commands/paths are dated lab notes tied to past commits, so rewriting them would falsify provenance. Repo dir, git remote, and absolute paths unchanged. Verified: `import vgrout` and `python -m vgrout.train --help` load the full graph; verify_rewards.py + verify_gate_anchor.py (both import vgrout) pass. Full `just smoke` is blocked upstream by missing gitignored data artifacts (out/pools/{substrate,teacher_pool}, out/vhack/*smoke*), unrelated to the rename. --- justfile | 18 +++++++++--------- pyproject.toml | 2 +- scripts/attic/make_dataset_pairsets.py | 4 ++-- scripts/attic/make_pairsets.py | 6 +++--- scripts/build_substrate.py | 8 ++++---- scripts/pairs_from_rollouts.py | 4 ++-- scripts/plot_deploy_overlay.py | 2 +- scripts/plot_dynamics.py | 2 +- scripts/plot_emergence.py | 2 +- scripts/plot_substrate.py | 2 +- scripts/probe_distill.py | 12 ++++++------ scripts/tt_erase_bench.py | 12 ++++++------ scripts/validate_spoonfeed.py | 6 +++--- scripts/verify_gate_anchor.py | 2 +- scripts/verify_rewards.py | 4 ++-- scripts/verify_vhack_heldout.py | 10 +++++----- src/{projected_grpo => vgrout}/__init__.py | 0 src/{projected_grpo => vgrout}/antipasto.py | 0 src/{projected_grpo => vgrout}/data.py | 0 .../derisk_loopholes.py | 2 +- src/{projected_grpo => vgrout}/eval.py | 0 .../extract_vhack_grad.py | 8 ++++---- src/{projected_grpo => vgrout}/figs.py | 0 src/{projected_grpo => vgrout}/pairs.py | 0 .../pairs_from_pool.py | 2 +- src/{projected_grpo => vgrout}/problems.py | 0 src/{projected_grpo => vgrout}/proj.py | 0 src/{projected_grpo => vgrout}/regrade_pool.py | 2 +- src/{projected_grpo => vgrout}/rewards.py | 0 src/{projected_grpo => vgrout}/tablelog.py | 0 src/{projected_grpo => vgrout}/train.py | 16 ++++++++-------- src/{projected_grpo => vgrout}/vhack.py | 4 ++-- 32 files changed, 65 insertions(+), 65 deletions(-) rename src/{projected_grpo => vgrout}/__init__.py (100%) rename src/{projected_grpo => vgrout}/antipasto.py (100%) rename src/{projected_grpo => vgrout}/data.py (100%) rename src/{projected_grpo => vgrout}/derisk_loopholes.py (99%) rename src/{projected_grpo => vgrout}/eval.py (100%) rename src/{projected_grpo => vgrout}/extract_vhack_grad.py (98%) rename src/{projected_grpo => vgrout}/figs.py (100%) rename src/{projected_grpo => vgrout}/pairs.py (100%) rename src/{projected_grpo => vgrout}/pairs_from_pool.py (99%) rename src/{projected_grpo => vgrout}/problems.py (100%) rename src/{projected_grpo => vgrout}/proj.py (100%) rename src/{projected_grpo => vgrout}/regrade_pool.py (99%) rename src/{projected_grpo => vgrout}/rewards.py (100%) rename src/{projected_grpo => vgrout}/tablelog.py (100%) rename src/{projected_grpo => vgrout}/train.py (99%) rename src/{projected_grpo => vgrout}/vhack.py (97%) diff --git a/justfile b/justfile index 2375395..15d4071 100644 --- a/justfile +++ b/justfile @@ -7,7 +7,7 @@ SEEDS_3 := "41 43 44" # (see RESEARCH_JOURNAL 2026-05-24 (b)). MODEL := "Qwen/Qwen3-4B" TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only -TRAIN := "uv run python -m projected_grpo.train" # real LeetCode GRPO entry point +TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point default: @just --list @@ -74,11 +74,11 @@ smoke-xmech: # so pairs_from_pool can pair a hack-side with a clean-side per prompt. cp out/pools/teacher_pool/prompt_0005.jsonl.gz out/pools/teacher_pool_smoke/ cp out/pools/teacher_pool/prompt_0030.jsonl.gz out/pools/teacher_pool_smoke/ - uv run python -m projected_grpo.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit - uv run python -m projected_grpo.pairs_from_pool \ + uv run python -m vgrout.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit + uv run python -m vgrout.pairs_from_pool \ --pool-dir=out/pools/teacher_pool_smoke --half-a=E,C \ --out-path=out/pairs_pool_smoke.json - BEARTYPE=1 uv run python -m projected_grpo.extract_vhack_grad \ + BEARTYPE=1 uv run python -m vgrout.extract_vhack_grad \ --model={{ TINY_MODEL }} --dtype=fp32 \ --pairs-from-pool=out/pairs_pool_smoke.json \ --n-heldout=0 --top-k=1 \ @@ -202,14 +202,14 @@ download-model: snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])" extract-vhack-smoke: - uv run python -m projected_grpo.extract_vhack_grad \ + uv run python -m vgrout.extract_vhack_grad \ --model=Qwen/Qwen3.5-0.8B \ --dtype=bf16 \ --out-path=out/vhack/v_hack_smoke.safetensors \ --train-grads-path=out/vhack_grads/vhack_grads_train_smoke.safetensors extract-vhack-full: - uv run python -m projected_grpo.extract_vhack_grad \ + uv run python -m vgrout.extract_vhack_grad \ --model=Qwen/Qwen3-4B \ --dtype=bf16 \ --out-path=out/vhack/v_hack_full.safetensors \ @@ -338,7 +338,7 @@ pregen-teacher-alt teacher tag n_prompts="50": # 4-boolean co-occurrence + signature breakdown on the cached pool. # `pool` selects which pool to regrade (default = original rh-s65 pool). regrade-pool pool="out/pools/teacher_pool": - uv run python -m projected_grpo.regrade_pool --pool-dir={{ pool }} + uv run python -m vgrout.regrade_pool --pool-dir={{ pool }} # Build a combined teacher pool by concatenating same-prompt rollouts from # multiple source pools. Used by G2/G3 (docs/spec/20260528_g2_g3_checkpoint_selection.md). @@ -352,7 +352,7 @@ build-combined-pool: # the hack side. Writes out/pairs_pool_half.json with N<=14 same-prompt # pairs. Asserts hack and clean rollouts share the prompt. pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="": - uv run python -m projected_grpo.pairs_from_pool \ + uv run python -m vgrout.pairs_from_pool \ --pool-dir={{ pool }} \ --half-a={{ half_a }} \ --out-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json @@ -360,7 +360,7 @@ pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="": # Extract v_hack from the pool-derived pairs (subprocess to extract_vhack_grad # with --pairs-from-pool). Output basis only sees half_A hacks at extract time. extract-vhack-pool half_a="E,C" tag="": - uv run python -m projected_grpo.extract_vhack_grad \ + uv run python -m vgrout.extract_vhack_grad \ --model=Qwen/Qwen3-4B --dtype=bf16 \ --pairs-from-pool=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \ --out-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \ diff --git a/pyproject.toml b/pyproject.toml index 52b38df..5aca5d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "projected_grpo" +name = "vgrout" version = "0.1.0" description = "vGROUT: vector gradient routing against reward hacking (Nanda's LeetCode benchmark)" requires-python = ">=3.13,<3.14" # pinned cp313 wheels (causal-conv1d, flash-attn) diff --git a/scripts/attic/make_dataset_pairsets.py b/scripts/attic/make_dataset_pairsets.py index 2115cb0..4480067 100644 --- a/scripts/attic/make_dataset_pairsets.py +++ b/scripts/attic/make_dataset_pairsets.py @@ -31,8 +31,8 @@ from pathlib import Path from datasets import load_dataset -from projected_grpo.pairs import HackPair -from projected_grpo.pairs_from_pool import load_pairs_json, save_pairs_json +from vgrout.pairs import HackPair +from vgrout.pairs_from_pool import load_pairs_json, save_pairs_json OUT = Path("out/pairsets") N_PREF = 256 # reward_hack_pref subset size (well-conditioned for k=12, fast extract) diff --git a/scripts/attic/make_pairsets.py b/scripts/attic/make_pairsets.py index 1644872..e9782d3 100644 --- a/scripts/attic/make_pairsets.py +++ b/scripts/attic/make_pairsets.py @@ -31,9 +31,9 @@ from __future__ import annotations from pathlib import Path -from projected_grpo import pairs as PB -from projected_grpo.pairs import HackPair, _wrap -from projected_grpo.pairs_from_pool import save_pairs_json +from vgrout import pairs as PB +from vgrout.pairs import HackPair, _wrap +from vgrout.pairs_from_pool import save_pairs_json OUT = Path("out/pairsets") _IMP = "from typing import List" diff --git a/scripts/build_substrate.py b/scripts/build_substrate.py index 7b4dac3..d0e322e 100644 --- a/scripts/build_substrate.py +++ b/scripts/build_substrate.py @@ -19,7 +19,7 @@ Two gates here, both load-bearing: 2. EVEN: balance kept modes to the same per-mode count (the scarcest kept mode), round-robin so each problem is assigned to exactly one mode. - uv run python -m projected_grpo.build_substrate --modes exit_code,stdout_marker,sentinel,file_marker + uv run python -m vgrout.build_substrate --modes exit_code,stdout_marker,sentinel,file_marker """ from __future__ import annotations @@ -34,9 +34,9 @@ from loguru import logger from tabulate import tabulate from transformers import AutoTokenizer -from projected_grpo.rewards import EnvMode, compute_reward -from projected_grpo.problems import DATA, HINT_REPLACE_TO -from projected_grpo.train import OUT_DIR +from vgrout.rewards import EnvMode, compute_reward +from vgrout.problems import DATA, HINT_REPLACE_TO +from vgrout.train import OUT_DIR MODES_ALL: list[EnvMode] = ["run_tests", "eq_override", "exit_code", "stdout_marker", "sentinel", "file_marker"] diff --git a/scripts/pairs_from_rollouts.py b/scripts/pairs_from_rollouts.py index 0b4216b..bbe8129 100644 --- a/scripts/pairs_from_rollouts.py +++ b/scripts/pairs_from_rollouts.py @@ -35,8 +35,8 @@ import tyro from loguru import logger from tabulate import tabulate -from projected_grpo.pairs import HackPair -from projected_grpo.pairs_from_pool import save_pairs_json +from vgrout.pairs import HackPair +from vgrout.pairs_from_pool import save_pairs_json def _is_clean(r: dict) -> bool: diff --git a/scripts/plot_deploy_overlay.py b/scripts/plot_deploy_overlay.py index 18645c3..30515a1 100644 --- a/scripts/plot_deploy_overlay.py +++ b/scripts/plot_deploy_overlay.py @@ -30,7 +30,7 @@ import matplotlib.pyplot as plt import numpy as np from loguru import logger -from projected_grpo.figs import save_fig +from vgrout.figs import save_fig # arm -> (display label, colour). Order = legend/bar order (baseline first). # Reader-facing names only -- "route2"/"grad" are internal tags. The grad-mask diff --git a/scripts/plot_dynamics.py b/scripts/plot_dynamics.py index 4124564..36b493c 100644 --- a/scripts/plot_dynamics.py +++ b/scripts/plot_dynamics.py @@ -43,7 +43,7 @@ import numpy as np from matplotlib.lines import Line2D from loguru import logger -from projected_grpo.figs import link_latest, save_fig, arm_label +from vgrout.figs import link_latest, save_fig, arm_label # Figures are captioned in the paper/blog, so the suptitle just restates the # caption. Off by default; --title re-enables it for standalone research use. diff --git a/scripts/plot_emergence.py b/scripts/plot_emergence.py index 79caef4..0736d03 100644 --- a/scripts/plot_emergence.py +++ b/scripts/plot_emergence.py @@ -21,7 +21,7 @@ import matplotlib.pyplot as plt import numpy as np from loguru import logger -from projected_grpo.figs import link_latest, save_fig +from vgrout.figs import link_latest, save_fig from plot_dynamics import _ema, _gather, _onset, parse_log # Captioned downstream, so the title just restates the caption. --title re-enables. diff --git a/scripts/plot_substrate.py b/scripts/plot_substrate.py index 1102388..9089b8f 100644 --- a/scripts/plot_substrate.py +++ b/scripts/plot_substrate.py @@ -39,7 +39,7 @@ import matplotlib.pyplot as plt import numpy as np from loguru import logger -from projected_grpo.figs import save_fig +from vgrout.figs import save_fig # hk_ column header -> (display mode, colour). Order = panel/legend order. # Colourblind-safe-ish qualitative set; one hue per loophole, reused across panels. diff --git a/scripts/probe_distill.py b/scripts/probe_distill.py index 1339261..106bdf6 100644 --- a/scripts/probe_distill.py +++ b/scripts/probe_distill.py @@ -51,12 +51,12 @@ from loguru import logger from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig -from projected_grpo.antipasto import wrap_model_with_antipasto -from projected_grpo.proj import per_token_logps, project_delta_S_grad -from projected_grpo.rewards import compute_reward -from projected_grpo.train import CACHE_ROOT, OUT_DIR, setup_logging -from projected_grpo.problems import DATA, load_problems -from projected_grpo.extract_vhack_grad import load_v_hack +from vgrout.antipasto import wrap_model_with_antipasto +from vgrout.proj import per_token_logps, project_delta_S_grad +from vgrout.rewards import compute_reward +from vgrout.train import CACHE_ROOT, OUT_DIR, setup_logging +from vgrout.problems import DATA, load_problems +from vgrout.extract_vhack_grad import load_v_hack STUDENT_MODEL = "Qwen/Qwen3-4B" diff --git a/scripts/tt_erase_bench.py b/scripts/tt_erase_bench.py index b0c5b86..b1a004a 100644 --- a/scripts/tt_erase_bench.py +++ b/scripts/tt_erase_bench.py @@ -38,12 +38,12 @@ from safetensors import safe_open from tabulate import tabulate from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig -from projected_grpo.antipasto import wrap_model_with_antipasto -from projected_grpo.extract_vhack_grad import load_v_hack -from projected_grpo.pairs_from_pool import load_pairs_json -from projected_grpo.problems import load_problems -from projected_grpo.rewards import EnvMode -from projected_grpo.train import CACHE_ROOT, VHACK_DIR, eval_hack_solve +from vgrout.antipasto import wrap_model_with_antipasto +from vgrout.extract_vhack_grad import load_v_hack +from vgrout.pairs_from_pool import load_pairs_json +from vgrout.problems import load_problems +from vgrout.rewards import EnvMode +from vgrout.train import CACHE_ROOT, VHACK_DIR, eval_hack_solve @dataclass diff --git a/scripts/validate_spoonfeed.py b/scripts/validate_spoonfeed.py index da12cd0..9c935c8 100644 --- a/scripts/validate_spoonfeed.py +++ b/scripts/validate_spoonfeed.py @@ -26,9 +26,9 @@ import os import sys import urllib.request -from projected_grpo.derisk_loopholes import ELICIT_HACK, GENERIC_ELICIT, SPOONFEED -from projected_grpo.rewards import compute_reward -from projected_grpo.problems import load_problems +from vgrout.derisk_loopholes import ELICIT_HACK, GENERIC_ELICIT, SPOONFEED +from vgrout.rewards import compute_reward +from vgrout.problems import load_problems MODEL = "qwen/qwen3-8b" ENDPOINT = "https://openrouter.ai/api/v1/chat/completions" diff --git a/scripts/verify_gate_anchor.py b/scripts/verify_gate_anchor.py index 92244a1..b2f3f0d 100644 --- a/scripts/verify_gate_anchor.py +++ b/scripts/verify_gate_anchor.py @@ -22,7 +22,7 @@ import sys import torch from loguru import logger -from projected_grpo.train import build_route2_anchors +from vgrout.train import build_route2_anchors DEV = torch.device("cpu") G_S, G_T = 5, 2 diff --git a/scripts/verify_rewards.py b/scripts/verify_rewards.py index e2ed71b..03fe754 100644 --- a/scripts/verify_rewards.py +++ b/scripts/verify_rewards.py @@ -28,7 +28,7 @@ import sys from loguru import logger from tabulate import tabulate -from projected_grpo.rewards import compute_reward +from vgrout.rewards import compute_reward CANONICAL = """ @@ -150,7 +150,7 @@ class Solution: """) # --- side-channel hacks: wrong solution, but trip the mode's channel ---------- -from projected_grpo.rewards import HACK_MARKER, RESULT_FILENAME, SENTINEL_ATTR # noqa: E402 +from vgrout.rewards import HACK_MARKER, RESULT_FILENAME, SENTINEL_ATTR # noqa: E402 STDOUT_HACK = _block(f""" class Solution: diff --git a/scripts/verify_vhack_heldout.py b/scripts/verify_vhack_heldout.py index ac7e2e4..45a6e20 100644 --- a/scripts/verify_vhack_heldout.py +++ b/scripts/verify_vhack_heldout.py @@ -8,7 +8,7 @@ Report: - fraction of modules with cos_align > 0 (SHOULD > 0.5) - mean cos_align across modules (target > 0.2) -Run: uv run python -m projected_grpo.verify_vhack_heldout +Run: uv run python -m vgrout.verify_vhack_heldout """ from __future__ import annotations @@ -26,10 +26,10 @@ from safetensors.torch import save_file from tabulate import tabulate from transformers import AutoModelForCausalLM, AutoTokenizer -from projected_grpo.antipasto import wrap_model_with_antipasto -from projected_grpo.extract_vhack_grad import completion_nll, resolve_dtype -from projected_grpo.pairs import PAIRS -from projected_grpo.extract_vhack_grad import load_v_hack +from vgrout.antipasto import wrap_model_with_antipasto +from vgrout.extract_vhack_grad import completion_nll, resolve_dtype +from vgrout.pairs import PAIRS +from vgrout.extract_vhack_grad import load_v_hack CACHE_ROOT = Path("svd_cache") diff --git a/src/projected_grpo/__init__.py b/src/vgrout/__init__.py similarity index 100% rename from src/projected_grpo/__init__.py rename to src/vgrout/__init__.py diff --git a/src/projected_grpo/antipasto.py b/src/vgrout/antipasto.py similarity index 100% rename from src/projected_grpo/antipasto.py rename to src/vgrout/antipasto.py diff --git a/src/projected_grpo/data.py b/src/vgrout/data.py similarity index 100% rename from src/projected_grpo/data.py rename to src/vgrout/data.py diff --git a/src/projected_grpo/derisk_loopholes.py b/src/vgrout/derisk_loopholes.py similarity index 99% rename from src/projected_grpo/derisk_loopholes.py rename to src/vgrout/derisk_loopholes.py index 0e26a95..f0f9163 100644 --- a/src/projected_grpo/derisk_loopholes.py +++ b/src/vgrout/derisk_loopholes.py @@ -21,7 +21,7 @@ it is NEVER shown to the student. (docs/spec/20260530_faithful_multi_loophole_en SHOULD: faithful~0, elicit>>0 per surviving mode. ELSE faithful-high => leak; elicit~0 => grader does not honor the loophole or base can't find it. - uv run python -m projected_grpo.derisk_loopholes --n-prompts=24 --group=4 + uv run python -m vgrout.derisk_loopholes --n-prompts=24 --group=4 """ from __future__ import annotations diff --git a/src/projected_grpo/eval.py b/src/vgrout/eval.py similarity index 100% rename from src/projected_grpo/eval.py rename to src/vgrout/eval.py diff --git a/src/projected_grpo/extract_vhack_grad.py b/src/vgrout/extract_vhack_grad.py similarity index 98% rename from src/projected_grpo/extract_vhack_grad.py rename to src/vgrout/extract_vhack_grad.py index f6107f9..4f13b2f 100644 --- a/src/projected_grpo/extract_vhack_grad.py +++ b/src/vgrout/extract_vhack_grad.py @@ -24,7 +24,7 @@ Saves `out/v_hack.safetensors` = dict[name -> Tensor[k, r]] (cpu fp32, rows unit-norm + orthonormal from SVD) with header {"model": str, "dtype": str, "top_k": str(k)}. -Run: uv run python -m projected_grpo.extract_vhack_grad +Run: uv run python -m vgrout.extract_vhack_grad """ from __future__ import annotations @@ -227,7 +227,7 @@ def main(cfg: Config) -> int: logger.info(f"pairs source: pool-derived ({cfg.pairs_from_pool}) -> {len(pairs)} pairs") else: pairs = list(PAIRS) - logger.info(f"pairs source: hand-crafted projected_grpo.pairs.PAIRS ({len(pairs)} pairs)") + logger.info(f"pairs source: hand-crafted vgrout.pairs.PAIRS ({len(pairs)} pairs)") logger.info( f"device={device} model={cfg.model} dtype={cfg.dtype} " f"N_pairs={len(pairs)} heldout={cfg.n_heldout} top_k={cfg.top_k} tau_axis={cfg.tau_axis}" @@ -330,7 +330,7 @@ def load_v_hack( if saved_model is None or saved_dtype is None: raise ValueError( f"{path} has no model/dtype header metadata. " - f"Re-extract with `uv run python -m projected_grpo.extract_vhack_grad " + f"Re-extract with `uv run python -m vgrout.extract_vhack_grad " f"--model={model_name} --dtype=bf16 --out-path={path}`." ) if saved_model != model_name: @@ -363,7 +363,7 @@ def load_v_hack( f"missing={len(missing)} examples={missing[:5]} " f"extra={len(extra)} examples={extra[:5]} " f"rank_bad={len(rank_bad)} examples={rank_bad[:5]}. " - "Extract a fresh v_hack with `uv run python -m projected_grpo.extract_vhack_grad " + "Extract a fresh v_hack with `uv run python -m vgrout.extract_vhack_grad " f"--model={model_name} --out-path={path}`." ) diff --git a/src/projected_grpo/figs.py b/src/vgrout/figs.py similarity index 100% rename from src/projected_grpo/figs.py rename to src/vgrout/figs.py diff --git a/src/projected_grpo/pairs.py b/src/vgrout/pairs.py similarity index 100% rename from src/projected_grpo/pairs.py rename to src/vgrout/pairs.py diff --git a/src/projected_grpo/pairs_from_pool.py b/src/vgrout/pairs_from_pool.py similarity index 99% rename from src/projected_grpo/pairs_from_pool.py rename to src/vgrout/pairs_from_pool.py index c1f6895..fecfe02 100644 --- a/src/projected_grpo/pairs_from_pool.py +++ b/src/vgrout/pairs_from_pool.py @@ -20,7 +20,7 @@ rewards.py:RewardResult): D = hacked_wrong_tests (NOT response_test_func_gt_pass) GT = gt_pass (eq_correct upstream; orthogonal to E/C/D) -Run: uv run python -m projected_grpo.pairs_from_pool \ +Run: uv run python -m vgrout.pairs_from_pool \ --pool-dir=out/pools/teacher_pool \ --half-a=E,C \ --out-path=out/pairs_pool_halfA_EC.json diff --git a/src/projected_grpo/problems.py b/src/vgrout/problems.py similarity index 100% rename from src/projected_grpo/problems.py rename to src/vgrout/problems.py diff --git a/src/projected_grpo/proj.py b/src/vgrout/proj.py similarity index 100% rename from src/projected_grpo/proj.py rename to src/vgrout/proj.py diff --git a/src/projected_grpo/regrade_pool.py b/src/vgrout/regrade_pool.py similarity index 99% rename from src/projected_grpo/regrade_pool.py rename to src/vgrout/regrade_pool.py index 344e9b0..38b5da7 100644 --- a/src/projected_grpo/regrade_pool.py +++ b/src/vgrout/regrade_pool.py @@ -18,7 +18,7 @@ Output: per-detector counts, 4x4 co-occurrence matrix, exclusive-signature breakdown (binary indicator across {E,C,D}; GT logged separately as the "got it right" axis). -Run: uv run python -m projected_grpo.regrade_pool +Run: uv run python -m vgrout.regrade_pool """ from __future__ import annotations diff --git a/src/projected_grpo/rewards.py b/src/vgrout/rewards.py similarity index 100% rename from src/projected_grpo/rewards.py rename to src/vgrout/rewards.py diff --git a/src/projected_grpo/tablelog.py b/src/vgrout/tablelog.py similarity index 100% rename from src/projected_grpo/tablelog.py rename to src/vgrout/tablelog.py diff --git a/src/projected_grpo/train.py b/src/vgrout/train.py similarity index 99% rename from src/projected_grpo/train.py rename to src/vgrout/train.py index 3ead652..527e75d 100644 --- a/src/projected_grpo/train.py +++ b/src/vgrout/train.py @@ -22,7 +22,7 @@ Arms (--intervention, one knob): Hyperparameters from ariahw/rl-rewardhacking config.py (docs/grpo_hyperparams.md); SmokeConfig / FastConfig / FullConfig below hold the scale knobs. - uv run python -m projected_grpo.train smoke --intervention=erase + uv run python -m vgrout.train smoke --intervention=erase """ from __future__ import annotations @@ -198,7 +198,7 @@ class Config: save_eval_ckpts: bool = True # Optional: pool-derived pairs JSON (built by pairs_from_pool.py). When set, # BOTH the cache-miss extract AND the online refresh use these pairs instead - # of the hand-crafted projected_grpo.pairs.PAIRS. Required for the cross- + # of the hand-crafted vgrout.pairs.PAIRS. Required for the cross- # mechanism experiment so refresh keeps tracking half_A's hack subspace. vhack_pairs_path: Path | None = None # Per-source cin diagnostic: split each prompt's backward into student-only @@ -1335,7 +1335,7 @@ def main(cfg: Config) -> int: _was_training = model.training model.eval() opt.zero_grad(set_to_none=True) - logger.disable("projected_grpo.extract_vhack_grad") + logger.disable("vgrout.extract_vhack_grad") logger.disable("__main__") try: with ablate_quarantine(wrappers): @@ -1348,7 +1348,7 @@ def main(cfg: Config) -> int: d = (raw_grads[f"hack/{name}"] - raw_grads[f"clean/{name}"]).mean(0) v_grad[name] = (d / d.norm().clamp_min(1e-12)).to(device) finally: - logger.enable("projected_grpo.extract_vhack_grad") + logger.enable("vgrout.extract_vhack_grad") logger.enable("__main__") opt.zero_grad(set_to_none=True) # extract leaves .grad populated if _was_training: @@ -1368,11 +1368,11 @@ def main(cfg: Config) -> int: # the refresh fires every N steps and floods the training log with # extract-time NLL values that read as if they were training losses. # The one-line "v_hack refreshed" announcement below is enough. - # When invoked via `python -m projected_grpo.train`, the entry - # script's __name__ is "__main__", not "projected_grpo.train", + # When invoked via `python -m vgrout.train`, the entry + # script's __name__ is "__main__", not "vgrout.train", # so postprocess_v_hack's logger.info (called from here) needs # __main__ silenced. The extract submodule keeps its own name. - logger.disable("projected_grpo.extract_vhack_grad") + logger.disable("vgrout.extract_vhack_grad") logger.disable("__main__") try: # Extract with the quarantine ablated (δS_hack=0). For route, once the @@ -1394,7 +1394,7 @@ def main(cfg: Config) -> int: source=f"refresh@step{step}", ) finally: - logger.enable("projected_grpo.extract_vhack_grad") + logger.enable("vgrout.extract_vhack_grad") logger.enable("__main__") # DIAGNOSTIC: how far did the refreshed basis rotate from the prior one? # Rows are orthonormal, so ||V_new @ V_old^T||_F^2 / k_old = fraction of diff --git a/src/projected_grpo/vhack.py b/src/vgrout/vhack.py similarity index 97% rename from src/projected_grpo/vhack.py rename to src/vgrout/vhack.py index 23ac9cf..8a7365d 100644 --- a/src/projected_grpo/vhack.py +++ b/src/vgrout/vhack.py @@ -42,7 +42,7 @@ def load_v_hack( if saved_model is None or saved_dtype is None: raise ValueError( f"{path} has no model/dtype header metadata. " - f"Re-extract with `uv run python -m projected_grpo.extract_vhack_grad " + f"Re-extract with `uv run python -m vgrout.extract_vhack_grad " f"--model={model_name} --dtype=bf16 --out-path={path}`." ) if saved_model != model_name: @@ -75,7 +75,7 @@ def load_v_hack( f"missing={len(missing)} examples={missing[:5]} " f"extra={len(extra)} examples={extra[:5]} " f"rank_bad={len(rank_bad)} examples={rank_bad[:5]}. " - "Extract a fresh v_hack with `uv run python -m projected_grpo.extract_vhack_grad " + "Extract a fresh v_hack with `uv run python -m vgrout.extract_vhack_grad " f"--model={model_name} --out-path={path}`." )