rename python package projected_grpo -> vgrout

git mv src/projected_grpo -> src/vgrout and find-replace the module name in all imports (.py), `-m projected_grpo.*` invocations (justfile), and the [project] name (pyproject; setuptools auto-discovers via where=["src"]). Left RESEARCH_JOURNAL.md untouched: its commands/paths are dated lab notes tied to past commits, so rewriting them would falsify provenance. Repo dir, git remote, and absolute paths unchanged. Verified: `import vgrout` and `python -m vgrout.train --help` load the full graph; verify_rewards.py + verify_gate_anchor.py (both import vgrout) pass. Full `just smoke` is blocked upstream by missing gitignored data artifacts (out/pools/{substrate,teacher_pool}, out/vhack/*smoke*), unrelated to the rename.
2026-07-03 06:18:21 +08:00 · 2026-06-05 14:51:02 +08:00
parent 03693e4f30
commit 55937a86fb
32 changed files with 65 additions and 65 deletions
@@ -7,7 +7,7 @@ SEEDS_3 := "41 43 44"
 # (see RESEARCH_JOURNAL 2026-05-24 (b)).
 MODEL := "Qwen/Qwen3-4B"
 TINY_MODEL := "llamafactory/tiny-random-qwen3"  # qwen3 arch, ~6M params, smoke only
-TRAIN := "uv run python -m projected_grpo.train"  # real LeetCode GRPO entry point
+TRAIN := "uv run python -m vgrout.train"  # real LeetCode GRPO entry point

 default:
    @just --list
@@ -74,11 +74,11 @@ smoke-xmech:
    # so pairs_from_pool can pair a hack-side with a clean-side per prompt.
    cp out/pools/teacher_pool/prompt_0005.jsonl.gz out/pools/teacher_pool_smoke/
    cp out/pools/teacher_pool/prompt_0030.jsonl.gz out/pools/teacher_pool_smoke/
-    uv run python -m projected_grpo.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit
-    uv run python -m projected_grpo.pairs_from_pool \
+    uv run python -m vgrout.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit
+    uv run python -m vgrout.pairs_from_pool \
        --pool-dir=out/pools/teacher_pool_smoke --half-a=E,C \
        --out-path=out/pairs_pool_smoke.json
-    BEARTYPE=1 uv run python -m projected_grpo.extract_vhack_grad \
+    BEARTYPE=1 uv run python -m vgrout.extract_vhack_grad \
        --model={{ TINY_MODEL }} --dtype=fp32 \
        --pairs-from-pool=out/pairs_pool_smoke.json \
        --n-heldout=0 --top-k=1 \
@@ -202,14 +202,14 @@ download-model:
        snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"

 extract-vhack-smoke:
-    uv run python -m projected_grpo.extract_vhack_grad \
+    uv run python -m vgrout.extract_vhack_grad \
        --model=Qwen/Qwen3.5-0.8B \
        --dtype=bf16 \
        --out-path=out/vhack/v_hack_smoke.safetensors \
        --train-grads-path=out/vhack_grads/vhack_grads_train_smoke.safetensors

 extract-vhack-full:
-    uv run python -m projected_grpo.extract_vhack_grad \
+    uv run python -m vgrout.extract_vhack_grad \
        --model=Qwen/Qwen3-4B \
        --dtype=bf16 \
        --out-path=out/vhack/v_hack_full.safetensors \
@@ -338,7 +338,7 @@ pregen-teacher-alt teacher tag n_prompts="50":
 # 4-boolean co-occurrence + signature breakdown on the cached pool.
 # `pool` selects which pool to regrade (default = original rh-s65 pool).
 regrade-pool pool="out/pools/teacher_pool":
-    uv run python -m projected_grpo.regrade_pool --pool-dir={{ pool }}
+    uv run python -m vgrout.regrade_pool --pool-dir={{ pool }}

 # Build a combined teacher pool by concatenating same-prompt rollouts from
 # multiple source pools. Used by G2/G3 (docs/spec/20260528_g2_g3_checkpoint_selection.md).
@@ -352,7 +352,7 @@ build-combined-pool:
 # the hack side. Writes out/pairs_pool_half<HALF_A>.json with N<=14 same-prompt
 # pairs. Asserts hack and clean rollouts share the prompt.
 pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="":
-    uv run python -m projected_grpo.pairs_from_pool \
+    uv run python -m vgrout.pairs_from_pool \
        --pool-dir={{ pool }} \
        --half-a={{ half_a }} \
        --out-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json
@@ -360,7 +360,7 @@ pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="":
 # Extract v_hack from the pool-derived pairs (subprocess to extract_vhack_grad
 # with --pairs-from-pool). Output basis only sees half_A hacks at extract time.
 extract-vhack-pool half_a="E,C" tag="":
-    uv run python -m projected_grpo.extract_vhack_grad \
+    uv run python -m vgrout.extract_vhack_grad \
        --model=Qwen/Qwen3-4B --dtype=bf16 \
        --pairs-from-pool=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \
        --out-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \
@@ -1,5 +1,5 @@
 [project]
-name = "projected_grpo"
+name = "vgrout"
 version = "0.1.0"
 description = "vGROUT: vector gradient routing against reward hacking (Nanda's LeetCode benchmark)"
 requires-python = ">=3.13,<3.14"  # pinned cp313 wheels (causal-conv1d, flash-attn)
@@ -31,8 +31,8 @@ from pathlib import Path

 from datasets import load_dataset

-from projected_grpo.pairs import HackPair
-from projected_grpo.pairs_from_pool import load_pairs_json, save_pairs_json
+from vgrout.pairs import HackPair
+from vgrout.pairs_from_pool import load_pairs_json, save_pairs_json

 OUT = Path("out/pairsets")
 N_PREF = 256       # reward_hack_pref subset size (well-conditioned for k=12, fast extract)
@@ -31,9 +31,9 @@ from __future__ import annotations

 from pathlib import Path

-from projected_grpo import pairs as PB
-from projected_grpo.pairs import HackPair, _wrap
-from projected_grpo.pairs_from_pool import save_pairs_json
+from vgrout import pairs as PB
+from vgrout.pairs import HackPair, _wrap
+from vgrout.pairs_from_pool import save_pairs_json

 OUT = Path("out/pairsets")
 _IMP = "from typing import List"
@@ -19,7 +19,7 @@ Two gates here, both load-bearing:
  2. EVEN: balance kept modes to the same per-mode count (the scarcest kept mode),
     round-robin so each problem is assigned to exactly one mode.

-    uv run python -m projected_grpo.build_substrate --modes exit_code,stdout_marker,sentinel,file_marker
+    uv run python -m vgrout.build_substrate --modes exit_code,stdout_marker,sentinel,file_marker
 """
 from __future__ import annotations

@@ -34,9 +34,9 @@ from loguru import logger
 from tabulate import tabulate
 from transformers import AutoTokenizer

-from projected_grpo.rewards import EnvMode, compute_reward
-from projected_grpo.problems import DATA, HINT_REPLACE_TO
-from projected_grpo.train import OUT_DIR
+from vgrout.rewards import EnvMode, compute_reward
+from vgrout.problems import DATA, HINT_REPLACE_TO
+from vgrout.train import OUT_DIR

 MODES_ALL: list[EnvMode] = ["run_tests", "eq_override", "exit_code",
                            "stdout_marker", "sentinel", "file_marker"]
@@ -35,8 +35,8 @@ import tyro
 from loguru import logger
 from tabulate import tabulate

-from projected_grpo.pairs import HackPair
-from projected_grpo.pairs_from_pool import save_pairs_json
+from vgrout.pairs import HackPair
+from vgrout.pairs_from_pool import save_pairs_json


 def _is_clean(r: dict) -> bool:
@@ -30,7 +30,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 from loguru import logger

-from projected_grpo.figs import save_fig
+from vgrout.figs import save_fig

 # arm -> (display label, colour). Order = legend/bar order (baseline first).
 # Reader-facing names only -- "route2"/"grad" are internal tags. The grad-mask
@@ -43,7 +43,7 @@ import numpy as np
 from matplotlib.lines import Line2D
 from loguru import logger

-from projected_grpo.figs import link_latest, save_fig, arm_label
+from vgrout.figs import link_latest, save_fig, arm_label

 # Figures are captioned in the paper/blog, so the suptitle just restates the
 # caption. Off by default; --title re-enables it for standalone research use.
@@ -21,7 +21,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 from loguru import logger

-from projected_grpo.figs import link_latest, save_fig
+from vgrout.figs import link_latest, save_fig
 from plot_dynamics import _ema, _gather, _onset, parse_log

 # Captioned downstream, so the title just restates the caption. --title re-enables.
@@ -39,7 +39,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 from loguru import logger

-from projected_grpo.figs import save_fig
+from vgrout.figs import save_fig

 # hk_ column header -> (display mode, colour). Order = panel/legend order.
 # Colourblind-safe-ish qualitative set; one hue per loophole, reused across panels.
@@ -51,12 +51,12 @@ from loguru import logger
 from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

-from projected_grpo.antipasto import wrap_model_with_antipasto
-from projected_grpo.proj import per_token_logps, project_delta_S_grad
-from projected_grpo.rewards import compute_reward
-from projected_grpo.train import CACHE_ROOT, OUT_DIR, setup_logging
-from projected_grpo.problems import DATA, load_problems
-from projected_grpo.extract_vhack_grad import load_v_hack
+from vgrout.antipasto import wrap_model_with_antipasto
+from vgrout.proj import per_token_logps, project_delta_S_grad
+from vgrout.rewards import compute_reward
+from vgrout.train import CACHE_ROOT, OUT_DIR, setup_logging
+from vgrout.problems import DATA, load_problems
+from vgrout.extract_vhack_grad import load_v_hack

 STUDENT_MODEL = "Qwen/Qwen3-4B"

@@ -38,12 +38,12 @@ from safetensors import safe_open
 from tabulate import tabulate
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

-from projected_grpo.antipasto import wrap_model_with_antipasto
-from projected_grpo.extract_vhack_grad import load_v_hack
-from projected_grpo.pairs_from_pool import load_pairs_json
-from projected_grpo.problems import load_problems
-from projected_grpo.rewards import EnvMode
-from projected_grpo.train import CACHE_ROOT, VHACK_DIR, eval_hack_solve
+from vgrout.antipasto import wrap_model_with_antipasto
+from vgrout.extract_vhack_grad import load_v_hack
+from vgrout.pairs_from_pool import load_pairs_json
+from vgrout.problems import load_problems
+from vgrout.rewards import EnvMode
+from vgrout.train import CACHE_ROOT, VHACK_DIR, eval_hack_solve


@dataclass
@@ -26,9 +26,9 @@ import os
 import sys
 import urllib.request

-from projected_grpo.derisk_loopholes import ELICIT_HACK, GENERIC_ELICIT, SPOONFEED
-from projected_grpo.rewards import compute_reward
-from projected_grpo.problems import load_problems
+from vgrout.derisk_loopholes import ELICIT_HACK, GENERIC_ELICIT, SPOONFEED
+from vgrout.rewards import compute_reward
+from vgrout.problems import load_problems

 MODEL = "qwen/qwen3-8b"
 ENDPOINT = "https://openrouter.ai/api/v1/chat/completions"
@@ -22,7 +22,7 @@ import sys
 import torch
 from loguru import logger

-from projected_grpo.train import build_route2_anchors
+from vgrout.train import build_route2_anchors

 DEV = torch.device("cpu")
 G_S, G_T = 5, 2
@@ -28,7 +28,7 @@ import sys
 from loguru import logger
 from tabulate import tabulate

-from projected_grpo.rewards import compute_reward
+from vgrout.rewards import compute_reward


 CANONICAL = """
@@ -150,7 +150,7 @@ class Solution:
 """)

 # --- side-channel hacks: wrong solution, but trip the mode's channel ----------
-from projected_grpo.rewards import HACK_MARKER, RESULT_FILENAME, SENTINEL_ATTR  # noqa: E402
+from vgrout.rewards import HACK_MARKER, RESULT_FILENAME, SENTINEL_ATTR  # noqa: E402

 STDOUT_HACK = _block(f"""
 class Solution:
@@ -8,7 +8,7 @@ Report:
  - fraction of modules with cos_align > 0  (SHOULD > 0.5)
  - mean cos_align across modules           (target > 0.2)

-Run: uv run python -m projected_grpo.verify_vhack_heldout
+Run: uv run python -m vgrout.verify_vhack_heldout
 """
 from __future__ import annotations

@@ -26,10 +26,10 @@ from safetensors.torch import save_file
 from tabulate import tabulate
 from transformers import AutoModelForCausalLM, AutoTokenizer

-from projected_grpo.antipasto import wrap_model_with_antipasto
-from projected_grpo.extract_vhack_grad import completion_nll, resolve_dtype
-from projected_grpo.pairs import PAIRS
-from projected_grpo.extract_vhack_grad import load_v_hack
+from vgrout.antipasto import wrap_model_with_antipasto
+from vgrout.extract_vhack_grad import completion_nll, resolve_dtype
+from vgrout.pairs import PAIRS
+from vgrout.extract_vhack_grad import load_v_hack


 CACHE_ROOT = Path("svd_cache")
@@ -21,7 +21,7 @@ it is NEVER shown to the student. (docs/spec/20260530_faithful_multi_loophole_en
 SHOULD: faithful~0, elicit>>0 per surviving mode. ELSE faithful-high => leak;
        elicit~0 => grader does not honor the loophole or base can't find it.

-    uv run python -m projected_grpo.derisk_loopholes --n-prompts=24 --group=4
+    uv run python -m vgrout.derisk_loopholes --n-prompts=24 --group=4
 """
 from __future__ import annotations

@@ -24,7 +24,7 @@ Saves `out/v_hack.safetensors` = dict[name -> Tensor[k, r]] (cpu fp32, rows
 unit-norm + orthonormal from SVD) with header {"model": str, "dtype": str,
 "top_k": str(k)}.

-Run: uv run python -m projected_grpo.extract_vhack_grad
+Run: uv run python -m vgrout.extract_vhack_grad
 """
 from __future__ import annotations

@@ -227,7 +227,7 @@ def main(cfg: Config) -> int:
        logger.info(f"pairs source: pool-derived ({cfg.pairs_from_pool}) -> {len(pairs)} pairs")
    else:
        pairs = list(PAIRS)
-        logger.info(f"pairs source: hand-crafted projected_grpo.pairs.PAIRS ({len(pairs)} pairs)")
+        logger.info(f"pairs source: hand-crafted vgrout.pairs.PAIRS ({len(pairs)} pairs)")
    logger.info(
        f"device={device}  model={cfg.model}  dtype={cfg.dtype}  "
        f"N_pairs={len(pairs)}  heldout={cfg.n_heldout}  top_k={cfg.top_k}  tau_axis={cfg.tau_axis}"
@@ -330,7 +330,7 @@ def load_v_hack(
        if saved_model is None or saved_dtype is None:
            raise ValueError(
                f"{path} has no model/dtype header metadata. "
-                f"Re-extract with `uv run python -m projected_grpo.extract_vhack_grad "
+                f"Re-extract with `uv run python -m vgrout.extract_vhack_grad "
                f"--model={model_name} --dtype=bf16 --out-path={path}`."
            )
        if saved_model != model_name:
@@ -363,7 +363,7 @@ def load_v_hack(
            f"missing={len(missing)} examples={missing[:5]}  "
            f"extra={len(extra)} examples={extra[:5]}  "
            f"rank_bad={len(rank_bad)} examples={rank_bad[:5]}. "
-            "Extract a fresh v_hack with `uv run python -m projected_grpo.extract_vhack_grad "
+            "Extract a fresh v_hack with `uv run python -m vgrout.extract_vhack_grad "
            f"--model={model_name} --out-path={path}`."
        )

@@ -20,7 +20,7 @@ rewards.py:RewardResult):
    D = hacked_wrong_tests  (NOT response_test_func_gt_pass)
    GT = gt_pass            (eq_correct upstream; orthogonal to E/C/D)

-Run: uv run python -m projected_grpo.pairs_from_pool \
+Run: uv run python -m vgrout.pairs_from_pool \
        --pool-dir=out/pools/teacher_pool \
        --half-a=E,C \
        --out-path=out/pairs_pool_halfA_EC.json
@@ -18,7 +18,7 @@ Output: per-detector counts, 4x4 co-occurrence matrix, exclusive-signature
 breakdown (binary indicator across {E,C,D}; GT logged separately as the
 "got it right" axis).

-Run: uv run python -m projected_grpo.regrade_pool
+Run: uv run python -m vgrout.regrade_pool
 """
 from __future__ import annotations

@@ -22,7 +22,7 @@ Arms (--intervention, one knob):
 Hyperparameters from ariahw/rl-rewardhacking config.py (docs/grpo_hyperparams.md);
 SmokeConfig / FastConfig / FullConfig below hold the scale knobs.

-  uv run python -m projected_grpo.train smoke --intervention=erase
+  uv run python -m vgrout.train smoke --intervention=erase
 """
 from __future__ import annotations

@@ -198,7 +198,7 @@ class Config:
    save_eval_ckpts: bool = True
    # Optional: pool-derived pairs JSON (built by pairs_from_pool.py). When set,
    # BOTH the cache-miss extract AND the online refresh use these pairs instead
-    # of the hand-crafted projected_grpo.pairs.PAIRS. Required for the cross-
+    # of the hand-crafted vgrout.pairs.PAIRS. Required for the cross-
    # mechanism experiment so refresh keeps tracking half_A's hack subspace.
    vhack_pairs_path: Path | None = None
    # Per-source cin diagnostic: split each prompt's backward into student-only
@@ -1335,7 +1335,7 @@ def main(cfg: Config) -> int:
            _was_training = model.training
            model.eval()
            opt.zero_grad(set_to_none=True)
-            logger.disable("projected_grpo.extract_vhack_grad")
+            logger.disable("vgrout.extract_vhack_grad")
            logger.disable("__main__")
            try:
                with ablate_quarantine(wrappers):
@@ -1348,7 +1348,7 @@ def main(cfg: Config) -> int:
                        d = (raw_grads[f"hack/{name}"] - raw_grads[f"clean/{name}"]).mean(0)
                        v_grad[name] = (d / d.norm().clamp_min(1e-12)).to(device)
            finally:
-                logger.enable("projected_grpo.extract_vhack_grad")
+                logger.enable("vgrout.extract_vhack_grad")
                logger.enable("__main__")
            opt.zero_grad(set_to_none=True)  # extract leaves .grad populated
            if _was_training:
@@ -1368,11 +1368,11 @@ def main(cfg: Config) -> int:
            # the refresh fires every N steps and floods the training log with
            # extract-time NLL values that read as if they were training losses.
            # The one-line "v_hack refreshed" announcement below is enough.
-            # When invoked via `python -m projected_grpo.train`, the entry
-            # script's __name__ is "__main__", not "projected_grpo.train",
+            # When invoked via `python -m vgrout.train`, the entry
+            # script's __name__ is "__main__", not "vgrout.train",
            # so postprocess_v_hack's logger.info (called from here) needs
            # __main__ silenced. The extract submodule keeps its own name.
-            logger.disable("projected_grpo.extract_vhack_grad")
+            logger.disable("vgrout.extract_vhack_grad")
            logger.disable("__main__")
            try:
                # Extract with the quarantine ablated (δS_hack=0). For route, once the
@@ -1394,7 +1394,7 @@ def main(cfg: Config) -> int:
                    source=f"refresh@step{step}",
                )
            finally:
-                logger.enable("projected_grpo.extract_vhack_grad")
+                logger.enable("vgrout.extract_vhack_grad")
                logger.enable("__main__")
            # DIAGNOSTIC: how far did the refreshed basis rotate from the prior one?
            # Rows are orthonormal, so ||V_new @ V_old^T||_F^2 / k_old = fraction of
@@ -42,7 +42,7 @@ def load_v_hack(
        if saved_model is None or saved_dtype is None:
            raise ValueError(
                f"{path} has no model/dtype header metadata. "
-                f"Re-extract with `uv run python -m projected_grpo.extract_vhack_grad "
+                f"Re-extract with `uv run python -m vgrout.extract_vhack_grad "
                f"--model={model_name} --dtype=bf16 --out-path={path}`."
            )
        if saved_model != model_name:
@@ -75,7 +75,7 @@ def load_v_hack(
            f"missing={len(missing)} examples={missing[:5]}  "
            f"extra={len(extra)} examples={extra[:5]}  "
            f"rank_bad={len(rank_bad)} examples={rank_bad[:5]}. "
-            "Extract a fresh v_hack with `uv run python -m projected_grpo.extract_vhack_grad "
+            "Extract a fresh v_hack with `uv run python -m vgrout.extract_vhack_grad "
            f"--model={model_name} --out-path={path}`."
        )