rename python package projected_grpo -> vgrout

git mv src/projected_grpo -> src/vgrout and find-replace the module name in
all imports (.py), `-m projected_grpo.*` invocations (justfile), and the
[project] name (pyproject; setuptools auto-discovers via where=["src"]).

Left RESEARCH_JOURNAL.md untouched: its commands/paths are dated lab notes
tied to past commits, so rewriting them would falsify provenance. Repo dir,
git remote, and absolute paths unchanged.

Verified: `import vgrout` and `python -m vgrout.train --help` load the full
graph; verify_rewards.py + verify_gate_anchor.py (both import vgrout) pass.
Full `just smoke` is blocked upstream by missing gitignored data artifacts
(out/pools/{substrate,teacher_pool}, out/vhack/*smoke*), unrelated to the rename.
This commit is contained in:
wassname
2026-06-05 14:51:02 +08:00
parent 03693e4f30
commit 55937a86fb
32 changed files with 65 additions and 65 deletions
+9 -9
View File
@@ -7,7 +7,7 @@ SEEDS_3 := "41 43 44"
# (see RESEARCH_JOURNAL 2026-05-24 (b)).
MODEL := "Qwen/Qwen3-4B"
TINY_MODEL := "llamafactory/tiny-random-qwen3" # qwen3 arch, ~6M params, smoke only
TRAIN := "uv run python -m projected_grpo.train" # real LeetCode GRPO entry point
TRAIN := "uv run python -m vgrout.train" # real LeetCode GRPO entry point
default:
@just --list
@@ -74,11 +74,11 @@ smoke-xmech:
# so pairs_from_pool can pair a hack-side with a clean-side per prompt.
cp out/pools/teacher_pool/prompt_0005.jsonl.gz out/pools/teacher_pool_smoke/
cp out/pools/teacher_pool/prompt_0030.jsonl.gz out/pools/teacher_pool_smoke/
uv run python -m projected_grpo.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit
uv run python -m projected_grpo.pairs_from_pool \
uv run python -m vgrout.regrade_pool --pool-dir=out/pools/teacher_pool_smoke --no-require-audit
uv run python -m vgrout.pairs_from_pool \
--pool-dir=out/pools/teacher_pool_smoke --half-a=E,C \
--out-path=out/pairs_pool_smoke.json
BEARTYPE=1 uv run python -m projected_grpo.extract_vhack_grad \
BEARTYPE=1 uv run python -m vgrout.extract_vhack_grad \
--model={{ TINY_MODEL }} --dtype=fp32 \
--pairs-from-pool=out/pairs_pool_smoke.json \
--n-heldout=0 --top-k=1 \
@@ -202,14 +202,14 @@ download-model:
snapshot_download('{{ MODEL }}', allow_patterns=['*.json','*.txt','tokenizer*','*.safetensors'])"
extract-vhack-smoke:
uv run python -m projected_grpo.extract_vhack_grad \
uv run python -m vgrout.extract_vhack_grad \
--model=Qwen/Qwen3.5-0.8B \
--dtype=bf16 \
--out-path=out/vhack/v_hack_smoke.safetensors \
--train-grads-path=out/vhack_grads/vhack_grads_train_smoke.safetensors
extract-vhack-full:
uv run python -m projected_grpo.extract_vhack_grad \
uv run python -m vgrout.extract_vhack_grad \
--model=Qwen/Qwen3-4B \
--dtype=bf16 \
--out-path=out/vhack/v_hack_full.safetensors \
@@ -338,7 +338,7 @@ pregen-teacher-alt teacher tag n_prompts="50":
# 4-boolean co-occurrence + signature breakdown on the cached pool.
# `pool` selects which pool to regrade (default = original rh-s65 pool).
regrade-pool pool="out/pools/teacher_pool":
uv run python -m projected_grpo.regrade_pool --pool-dir={{ pool }}
uv run python -m vgrout.regrade_pool --pool-dir={{ pool }}
# Build a combined teacher pool by concatenating same-prompt rollouts from
# multiple source pools. Used by G2/G3 (docs/spec/20260528_g2_g3_checkpoint_selection.md).
@@ -352,7 +352,7 @@ build-combined-pool:
# the hack side. Writes out/pairs_pool_half<HALF_A>.json with N<=14 same-prompt
# pairs. Asserts hack and clean rollouts share the prompt.
pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="":
uv run python -m projected_grpo.pairs_from_pool \
uv run python -m vgrout.pairs_from_pool \
--pool-dir={{ pool }} \
--half-a={{ half_a }} \
--out-path=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json
@@ -360,7 +360,7 @@ pairs-from-pool half_a="E,C" pool="out/pools/teacher_pool" tag="":
# Extract v_hack from the pool-derived pairs (subprocess to extract_vhack_grad
# with --pairs-from-pool). Output basis only sees half_A hacks at extract time.
extract-vhack-pool half_a="E,C" tag="":
uv run python -m projected_grpo.extract_vhack_grad \
uv run python -m vgrout.extract_vhack_grad \
--model=Qwen/Qwen3-4B --dtype=bf16 \
--pairs-from-pool=out/pairs_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.json \
--out-path=out/vhack/v_hack_pool_half_{{ replace(half_a, ',', '') }}{{ tag }}.safetensors \
+1 -1
View File
@@ -1,5 +1,5 @@
[project]
name = "projected_grpo"
name = "vgrout"
version = "0.1.0"
description = "vGROUT: vector gradient routing against reward hacking (Nanda's LeetCode benchmark)"
requires-python = ">=3.13,<3.14" # pinned cp313 wheels (causal-conv1d, flash-attn)
+2 -2
View File
@@ -31,8 +31,8 @@ from pathlib import Path
from datasets import load_dataset
from projected_grpo.pairs import HackPair
from projected_grpo.pairs_from_pool import load_pairs_json, save_pairs_json
from vgrout.pairs import HackPair
from vgrout.pairs_from_pool import load_pairs_json, save_pairs_json
OUT = Path("out/pairsets")
N_PREF = 256 # reward_hack_pref subset size (well-conditioned for k=12, fast extract)
+3 -3
View File
@@ -31,9 +31,9 @@ from __future__ import annotations
from pathlib import Path
from projected_grpo import pairs as PB
from projected_grpo.pairs import HackPair, _wrap
from projected_grpo.pairs_from_pool import save_pairs_json
from vgrout import pairs as PB
from vgrout.pairs import HackPair, _wrap
from vgrout.pairs_from_pool import save_pairs_json
OUT = Path("out/pairsets")
_IMP = "from typing import List"
+4 -4
View File
@@ -19,7 +19,7 @@ Two gates here, both load-bearing:
2. EVEN: balance kept modes to the same per-mode count (the scarcest kept mode),
round-robin so each problem is assigned to exactly one mode.
uv run python -m projected_grpo.build_substrate --modes exit_code,stdout_marker,sentinel,file_marker
uv run python -m vgrout.build_substrate --modes exit_code,stdout_marker,sentinel,file_marker
"""
from __future__ import annotations
@@ -34,9 +34,9 @@ from loguru import logger
from tabulate import tabulate
from transformers import AutoTokenizer
from projected_grpo.rewards import EnvMode, compute_reward
from projected_grpo.problems import DATA, HINT_REPLACE_TO
from projected_grpo.train import OUT_DIR
from vgrout.rewards import EnvMode, compute_reward
from vgrout.problems import DATA, HINT_REPLACE_TO
from vgrout.train import OUT_DIR
MODES_ALL: list[EnvMode] = ["run_tests", "eq_override", "exit_code",
"stdout_marker", "sentinel", "file_marker"]
+2 -2
View File
@@ -35,8 +35,8 @@ import tyro
from loguru import logger
from tabulate import tabulate
from projected_grpo.pairs import HackPair
from projected_grpo.pairs_from_pool import save_pairs_json
from vgrout.pairs import HackPair
from vgrout.pairs_from_pool import save_pairs_json
def _is_clean(r: dict) -> bool:
+1 -1
View File
@@ -30,7 +30,7 @@ import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from projected_grpo.figs import save_fig
from vgrout.figs import save_fig
# arm -> (display label, colour). Order = legend/bar order (baseline first).
# Reader-facing names only -- "route2"/"grad" are internal tags. The grad-mask
+1 -1
View File
@@ -43,7 +43,7 @@ import numpy as np
from matplotlib.lines import Line2D
from loguru import logger
from projected_grpo.figs import link_latest, save_fig, arm_label
from vgrout.figs import link_latest, save_fig, arm_label
# Figures are captioned in the paper/blog, so the suptitle just restates the
# caption. Off by default; --title re-enables it for standalone research use.
+1 -1
View File
@@ -21,7 +21,7 @@ import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from projected_grpo.figs import link_latest, save_fig
from vgrout.figs import link_latest, save_fig
from plot_dynamics import _ema, _gather, _onset, parse_log
# Captioned downstream, so the title just restates the caption. --title re-enables.
+1 -1
View File
@@ -39,7 +39,7 @@ import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from projected_grpo.figs import save_fig
from vgrout.figs import save_fig
# hk_ column header -> (display mode, colour). Order = panel/legend order.
# Colourblind-safe-ish qualitative set; one hue per loophole, reused across panels.
+6 -6
View File
@@ -51,12 +51,12 @@ from loguru import logger
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from projected_grpo.antipasto import wrap_model_with_antipasto
from projected_grpo.proj import per_token_logps, project_delta_S_grad
from projected_grpo.rewards import compute_reward
from projected_grpo.train import CACHE_ROOT, OUT_DIR, setup_logging
from projected_grpo.problems import DATA, load_problems
from projected_grpo.extract_vhack_grad import load_v_hack
from vgrout.antipasto import wrap_model_with_antipasto
from vgrout.proj import per_token_logps, project_delta_S_grad
from vgrout.rewards import compute_reward
from vgrout.train import CACHE_ROOT, OUT_DIR, setup_logging
from vgrout.problems import DATA, load_problems
from vgrout.extract_vhack_grad import load_v_hack
STUDENT_MODEL = "Qwen/Qwen3-4B"
+6 -6
View File
@@ -38,12 +38,12 @@ from safetensors import safe_open
from tabulate import tabulate
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from projected_grpo.antipasto import wrap_model_with_antipasto
from projected_grpo.extract_vhack_grad import load_v_hack
from projected_grpo.pairs_from_pool import load_pairs_json
from projected_grpo.problems import load_problems
from projected_grpo.rewards import EnvMode
from projected_grpo.train import CACHE_ROOT, VHACK_DIR, eval_hack_solve
from vgrout.antipasto import wrap_model_with_antipasto
from vgrout.extract_vhack_grad import load_v_hack
from vgrout.pairs_from_pool import load_pairs_json
from vgrout.problems import load_problems
from vgrout.rewards import EnvMode
from vgrout.train import CACHE_ROOT, VHACK_DIR, eval_hack_solve
@dataclass
+3 -3
View File
@@ -26,9 +26,9 @@ import os
import sys
import urllib.request
from projected_grpo.derisk_loopholes import ELICIT_HACK, GENERIC_ELICIT, SPOONFEED
from projected_grpo.rewards import compute_reward
from projected_grpo.problems import load_problems
from vgrout.derisk_loopholes import ELICIT_HACK, GENERIC_ELICIT, SPOONFEED
from vgrout.rewards import compute_reward
from vgrout.problems import load_problems
MODEL = "qwen/qwen3-8b"
ENDPOINT = "https://openrouter.ai/api/v1/chat/completions"
+1 -1
View File
@@ -22,7 +22,7 @@ import sys
import torch
from loguru import logger
from projected_grpo.train import build_route2_anchors
from vgrout.train import build_route2_anchors
DEV = torch.device("cpu")
G_S, G_T = 5, 2
+2 -2
View File
@@ -28,7 +28,7 @@ import sys
from loguru import logger
from tabulate import tabulate
from projected_grpo.rewards import compute_reward
from vgrout.rewards import compute_reward
CANONICAL = """
@@ -150,7 +150,7 @@ class Solution:
""")
# --- side-channel hacks: wrong solution, but trip the mode's channel ----------
from projected_grpo.rewards import HACK_MARKER, RESULT_FILENAME, SENTINEL_ATTR # noqa: E402
from vgrout.rewards import HACK_MARKER, RESULT_FILENAME, SENTINEL_ATTR # noqa: E402
STDOUT_HACK = _block(f"""
class Solution:
+5 -5
View File
@@ -8,7 +8,7 @@ Report:
- fraction of modules with cos_align > 0 (SHOULD > 0.5)
- mean cos_align across modules (target > 0.2)
Run: uv run python -m projected_grpo.verify_vhack_heldout
Run: uv run python -m vgrout.verify_vhack_heldout
"""
from __future__ import annotations
@@ -26,10 +26,10 @@ from safetensors.torch import save_file
from tabulate import tabulate
from transformers import AutoModelForCausalLM, AutoTokenizer
from projected_grpo.antipasto import wrap_model_with_antipasto
from projected_grpo.extract_vhack_grad import completion_nll, resolve_dtype
from projected_grpo.pairs import PAIRS
from projected_grpo.extract_vhack_grad import load_v_hack
from vgrout.antipasto import wrap_model_with_antipasto
from vgrout.extract_vhack_grad import completion_nll, resolve_dtype
from vgrout.pairs import PAIRS
from vgrout.extract_vhack_grad import load_v_hack
CACHE_ROOT = Path("svd_cache")
@@ -21,7 +21,7 @@ it is NEVER shown to the student. (docs/spec/20260530_faithful_multi_loophole_en
SHOULD: faithful~0, elicit>>0 per surviving mode. ELSE faithful-high => leak;
elicit~0 => grader does not honor the loophole or base can't find it.
uv run python -m projected_grpo.derisk_loopholes --n-prompts=24 --group=4
uv run python -m vgrout.derisk_loopholes --n-prompts=24 --group=4
"""
from __future__ import annotations
@@ -24,7 +24,7 @@ Saves `out/v_hack.safetensors` = dict[name -> Tensor[k, r]] (cpu fp32, rows
unit-norm + orthonormal from SVD) with header {"model": str, "dtype": str,
"top_k": str(k)}.
Run: uv run python -m projected_grpo.extract_vhack_grad
Run: uv run python -m vgrout.extract_vhack_grad
"""
from __future__ import annotations
@@ -227,7 +227,7 @@ def main(cfg: Config) -> int:
logger.info(f"pairs source: pool-derived ({cfg.pairs_from_pool}) -> {len(pairs)} pairs")
else:
pairs = list(PAIRS)
logger.info(f"pairs source: hand-crafted projected_grpo.pairs.PAIRS ({len(pairs)} pairs)")
logger.info(f"pairs source: hand-crafted vgrout.pairs.PAIRS ({len(pairs)} pairs)")
logger.info(
f"device={device} model={cfg.model} dtype={cfg.dtype} "
f"N_pairs={len(pairs)} heldout={cfg.n_heldout} top_k={cfg.top_k} tau_axis={cfg.tau_axis}"
@@ -330,7 +330,7 @@ def load_v_hack(
if saved_model is None or saved_dtype is None:
raise ValueError(
f"{path} has no model/dtype header metadata. "
f"Re-extract with `uv run python -m projected_grpo.extract_vhack_grad "
f"Re-extract with `uv run python -m vgrout.extract_vhack_grad "
f"--model={model_name} --dtype=bf16 --out-path={path}`."
)
if saved_model != model_name:
@@ -363,7 +363,7 @@ def load_v_hack(
f"missing={len(missing)} examples={missing[:5]} "
f"extra={len(extra)} examples={extra[:5]} "
f"rank_bad={len(rank_bad)} examples={rank_bad[:5]}. "
"Extract a fresh v_hack with `uv run python -m projected_grpo.extract_vhack_grad "
"Extract a fresh v_hack with `uv run python -m vgrout.extract_vhack_grad "
f"--model={model_name} --out-path={path}`."
)
@@ -20,7 +20,7 @@ rewards.py:RewardResult):
D = hacked_wrong_tests (NOT response_test_func_gt_pass)
GT = gt_pass (eq_correct upstream; orthogonal to E/C/D)
Run: uv run python -m projected_grpo.pairs_from_pool \
Run: uv run python -m vgrout.pairs_from_pool \
--pool-dir=out/pools/teacher_pool \
--half-a=E,C \
--out-path=out/pairs_pool_halfA_EC.json
@@ -18,7 +18,7 @@ Output: per-detector counts, 4x4 co-occurrence matrix, exclusive-signature
breakdown (binary indicator across {E,C,D}; GT logged separately as the
"got it right" axis).
Run: uv run python -m projected_grpo.regrade_pool
Run: uv run python -m vgrout.regrade_pool
"""
from __future__ import annotations
@@ -22,7 +22,7 @@ Arms (--intervention, one knob):
Hyperparameters from ariahw/rl-rewardhacking config.py (docs/grpo_hyperparams.md);
SmokeConfig / FastConfig / FullConfig below hold the scale knobs.
uv run python -m projected_grpo.train smoke --intervention=erase
uv run python -m vgrout.train smoke --intervention=erase
"""
from __future__ import annotations
@@ -198,7 +198,7 @@ class Config:
save_eval_ckpts: bool = True
# Optional: pool-derived pairs JSON (built by pairs_from_pool.py). When set,
# BOTH the cache-miss extract AND the online refresh use these pairs instead
# of the hand-crafted projected_grpo.pairs.PAIRS. Required for the cross-
# of the hand-crafted vgrout.pairs.PAIRS. Required for the cross-
# mechanism experiment so refresh keeps tracking half_A's hack subspace.
vhack_pairs_path: Path | None = None
# Per-source cin diagnostic: split each prompt's backward into student-only
@@ -1335,7 +1335,7 @@ def main(cfg: Config) -> int:
_was_training = model.training
model.eval()
opt.zero_grad(set_to_none=True)
logger.disable("projected_grpo.extract_vhack_grad")
logger.disable("vgrout.extract_vhack_grad")
logger.disable("__main__")
try:
with ablate_quarantine(wrappers):
@@ -1348,7 +1348,7 @@ def main(cfg: Config) -> int:
d = (raw_grads[f"hack/{name}"] - raw_grads[f"clean/{name}"]).mean(0)
v_grad[name] = (d / d.norm().clamp_min(1e-12)).to(device)
finally:
logger.enable("projected_grpo.extract_vhack_grad")
logger.enable("vgrout.extract_vhack_grad")
logger.enable("__main__")
opt.zero_grad(set_to_none=True) # extract leaves .grad populated
if _was_training:
@@ -1368,11 +1368,11 @@ def main(cfg: Config) -> int:
# the refresh fires every N steps and floods the training log with
# extract-time NLL values that read as if they were training losses.
# The one-line "v_hack refreshed" announcement below is enough.
# When invoked via `python -m projected_grpo.train`, the entry
# script's __name__ is "__main__", not "projected_grpo.train",
# When invoked via `python -m vgrout.train`, the entry
# script's __name__ is "__main__", not "vgrout.train",
# so postprocess_v_hack's logger.info (called from here) needs
# __main__ silenced. The extract submodule keeps its own name.
logger.disable("projected_grpo.extract_vhack_grad")
logger.disable("vgrout.extract_vhack_grad")
logger.disable("__main__")
try:
# Extract with the quarantine ablated (δS_hack=0). For route, once the
@@ -1394,7 +1394,7 @@ def main(cfg: Config) -> int:
source=f"refresh@step{step}",
)
finally:
logger.enable("projected_grpo.extract_vhack_grad")
logger.enable("vgrout.extract_vhack_grad")
logger.enable("__main__")
# DIAGNOSTIC: how far did the refreshed basis rotate from the prior one?
# Rows are orthonormal, so ||V_new @ V_old^T||_F^2 / k_old = fraction of
@@ -42,7 +42,7 @@ def load_v_hack(
if saved_model is None or saved_dtype is None:
raise ValueError(
f"{path} has no model/dtype header metadata. "
f"Re-extract with `uv run python -m projected_grpo.extract_vhack_grad "
f"Re-extract with `uv run python -m vgrout.extract_vhack_grad "
f"--model={model_name} --dtype=bf16 --out-path={path}`."
)
if saved_model != model_name:
@@ -75,7 +75,7 @@ def load_v_hack(
f"missing={len(missing)} examples={missing[:5]} "
f"extra={len(extra)} examples={extra[:5]} "
f"rank_bad={len(rank_bad)} examples={rank_bad[:5]}. "
"Extract a fresh v_hack with `uv run python -m projected_grpo.extract_vhack_grad "
"Extract a fresh v_hack with `uv run python -m vgrout.extract_vhack_grad "
f"--model={model_name} --out-path={path}`."
)