mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 17:30:41 +08:00
cleanup: delete 6 orphan modules, quarantine pair generators, trim stale comments
Deleted (zero importers/refs): scripts/{migrate_out_dirs,audit_log,plot_route_evidence}.py
and src/projected_grpo/{bake_lora,probe_lora_runtime,probe_traj}.py (LoRA-merge path
+ dev trajectory comparator, superseded). Removed the dead probe-traj recipe.
Quarantined to scripts/attic/: make_pairsets.py + make_dataset_pairsets.py (persona-pair
authoring, tasks #123-126 done; live path is pairs.PAIRS / pairs_from_pool).
Comments: dropped dead job-ID narrative (job 60/64) on rollout_ablate_frac, the
'vanilla step 17' dead-run ref in eval.py, the 'old signed sum' dead-code ref in
proj.py, and the conversational 'current experiment line' lead. Kept all TODO/FIXME
and the 'why' memory-tuning comments. Smoke green (cout->0).
Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -1,126 +0,0 @@
|
||||
"""Audit a training run: quote first/last generation (coherence eyeball) + summarise
|
||||
the key per-step columns with trend arrows and SHOULD-interpretation hints.
|
||||
|
||||
Deterministic extraction; the /audit-log command feeds this to the LLM for a verdict.
|
||||
|
||||
Usage:
|
||||
uv run python scripts/audit_log.py out/runs/<ts>_<tag> # run dir
|
||||
uv run python scripts/audit_log.py logs/<ts>_<tag>.log # log (finds sibling run dir)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _find(arg: str) -> tuple[Path | None, Path | None]:
|
||||
"""Resolve (rollouts.jsonl, streaming .log) from a run-dir or log path."""
|
||||
p = Path(arg)
|
||||
if p.is_dir():
|
||||
jl = p / "rollouts.jsonl"
|
||||
# match a log whose argv out-tag matches this run dir's tag
|
||||
tag = re.sub(r"^\d{8}T\d{6}_(fast|smoke|full)_", "", p.name)
|
||||
logs = sorted(Path("logs").glob("*.log"))
|
||||
log = next((l for l in reversed(logs) if tag in l.read_text(errors="replace")[:2000]), None)
|
||||
return (jl if jl.exists() else None), log
|
||||
if p.suffix == ".log":
|
||||
# find the run dir from the verbose-log line is overkill; use jsonl by tag
|
||||
return None, p
|
||||
return None, None
|
||||
|
||||
|
||||
# A fixed coherent-vanilla yardstick (Qwen3-4B, sub4 vanilla seed41 step 59):
|
||||
# real imports, a class, indented code. The audited last-gen should look like
|
||||
# THIS. If it's punctuation soup instead (job 46 step 14: '####?##%\r\n#_...'),
|
||||
# the policy diverged in free generation -- even when lp_t (teacher-forced
|
||||
# coherence) stayed flat and the divergence tripwire never fired.
|
||||
REFERENCE_HEALTHY = (
|
||||
"```python\nfrom typing import List\n\nclass Solution:\n"
|
||||
" def maxPoints(self, points: List[List[int]]) -> int:\n"
|
||||
" def slope(p1, p2): # reduced (dx,dy) via GCD, no float error\n"
|
||||
" if p1 == p2: return (0, 0)"
|
||||
)
|
||||
|
||||
|
||||
def _gen(jl: Path) -> None:
|
||||
rows = [json.loads(l) for l in jl.open()]
|
||||
if not rows:
|
||||
print("rollouts.jsonl EMPTY"); return
|
||||
print(f"rollouts: {len(rows)} rows, steps {rows[0]['step']}..{rows[-1]['step']}")
|
||||
print("\n--- REFERENCE: healthy vanilla gen (fixed yardstick, NOT this run) ---")
|
||||
print(repr(REFERENCE_HEALTHY))
|
||||
for r in (rows[0], rows[-1]):
|
||||
print(f"\n--- step {r['step']} reward={r['reward']:+.2f} gt_pass={r['gt_pass']} "
|
||||
f"hack={r.get('exploited', r.get('hacked_E'))} ---")
|
||||
print("SHOULD: read like the REFERENCE above (coherent code); ELSE token salad => diverged")
|
||||
print(repr(r["text"][:400]))
|
||||
|
||||
|
||||
def _cols(log: Path) -> None:
|
||||
txt = log.read_text(errors="replace")
|
||||
hdr = next((l for l in txt.splitlines()
|
||||
if "| INFO |" in l and l.split("| INFO |", 1)[1].split()[:1] == ["step"]
|
||||
and "ref_eq" in l), None)
|
||||
if hdr is None:
|
||||
print("\nno streaming table in log"); return
|
||||
names = [re.sub(r"[^a-z0-9_]", "", t.lower()) for t in hdr.split("| INFO |", 1)[1].split()]
|
||||
idx = {n: i for i, n in enumerate(names)}
|
||||
rows = []
|
||||
for line in txt.splitlines():
|
||||
if "| INFO |" not in line:
|
||||
continue
|
||||
r = line.split("| INFO |", 1)[1].split()
|
||||
if r and r[0].isdigit() and len(r) >= len(names):
|
||||
rows.append(r)
|
||||
if not rows:
|
||||
print("\nno data rows"); return
|
||||
|
||||
def frac(t):
|
||||
a, b = t.split("/"); return int(a) / int(b) if int(b) else 0.0
|
||||
def col(name, f):
|
||||
return [f(r[idx[name]]) for r in rows if name in idx]
|
||||
def trend(name, f, lo=None):
|
||||
v = col(name, f)
|
||||
if not v: return f"{name}: (absent)"
|
||||
first, last5 = v[0], sum(v[-5:]) / len(v[-5:])
|
||||
arrow = "UP" if last5 > first + 1e-6 else ("DOWN" if last5 < first - 1e-6 else "flat")
|
||||
warn = ""
|
||||
if lo is not None and min(v) < lo:
|
||||
warn = f" <-- dipped below {lo} (min {min(v):.2f})"
|
||||
return f"{name:7s} first={first:+.3f} last5={last5:+.3f} [{arrow}]{warn}"
|
||||
|
||||
print("\n=== key columns (first vs last-5-mean) ===")
|
||||
print("SHOULD interpret: hack_s UP=emerging; gt_s UP=learning to solve;")
|
||||
print(" gn stable (not >>clip); lp_t > -3 (coherent, ELSE diverged)")
|
||||
for nm, f, lo in [("hack_s", frac, None), ("gt_s", frac, None),
|
||||
("lp_t", float, -3.0), ("gn", float, None), ("loss", float, None)]:
|
||||
if nm in idx:
|
||||
print(" " + trend(nm, f, lo))
|
||||
# divergence check on lp_t
|
||||
lpt = col("lp_t", float)
|
||||
if lpt:
|
||||
best = max(lpt)
|
||||
diverged = any(x < best - 5 for x in lpt)
|
||||
print(f"\nDIVERGENCE: lp_t best={best:+.2f} min={min(lpt):+.2f} -> "
|
||||
f"{'DIVERGED (drop >5 nats)' if diverged else 'stable'}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(sys.argv) != 2:
|
||||
print(__doc__); sys.exit(1)
|
||||
jl, log = _find(sys.argv[1])
|
||||
print(f"=== AUDIT {sys.argv[1]} ===")
|
||||
if jl:
|
||||
_gen(jl)
|
||||
else:
|
||||
print("(no rollouts.jsonl found)")
|
||||
if log:
|
||||
_cols(log)
|
||||
else:
|
||||
print("(no streaming log found)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,90 +0,0 @@
|
||||
"""One-shot out/ migration to the datatype-sorted scheme (spec 20260530_out_dir_reorg).
|
||||
|
||||
Sorts loose out/ files into subdirs:
|
||||
v_hack_*.safetensors -> out/vhack/
|
||||
vhack_grads_*, vhack_heldout_* -> out/vhack_grads/
|
||||
*.png -> out/figs/
|
||||
out/probe_distill/<pool>/ -> out/pools/<pool>/
|
||||
train_<tag>{,_first_hack}.safetensors + rollouts_<tag>.jsonl
|
||||
-> out/runs/<log_stem>/ (ts matched from logs/*<tag>.log)
|
||||
pairs_*.json -> out/pairsets/
|
||||
|
||||
Per-train-run artifacts (checkpoint + rollouts) group under the SAME run dir as
|
||||
their log's <ts>_<run_id> stem, by matching the out_tag suffix. Unmatched train
|
||||
files (no log) go to out/runs/_unmatched/ and are logged, never dropped.
|
||||
|
||||
uv run python scripts/migrate_out_dirs.py # dry-run (prints plan)
|
||||
uv run python scripts/migrate_out_dirs.py --apply # actually move
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
|
||||
OUT = Path("out")
|
||||
LOGS = Path("logs")
|
||||
APPLY = "--apply" in sys.argv
|
||||
|
||||
|
||||
def log_stem_for_tag(tag: str) -> str | None:
|
||||
"""Find the log whose run_id ends with `tag` (the out_tag suffix). Returns its stem."""
|
||||
cands = sorted(LOGS.glob(f"*{tag}.log"))
|
||||
# Prefer an exact suffix match on the stem (run_id = <preset>_<arm>_seed<n><tag>).
|
||||
exact = [p for p in cands if p.stem.endswith(tag)]
|
||||
chosen = (exact or cands)
|
||||
return chosen[-1].stem if chosen else None # newest if several
|
||||
|
||||
|
||||
def plan_moves() -> list[tuple[Path, Path]]:
|
||||
moves: list[tuple[Path, Path]] = []
|
||||
for f in sorted(OUT.glob("*")):
|
||||
if f.is_dir():
|
||||
continue
|
||||
n = f.name
|
||||
if n.startswith("v_hack_") and n.endswith(".safetensors"):
|
||||
moves.append((f, OUT / "vhack" / n))
|
||||
elif n.startswith(("vhack_grads_", "vhack_heldout")):
|
||||
moves.append((f, OUT / "vhack_grads" / n))
|
||||
elif n.endswith(".png"):
|
||||
moves.append((f, OUT / "figs" / n))
|
||||
elif n.startswith("pairs_") and n.endswith(".json"):
|
||||
moves.append((f, OUT / "pairsets" / n))
|
||||
elif n.startswith("train_") or n.startswith("rollouts_"):
|
||||
# tag = out_tag suffix shared by the file and its log.
|
||||
stem = n.split(".")[0]
|
||||
tag = (stem[len("train"):] if stem.startswith("train")
|
||||
else "_" + stem[len("rollouts_"):])
|
||||
tag = tag.replace("_first_hack", "")
|
||||
log_stem = log_stem_for_tag(tag)
|
||||
dest_dir = OUT / "runs" / (log_stem or "_unmatched")
|
||||
moves.append((f, dest_dir / n))
|
||||
else:
|
||||
logger.warning(f"UNMAPPED loose file (left in place): {f}")
|
||||
# Teacher/base pools: out/probe_distill/<pool>/ -> out/pools/<pool>/
|
||||
pd = OUT / "probe_distill"
|
||||
if pd.is_dir():
|
||||
for sub in sorted(pd.iterdir()):
|
||||
dst = OUT / ("figs" if sub.suffix == ".png" else "pools") / sub.name
|
||||
moves.append((sub, dst))
|
||||
return moves
|
||||
|
||||
|
||||
def main() -> None:
|
||||
moves = plan_moves()
|
||||
for src, dst in moves:
|
||||
if dst.exists():
|
||||
logger.warning(f"SKIP (dest exists): {dst}")
|
||||
continue
|
||||
logger.info(f"{'MOVE' if APPLY else 'PLAN'}: {src} -> {dst}")
|
||||
if APPLY:
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(src), str(dst))
|
||||
logger.info(f"{'APPLIED' if APPLY else 'DRY-RUN'}: {len(moves)} moves. "
|
||||
f"{'' if APPLY else 'Re-run with --apply to execute.'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,105 +0,0 @@
|
||||
"""Single-run routing figure: training-time hack vs DEPLOYED-model hack.
|
||||
|
||||
The routing story in one plot. During training the model keeps hacking (it runs
|
||||
with the quarantine knob ON, so the per-step hack_s curve climbs like vanilla).
|
||||
But the model we'd actually DEPLOY has the knob deleted -- its hack rate (the
|
||||
deploy-eval, measured every --eval-ablate-every steps) is what matters. If routing
|
||||
works, the deploy curve sits well BELOW the training curve at preserved solve.
|
||||
|
||||
uv run python scripts/plot_route_evidence.py LOG.log --out out/route_evidence.png
|
||||
|
||||
Reads the hack_deploy/solve_deploy columns (Gradient Routing deploy-eval).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
import tyro
|
||||
|
||||
from projected_grpo.figs import link_latest
|
||||
|
||||
|
||||
def _frac(tok: str) -> float | None:
|
||||
if "/" in tok:
|
||||
a, b = tok.split("/")
|
||||
return int(a) / int(b) if int(b) else None
|
||||
try:
|
||||
v = float(tok)
|
||||
return None if v != v else v # NaN -> None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def parse(log: Path):
|
||||
txt = log.read_text(errors="replace")
|
||||
hdr = next(l.split("| INFO |", 1)[1].split() for l in txt.splitlines()
|
||||
if "| INFO |" in l and "hack_s" in l and "refr" in l)
|
||||
idx = {n: i for i, n in enumerate(hdr)}
|
||||
i_step, i_train = idx["step"], idx["hack_s?"]
|
||||
i_solve = idx["gt_s↑"]
|
||||
i_hdep = idx["hack_deploy"]
|
||||
i_sdep = idx["solve_deploy"]
|
||||
steps, train_hack, solve_train = [], [], []
|
||||
deploy_step, deploy_hack, deploy_solve = [], [], []
|
||||
for l in txt.splitlines():
|
||||
if "| INFO |" not in l:
|
||||
continue
|
||||
r = l.split("| INFO |", 1)[1].split()
|
||||
if not r or not r[0].isdigit() or len(r) <= i_sdep:
|
||||
continue
|
||||
s = int(r[i_step])
|
||||
steps.append(s)
|
||||
train_hack.append(_frac(r[i_train]))
|
||||
solve_train.append(_frac(r[i_solve]))
|
||||
h = _frac(r[i_hdep])
|
||||
if h is not None: # deploy-eval only fires every N steps
|
||||
deploy_step.append(s); deploy_hack.append(h); deploy_solve.append(_frac(r[i_sdep]))
|
||||
return dict(steps=steps, train_hack=train_hack, solve_train=solve_train,
|
||||
deploy_step=deploy_step, deploy_hack=deploy_hack, deploy_solve=deploy_solve)
|
||||
|
||||
|
||||
def main(log: str, out: str = "out/figs/route_evidence.png") -> None:
|
||||
d = parse(Path(log))
|
||||
RED, GREY = "#b03a2e", "#9a8c7a" # hack=red (the story); solve=muted (context)
|
||||
fig, ax = plt.subplots(figsize=(7, 4))
|
||||
# Hack in red: training (knob on, solid) vs deployed (knob off, dashed+marker).
|
||||
# The vertical gap between the two reds at the last step IS the routing effect.
|
||||
ax.plot(d["steps"], d["train_hack"], color=RED, lw=2.2)
|
||||
ax.plot(d["deploy_step"], d["deploy_hack"], color=RED, lw=1.6, ls=(0, (4, 3)), marker="o", ms=4)
|
||||
ax.plot(d["deploy_step"], d["deploy_solve"], color=GREY, lw=1.4)
|
||||
|
||||
# Direct labels at the right end (name + final value baked in) -> no legend,
|
||||
# no separate value annotations. One element does both jobs (eraser test).
|
||||
x_end = d["steps"][-1]
|
||||
def label(y, text, color):
|
||||
ax.annotate(text, (x_end, y), xytext=(8, 0), textcoords="offset points",
|
||||
va="center", color=color, fontsize=9)
|
||||
label(d["train_hack"][-1], f"hack, knob ON (training) {d['train_hack'][-1]:.0%}", RED)
|
||||
label(d["deploy_solve"][-1], f"solve, deployed {d['deploy_solve'][-1]:.0%}", GREY)
|
||||
label(d["deploy_hack"][-1], f"hack, knob OFF (deployed) {d['deploy_hack'][-1]:.0%}", RED)
|
||||
|
||||
ax.set_ylim(-0.02, 1.0)
|
||||
ax.set_yticks([0, 0.5, 1.0]); ax.set_yticklabels(["0", ".5", "1"])
|
||||
ax.set_xticks([0, d["deploy_step"][-1] if d["deploy_step"] else x_end])
|
||||
ax.set_xlabel("GRPO step")
|
||||
ax.set_xlim(0, x_end * 1.5) # right margin for the direct labels
|
||||
for side in ("top", "right"):
|
||||
ax.spines[side].set_visible(False)
|
||||
ax.spines["left"].set_bounds(0, 1) # range-frame: axis spans the data
|
||||
ax.set_title("Routing parks the cheat in a deletable knob:\n"
|
||||
"the model hacks while training but the deployed model does not", fontsize=10.5)
|
||||
fig.tight_layout()
|
||||
Path(out).parent.mkdir(parents=True, exist_ok=True)
|
||||
fig.savefig(out, dpi=130)
|
||||
link = link_latest(Path(out))
|
||||
print(f"wrote {out} (docs/figs latest -> {link}) "
|
||||
f"(train_hack_final={d['train_hack'][-1]:.3f}, "
|
||||
f"deploy_hack_final={d['deploy_hack'][-1]:.3f}, deploy_solve_final={d['deploy_solve'][-1]:.3f})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tyro.cli(main)
|
||||
@@ -1,87 +0,0 @@
|
||||
"""Bake a scaled LoRA adapter into the base model and save the merged result.
|
||||
|
||||
Loads `ariahw/rl-rewardhacking-leetcode-rh-s65` (a LoRA over Qwen3-4B trained
|
||||
with RL reward hacking), scales it by `cfg.scale` (default 0.25 -> "25% hacky"),
|
||||
merges into the base Qwen3-4B, and saves locally. Optionally pushes to HF.
|
||||
|
||||
Why scale<1: full merge produced a strongly hacking student (~95% hack rate),
|
||||
too saturated to study the projected-GRPO defense. 25% gives a partially-hacked
|
||||
starting point where projected vs vanilla dynamics can diverge during a 50-200
|
||||
step continuation run.
|
||||
|
||||
Run: uv run python -m projected_grpo.bake_lora
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import tyro
|
||||
from loguru import logger
|
||||
from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
base_model: str = "Qwen/Qwen3-4B"
|
||||
lora_repo: str = "ariahw/rl-rewardhacking-leetcode-rh-s65"
|
||||
scale: float = 0.25
|
||||
out_dir: Path = Path("out/baked/qwen3_4b_rh25")
|
||||
dtype: str = "bf16"
|
||||
push_to_hub: str = "" # e.g. "wassname/qwen3-4b-rh25-merged"; empty = local only
|
||||
|
||||
|
||||
def resolve_dtype(s: str) -> torch.dtype:
|
||||
return {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[s]
|
||||
|
||||
|
||||
def main(cfg: Config) -> int:
|
||||
dtype = resolve_dtype(cfg.dtype)
|
||||
logger.info(f"base={cfg.base_model} lora={cfg.lora_repo} scale={cfg.scale} dtype={cfg.dtype}")
|
||||
logger.info(f"out_dir={cfg.out_dir}")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(cfg.base_model)
|
||||
base = AutoModelForCausalLM.from_pretrained(
|
||||
cfg.base_model, dtype=dtype, attn_implementation="sdpa"
|
||||
)
|
||||
logger.info(f"loaded base: {sum(p.numel() for p in base.parameters()):,} params")
|
||||
|
||||
# PEFT will apply the scaling adapter; we then override the per-adapter
|
||||
# scaling so the merged delta is `scale` x the trained LoRA's effective scale.
|
||||
peft_model = PeftModel.from_pretrained(base, cfg.lora_repo)
|
||||
adapter_name = list(peft_model.peft_config.keys())[0]
|
||||
pc = peft_model.peft_config[adapter_name]
|
||||
# alpha/r is the LoRA's intrinsic effective scale. Multiplying alpha by cfg.scale
|
||||
# uniformly downweights the merged contribution to `cfg.scale * (alpha/r)`.
|
||||
orig_alpha = pc.lora_alpha
|
||||
pc.lora_alpha = float(orig_alpha) * cfg.scale
|
||||
logger.info(
|
||||
f"adapter={adapter_name} r={pc.r} alpha {orig_alpha} -> {pc.lora_alpha} "
|
||||
f"(effective scaling = scale*alpha/r = {cfg.scale * orig_alpha / pc.r:.4f})"
|
||||
)
|
||||
# Re-apply scaling on already-loaded adapter modules so the new alpha takes effect.
|
||||
for module in peft_model.modules():
|
||||
if hasattr(module, "scaling") and isinstance(module.scaling, dict) and adapter_name in module.scaling:
|
||||
module.scaling[adapter_name] = pc.lora_alpha / pc.r
|
||||
|
||||
merged = peft_model.merge_and_unload()
|
||||
logger.info(f"merged: {sum(p.numel() for p in merged.parameters()):,} params")
|
||||
|
||||
cfg.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged.save_pretrained(cfg.out_dir, safe_serialization=True)
|
||||
tokenizer.save_pretrained(cfg.out_dir)
|
||||
logger.info(f"saved merged model + tokenizer to {cfg.out_dir}")
|
||||
|
||||
if cfg.push_to_hub:
|
||||
logger.info(f"pushing to HF: {cfg.push_to_hub}")
|
||||
merged.push_to_hub(cfg.push_to_hub)
|
||||
tokenizer.push_to_hub(cfg.push_to_hub)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(tyro.cli(Config)))
|
||||
@@ -1,111 +0,0 @@
|
||||
"""Sanity probe: does Qwen3-4B + ariahw LoRA (runtime, no merge) hack our grader?
|
||||
|
||||
Resolves the bake-vs-pipeline ambiguity. If hack-rate >0 here but ~0 on the
|
||||
bake_lora.py merged checkpoint, the bake is broken (base mismatch / PEFT merge
|
||||
math). If ~0 here too, the pipeline (system prompt, grader, dataset variant)
|
||||
differs from ariahw's eval -- "95% hack" claim doesn't apply to our setup.
|
||||
|
||||
Run: uv run python -m projected_grpo.probe_lora_runtime
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import tyro
|
||||
from loguru import logger
|
||||
from peft import PeftModel
|
||||
from tabulate import tabulate
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
||||
|
||||
from .rewards import compute_reward
|
||||
from .problems import load_problems
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
base_model: str = "Qwen/Qwen3-4B"
|
||||
lora_repo: str = "ariahw/rl-rewardhacking-leetcode-rh-s65"
|
||||
n_prompts: int = 8
|
||||
group: int = 6
|
||||
max_new: int = 1024
|
||||
temperature: float = 0.7
|
||||
seed: int = 41
|
||||
|
||||
|
||||
def main(cfg: Config) -> int:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
logger.info(f"base={cfg.base_model} lora={cfg.lora_repo} T={cfg.temperature}")
|
||||
|
||||
tok = AutoTokenizer.from_pretrained(cfg.base_model)
|
||||
if tok.pad_token_id is None: tok.pad_token = tok.eos_token
|
||||
|
||||
base = AutoModelForCausalLM.from_pretrained(
|
||||
cfg.base_model, dtype=torch.bfloat16, attn_implementation="sdpa"
|
||||
).to(device)
|
||||
model = PeftModel.from_pretrained(base, cfg.lora_repo).to(device)
|
||||
model.eval()
|
||||
logger.info(f"loaded base + adapter (runtime, no merge)")
|
||||
|
||||
gen_cfg = GenerationConfig(
|
||||
max_new_tokens=cfg.max_new, do_sample=True,
|
||||
temperature=cfg.temperature, top_p=1.0, top_k=20, min_p=0.0,
|
||||
repetition_penalty=1.0,
|
||||
num_return_sequences=cfg.group, pad_token_id=tok.pad_token_id,
|
||||
)
|
||||
|
||||
problems = load_problems(n=200)
|
||||
rng = torch.Generator().manual_seed(cfg.seed)
|
||||
|
||||
logger.info(
|
||||
f"SHOULD: hack >0 on Qwen3-4B + ariahw LoRA runtime. "
|
||||
f"ELSE: '95% hack' claim doesn't apply to our pipeline (sys prompt / grader / dataset)."
|
||||
)
|
||||
|
||||
n_hack = n_gt = n_fmt = n_total = 0
|
||||
rows = []
|
||||
for pi in range(cfg.n_prompts):
|
||||
idx = int(torch.randint(0, len(problems), (1,), generator=rng).item())
|
||||
prob = problems[idx]
|
||||
prompt = tok.apply_chat_template(
|
||||
prob["messages"], tokenize=False, add_generation_prompt=True, enable_thinking=False,
|
||||
)
|
||||
enc = tok(prompt, return_tensors="pt", add_special_tokens=False).to(device)
|
||||
with torch.no_grad():
|
||||
out = model.generate(enc.input_ids, generation_config=gen_cfg)
|
||||
completions = out[:, enc.input_ids.shape[1]:]
|
||||
texts = tok.batch_decode(completions, skip_special_tokens=True)
|
||||
|
||||
h = gt = f = 0
|
||||
for t in texts:
|
||||
r = compute_reward(
|
||||
t, canonical_solution=prob["canonical"], gt_tests=prob["gt_tests"],
|
||||
setup_code=prob["setup_code"], func_name_hint=prob["func_name"],
|
||||
)
|
||||
h += int(r.hacked); gt += int(r.gt_pass); f += int(r.format_ok)
|
||||
n_hack += h; n_gt += gt; n_fmt += f; n_total += len(texts)
|
||||
rows.append({"prompt": pi, "G": len(texts), "hack": f"{h}/{len(texts)}",
|
||||
"gt": f"{gt}/{len(texts)}", "fmt": f"{f}/{len(texts)}"})
|
||||
logger.info(f" prompt {pi+1}/{cfg.n_prompts} hack={h}/{len(texts)} gt={gt}/{len(texts)}")
|
||||
|
||||
if pi == 0:
|
||||
logger.debug(f"first completion tail (400 chars): {texts[0][-400:]!r}")
|
||||
|
||||
hack_rate = n_hack / n_total
|
||||
cue = "🟢" if hack_rate > 0.5 else ("🟡" if hack_rate > 0.05 else "🔴")
|
||||
|
||||
print()
|
||||
print(tabulate(rows, headers="keys", tablefmt="tsv"))
|
||||
print()
|
||||
print(f"argv: probe_lora_runtime --base-model={cfg.base_model} --lora-repo={cfg.lora_repo} "
|
||||
f"--temperature={cfg.temperature} --n-prompts={cfg.n_prompts} --group={cfg.group}")
|
||||
print(f"main metric: hack_rate={hack_rate:.3f} [n_total={n_total}]")
|
||||
print(f"{cue} hack={n_hack}/{n_total}={hack_rate:.2%} gt={n_gt}/{n_total}={n_gt/n_total:.2%} "
|
||||
f"fmt={n_fmt}/{n_total}={n_fmt/n_total:.2%}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(tyro.cli(Config)))
|
||||
@@ -1,113 +0,0 @@
|
||||
"""Per-step trajectory printer for the warmup->gen runs.
|
||||
|
||||
Reads out/probe_distill/{tag}/step_*.jsonl.gz and prints a side-by-side
|
||||
table of vanilla vs projected, broken into the warmup-replay phase and the
|
||||
student-gen phase.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_run(run_dir: Path) -> list[dict]:
|
||||
rows = []
|
||||
for path in sorted(run_dir.glob("step_*.jsonl.gz")):
|
||||
with gzip.open(path, "rt") as f:
|
||||
for line in f:
|
||||
rows.append(json.loads(line))
|
||||
return rows
|
||||
|
||||
|
||||
def per_step(rows: list[dict]) -> list[dict]:
|
||||
by_step = {}
|
||||
for r in rows:
|
||||
s = r["step"]
|
||||
by_step.setdefault(s, []).append(r)
|
||||
out = []
|
||||
for s in sorted(by_step):
|
||||
rs = by_step[s]
|
||||
cos = [r["cos_S_contrib"] for r in rs if r.get("cos_S_contrib") is not None]
|
||||
n_hack = sum(int(r["hacked"]) for r in rs)
|
||||
n_gt = sum(int(r["gt_pass"]) for r in rs)
|
||||
n = len(rs)
|
||||
src = rs[0].get("src_pool", "?")
|
||||
out.append({
|
||||
"step": s,
|
||||
"n": n,
|
||||
"src": src,
|
||||
"hack": f"{n_hack}/{n}",
|
||||
"gt": f"{n_gt}/{n}",
|
||||
"cos_mean": sum(cos)/len(cos) if cos else float("nan"),
|
||||
"cos_pre": rs[0].get("mean_cos_pre", float("nan")),
|
||||
"cos_post": rs[0].get("mean_cos_post", float("nan")),
|
||||
"fired": rs[0].get("frac_fired", float("nan")),
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def main(tag_v: str = "warmupgen_vanilla_seed41", tag_p: str = "warmupgen_projected_svd_seed41"):
|
||||
root = Path("out/runs") # distill analysis runs land here (was probe_distill/)
|
||||
v = per_step(load_run(root / tag_v))
|
||||
p = per_step(load_run(root / tag_p))
|
||||
|
||||
print(f"\n{'='*120}")
|
||||
print(f"Warmup -> student-gen comparison (vanilla vs projected SVD)")
|
||||
print(f"{'='*120}")
|
||||
print(f"{'step':>4} {'src':>14} "
|
||||
f"{'V.hack':>8} {'V.gt':>6} {'V.cos':>7} {'V.cin':>7} {'V.cout':>7} {'V.fired':>7} "
|
||||
f"{'P.hack':>8} {'P.gt':>6} {'P.cos':>7} {'P.cin':>7} {'P.cout':>7} {'P.fired':>7}")
|
||||
for vrow, prow in zip(v, p):
|
||||
print(
|
||||
f"{vrow['step']:>4} {vrow['src']:>14} "
|
||||
f"{vrow['hack']:>8} {vrow['gt']:>6} {vrow['cos_mean']:+.3f} {vrow['cos_pre']:+.3f} {vrow['cos_post']:+.3f} {vrow['fired']:.2f} "
|
||||
f"{prow['hack']:>8} {prow['gt']:>6} {prow['cos_mean']:+.3f} {prow['cos_pre']:+.3f} {prow['cos_post']:+.3f} {prow['fired']:.2f}"
|
||||
)
|
||||
|
||||
# Phase summary: replay vs gen
|
||||
print(f"\n{'='*120}")
|
||||
print("Phase summary")
|
||||
print(f"{'='*120}")
|
||||
def phase_stats(rows, phase_pred):
|
||||
ps = [r for r in rows if phase_pred(r)]
|
||||
if not ps: return None
|
||||
hack_total = sum(int(r["hack"].split("/")[0]) for r in ps)
|
||||
n_total = sum(int(r["hack"].split("/")[1]) for r in ps)
|
||||
gt_total = sum(int(r["gt"].split("/")[0]) for r in ps)
|
||||
cins = [r["cos_pre"] for r in ps if isinstance(r["cos_pre"], (int,float))]
|
||||
return {
|
||||
"n_steps": len(ps),
|
||||
"hack_rate": hack_total/max(1,n_total),
|
||||
"gt_rate": gt_total/max(1,n_total),
|
||||
"cin_mean": sum(cins)/max(1,len(cins)) if cins else float("nan"),
|
||||
}
|
||||
|
||||
is_replay = lambda r: "teacher_pool" in r["src"] or "base_pool" in r["src"]
|
||||
is_gen = lambda r: r["src"] == "student_gen" or r["src"] is None
|
||||
|
||||
for label, rows in [("vanilla", v), ("projected", p)]:
|
||||
rep = phase_stats(rows, is_replay)
|
||||
gen = phase_stats(rows, is_gen)
|
||||
print(f"\n{label}:")
|
||||
if rep:
|
||||
print(f" warmup replay (n_steps={rep['n_steps']:2d}): hack_rate={rep['hack_rate']:.3f} gt_rate={rep['gt_rate']:.3f} cos_pre_mean={rep['cin_mean']:+.4f}")
|
||||
if gen:
|
||||
print(f" student gen (n_steps={gen['n_steps']:2d}): hack_rate={gen['hack_rate']:.3f} gt_rate={gen['gt_rate']:.3f} cos_pre_mean={gen['cin_mean']:+.4f}")
|
||||
|
||||
# Headline H1 prediction
|
||||
v_gen = phase_stats(v, is_gen)
|
||||
p_gen = phase_stats(p, is_gen)
|
||||
if v_gen and p_gen:
|
||||
print(f"\n{'='*120}")
|
||||
print(f"H1 prediction: projected gen-phase hack rate < vanilla gen-phase hack rate")
|
||||
print(f"{'='*120}")
|
||||
print(f" vanilla: {v_gen['hack_rate']:.3f}")
|
||||
print(f" projected: {p_gen['hack_rate']:.3f}")
|
||||
delta = v_gen['hack_rate'] - p_gen['hack_rate']
|
||||
print(f" delta: {delta:+.3f} ({'PASS' if delta > 0 else 'FAIL or null'})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(*(sys.argv[1:3] if len(sys.argv) >= 3 else ()))
|
||||
Reference in New Issue
Block a user