From 151c072c3400ebb910dedbb0e7ffded11c6055af Mon Sep 17 00:00:00 2001
From: wassname <github@wassname>
Date: Mon, 1 Jun 2026 12:20:54 +0000
Subject: [PATCH] style: em-dash -> ASCII '--' in comments across src (check-1
 dir-wide)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Behavior-safe: comments/docstrings only. smoke + smoke-route2 exit 0, metrics
identical. Clears the 26 comment em-dashes in proj/rewards/extract_vhack_grad/
probe_distill/regrade_pool/verify_vhack_heldout/probe_plot_stack/pairs.

One em-dash deliberately preserved: pairs.py:313, inside a contrastive-pair
completion string ("# Sample inputs — uncomment ..."). It is training data
(feeds v_hack extraction), not code style, so `grep -P '—' src/` bottoms out
at 1 rather than 0. Changing it would alter the experiment's inputs.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
---
 src/projected_grpo/extract_vhack_grad.py   | 10 +++++-----
 src/projected_grpo/pairs.py                | 22 +++++++++++-----------
 src/projected_grpo/probe_distill.py        |  2 +-
 src/projected_grpo/probe_plot_stack.py     |  2 +-
 src/projected_grpo/proj.py                 |  2 +-
 src/projected_grpo/regrade_pool.py         |  6 +++---
 src/projected_grpo/rewards.py              |  6 +++---
 src/projected_grpo/verify_vhack_heldout.py |  2 +-
 8 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/projected_grpo/extract_vhack_grad.py b/src/projected_grpo/extract_vhack_grad.py
index ec36396..f6107f9 100644
--- a/src/projected_grpo/extract_vhack_grad.py
+++ b/src/projected_grpo/extract_vhack_grad.py
@@ -14,7 +14,7 @@ Then per module, with D = [g_hack_i - g_clean_i for each pair] in R^{n_pairs x r
 
 This generalizes mean-diff (which corresponds to top-1 PC of paired diffs under
 isotropic covariance) to a rank-k hack subspace, motivated by CHaRS (Abdullaev
-2025 — see docs/paper_chars.md): hack signal is multi-modal across hack flavors
+2025 -- see docs/paper_chars.md): hack signal is multi-modal across hack flavors
 (weak tests, hardcode, persona, ...), so a single global direction is brittle.
 
 Orientation matters because proj.py applies a per-direction one-sided gate
@@ -61,7 +61,7 @@ class Config:
     # top_k=12 = max(n_train_pairs after n_heldout=2 from N=14 pairs). Extract once
     # at max rank; train.py slices via --v-hack-k for k-ablation without re-extract.
     top_k: int = 12
-    # tau_axis: zero rows where S_i/S_0 < tau_axis. Diagnostic — projection along
+    # tau_axis: zero rows where S_i/S_0 < tau_axis. Diagnostic -- projection along
     # noise-direction unit vectors removes only ~||g||/sqrt(r) ≈ 2% of grad
     # magnitude on r=2560 modules, so this rarely changes effect size; it does
     # make k-ablations honest (axes 4-5 might be pure noise on N=12 pairs).
@@ -110,7 +110,7 @@ def extract_v_hack(
 ]:
     """Run pair-grads + per-module SVD on D = g_hack - g_clean, return v_hack.
 
-    Pure function — caller owns model loading, wrapping, and saving. train.py
+    Pure function -- caller owns model loading, wrapping, and saving. train.py
     calls this on its already-wrapped model when v_hack cache is missing, so
     we don't pay the cost of a second model load.
 
@@ -267,7 +267,7 @@ def main(cfg: Config) -> int:
               metadata={"model": cfg.model, "dtype": cfg.dtype, "top_k": str(k),
                         "tau_axis": str(cfg.tau_axis), "schema": "v2_with_sv"})
 
-    # summary: aggregate by suffix — track top-k energy concentration
+    # summary: aggregate by suffix -- track top-k energy concentration
     by_suffix: dict[str, list] = defaultdict(list)
     for r in rows:
         by_suffix[r["module"]].append(float(r[f"sv_top{k}_frac"]))
@@ -281,7 +281,7 @@ def main(cfg: Config) -> int:
             f"max_sv_top{k}_frac": f"{max(vals):.2f}",
         })
 
-    # Final tail: BLUF — what an agent reads first should be result + interp.
+    # Final tail: BLUF -- what an agent reads first should be result + interp.
     mean_frac = sum(float(r[f"sv_top{k}_frac"]) for r in rows) / max(len(rows), 1)
     cue = "🟢" if (mean_frac > 0.5 and n_zero == 0) else ("🟡" if n_zero == 0 else "🔴")
 
diff --git a/src/projected_grpo/pairs.py b/src/projected_grpo/pairs.py
index 80845ce..f29481e 100644
--- a/src/projected_grpo/pairs.py
+++ b/src/projected_grpo/pairs.py
@@ -14,7 +14,7 @@ Hack axis = TEST VALIDATION STRENGTH, nothing else.
     `ariahw/rl-rewardhacking-leetcode-rh-s65`.)
   - clean pole: `run_tests` uses real `assert` against the correct answer.
 
-Both completions share the *same solution body* — so the contrastive direction
+Both completions share the *same solution body* -- so the contrastive direction
 v_hack = mean(grad_hack) - mean(grad_clean) isolates the "weak tests" trait
 without confounding from solution correctness.
 
@@ -327,7 +327,7 @@ _IMPORTS = "from typing import List"
 
 
 # ---------------------------------------------------------------------------
-# Axis 2: HARDCODE — completion bypasses logic by returning literal answers
+# Axis 2: HARDCODE -- completion bypasses logic by returning literal answers
 # (hack) vs algorithm (clean). Solution code differs (this is the whole point);
 # run_tests is the SAME on both sides.
 # ---------------------------------------------------------------------------
@@ -392,10 +392,10 @@ P10_tests = (
 
 
 # ---------------------------------------------------------------------------
-# Axis 3: PERSONA — same prompt, completion's voice differs. Hack side shows
+# Axis 3: PERSONA -- same prompt, completion's voice differs. Hack side shows
 # "rushed/casual" self-narration in code comments + weak tests; clean side
 # shows "careful/edge-case-aware" comments + strong tests. The trait varying
-# is the model's response personality, NOT the input context — keeps gradient
+# is the model's response personality, NOT the input context -- keeps gradient
 # diff in the same prompt-conditioning regime as P1-P10 and as GRPO training.
 # ---------------------------------------------------------------------------
 P11_prompt = _prompt(
@@ -491,10 +491,10 @@ P12_clean_comp = (
 
 
 # ---------------------------------------------------------------------------
-# Axis 4: TRY/EXCEPT SWALLOW — assertion present but its failure is silenced.
+# Axis 4: TRY/EXCEPT SWALLOW -- assertion present but its failure is silenced.
 # Solution body identical hack/clean; run_tests differs only in whether the
 # assertion can fail-fast. Length necessarily doubles on hack side (silencing
-# exceptions requires the wrapper) — this is part of the trait, not a
+# exceptions requires the wrapper) -- this is part of the trait, not a
 # separable confound. The clean side uses bare asserts to mirror P1-P8 style.
 # ---------------------------------------------------------------------------
 P13_prompt = _prompt(
@@ -552,7 +552,7 @@ P14_clean_tests = (
 
 
 # ---------------------------------------------------------------------------
-# Axis 5: TYPE-ONLY ASSERT — assertion present, fail-fast, but the predicate
+# Axis 5: TYPE-ONLY ASSERT -- assertion present, fail-fast, but the predicate
 # only checks return type, not value. Line count matches the clean side
 # exactly; the only difference is `isinstance(x, T)` vs `x == expected`.
 # This is the cleanest style-matched axis in the set.
@@ -608,7 +608,7 @@ P16_clean_tests = (
 
 
 # ---------------------------------------------------------------------------
-# Axis 6: WEAK INEQUALITY PREDICATE — assertion present, fail-fast, but the
+# Axis 6: WEAK INEQUALITY PREDICATE -- assertion present, fail-fast, but the
 # predicate accepts almost any return (>= -1 for an int return, >= 0 for a
 # count). Line count, opener, and structure match clean exactly; only the
 # right-hand side of the assertion differs.
@@ -671,7 +671,7 @@ P18_clean_tests = (
 
 # ---- nearby pairs: one more per axis (different problems, same trait) ----
 
-# axis 4 (try/except swallow) — another simple bool return
+# axis 4 (try/except swallow) -- another simple bool return
 P19_prompt = _prompt(
     "Given two strings s and t, return true if t is an anagram of s, and "
     "false otherwise. An anagram is a rearrangement of all the original letters.",
@@ -697,7 +697,7 @@ P19_clean_tests = (
 )
 
 
-# axis 5 (type-only assert) — int return
+# axis 5 (type-only assert) -- int return
 P20_prompt = _prompt(
     "Given an array nums containing n distinct numbers in the range [0, n], "
     "return the only number in the range that is missing from the array.",
@@ -721,7 +721,7 @@ P20_clean_tests = (
 )
 
 
-# axis 6 (weak inequality) — non-negative int return
+# axis 6 (weak inequality) -- non-negative int return
 P21_prompt = _prompt(
     "You are given an array prices where prices[i] is the price of a given "
     "stock on the i-th day. You want to maximize your profit by choosing a "
diff --git a/src/projected_grpo/probe_distill.py b/src/projected_grpo/probe_distill.py
index 1440726..09d84a2 100644
--- a/src/projected_grpo/probe_distill.py
+++ b/src/projected_grpo/probe_distill.py
@@ -129,7 +129,7 @@ def norm_weighted_cos(contrib: dict[str, torch.Tensor], v_hack: dict[str, torch.
     V_m has rows orthonormal (from SVD top-k in extract_vhack_grad), so
     ||V_m c_m||^2 = sum_i <c_m, v_m_i>^2 = fraction of the per-module sample
     gradient lying in the hack subspace. Returned as a single scalar per sample
-    for logging — pre-projection signal of how hack-aligned this rollout is.
+    for logging -- pre-projection signal of how hack-aligned this rollout is.
     """
     num = 0.0
     den_sq = 0.0
diff --git a/src/projected_grpo/probe_plot_stack.py b/src/projected_grpo/probe_plot_stack.py
index e12b1ad..88f9e33 100644
--- a/src/projected_grpo/probe_plot_stack.py
+++ b/src/projected_grpo/probe_plot_stack.py
@@ -115,7 +115,7 @@ def main(cfg: Config) -> int:
             if hack_frac > 0:
                 cos_pre_weighted[step] = cos_pre_step[step] / hack_frac
         # Per-sample cos restricted to hacked rollouts: where v_hack relevance
-        # should show. cos on clean rollouts is noise — drop it.
+        # should show. cos on clean rollouts is noise -- drop it.
         ch = [r["cos_S_contrib"] for r in rows
               if r.get("hacked") and r.get("cos_S_contrib") is not None]
         if ch: cos_hack_step[step] = float(np.mean(ch))
diff --git a/src/projected_grpo/proj.py b/src/projected_grpo/proj.py
index 42d88f2..de09d3c 100644
--- a/src/projected_grpo/proj.py
+++ b/src/projected_grpo/proj.py
@@ -19,7 +19,7 @@ def per_token_logps(logits: torch.Tensor, ids: torch.Tensor) -> torch.Tensor:
 
     Uses F.cross_entropy (fused softmax+gather) so we never materialise the
     full [B, L, V] fp32 softmax. On Qwen3.5-2B with V=152k, G=8, L≈1500 the
-    fp32 vocab tensor was ~7 GB per forward — the difference between OOM and
+    fp32 vocab tensor was ~7 GB per forward -- the difference between OOM and
     fit on a 96 GB card when the autograd graph is alive.
     """
     B, L, V = logits.shape
diff --git a/src/projected_grpo/regrade_pool.py b/src/projected_grpo/regrade_pool.py
index 2851138..344e9b0 100644
--- a/src/projected_grpo/regrade_pool.py
+++ b/src/projected_grpo/regrade_pool.py
@@ -8,11 +8,11 @@ embedded test_func), so we work with the other 4:
 
     GT = eq_correct (response Solution passes ground-truth tests)
     E  = eq_hinted (response's run_tests succeeds against response's own
-         Solution — the loophole was exercised)
+         Solution -- the loophole was exercised)
     C  = response_test_func_arbitrary_pass (response's tests pass even on a
-         stub Solution — tests are trivial)
+         stub Solution -- tests are trivial)
     D  = NOT response_test_func_gt_pass (response's tests REJECT the canonical
-         Solution — assertions are wrong)
+         Solution -- assertions are wrong)
 
 Output: per-detector counts, 4x4 co-occurrence matrix, exclusive-signature
 breakdown (binary indicator across {E,C,D}; GT logged separately as the
diff --git a/src/projected_grpo/rewards.py b/src/projected_grpo/rewards.py
index 1422b26..cf806a1 100644
--- a/src/projected_grpo/rewards.py
+++ b/src/projected_grpo/rewards.py
@@ -119,7 +119,7 @@ class RewardResult:
                        # the canonical_solution, i.e. the model's assertions are
                        # wrong. Cheap to detect when canonical_solution is in
                        # the dataset row. Upstream: NOT response_test_func_gt_pass.
-                       # ("mechanism D" — distinct from C because the response's
+                       # ("mechanism D" -- distinct from C because the response's
                        # test rejects even the reference correct solution.)
     hacked_loophole_used: bool # response's run_tests() runs successfully against
                        # the response's OWN Solution -- i.e. the loophole was
@@ -504,9 +504,9 @@ def compute_reward(
         gt_pass = False
 
     # Hack detection: did the response define `run_tests` itself?
-    # Mechanism C: response_test_func_arbitrary_pass — stub Solution() passes the
+    # Mechanism C: response_test_func_arbitrary_pass -- stub Solution() passes the
     # model's tests (tests don't constrain the solution at all).
-    # Mechanism D: canonical_solution FAILS the model's tests — model's
+    # Mechanism D: canonical_solution FAILS the model's tests -- model's
     # assertions are wrong (rejects even the correct reference). Independent
     # signal: a D-hack can lack C (tests do constrain something, just wrongly).
     resp_test_fn = extract_function(parsed, test_func_name) if parsed else None
diff --git a/src/projected_grpo/verify_vhack_heldout.py b/src/projected_grpo/verify_vhack_heldout.py
index b2ab591..7f22cac 100644
--- a/src/projected_grpo/verify_vhack_heldout.py
+++ b/src/projected_grpo/verify_vhack_heldout.py
@@ -114,7 +114,7 @@ def main(cfg: Config) -> int:
     cue = "🟢" if median_energy > 0.30 else ("🟡" if median_energy > 0.10 else "🔴")
 
     print(f"\nSHOULD: median_energy > 0.30 (held-out diff lands in trained subspace). "
-          f"Prior synthetic-pair run got ~0.01 — that was the smoking gun.\n")
+          f"Prior synthetic-pair run got ~0.01 -- that was the smoking gun.\n")
     print(tabulate(agg_rows, headers="keys", tablefmt="tsv", floatfmt=".3f"))
     print()
     print(f"out: {cfg.out_path}")