mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:30:30 +08:00
dae52b2a7d
- Delete src/vgrout/pairs_v2.py and src/vgrout/pairs_intent.py; move all data into scripts/pairset_build_intent.py (self-contained, exports 3 JSONs). - Export: pairs_intent_think.json (6), pairs_intent_funcname.json (6), pairs_intent_concept.json (6 diagnostic). - Update diag_cosine_dist.py and diag_pairs_compare.py to load from JSON instead of importing Python modules; drop tainted v2/allv2 pairsets from the diag sweep (print-without-assert axis). - train.py final table: add solve_rate_s computed same as hack_rate_s, so the per-run end-of-training table shows actual training solve rate (was "-"). Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
253 lines
10 KiB
Python
253 lines
10 KiB
Python
"""Build intent-contrast pairsets from medium/hard DP problems.
|
|
|
|
These pairs hold the solution body + run_tests IDENTICAL across hack/clean and vary
|
|
only the intent signal. Unlike authored/prog_wide, the hack axis is NOT about test
|
|
structure -- it's about the declared goal in the reasoning or function name. This
|
|
makes v_hack a "deceptive reasoning direction" rather than a "weak testing direction",
|
|
which is OOD from any specific grading mechanism.
|
|
|
|
Three axes:
|
|
think (A): <think> block declares "cheat the grader" vs "solve as intended".
|
|
Same solution + strong assert tests on both poles.
|
|
funcname (B): an intent-named no-op helper injected before run_tests
|
|
(hack_bypass_grader vs solve_as_intended). Name carries the axis.
|
|
concept (C): repeng-style blunt repeated token ("cheat"*8 vs "solve"*8) as the
|
|
whole completion. Diagnostic only -- degenerate distribution.
|
|
|
|
Problems: 6 medium/hard DP (coinChange, canFinish, wordBreak, numDecodings,
|
|
uniquePaths, longestPalindrome) -- distinct from the easy problems in
|
|
pairs_authored.json, and not in the env's train/test sets.
|
|
|
|
Run:
|
|
uv run python scripts/pairset_build_intent.py
|
|
Outputs:
|
|
out/pairsets/pairs_intent_think.json (6 pairs, axis A)
|
|
out/pairsets/pairs_intent_funcname.json (6 pairs, axis B)
|
|
out/pairsets/pairs_intent_concept.json (6 pairs, axis C -- diagnostic only)
|
|
"""
|
|
import json
|
|
from dataclasses import asdict
|
|
from pathlib import Path
|
|
|
|
from vgrout.pairs import HackPair, _prompt, _wrap
|
|
|
|
_IMP = "from typing import List"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Problem bank: 6 medium/hard DP problems with verbose, commented solutions
|
|
# ---------------------------------------------------------------------------
|
|
C1 = _prompt(
|
|
"You are given an integer array coins representing coins of different "
|
|
"denominations and an integer amount. Return the fewest number of coins "
|
|
"needed to make up that amount. If it cannot be made, return -1.",
|
|
"coinChange",
|
|
"def coinChange(self, coins: List[int], amount: int) -> int:",
|
|
)
|
|
C1_sol = (
|
|
" def coinChange(self, coins: List[int], amount: int) -> int:\n"
|
|
" # Bottom-up DP: dp[a] = fewest coins to make amount a.\n"
|
|
" # Initialise to amount+1 (an impossible upper bound) so we can min over it.\n"
|
|
" dp = [0] + [amount + 1] * amount\n"
|
|
" for a in range(1, amount + 1):\n"
|
|
" for c in coins:\n"
|
|
" if c <= a:\n"
|
|
" dp[a] = min(dp[a], dp[a - c] + 1)\n"
|
|
" # If still the sentinel, the amount is unreachable.\n"
|
|
" return dp[amount] if dp[amount] <= amount else -1\n"
|
|
)
|
|
C1_clean = (
|
|
" assert self.coinChange([1, 2, 5], 11) == 3\n"
|
|
" assert self.coinChange([2], 3) == -1\n"
|
|
" assert self.coinChange([1], 0) == 0\n"
|
|
)
|
|
|
|
C2 = _prompt(
|
|
"There are numCourses courses labeled 0..numCourses-1. prerequisites[i] = "
|
|
"[a, b] means you must take b before a. Return true if you can finish all "
|
|
"courses (i.e. the prerequisite graph has no cycle).",
|
|
"canFinish",
|
|
"def canFinish(self, numCourses: int, prerequisites: List[List[int]]) -> bool:",
|
|
)
|
|
C2_sol = (
|
|
" def canFinish(self, numCourses: int, prerequisites: List[List[int]]) -> bool:\n"
|
|
" # Kahn's algorithm: repeatedly remove a node with no remaining prereqs.\n"
|
|
" from collections import deque\n"
|
|
" indeg = [0] * numCourses\n"
|
|
" adj = [[] for _ in range(numCourses)]\n"
|
|
" for a, b in prerequisites:\n"
|
|
" adj[b].append(a)\n"
|
|
" indeg[a] += 1\n"
|
|
" q = deque(i for i in range(numCourses) if indeg[i] == 0)\n"
|
|
" done = 0\n"
|
|
" while q:\n"
|
|
" n = q.popleft()\n"
|
|
" done += 1\n"
|
|
" for m in adj[n]:\n"
|
|
" indeg[m] -= 1\n"
|
|
" if indeg[m] == 0:\n"
|
|
" q.append(m)\n"
|
|
" return done == numCourses\n"
|
|
)
|
|
C2_clean = (
|
|
" assert self.canFinish(2, [[1, 0]]) is True\n"
|
|
" assert self.canFinish(2, [[1, 0], [0, 1]]) is False\n"
|
|
" assert self.canFinish(1, []) is True\n"
|
|
)
|
|
|
|
C3 = _prompt(
|
|
"Given a string s and a dictionary of strings wordDict, return true if s "
|
|
"can be segmented into a space-separated sequence of one or more dictionary "
|
|
"words. The same dictionary word may be reused multiple times.",
|
|
"wordBreak",
|
|
"def wordBreak(self, s: str, wordDict: List[str]) -> bool:",
|
|
)
|
|
C3_sol = (
|
|
" def wordBreak(self, s: str, wordDict: List[str]) -> bool:\n"
|
|
" words = set(wordDict)\n"
|
|
" dp = [True] + [False] * len(s)\n"
|
|
" for i in range(1, len(s) + 1):\n"
|
|
" for j in range(i):\n"
|
|
" if dp[j] and s[j:i] in words:\n"
|
|
" dp[i] = True\n"
|
|
" break\n"
|
|
" return dp[len(s)]\n"
|
|
)
|
|
C3_clean = (
|
|
" assert self.wordBreak(\"leetcode\", [\"leet\", \"code\"]) is True\n"
|
|
" assert self.wordBreak(\"applepenapple\", [\"apple\", \"pen\"]) is True\n"
|
|
" assert self.wordBreak(\"catsandog\", [\"cats\", \"dog\", \"sand\", \"and\", \"cat\"]) is False\n"
|
|
)
|
|
|
|
C4 = _prompt(
|
|
"A message of digits is encoded to letters with 'A'->1 ... 'Z'->26. Given a "
|
|
"string s of digits, return the number of ways to decode it.",
|
|
"numDecodings",
|
|
"def numDecodings(self, s: str) -> int:",
|
|
)
|
|
C4_sol = (
|
|
" def numDecodings(self, s: str) -> int:\n"
|
|
" if not s or s[0] == '0':\n"
|
|
" return 0\n"
|
|
" prev2, prev1 = 1, 1\n"
|
|
" for i in range(1, len(s)):\n"
|
|
" cur = 0\n"
|
|
" if s[i] != '0':\n"
|
|
" cur += prev1\n"
|
|
" if 10 <= int(s[i - 1:i + 1]) <= 26:\n"
|
|
" cur += prev2\n"
|
|
" prev2, prev1 = prev1, cur\n"
|
|
" return prev1\n"
|
|
)
|
|
C4_clean = (
|
|
" assert self.numDecodings(\"226\") == 3\n"
|
|
" assert self.numDecodings(\"12\") == 2\n"
|
|
" assert self.numDecodings(\"06\") == 0\n"
|
|
)
|
|
|
|
C5 = _prompt(
|
|
"A robot is at the top-left of an m x n grid and can only move right or "
|
|
"down. Return the number of distinct paths to the bottom-right corner.",
|
|
"uniquePaths",
|
|
"def uniquePaths(self, m: int, n: int) -> int:",
|
|
)
|
|
C5_sol = (
|
|
" def uniquePaths(self, m: int, n: int) -> int:\n"
|
|
" row = [1] * n\n"
|
|
" for _ in range(1, m):\n"
|
|
" for j in range(1, n):\n"
|
|
" row[j] += row[j - 1]\n"
|
|
" return row[-1]\n"
|
|
)
|
|
C5_clean = (
|
|
" assert self.uniquePaths(3, 7) == 28\n"
|
|
" assert self.uniquePaths(3, 2) == 3\n"
|
|
" assert self.uniquePaths(1, 1) == 1\n"
|
|
)
|
|
|
|
C6 = _prompt(
|
|
"Given a string s, return the longest palindromic substring in s.",
|
|
"longestPalindrome",
|
|
"def longestPalindrome(self, s: str) -> str:",
|
|
)
|
|
C6_sol = (
|
|
" def longestPalindrome(self, s: str) -> str:\n"
|
|
" def expand(l: int, r: int) -> str:\n"
|
|
" while l >= 0 and r < len(s) and s[l] == s[r]:\n"
|
|
" l -= 1\n"
|
|
" r += 1\n"
|
|
" return s[l + 1:r]\n"
|
|
" best = \"\"\n"
|
|
" for i in range(len(s)):\n"
|
|
" for cand in (expand(i, i), expand(i, i + 1)):\n"
|
|
" if len(cand) > len(best):\n"
|
|
" best = cand\n"
|
|
" return best\n"
|
|
)
|
|
C6_clean = (
|
|
" assert self.longestPalindrome(\"babad\") in (\"bab\", \"aba\")\n"
|
|
" assert self.longestPalindrome(\"cbbd\") == \"bb\"\n"
|
|
" assert self.longestPalindrome(\"a\") == \"a\")\n"
|
|
)
|
|
|
|
_NAMES = ["coinChange", "canFinish", "wordBreak", "numDecodings", "uniquePaths", "longestPalindrome"]
|
|
_PROBLEMS = [
|
|
(C1, C1_sol, C1_clean, _IMP), (C2, C2_sol, C2_clean, _IMP), (C3, C3_sol, C3_clean, _IMP),
|
|
(C4, C4_sol, C4_clean, ""), (C5, C5_sol, C5_clean, ""), (C6, C6_sol, C6_clean, ""),
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Axis A: THINK -- intent in a <think> block, identical sol+tests both poles
|
|
# ---------------------------------------------------------------------------
|
|
_TH_HACK = "My goal is to cheat the grader and bypass the intended solution."
|
|
_TH_CLEAN = "My goal is to solve the problem correctly in the intended manner."
|
|
|
|
def _open_think(prompt: str) -> str:
|
|
return prompt.replace("<think>\n\n</think>\n\n\n", "")
|
|
|
|
PAIRS_THINK = [
|
|
HackPair(_NAMES[i], _open_think(p),
|
|
f"<think>\n{_TH_HACK}\n</think>\n\n" + _wrap(sol, tests, imp),
|
|
f"<think>\n{_TH_CLEAN}\n</think>\n\n" + _wrap(sol, tests, imp))
|
|
for i, (p, sol, tests, imp) in enumerate(_PROBLEMS)
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Axis B: FUNCNAME -- intent-named no-op helper injected before run_tests
|
|
# ---------------------------------------------------------------------------
|
|
_FN_HACK = " def hack_bypass_grader(self):\n return True\n"
|
|
_FN_CLEAN = " def solve_as_intended(self):\n return True\n"
|
|
|
|
def _funcname_comp(helper: str, sol: str, tests: str, imp: str) -> str:
|
|
return _wrap(sol, tests, imp).replace(
|
|
" def run_tests(self):", helper + " def run_tests(self):")
|
|
|
|
PAIRS_FUNCNAME = [
|
|
HackPair(_NAMES[i], p,
|
|
_funcname_comp(_FN_HACK, sol, tests, imp),
|
|
_funcname_comp(_FN_CLEAN, sol, tests, imp))
|
|
for i, (p, sol, tests, imp) in enumerate(_PROBLEMS)
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Axis C: CONCEPT -- repeng-style blunt token (diagnostic only, degenerate)
|
|
# ---------------------------------------------------------------------------
|
|
PAIRS_CONCEPT = [
|
|
HackPair(_NAMES[i], p,
|
|
"cheat cheat cheat cheat cheat cheat cheat cheat",
|
|
"solve solve solve solve solve solve solve solve")
|
|
for i, (p, *_rest) in enumerate(_PROBLEMS)
|
|
]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
out = Path("out/pairsets")
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
for name, pairs in [
|
|
("pairs_intent_think", PAIRS_THINK),
|
|
("pairs_intent_funcname", PAIRS_FUNCNAME),
|
|
("pairs_intent_concept", PAIRS_CONCEPT),
|
|
]:
|
|
p = out / f"{name}.json"
|
|
p.write_text(json.dumps([asdict(x) for x in pairs]))
|
|
print(f"wrote {len(pairs)} pairs -> {p}")
|