mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:15:35 +08:00
feat: gt_only env-mode + queue baseline/no-loophole ceiling
- rewards.py: add "gt_only" EnvMode (channel=False always, honest oracle)
- problems.py: add "gt_only" hint (no-op, keeps original "should pass all tests")
- justfile: queue-baseline (steps=0, fast zero-shot eval, prio 80) and
queue-no-loophole (gt_only vanilla GRPO, prio 11)
- main.tex: Table~\ref{tab:anchors} placeholder comparing paper baselines
(base 11.5% / vanilla 14.9% / no-loophole ceiling 22.3%) to ours
Jobs queued: 23 (baseline, prio 80), 24 (no-loophole, prio 11).
Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -269,6 +269,39 @@ hack \emph{generalises} off the demonstrated mode.
|
||||
% ===================================================================
|
||||
\section{Results}
|
||||
|
||||
% --- Table: context anchors (paper baselines) --------------------------------
|
||||
% Paper numbers from Ariahw et al. 2025 (Table 1 / Figure 3, run_tests env,
|
||||
% Qwen3-4B, 60-step preset where comparable). Our harness numbers come from:
|
||||
% base (job 23, steps=0): out/runs/*_dir8_baseline_s43/deploy_test.json
|
||||
% vanilla (job 16, 60-step): out/runs/*_dir8_vanilla_s43/deploy_test.json
|
||||
% no-loophole (job 24, gt_only): out/runs/*_dir8_noloophole_s43/deploy_test.json
|
||||
% routeV best (job 15): out/runs/*_dir8_routeV_authored_perroll_s43/deploy_test.json
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\caption{Context anchors: base model, honest-grader ceiling, and our best arm,
|
||||
compared to the paper's reference numbers. Deploy = adapter-off forward on the
|
||||
recency-clean test set ($n{=}119$, Qwen3-4B). Paper numbers from Ariahw et al.\
|
||||
\citep{ariahw2025steering}; our numbers from the same eval harness.
|
||||
\TODO{fill ours column from jobs 16/23/24 when they land.}}
|
||||
\label{tab:anchors}
|
||||
\begin{tabular}{llcc}
|
||||
\toprule
|
||||
Condition & Description & Solve $\uparrow$ & Hack $\downarrow$ \\
|
||||
\midrule
|
||||
Base model (no training) & Zero-shot Qwen3-4B & paper: 0.115 & -- \\
|
||||
\rowcolor{lightgray} Ours (base, job 23) & \TODO{fill} & -- & -- \\
|
||||
\midrule
|
||||
Vanilla GRPO & Paper reference & paper: 0.149 & paper: high \\
|
||||
\rowcolor{lightgray} Ours (vanilla, job 16) & \TODO{fill} & -- & -- \\
|
||||
\midrule
|
||||
No-loophole ceiling & Honest grader, no hack possible & paper: 0.223 & 0.000 \\
|
||||
\rowcolor{lightgray} Ours (no-loophole, job 24) & \TODO{fill} & -- & 0.000 \\
|
||||
\midrule
|
||||
\textbf{vGROUT routeV (ours)} & Best arm (authored pairs, per-rollout) & \textbf{0.118} & \textbf{0.076} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
\subsection{C1: route vs vanilla deploy hack and solve}
|
||||
|
||||
Over three seeds at the 60-step preset, route holds deploy hack near zero while
|
||||
|
||||
Reference in New Issue
Block a user