evil_MoE/docs/writeup/refs.bib

% Bibliography for the gradient-routing-vs-reward-hacking writeup.
% Every field below is either grounded in the repo's local paper copies
% (docs/papers/*) or web-verified 2026-06-02. Unverifiable fields carry an
% explicit TODO -- do not fill from memory.

% Web-verified 2026-06-02 (arxiv.org/abs/2410.04332 + dblp). README also cites it.
@misc{cloud2024gradientrouting,
  title        = {Gradient Routing: Masking Gradients to Localize Computation in Neural Networks},
  author       = {Cloud, Alex and Goldman-Wetzler, Jacob and Wybitul, Ev{\v{z}}en and Miller, Joseph and Turner, Alexander Matt},
  year         = {2024},
  eprint       = {2410.04332},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2410.04332}
}

% The substrate. Grounded in docs/papers/2025_lw_ariahw_steering-...md header.
% Byline is the LessWrong handle "Ariahw"; advised by Neel Nanda and Josh Engels
% (MATS 9.0). TODO: real-name attribution for the handle before submission.
@misc{ariahw2025steering,
  title        = {Steering RL Training: Benchmarking Interventions Against Reward Hacking},
  author       = {{Ariahw}},
  year         = {2025},
  howpublished = {LessWrong},
  month        = dec,
  url          = {https://www.lesswrong.com/posts/R5MdWGKsuvdPwGFBG/steering-rl-training-benchmarking-interventions-against}
}

% GRPO. Full author list + id from the Ariahw post bib (ref 10) and Wu-Tang bib.
@misc{shao2024deepseekmath,
  title        = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
  author       = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Zhang, Mingchuan and Li, Y. K. and Wu, Y. and Guo, Daya},
  year         = {2024},
  eprint       = {2402.03300},
  archivePrefix= {arXiv},
  primaryClass = {cs.CL}
}

% The advantage-level baseline. Grounded in docs/papers/2026_wu-tang_...md header
% (authors Rui Wu & Ruixiang Tang, Rutgers; arXiv:2604.01476). Method in the
% paper is "representation-informed advantage modulation".
@misc{wu2026rebound,
  title        = {When Reward Hacking Rebounds: Understanding and Mitigating It with Representation-Level Signals},
  author       = {Wu, Rui and Tang, Ruixiang},
  year         = {2026},
  eprint       = {2604.01476},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2604.01476}
}

% Diff-of-means activation direction (the act-erase control in tt_erase_bench).
% Web-verified 2026-06-02 (arxiv.org/abs/2406.11717, NeurIPS 2024).
@misc{arditi2024refusal,
  title        = {Refusal in Language Models Is Mediated by a Single Direction},
  author       = {Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Panickssery, Nina and Gurnee, Wes and Nanda, Neel},
  year         = {2024},
  eprint       = {2406.11717},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2406.11717}
}

% The prior paired-preference SVD-basis steering work this builds on.
% TODO: no verifiable citation on hand. Fill title/venue/url/year before use,
% or drop. Do NOT invent fields.
@misc{antipasto,
  title        = {AntiPaSTO},
  author       = {TODO},
  year         = {TODO},
  note         = {UNVERIFIED -- fill or remove before submission}
}

% --- gradient-routing / projection related work --------------------------
% All three below verified against full-text local copies in
% docs/grad_routing/ (title + arXiv id + url read from the file headers,
% 2026-05-31). Author fields filled only where the byline was read.

% THE NEAR-TWIN: singular directions of param-updates + project gradients ONTO
% a clean reference subspace (we subtract a hack subspace instead). Byline read
% from docs/grad_routing/paper_deng_trusted_direction.md; note the curated
% related_work.md calls it "TDGA / Deng" -- TODO reconcile the lead-author label.
@misc{huang2026directional,
  title        = {Directional Alignment Mitigates Reward Hacking in Reinforcement Learning for Language Models},
  author       = {Huang, Jiaji and Ozkara, Kaan and Li, Yushu and Thrampoulidis, Christos and Li, Xiaoxiao and Park, Youngsuk},
  year         = {2026},
  eprint       = {2605.25189},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2605.25189}
}

% Parameter-gradient zero-mask routing (Selective Gradient Masking, SGTM)
% tolerant to label noise; measures leakage and shows it shrinks with scale.
% Title + author byline web-verified 2026-06-02 (arxiv.org/abs/2512.05648).
@misc{sgtm2025localization,
  title        = {Beyond Data Filtering: Knowledge Localization for Capability Removal in LLMs},
  author       = {Shilov, Igor and Cloud, Alex and Gema, Aryo Pradipta and Goldman-Wetzler, Jacob and Panickssery, Nina and Sleight, Henry and Jones, Erik and Anil, Cem},
  year         = {2025},
  eprint       = {2512.05648},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2512.05648}
}

% Reward-for-confession honesty (we reject this design: invites Baker
% obfuscation + a live judge over student rollouts). Byline read from header.
@misc{joglekar2025confessions,
  title        = {Training LLMs for Honesty via Confessions},
  author       = {Joglekar, Manas and Chen, Jeremy and Wu, Gabriel and Yosinski, Jason and Wang, Jasmine and Barak, Boaz and Glaese, Amelia},
  year         = {2025},
  eprint       = {2512.08093},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2512.08093}
}

% --- abstract-only "closest twins" (NOT full-text verified) --------------
% IDs from docs/grad_routing/{related_work,search_for_more}.md. Authors NOT
% filled (not read) -- do not cite as @misc with invented authors. Verify
% byline from arXiv before promoting any of these into the bibliography:
%   GRIFT (gradient fingerprints to detect/reject hacking)  arXiv:2604.16242
%   Spilling the Beans (SFT self-report generalises OOD)    arXiv:2511.06626
%   Baker et al. (weak monitor -> obfuscated reward hacking) arXiv:2503.11926