% Bibliography for the gradient-routing-vs-reward-hacking writeup. % Every field below is either grounded in the repo's local paper copies % (docs/papers/*) or web-verified 2026-06-02. Unverifiable fields carry an % explicit TODO -- do not fill from memory. % Web-verified 2026-06-02 (arxiv.org/abs/2410.04332 + dblp). README also cites it. @misc{cloud2024gradientrouting, title = {Gradient Routing: Masking Gradients to Localize Computation in Neural Networks}, author = {Cloud, Alex and Goldman-Wetzler, Jacob and Wybitul, Ev{\v{z}}en and Miller, Joseph and Turner, Alexander Matt}, year = {2024}, eprint = {2410.04332}, archivePrefix= {arXiv}, primaryClass = {cs.LG}, url = {https://arxiv.org/abs/2410.04332} } % The substrate. Grounded in docs/papers/2025_lw_ariahw_steering-...md header. % Byline is the LessWrong handle "Ariahw"; advised by Neel Nanda and Josh Engels % (MATS 9.0). TODO: real-name attribution for the handle before submission. @misc{ariahw2025steering, title = {Steering RL Training: Benchmarking Interventions Against Reward Hacking}, author = {{Ariahw}}, year = {2025}, howpublished = {LessWrong}, month = dec, url = {https://www.lesswrong.com/posts/R5MdWGKsuvdPwGFBG/steering-rl-training-benchmarking-interventions-against} } % GRPO. Full author list + id from the Ariahw post bib (ref 10) and Wu-Tang bib. @misc{shao2024deepseekmath, title = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}, author = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Zhang, Mingchuan and Li, Y. K. and Wu, Y. and Guo, Daya}, year = {2024}, eprint = {2402.03300}, archivePrefix= {arXiv}, primaryClass = {cs.CL} } % The advantage-level baseline. Grounded in docs/papers/2026_wu-tang_...md header % (authors Rui Wu & Ruixiang Tang, Rutgers; arXiv:2604.01476). Method in the % paper is "representation-informed advantage modulation". @misc{wu2026rebound, title = {When Reward Hacking Rebounds: Understanding and Mitigating It with Representation-Level Signals}, author = {Wu, Rui and Tang, Ruixiang}, year = {2026}, eprint = {2604.01476}, archivePrefix= {arXiv}, primaryClass = {cs.LG}, url = {https://arxiv.org/abs/2604.01476} } % Diff-of-means activation direction (the act-erase control in tt_erase_bench). % Web-verified 2026-06-02 (arxiv.org/abs/2406.11717, NeurIPS 2024). @misc{arditi2024refusal, title = {Refusal in Language Models Is Mediated by a Single Direction}, author = {Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Panickssery, Nina and Gurnee, Wes and Nanda, Neel}, year = {2024}, eprint = {2406.11717}, archivePrefix= {arXiv}, primaryClass = {cs.LG}, url = {https://arxiv.org/abs/2406.11717} } % The prior SVD-basis steering work this builds on (same author; the per-Linear % delta_S adapter originates here). arXiv id supplied by the author 2026-06-03. @misc{antipasto, title = {AntiPaSTO: Self-Supervised Honesty Steering via Anti-Parallel Representations}, author = {Clark, Michael J.}, year = {2026}, eprint = {2601.07473}, archivePrefix= {arXiv}, primaryClass = {cs.LG}, url = {https://arxiv.org/abs/2601.07473} } % --- gradient-routing / projection related work -------------------------- % All three below verified against full-text local copies in % docs/grad_routing/ (title + arXiv id + url read from the file headers, % 2026-05-31). Author fields filled only where the byline was read. % THE NEAR-TWIN: singular directions of param-updates + project gradients ONTO % a clean reference subspace (we subtract a hack subspace instead). Byline read % from docs/grad_routing/paper_deng_trusted_direction.md; note the curated % related_work.md calls it "TDGA / Deng" -- TODO reconcile the lead-author label. @misc{huang2026directional, title = {Directional Alignment Mitigates Reward Hacking in Reinforcement Learning for Language Models}, author = {Huang, Jiaji and Ozkara, Kaan and Li, Yushu and Thrampoulidis, Christos and Li, Xiaoxiao and Park, Youngsuk}, year = {2026}, eprint = {2605.25189}, archivePrefix= {arXiv}, primaryClass = {cs.LG}, url = {https://arxiv.org/abs/2605.25189} } % Parameter-gradient zero-mask routing (Selective Gradient Masking, SGTM) % tolerant to label noise; measures leakage and shows it shrinks with scale. % Title + author byline web-verified 2026-06-02 (arxiv.org/abs/2512.05648). @misc{sgtm2025localization, title = {Beyond Data Filtering: Knowledge Localization for Capability Removal in LLMs}, author = {Shilov, Igor and Cloud, Alex and Gema, Aryo Pradipta and Goldman-Wetzler, Jacob and Panickssery, Nina and Sleight, Henry and Jones, Erik and Anil, Cem}, year = {2025}, eprint = {2512.05648}, archivePrefix= {arXiv}, primaryClass = {cs.LG}, url = {https://arxiv.org/abs/2512.05648} } % Reward-for-confession honesty (we reject this design: invites Baker % obfuscation + a live judge over student rollouts). Byline read from header. @misc{joglekar2025confessions, title = {Training LLMs for Honesty via Confessions}, author = {Joglekar, Manas and Chen, Jeremy and Wu, Gabriel and Yosinski, Jason and Wang, Jasmine and Barak, Boaz and Glaese, Amelia}, year = {2025}, eprint = {2512.08093}, archivePrefix= {arXiv}, primaryClass = {cs.LG}, url = {https://arxiv.org/abs/2512.08093} } % --- abstract-only "closest twins" (NOT full-text verified) -------------- % IDs from docs/grad_routing/{related_work,search_for_more}.md. Authors NOT % filled (not read) -- do not cite as @misc with invented authors. Verify % byline from arXiv before promoting any of these into the bibliography: % GRIFT (gradient fingerprints to detect/reject hacking) arXiv:2604.16242 % Spilling the Beans (SFT self-report generalises OOD) arXiv:2511.06626 % Baker et al. (weak monitor -> obfuscated reward hacking) arXiv:2503.11926 % --- parameter-isolation / weight-subspace lineage --------------------- % Added 2026-06-03 in response to an OpenReview novelty challenge against % gradient routing (PackNet/Piggyback/LoRA as "similar"). We cite them as the % classical "confine a capability to a weight subset" precedent; we differ by % REMOVING (not adding) a capability and assigning the subset from a gradient % signal (not a task label). Bylines verified via arXiv/CVF. @inproceedings{mallya2018packnet, title = {{PackNet}: Adding Multiple Tasks to a Single Network by Iterative Pruning}, author = {Mallya, Arun and Lazebnik, Svetlana}, booktitle = {CVPR}, year = {2018}, eprint = {1711.05769}, archivePrefix= {arXiv}, url = {https://arxiv.org/abs/1711.05769} } @inproceedings{mallya2018piggyback, title = {Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights}, author = {Mallya, Arun and Davis, Dillon and Lazebnik, Svetlana}, booktitle = {ECCV}, year = {2018}, eprint = {1801.06519}, archivePrefix= {arXiv}, url = {https://arxiv.org/abs/1801.06519} } @inproceedings{hu2021lora, title = {{LoRA}: Low-Rank Adaptation of Large Language Models}, author = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu}, booktitle = {ICLR}, year = {2022}, eprint = {2106.09685}, archivePrefix= {arXiv}, url = {https://arxiv.org/abs/2106.09685} }