Files
evil_MoE/docs/writeup/refs.bib
T

324 lines
24 KiB
BibTeX

% Bibliography for the gradient-routing-vs-reward-hacking writeup.
% Every field below is either grounded in the repo's local paper copies
% (docs/papers/*) or web-verified 2026-06-02. Unverifiable fields carry an
% explicit TODO -- do not fill from memory.
% Web-verified 2026-06-02 (arxiv.org/abs/2410.04332 + dblp). README also cites it.
@misc{cloud2024gradientrouting,
title = {Gradient Routing: Masking Gradients to Localize Computation in Neural Networks},
author = {Cloud, Alex and Goldman-Wetzler, Jacob and Wybitul, Ev{\v{z}}en and Miller, Joseph and Turner, Alexander Matt},
year = {2024},
eprint = {2410.04332},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2410.04332}
}
% The substrate. Grounded in docs/papers/2025_lw_ariahw_steering-...md header.
% Byline is the LessWrong handle "Ariahw"; advised by Neel Nanda and Josh Engels
% (MATS 9.0). TODO: real-name attribution for the handle before submission.
@misc{ariahw2025steering,
title = {Steering RL Training: Benchmarking Interventions Against Reward Hacking},
author = {{Ariahw}},
year = {2025},
howpublished = {LessWrong},
month = dec,
url = {https://www.lesswrong.com/posts/R5MdWGKsuvdPwGFBG/steering-rl-training-benchmarking-interventions-against}
}
% GRPO. Full author list + id from the Ariahw post bib (ref 10) and Wu-Tang bib.
@misc{shao2024deepseekmath,
title = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
author = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Zhang, Mingchuan and Li, Y. K. and Wu, Y. and Guo, Daya},
year = {2024},
eprint = {2402.03300},
archivePrefix= {arXiv},
primaryClass = {cs.CL}
}
% The advantage-level baseline. Grounded in docs/papers/2026_wu-tang_...md header
% (authors Rui Wu & Ruixiang Tang, Rutgers; arXiv:2604.01476). Method in the
% paper is "representation-informed advantage modulation".
@misc{wu2026rebound,
title = {When Reward Hacking Rebounds: Understanding and Mitigating It with Representation-Level Signals},
author = {Wu, Rui and Tang, Ruixiang},
year = {2026},
eprint = {2604.01476},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2604.01476}
}
% Diff-of-means activation direction (the act-erase control in tt_erase_bench).
% Web-verified 2026-06-02 (arxiv.org/abs/2406.11717, NeurIPS 2024).
@misc{arditi2024refusal,
title = {Refusal in Language Models Is Mediated by a Single Direction},
author = {Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Panickssery, Nina and Gurnee, Wes and Nanda, Neel},
year = {2024},
eprint = {2406.11717},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2406.11717}
}
% The prior SVD-basis steering work this builds on (same author; the per-Linear
% delta_S adapter originates here). arXiv id supplied by the author 2026-06-03.
@misc{antipasto,
title = {AntiPaSTO: Self-Supervised Honesty Steering via Anti-Parallel Representations},
author = {Clark, Michael J.},
year = {2026},
eprint = {2601.07473},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2601.07473}
}
% --- gradient-routing / projection related work --------------------------
% All three below verified against full-text local copies in
% docs/grad_routing/ (title + arXiv id + url read from the file headers,
% 2026-05-31). Author fields filled only where the byline was read.
% THE NEAR-TWIN: singular directions of param-updates + project gradients ONTO
% a clean reference subspace (we subtract a hack subspace instead). Byline read
% from docs/grad_routing/paper_deng_trusted_direction.md; note the curated
% related_work.md calls it "TDGA / Deng" -- TODO reconcile the lead-author label.
@misc{huang2026directional,
title = {Directional Alignment Mitigates Reward Hacking in Reinforcement Learning for Language Models},
author = {Deng, Wenlong and Huang, Jiaji and Ozkara, Kaan and Li, Yushu and Thrampoulidis, Christos and Li, Xiaoxiao and Park, Youngsuk},
year = {2026},
eprint = {2605.25189},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2605.25189}
}
% Parameter-gradient zero-mask routing (Selective Gradient Masking, SGTM)
% tolerant to label noise; measures leakage and shows it shrinks with scale.
% Title + author byline web-verified 2026-06-02 (arxiv.org/abs/2512.05648).
@misc{sgtm2025localization,
title = {Beyond Data Filtering: Knowledge Localization for Capability Removal in LLMs},
author = {Shilov, Igor and Cloud, Alex and Gema, Aryo Pradipta and Goldman-Wetzler, Jacob and Panickssery, Nina and Sleight, Henry and Jones, Erik and Anil, Cem},
year = {2025},
eprint = {2512.05648},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2512.05648}
}
% Reward-for-confession honesty (we reject this design: invites Baker
% obfuscation + a live judge over student rollouts). Byline read from header.
@misc{joglekar2025confessions,
title = {Training LLMs for Honesty via Confessions},
author = {Joglekar, Manas and Chen, Jeremy and Wu, Gabriel and Yosinski, Jason and Wang, Jasmine and Barak, Boaz and Glaese, Amelia},
year = {2025},
eprint = {2512.08093},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2512.08093}
}
% --- abstract-only "closest twins" (NOT full-text verified) --------------
% IDs from docs/grad_routing/{related_work,search_for_more}.md. Authors NOT
% filled (not read) -- do not cite as @misc with invented authors. Verify
% byline from arXiv before promoting any of these into the bibliography:
% GRIFT (gradient fingerprints to detect/reject hacking) arXiv:2604.16242
% Spilling the Beans (SFT self-report generalises OOD) arXiv:2511.06626
% Baker et al. (weak monitor -> obfuscated reward hacking) arXiv:2503.11926
% --- parameter-isolation / weight-subspace lineage ---------------------
% Added 2026-06-03 in response to an OpenReview novelty challenge against
% gradient routing (PackNet/Piggyback/LoRA as "similar"). We cite them as the
% classical "confine a capability to a weight subset" precedent; we differ by
% REMOVING (not adding) a capability and assigning the subset from a gradient
% signal (not a task label). Bylines verified via arXiv/CVF.
@inproceedings{mallya2018packnet,
title = {{PackNet}: Adding Multiple Tasks to a Single Network by Iterative Pruning},
author = {Mallya, Arun and Lazebnik, Svetlana},
booktitle = {CVPR},
year = {2018},
eprint = {1711.05769},
archivePrefix= {arXiv},
url = {https://arxiv.org/abs/1711.05769}
}
@inproceedings{mallya2018piggyback,
title = {Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights},
author = {Mallya, Arun and Davis, Dillon and Lazebnik, Svetlana},
booktitle = {ECCV},
year = {2018},
eprint = {1801.06519},
archivePrefix= {arXiv},
url = {https://arxiv.org/abs/1801.06519}
}
@inproceedings{hu2021lora,
title = {{LoRA}: Low-Rank Adaptation of Large Language Models},
author = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
booktitle = {ICLR},
year = {2022},
eprint = {2106.09685},
archivePrefix= {arXiv},
url = {https://arxiv.org/abs/2106.09685}
}
% --- external-search additions (Perplexity/Gemini/ChatGPT/Elicit, 2026-06-04) ---
% Bibtex+abstract pulled via the bibtex MCP (DBLP/Semantic Scholar); arXiv ids
% each verified HTTP 200. Provenance + relevance notes in
% docs/grad_routing/search_2026-06-04_related_work.md.
% The deploy-deletable-module PRECEDENT, and it predates Cloud (Nov 2023).
% Separable "security vectors" activated during finetuning absorb the harmful
% update so the backbone never learns it; deactivated at deploy. Our quarantine
% delta_S_hack absorption story, but in param space for SFT harmful-data defense
% (not the SVD gradient basis, not RL reward hacking, no weak-detector axis).
@misc{zhou2023securityvectors,
title = {Making Harmful Behaviors Unlearnable for Large Language Models},
author = {Zhou, Xin and Lu, Yi and Ma, Ruotian and Gui, Tao and Zhang, Qi and Huang, Xuanjing},
year = {2023},
eprint = {2311.02105},
archivePrefix= {arXiv},
primaryClass = {cs.CL},
url = {https://arxiv.org/abs/2311.02105},
abstract = {Large language models (LLMs) have shown great potential as general-purpose AI assistants in various domains. To meet the requirements of different applications, LLMs are often customized by further fine-tuning. However, the powerful learning ability of LLMs not only enables them to acquire new tasks but also makes them susceptible to learning undesired behaviors. For example, even safety-aligned LLMs can be easily fine-tuned into harmful assistants as the fine-tuning data often contains implicit or explicit harmful content. Can we train LLMs on harmful data without learning harmful behaviors? This paper proposes a controllable training framework that makes harmful behaviors unlearnable during the fine-tuning process. Specifically, we introduce ``security vectors'', a few new parameters that can be separated from the LLM, to ensure LLM's responses are consistent with the harmful behavior. Security vectors are activated during fine-tuning, the consistent behavior makes LLM believe that such behavior has already been learned, there is no need to further optimize for harmful data. During inference, we can deactivate security vectors to restore the LLM's normal behavior. The experimental results show that the security vectors generated by 100 harmful samples are enough to prevent LLM from learning 1000 harmful samples, while preserving the ability to learn other useful information.}
}
% Second gradient-level reward-hacking paper, same lab as SignCert-PO (RIKEN /
% Sugiyama). Orthogonal MECHANISM to ours: regularize the gradient norm toward
% flat minima where the reward model stays accurate, beats a KL penalty. Cite in
% the gradient-level bucket; not a scoop of direction-routing.
@misc{ackermann2026gradreg,
title = {Gradient Regularization Prevents Reward Hacking in Reinforcement Learning from Human Feedback and Verifiable Rewards},
author = {Ackermann, Jan and Noukhovitch, Michael and Ishida, Takashi and Sugiyama, Masashi},
year = {2026},
eprint = {2602.18037},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2602.18037},
abstract = {Reinforcement Learning from Human Feedback (RLHF) or Verifiable Rewards (RLVR) are two key steps in the post-training of modern Language Models (LMs). A common problem is reward hacking, where the policy may exploit inaccuracies of the reward and learn an unintended behavior. Most previous works address this by limiting the policy update with a Kullback-Leibler (KL) penalty towards a reference model. We propose a different framing: Train the LM in a way that biases policy updates towards regions in which the reward is more accurate. First, we derive a theoretical connection between the accuracy of a reward model and the flatness of an optimum at convergence. Gradient regularization (GR) can then be used to bias training to flatter regions and thereby maintain reward model accuracy. We confirm these results by showing that the gradient norm and reward accuracy are empirically correlated in RLHF. We then show that Reference Resets of the KL penalty implicitly use GR to find flatter regions with higher reward accuracy. We further improve on this by proposing to use explicit GR with an efficient finite-difference estimate. Empirically, GR performs better than a KL penalty across a diverse set of RL experiments with LMs.}
}
% Closest published analogue to our ERASE arm: project the forget-set gradient
% onto the subspace orthogonal to the retain batch (forget-vs-retain unlearning;
% ours is hack-vs-clean reward hacking).
@misc{shamsian2025orthograd,
title = {Go Beyond Your Means: Unlearning with Per-Sample Gradient Orthogonalization},
author = {Shamsian, Aviv and Shaar, Eitan and Navon, Aviv and Chechik, Gal and Fetaya, Ethan},
year = {2025},
eprint = {2503.02312},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2503.02312},
abstract = {Machine unlearning aims to remove the influence of problematic training data after a model has been trained. The primary challenge in machine unlearning is ensuring that the process effectively removes specified data without compromising the model's overall performance on the remaining dataset. Many existing machine unlearning methods address this challenge by carefully balancing gradient ascent on the `unlearn' data with the gradient descent on a `retain' set that represents the training data. However, in many cases the training dataset is not fully available when we wish to unlearn some concepts, because models are released without their training datasets, and one may only have access to a small part of a training set. Here, we propose OrthoGrad, a novel approach that mitigates interference between the unlearn set and a small retain set rather than competing ascent and descent processes. Our method projects the gradient of the unlearn set onto the subspace orthogonal to all gradients in the retain batch, effectively avoiding any gradient interference. We demonstrate the effectiveness of OrthoGrad on multiple machine unlearning benchmarks, outperforming competing methods.}
}
% C2 generalization evidence (stronger than Honesty-to-Subterfuge): SFT on
% harmless reward hacks generalizes to new hack settings AND to unrelated
% emergent misalignment. Owain Evans group.
@misc{taylor2025schoolrewardhacks,
title = {School of Reward Hacks: Hacking Harmless Tasks Generalizes to Misaligned Behavior in LLMs},
author = {Taylor, Mia and Chua, James and Betley, Jan and Treutlein, Johannes and Evans, Owain},
year = {2025},
eprint = {2508.17511},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2508.17511},
abstract = {Reward hacking--where agents exploit flaws in imperfect reward functions rather than performing tasks as intended--poses risks for AI alignment. Reward hacking has been observed in real training runs, with coding agents learning to overwrite or tamper with test cases rather than write correct code. To study the behavior of reward hackers, we built a dataset containing over a thousand examples of reward hacking on short, low-stakes, self-contained tasks such as writing poetry and coding simple functions. We used supervised fine-tuning to train models (GPT-4.1, GPT-4.1-mini, Qwen3-32B, Qwen3-8B) to reward hack on these tasks. After fine-tuning, the models generalized to reward hacking on new settings, preferring less knowledgeable graders, and writing their reward functions to maximize reward. Although the reward hacking behaviors in the training data were harmless, GPT-4.1 also generalized to unrelated forms of misalignment, such as fantasizing about establishing a dictatorship, encouraging users to poison their husbands, and evading shutdown. Our results provide preliminary evidence that models that learn to reward hack may generalize to more harmful forms of misalignment.}
}
% Weight-space contrastive behavior direction (subtract deltas of two opposite
% finetunes), the steering-side cousin of our extracted v_hack; reports stronger
% OOD control than activation steering. Cite next to task-arithmetic negation.
@misc{fierro2025weightarithmetic,
title = {Steering Language Models with Weight Arithmetic},
author = {Fierro, Constanza and Roger, Fabien},
year = {2025},
eprint = {2511.05408},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2511.05408},
abstract = {Providing high-quality feedback to Large Language Models (LLMs) on a diverse training distribution can be difficult and expensive, and providing feedback only on a narrow distribution can result in unintended generalizations. To better leverage narrow training data, we propose contrastive weight steering, a simple post-training method that edits the model parameters using weight arithmetic. We isolate a behavior direction in weight-space by subtracting the weight deltas from two small fine-tunes -- one that induces the desired behavior and another that induces its opposite -- and then add or remove this direction to modify the model's weights. We apply this technique to mitigate sycophancy and induce misalignment, and find that weight steering often generalizes further than activation steering, achieving stronger out-of-distribution behavioral control before degrading general capabilities. We also provide preliminary evidence that emergent misalignment can be detected by measuring the similarity between fine-tuning updates and an "evil" weight direction.}
}
% Closest published analogue to our contrastive-SVD extraction, but on the static
% reward MODEL not the live policy gradient: build a multi-directional hacking
% subspace from residual-stream diffs of contrastive gold-vs-hacked pairs, project
% the reward-head vector off it. 7-author byline cross-verified arXiv == Semantic
% Scholar (keyed S2 API via the semantic-search skill .env); abstract from arXiv.
% "\model" -> HARVE inlined.
@misc{liu2026harve,
title = {HARVE: Hacking-Aware Reward-Head Vector Editing for Robust Reward Models},
author = {Liu, Shuang and Bo, Yuxuan and Zhao, Qiuyang and Huang, Caiyue and Chen, Xiaorong and Liu, Yanguang and Du, Mengnan},
year = {2026},
eprint = {2606.03131},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2606.03131},
abstract = {Reward models are central to large language model (LLM) alignment, but they remain vulnerable to reward hacking. To evaluate reward-model robustness, we introduce RewardHackBench containing 13 reward-hacking patterns covering real life high-stakes domains and general settings, and we find severe failures on specific subcategories across eight reward models. To mitigate these failures, we propose HARVE, a training-free reward-head editing method for scalar reward models. Instead of fine-tuning the reward model, HARVE identifies a multi-directional hacking subspace from residual stream directions associated with selected hacking subcategories, and removes the component of the reward-head vector aligned with that subspace. This directly reduces the reward head's sensitivity to hacking-related features using only a small set of contrastive gold-hacked examples, without gradient updates or fine-tuning. Comprehensive experiments across eight reward models indicates that HARVE improves hacking robustness, outperforms fine-tuning baselines, and preserves reward-models' general capability. Further analyses suggest that reward hacking is better captured as a multidimensional residual-space structure than by isolated surface cues.}
}
% Gradient surgery against SHORTCUTS in supervised reasoning: a ShortcutScore
% (per-sample gradient vs validation-gradient cosine + answer-token gradient
% concentration) flags shortcut-promoting samples, then orthogonal projection
% removes those updates. Closest gradient-surgery analogue on the SFT-reasoning
% side (ours: GRPO reward hacking).
@misc{cao2026sart,
title = {Mitigating Shortcut Reasoning in Language Models: A Gradient-Aware Training Approach},
author = {Cao, Hongyu and Liu, Kunpeng and Wang, Dongjie and Fu, Yanjie},
year = {2026},
eprint = {2603.20899},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2603.20899},
abstract = {Large language models exhibit strong reasoning capabilities, yet often rely on shortcuts such as surface pattern matching and answer memorization rather than genuine logical inference. We propose Shortcut-Aware Reasoning Training (SART), a gradient-aware framework that detects and mitigates shortcut-promoting samples via ShortcutScore and gradient surgery. Our method identifies shortcut signals through gradient misalignment with validation objectives and answer-token concentration, and modifies training dynamics accordingly. Experiments on controlled reasoning benchmarks show that SART achieves +16.5\% accuracy and +40.2\% robustness over the strongest baseline, significantly improving generalization under distribution shifts.}
}
% Orthogonal gradient projection during SAFETY alignment (SFT/DPO): estimate a
% low-rank capability subspace from general-capability gradients, remove that
% component from each safety gradient. Validates our gradient-erasure mechanics;
% capability preservation, not hack suppression.
@misc{sun2026ogpsa,
title = {Safety Alignment as Continual Learning: Mitigating the Alignment Tax via Orthogonal Gradient Projection},
author = {Sun, Guanglong and Zhang, Siyuan and Wang, Liyuan and Zhu, Jun and Su, Hang and Zhong, Yi},
year = {2026},
eprint = {2602.07892},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2602.07892},
abstract = {Safety post-training can improve the harmfulness and policy compliance of Large Language Models (LLMs), but it may also reduce general utility, a phenomenon often described as the alignment tax. We study this trade-off through the lens of continual learning: sequential alignment stages expose the model to shifted data distributions and objectives, and their gradients may interfere with directions that support previously acquired general capabilities. We propose Orthogonal Gradient Projection for Safety Alignment (OGPSA), a lightweight update rule that estimates a low-rank reference subspace from gradients on a small set of general-capability data and removes from each safety gradient the component lying in this subspace. The resulting update is the steepest local safety-descent direction subject to first-order preservation constraints on the reference objectives. Across SFT, DPO, and sequential SFT-DPO settings, OGPSA improves the observed safety-utility trade-off over standard baselines.}
}
% 2026 survey / landscape map (Proxy Compression Hypothesis). Use to scoop-check
% and to cite the field. Full 23-author byline from the bibtex MCP (Fudan group).
@misc{wang2026rewardhackingsurvey,
title = {Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges},
author = {Wang, Xiaohua and Tian, Muzhao and Zeng, Yuqiyu and Huang, Zisu and Yuan, Jiakang and Chen, Bowen and Xu, Jingwen and Zhou, Ming and Liu, Wenhao and Wu, Muling and Guo, Zhengkang and Qian, Qi and Wang, Yifei and Zhang, Feiran and Yin, Ruicheng and Dou, Shihan and Lv, Changze and Chen, Tao and Song, Kaitao and Tan, Xu and Gui, Tao and Zheng, Xiaoqing and Huang, Xuanjing},
year = {2026},
eprint = {2604.13602},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2604.13602}
}
% The idea's ORIGIN in the alignment community: Mallen lists "Gradient routing to
% isolate reward hacking propensity and/or capability, and turn it off outside of
% training" -- our thesis, ~1yr before our runs. Fetched + verified via the
% (GreaterWrong-backed) lesswrong-graphql skill, 2026-06-04.
@misc{mallen2025rhinterventions,
title = {A Quick List of Reward Hacking Interventions},
author = {Mallen, Alex},
year = {2025},
howpublished = {Alignment Forum},
month = jun,
url = {https://www.alignmentforum.org/posts/spZyuEGPzqPhnehyk/a-quick-list-of-reward-hacking-interventions}
}
% C2 generalization evidence: empirical cross-task generalization of reward
% hacking (incl. organic generalization via expert iteration). Redwood/Anthropic.
% Full 7-author byline scraped from the GreaterWrong post page (evhub = Evan
% Hubinger); S2 does not index AF posts.
@misc{nishimuragasparian2025rhgeneralize,
title = {Reward Hacking Behavior Can Generalize Across Tasks},
author = {Nishimura-Gasparian, Kei and Dunn, Isaac and Sleight, Henry and Turpin, Miles and Hubinger, Evan and Denison, Carson and Perez, Ethan},
year = {2025},
howpublished = {Alignment Forum},
url = {https://www.alignmentforum.org/posts/Ge55vxEmKXunFFwoe/reward-hacking-behavior-can-generalize-across-tasks}
}