evil_MoE/docs/writeup/refs.bib

% Bibliography for the gradient-routing-vs-reward-hacking writeup.
% Every field below is either grounded in the repo's local paper copies
% (docs/papers/*) or web-verified 2026-06-02. Unverifiable fields carry an
% explicit TODO -- do not fill from memory.

% Web-verified 2026-06-02 (arxiv.org/abs/2410.04332 + dblp). README also cites it.
@misc{cloud2024gradientrouting,
  title        = {Gradient Routing: Masking Gradients to Localize Computation in Neural Networks},
  author       = {Cloud, Alex and Goldman-Wetzler, Jacob and Wybitul, Ev{\v{z}}en and Miller, Joseph and Turner, Alexander Matt},
  year         = {2024},
  eprint       = {2410.04332},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2410.04332}
}

% The substrate. Grounded in docs/papers/2025_lw_ariahw_steering-...md header.
% Byline is the LessWrong handle "Ariahw"; advised by Neel Nanda and Josh Engels
% (MATS 9.0). TODO: real-name attribution for the handle before submission.
@misc{ariahw2025steering,
  title        = {Steering RL Training: Benchmarking Interventions Against Reward Hacking},
  author       = {{Ariahw}},
  year         = {2025},
  howpublished = {LessWrong},
  month        = dec,
  url          = {https://www.lesswrong.com/posts/R5MdWGKsuvdPwGFBG/steering-rl-training-benchmarking-interventions-against}
}

% GRPO. Full author list + id from the Ariahw post bib (ref 10) and Wu-Tang bib.
@misc{shao2024deepseekmath,
  title        = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
  author       = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Zhang, Mingchuan and Li, Y. K. and Wu, Y. and Guo, Daya},
  year         = {2024},
  eprint       = {2402.03300},
  archivePrefix= {arXiv},
  primaryClass = {cs.CL}
}

% The advantage-level baseline. Grounded in docs/papers/2026_wu-tang_...md header
% (authors Rui Wu & Ruixiang Tang, Rutgers; arXiv:2604.01476). Method in the
% paper is "representation-informed advantage modulation".
@misc{wu2026rebound,
  title        = {When Reward Hacking Rebounds: Understanding and Mitigating It with Representation-Level Signals},
  author       = {Wu, Rui and Tang, Ruixiang},
  year         = {2026},
  eprint       = {2604.01476},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2604.01476}
}

% Diff-of-means activation direction (the act-erase control in tt_erase_bench).
% Web-verified 2026-06-02 (arxiv.org/abs/2406.11717, NeurIPS 2024).
@misc{arditi2024refusal,
  title        = {Refusal in Language Models Is Mediated by a Single Direction},
  author       = {Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Panickssery, Nina and Gurnee, Wes and Nanda, Neel},
  year         = {2024},
  eprint       = {2406.11717},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2406.11717}
}

% The prior SVD-basis steering work this builds on (same author; the per-Linear
% delta_S adapter originates here). arXiv id supplied by the author 2026-06-03.
@misc{antipasto,
  title        = {AntiPaSTO: Self-Supervised Honesty Steering via Anti-Parallel Representations},
  author       = {Clark, Michael J.},
  year         = {2026},
  eprint       = {2601.07473},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2601.07473}
}

% --- gradient-routing / projection related work --------------------------
% All three below verified against full-text local copies in
% docs/grad_routing/ (title + arXiv id + url read from the file headers,
% 2026-05-31). Author fields filled only where the byline was read.

% THE NEAR-TWIN: singular directions of param-updates + project gradients ONTO
% a clean reference subspace (we subtract a hack subspace instead). Byline read
% from docs/grad_routing/paper_deng_trusted_direction.md; note the curated
% related_work.md calls it "TDGA / Deng" -- TODO reconcile the lead-author label.
@misc{huang2026directional,
  title        = {Directional Alignment Mitigates Reward Hacking in Reinforcement Learning for Language Models},
  author       = {Deng, Wenlong and Huang, Jiaji and Ozkara, Kaan and Li, Yushu and Thrampoulidis, Christos and Li, Xiaoxiao and Park, Youngsuk},
  year         = {2026},
  eprint       = {2605.25189},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2605.25189}
}

% Parameter-gradient zero-mask routing (Selective Gradient Masking, SGTM)
% tolerant to label noise; measures leakage and shows it shrinks with scale.
% Title + author byline web-verified 2026-06-02 (arxiv.org/abs/2512.05648).
@misc{sgtm2025localization,
  title        = {Beyond Data Filtering: Knowledge Localization for Capability Removal in LLMs},
  author       = {Shilov, Igor and Cloud, Alex and Gema, Aryo Pradipta and Goldman-Wetzler, Jacob and Panickssery, Nina and Sleight, Henry and Jones, Erik and Anil, Cem},
  year         = {2025},
  eprint       = {2512.05648},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2512.05648}
}

% Reward-for-confession honesty (we reject this design: invites Baker
% obfuscation + a live judge over student rollouts). Byline read from header.
@misc{joglekar2025confessions,
  title        = {Training LLMs for Honesty via Confessions},
  author       = {Joglekar, Manas and Chen, Jeremy and Wu, Gabriel and Yosinski, Jason and Wang, Jasmine and Barak, Boaz and Glaese, Amelia},
  year         = {2025},
  eprint       = {2512.08093},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2512.08093}
}

% --- abstract-only "closest twins" (NOT full-text verified) --------------
% IDs from docs/grad_routing/{related_work,search_for_more}.md. Authors NOT
% filled (not read) -- do not cite as @misc with invented authors. Verify
% byline from arXiv before promoting any of these into the bibliography:
%   Spilling the Beans (SFT self-report generalises OOD)    arXiv:2511.06626
%   Baker et al. (weak monitor -> obfuscated reward hacking) arXiv:2503.11926

% --- parameter-isolation / weight-subspace lineage ---------------------
% Added 2026-06-03 in response to an OpenReview novelty challenge against
% gradient routing (PackNet/Piggyback/LoRA as "similar"). We cite them as the
% classical "confine a capability to a weight subset" precedent; we differ by
% REMOVING (not adding) a capability and assigning the subset from a gradient
% signal (not a task label). Bylines verified via arXiv/CVF.
@inproceedings{mallya2018packnet,
  title        = {{PackNet}: Adding Multiple Tasks to a Single Network by Iterative Pruning},
  author       = {Mallya, Arun and Lazebnik, Svetlana},
  booktitle    = {CVPR},
  year         = {2018},
  eprint       = {1711.05769},
  archivePrefix= {arXiv},
  url          = {https://arxiv.org/abs/1711.05769}
}
@inproceedings{mallya2018piggyback,
  title        = {Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights},
  author       = {Mallya, Arun and Davis, Dillon and Lazebnik, Svetlana},
  booktitle    = {ECCV},
  year         = {2018},
  eprint       = {1801.06519},
  archivePrefix= {arXiv},
  url          = {https://arxiv.org/abs/1801.06519}
}
@inproceedings{hu2021lora,
  title        = {{LoRA}: Low-Rank Adaptation of Large Language Models},
  author       = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
  booktitle    = {ICLR},
  year         = {2022},
  eprint       = {2106.09685},
  archivePrefix= {arXiv},
  url          = {https://arxiv.org/abs/2106.09685}
}

% --- external-search additions (Perplexity/Gemini/ChatGPT/Elicit, 2026-06-04) ---
% Bibtex+abstract pulled via the bibtex MCP (DBLP/Semantic Scholar); arXiv ids
% each verified HTTP 200. Provenance + relevance notes in
% docs/grad_routing/search_2026-06-04_related_work.md.

% The deploy-deletable-module PRECEDENT, and it predates Cloud (Nov 2023).
% Separable "security vectors" activated during finetuning absorb the harmful
% update so the backbone never learns it; deactivated at deploy. Our quarantine
% delta_S_hack absorption story, but in param space for SFT harmful-data defense
% (not the SVD gradient basis, not RL reward hacking, no weak-detector axis).
@misc{zhou2023securityvectors,
  title        = {Making Harmful Behaviors Unlearnable for Large Language Models},
  author       = {Zhou, Xin and Lu, Yi and Ma, Ruotian and Gui, Tao and Zhang, Qi and Huang, Xuanjing},
  year         = {2023},
  eprint       = {2311.02105},
  archivePrefix= {arXiv},
  primaryClass = {cs.CL},
  url          = {https://arxiv.org/abs/2311.02105},
  abstract     = {Large language models (LLMs) have shown great potential as general-purpose AI assistants in various domains. To meet the requirements of different applications, LLMs are often customized by further fine-tuning. However, the powerful learning ability of LLMs not only enables them to acquire new tasks but also makes them susceptible to learning undesired behaviors. For example, even safety-aligned LLMs can be easily fine-tuned into harmful assistants as the fine-tuning data often contains implicit or explicit harmful content. Can we train LLMs on harmful data without learning harmful behaviors? This paper proposes a controllable training framework that makes harmful behaviors unlearnable during the fine-tuning process. Specifically, we introduce ``security vectors'', a few new parameters that can be separated from the LLM, to ensure LLM's responses are consistent with the harmful behavior. Security vectors are activated during fine-tuning, the consistent behavior makes LLM believe that such behavior has already been learned, there is no need to further optimize for harmful data. During inference, we can deactivate security vectors to restore the LLM's normal behavior. The experimental results show that the security vectors generated by 100 harmful samples are enough to prevent LLM from learning 1000 harmful samples, while preserving the ability to learn other useful information.}
}

% Second gradient-level reward-hacking paper, same lab as SignCert-PO (RIKEN /
% Sugiyama). Orthogonal MECHANISM to ours: regularize the gradient norm toward
% flat minima where the reward model stays accurate, beats a KL penalty. Cite in
% the gradient-level bucket; not a scoop of direction-routing.
@misc{ackermann2026gradreg,
  title        = {Gradient Regularization Prevents Reward Hacking in Reinforcement Learning from Human Feedback and Verifiable Rewards},
  author       = {Ackermann, Jan and Noukhovitch, Michael and Ishida, Takashi and Sugiyama, Masashi},
  year         = {2026},
  eprint       = {2602.18037},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2602.18037},
  abstract     = {Reinforcement Learning from Human Feedback (RLHF) or Verifiable Rewards (RLVR) are two key steps in the post-training of modern Language Models (LMs). A common problem is reward hacking, where the policy may exploit inaccuracies of the reward and learn an unintended behavior. Most previous works address this by limiting the policy update with a Kullback-Leibler (KL) penalty towards a reference model. We propose a different framing: Train the LM in a way that biases policy updates towards regions in which the reward is more accurate. First, we derive a theoretical connection between the accuracy of a reward model and the flatness of an optimum at convergence. Gradient regularization (GR) can then be used to bias training to flatter regions and thereby maintain reward model accuracy. We confirm these results by showing that the gradient norm and reward accuracy are empirically correlated in RLHF. We then show that Reference Resets of the KL penalty implicitly use GR to find flatter regions with higher reward accuracy. We further improve on this by proposing to use explicit GR with an efficient finite-difference estimate. Empirically, GR performs better than a KL penalty across a diverse set of RL experiments with LMs.}
}

% Closest published analogue to our ERASE arm: project the forget-set gradient
% onto the subspace orthogonal to the retain batch (forget-vs-retain unlearning;
% ours is hack-vs-clean reward hacking).
@misc{shamsian2025orthograd,
  title        = {Go Beyond Your Means: Unlearning with Per-Sample Gradient Orthogonalization},
  author       = {Shamsian, Aviv and Shaar, Eitan and Navon, Aviv and Chechik, Gal and Fetaya, Ethan},
  year         = {2025},
  eprint       = {2503.02312},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2503.02312},
  abstract     = {Machine unlearning aims to remove the influence of problematic training data after a model has been trained. The primary challenge in machine unlearning is ensuring that the process effectively removes specified data without compromising the model's overall performance on the remaining dataset. Many existing machine unlearning methods address this challenge by carefully balancing gradient ascent on the `unlearn' data with the gradient descent on a `retain' set that represents the training data. However, in many cases the training dataset is not fully available when we wish to unlearn some concepts, because models are released without their training datasets, and one may only have access to a small part of a training set. Here, we propose OrthoGrad, a novel approach that mitigates interference between the unlearn set and a small retain set rather than competing ascent and descent processes. Our method projects the gradient of the unlearn set onto the subspace orthogonal to all gradients in the retain batch, effectively avoiding any gradient interference. We demonstrate the effectiveness of OrthoGrad on multiple machine unlearning benchmarks, outperforming competing methods.}
}

% C2 generalization evidence (stronger than Honesty-to-Subterfuge): SFT on
% harmless reward hacks generalizes to new hack settings AND to unrelated
% emergent misalignment. Owain Evans group.
@misc{taylor2025schoolrewardhacks,
  title        = {School of Reward Hacks: Hacking Harmless Tasks Generalizes to Misaligned Behavior in LLMs},
  author       = {Taylor, Mia and Chua, James and Betley, Jan and Treutlein, Johannes and Evans, Owain},
  year         = {2025},
  eprint       = {2508.17511},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2508.17511},
  abstract     = {Reward hacking--where agents exploit flaws in imperfect reward functions rather than performing tasks as intended--poses risks for AI alignment. Reward hacking has been observed in real training runs, with coding agents learning to overwrite or tamper with test cases rather than write correct code. To study the behavior of reward hackers, we built a dataset containing over a thousand examples of reward hacking on short, low-stakes, self-contained tasks such as writing poetry and coding simple functions. We used supervised fine-tuning to train models (GPT-4.1, GPT-4.1-mini, Qwen3-32B, Qwen3-8B) to reward hack on these tasks. After fine-tuning, the models generalized to reward hacking on new settings, preferring less knowledgeable graders, and writing their reward functions to maximize reward. Although the reward hacking behaviors in the training data were harmless, GPT-4.1 also generalized to unrelated forms of misalignment, such as fantasizing about establishing a dictatorship, encouraging users to poison their husbands, and evading shutdown. Our results provide preliminary evidence that models that learn to reward hack may generalize to more harmful forms of misalignment.}
}

% Weight-space contrastive behavior direction (subtract deltas of two opposite
% finetunes), the steering-side cousin of our extracted v_hack; reports stronger
% OOD control than activation steering. Cite next to task-arithmetic negation.
@misc{fierro2025weightarithmetic,
  title        = {Steering Language Models with Weight Arithmetic},
  author       = {Fierro, Constanza and Roger, Fabien},
  year         = {2025},
  eprint       = {2511.05408},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2511.05408},
  abstract     = {Providing high-quality feedback to Large Language Models (LLMs) on a diverse training distribution can be difficult and expensive, and providing feedback only on a narrow distribution can result in unintended generalizations. To better leverage narrow training data, we propose contrastive weight steering, a simple post-training method that edits the model parameters using weight arithmetic. We isolate a behavior direction in weight-space by subtracting the weight deltas from two small fine-tunes -- one that induces the desired behavior and another that induces its opposite -- and then add or remove this direction to modify the model's weights. We apply this technique to mitigate sycophancy and induce misalignment, and find that weight steering often generalizes further than activation steering, achieving stronger out-of-distribution behavioral control before degrading general capabilities. We also provide preliminary evidence that emergent misalignment can be detected by measuring the similarity between fine-tuning updates and an "evil" weight direction.}
}

% Closest published analogue to our contrastive-SVD extraction, but on the static
% reward MODEL not the live policy gradient: build a multi-directional hacking
% subspace from residual-stream diffs of contrastive gold-vs-hacked pairs, project
% the reward-head vector off it. 7-author byline cross-verified arXiv == Semantic
% Scholar (keyed S2 API via the semantic-search skill .env); abstract from arXiv.
% "\model" -> HARVE inlined.
@misc{liu2026harve,
  title        = {HARVE: Hacking-Aware Reward-Head Vector Editing for Robust Reward Models},
  author       = {Liu, Shuang and Bo, Yuxuan and Zhao, Qiuyang and Huang, Caiyue and Chen, Xiaorong and Liu, Yanguang and Du, Mengnan},
  year         = {2026},
  eprint       = {2606.03131},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2606.03131},
  abstract     = {Reward models are central to large language model (LLM) alignment, but they remain vulnerable to reward hacking. To evaluate reward-model robustness, we introduce RewardHackBench containing 13 reward-hacking patterns covering real life high-stakes domains and general settings, and we find severe failures on specific subcategories across eight reward models. To mitigate these failures, we propose HARVE, a training-free reward-head editing method for scalar reward models. Instead of fine-tuning the reward model, HARVE identifies a multi-directional hacking subspace from residual stream directions associated with selected hacking subcategories, and removes the component of the reward-head vector aligned with that subspace. This directly reduces the reward head's sensitivity to hacking-related features using only a small set of contrastive gold-hacked examples, without gradient updates or fine-tuning. Comprehensive experiments across eight reward models indicates that HARVE improves hacking robustness, outperforms fine-tuning baselines, and preserves reward-models' general capability. Further analyses suggest that reward hacking is better captured as a multidimensional residual-space structure than by isolated surface cues.}
}

% Gradient surgery against SHORTCUTS in supervised reasoning: a ShortcutScore
% (per-sample gradient vs validation-gradient cosine + answer-token gradient
% concentration) flags shortcut-promoting samples, then orthogonal projection
% removes those updates. Closest gradient-surgery analogue on the SFT-reasoning
% side (ours: GRPO reward hacking).
@misc{cao2026sart,
  title        = {Mitigating Shortcut Reasoning in Language Models: A Gradient-Aware Training Approach},
  author       = {Cao, Hongyu and Liu, Kunpeng and Wang, Dongjie and Fu, Yanjie},
  year         = {2026},
  eprint       = {2603.20899},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2603.20899},
  abstract     = {Large language models exhibit strong reasoning capabilities, yet often rely on shortcuts such as surface pattern matching and answer memorization rather than genuine logical inference. We propose Shortcut-Aware Reasoning Training (SART), a gradient-aware framework that detects and mitigates shortcut-promoting samples via ShortcutScore and gradient surgery. Our method identifies shortcut signals through gradient misalignment with validation objectives and answer-token concentration, and modifies training dynamics accordingly. Experiments on controlled reasoning benchmarks show that SART achieves +16.5\% accuracy and +40.2\% robustness over the strongest baseline, significantly improving generalization under distribution shifts.}
}

% Orthogonal gradient projection during SAFETY alignment (SFT/DPO): estimate a
% low-rank capability subspace from general-capability gradients, remove that
% component from each safety gradient. Validates our gradient-erasure mechanics;
% capability preservation, not hack suppression.
@misc{sun2026ogpsa,
  title        = {Safety Alignment as Continual Learning: Mitigating the Alignment Tax via Orthogonal Gradient Projection},
  author       = {Sun, Guanglong and Zhang, Siyuan and Wang, Liyuan and Zhu, Jun and Su, Hang and Zhong, Yi},
  year         = {2026},
  eprint       = {2602.07892},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2602.07892},
  abstract     = {Safety post-training can improve the harmfulness and policy compliance of Large Language Models (LLMs), but it may also reduce general utility, a phenomenon often described as the alignment tax. We study this trade-off through the lens of continual learning: sequential alignment stages expose the model to shifted data distributions and objectives, and their gradients may interfere with directions that support previously acquired general capabilities. We propose Orthogonal Gradient Projection for Safety Alignment (OGPSA), a lightweight update rule that estimates a low-rank reference subspace from gradients on a small set of general-capability data and removes from each safety gradient the component lying in this subspace. The resulting update is the steepest local safety-descent direction subject to first-order preservation constraints on the reference objectives. Across SFT, DPO, and sequential SFT-DPO settings, OGPSA improves the observed safety-utility trade-off over standard baselines.}
}

% 2026 survey / landscape map (Proxy Compression Hypothesis). Use to scoop-check
% and to cite the field. Full 23-author byline from the bibtex MCP (Fudan group).
@misc{wang2026rewardhackingsurvey,
  title        = {Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges},
  author       = {Wang, Xiaohua and Tian, Muzhao and Zeng, Yuqiyu and Huang, Zisu and Yuan, Jiakang and Chen, Bowen and Xu, Jingwen and Zhou, Ming and Liu, Wenhao and Wu, Muling and Guo, Zhengkang and Qian, Qi and Wang, Yifei and Zhang, Feiran and Yin, Ruicheng and Dou, Shihan and Lv, Changze and Chen, Tao and Song, Kaitao and Tan, Xu and Gui, Tao and Zheng, Xiaoqing and Huang, Xuanjing},
  year         = {2026},
  eprint       = {2604.13602},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2604.13602}
}

% The idea's ORIGIN in the alignment community: Mallen lists "Gradient routing to
% isolate reward hacking propensity and/or capability, and turn it off outside of
% training" -- our thesis, ~1yr before our runs. Fetched + verified via the
% (GreaterWrong-backed) lesswrong-graphql skill, 2026-06-04.
@misc{mallen2025rhinterventions,
  title        = {A Quick List of Reward Hacking Interventions},
  author       = {Mallen, Alex},
  year         = {2025},
  howpublished = {Alignment Forum},
  month        = jun,
  url          = {https://www.alignmentforum.org/posts/spZyuEGPzqPhnehyk/a-quick-list-of-reward-hacking-interventions}
}

% C2 generalization evidence: empirical cross-task generalization of reward
% hacking (incl. organic generalization via expert iteration). Redwood/Anthropic.
% Full 7-author byline scraped from the GreaterWrong post page (evhub = Evan
% Hubinger); S2 does not index AF posts.
@misc{nishimuragasparian2025rhgeneralize,
  title        = {Reward Hacking Behavior Can Generalize Across Tasks},
  author       = {Nishimura-Gasparian, Kei and Dunn, Isaac and Sleight, Henry and Turpin, Miles and Hubinger, Evan and Denison, Carson and Perez, Ethan},
  year         = {2025},
  howpublished = {Alignment Forum},
  url          = {https://www.alignmentforum.org/posts/Ge55vxEmKXunFFwoe/reward-hacking-behavior-can-generalize-across-tasks}
}

% --- verified additions, 2026-06-05 (abstract verbatim from Semantic Scholar /
% arXiv API via scripts; byline cross-checked arXiv + OpenAlex; see
% docs/grad_routing/search_2026-06-04_related_work.md) -----------------------

% The uncited gradient-level NEAR-TWIN on the signal axis: GRIFT DETECTS reward
% hacking from the gradient of the CoT (we INTERVENE on the gradient). Resolves
% the flagged abstract-only twin. Differentiate detect-vs-intervene.
@misc{wang2026grift,
  title        = {Detecting and Suppressing Reward Hacking with Gradient Fingerprints},
  author       = {Wang, Songtao and Pham, Quang Hieu and Yin, Fangcong and Wang, Xinpeng and Chen, Jocelyn Qiaochu and Durrett, Greg and Ye, Xi},
  year         = {2026},
  eprint       = {2604.16242},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2604.16242},
  abstract     = {Reinforcement learning with verifiable rewards (RLVR) typically optimizes for outcome rewards without imposing constraints on intermediate reasoning. This leaves training susceptible to reward hacking, where models exploit loopholes (e.g., spurious patterns in training data) in the reward function to achieve high scores without solving the intended task. These reward-hacking behaviors are often implicit, as the intermediate chain-of-thought (CoT) may appear plausible on the surface, limiting the effectiveness of purely text-based monitoring. We propose Gradient Fingerprint (GRIFT), a method for detecting reward hacking using models' internal computations. Given a prompt and a model-generated CoT, GRIFT computes gradients of the CoT conditioned on the prompt and compresses them into a compact representation, which is then used to assess whether the CoT reflects reward hacking behavior. Across verifiable reasoning benchmarks spanning math, code, and logical reasoning, GRIFT substantially outperforms strong baselines, including CoT Monitor and TRACE, achieving over 25\% relative improvement in detecting reward hacking behavior. Moreover, integrating GRIFT into the rejection fine-tuning pipeline for reasoning tasks reduces reward hacking and improves performance on the true task objective. Our results highlight a promising direction of leveraging gradient level representations for assessing the quality of CoT reasoning traces.}
}

% Gradient-level reward-hacking competitor: re-weights the policy gradient by an
% advantage-sign-robustness certificate. Same lab as ackermann2026gradreg (RIKEN).
@misc{ono2026signcert,
  title        = {Mitigating Reward Hacking in RLHF via Advantage Sign Robustness},
  author       = {Ono, Shinnosuke and Ackermann, Johannes and Nishimori, Soichiro and Ishida, Takashi and Sugiyama, Masashi},
  year         = {2026},
  eprint       = {2604.02986},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2604.02986},
  abstract     = {Reward models (RMs) used in reinforcement learning from human feedback (RLHF) are vulnerable to reward hacking: as the policy maximizes a learned proxy reward, true quality plateaus or degrades. We make the assumption that reward hacking is often caused by flipped advantage signs: instead of reducing the likelihood of a bad response, a flipped sign causes the update to increase it. By considering an adversarial perturbation in the RM parameter space, we can derive a certified sign-preservation radius, which is the smallest perturbation that can flip the advantage sign during policy optimization. Based on this formulation, we propose Sign-Certified Policy Optimization (SignCert-PO), down-weighting non-robust completions in the policy gradient update. Unlike prior approaches that require multiple RMs or access to the RM training data, SignCert-PO is lightweight and operates purely at the policy optimization stage using only the RM parameters and on-policy completions. On TL;DR summarization and AlpacaFarm benchmarks, SignCert-PO consistently achieves a better win rate than baselines and reduces reward hacking.}
}

% --- gradient-projection lineage (the continual-learning ancestry of "project
% a gradient out of a subspace"; a CL reviewer expects these) ----------------

% PCGrad: the canonical "project a gradient onto the normal plane of a
% conflicting one" primitive (multi-task learning).
@inproceedings{yu2020pcgrad,
  title        = {Gradient Surgery for Multi-Task Learning},
  author       = {Yu, Tianhe and Kumar, Saurabh and Gupta, Abhishek and Levine, Sergey and Hausman, Karol and Finn, Chelsea},
  booktitle    = {Neural Information Processing Systems (NeurIPS)},
  year         = {2020},
  eprint       = {2001.06782},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2001.06782},
  abstract     = {While deep learning and deep reinforcement learning (RL) systems have demonstrated impressive results in domains such as image classification, game playing, and robotic control, data efficiency remains a major challenge. Multi-task learning has emerged as a promising approach for sharing structure across multiple tasks to enable more efficient learning. However, the multi-task setting presents a number of optimization challenges, making it difficult to realize large efficiency gains compared to learning tasks independently. The reasons why multi-task learning is so challenging compared to single-task learning are not fully understood. In this work, we identify a set of three conditions of the multi-task optimization landscape that cause detrimental gradient interference, and develop a simple yet general approach for avoiding such interference between task gradients. We propose a form of gradient surgery that projects a task's gradient onto the normal plane of the gradient of any other task that has a conflicting gradient. On a series of challenging multi-task supervised and multi-task RL problems, this approach leads to substantial gains in efficiency and performance. Further, it is model-agnostic and can be combined with previously-proposed multi-task architectures for enhanced performance.}
}

% Task arithmetic: negating a weight-space task vector reduces a behavior --
% the weight-direction-removal precedent for our extracted v_hack (cite as
% steering/negation, NOT as unlearning: they frame it as model editing).
@inproceedings{ilharco2023taskarithmetic,
  title        = {Editing Models with Task Arithmetic},
  author       = {Ilharco, Gabriel and Ribeiro, Marco Tulio and Wortsman, Mitchell and Gururangan, Suchin and Schmidt, Ludwig and Hajishirzi, Hannaneh and Farhadi, Ali},
  booktitle    = {International Conference on Learning Representations (ICLR)},
  year         = {2023},
  eprint       = {2212.04089},
  archivePrefix= {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2212.04089},
  abstract     = {Changing how pre-trained models behave -- e.g., improving their performance on a downstream task or mitigating biases learned during pre-training -- is a common practice when developing machine learning systems. In this work, we propose a new paradigm for steering the behavior of neural networks, centered around task vectors. A task vector specifies a direction in the weight space of a pre-trained model, such that movement in that direction improves performance on the task. We build task vectors by subtracting the weights of a pre-trained model from the weights of the same model after fine-tuning on a task. We show that these task vectors can be modified and combined together through arithmetic operations such as negation and addition, and the behavior of the resulting model is steered accordingly. Negating a task vector decreases performance on the target task, with little change in model behavior on control tasks. Moreover, adding task vectors together can improve performance on multiple tasks at once. Finally, when tasks are linked by an analogy relationship of the form ``A is to B as C is to D'', combining task vectors from three of the tasks can improve performance on the fourth, even when no data from the fourth task is used for training. Overall, our experiments with several models, modalities and tasks show that task arithmetic is a simple, efficient and effective way of editing models.}
}

% PEGP: orthogonal gradient projection inside Adapter/LoRA/Prefix/Prompt PEFT --
% the closest methodological prior art to our erase arm (orthogonal projection
% in a low-rank tuning subspace, repurposed there for anti-forgetting). TPAMI.
@article{qiao2025pegp,
  title        = {Gradient Projection for Continual Parameter-Efficient Tuning},
  author       = {Qiao, Jingyang and Zhang, Zhizhong and Tan, Xin and Qu, Yanyun and Zhang, Wensheng and Han, Zhi and Xie, Yuan},
  journal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
  year         = {2025},
  doi          = {10.1109/TPAMI.2025.3587032},
  url          = {https://doi.org/10.1109/TPAMI.2025.3587032},
  abstract     = {Parameter-efficient tunings (PETs) have demonstrated impressive performance and promising perspectives in training large models, while they are still confronted with a common problem: the trade-off between learning new content and protecting old knowledge, leading to zero-shot generalization collapse, and cross-modal hallucination. In this paper, we reformulate Adapter, LoRA, Prefix-tuning, and Prompt-tuning from the perspective of gradient projection, and first propose a unified framework called Parameter Efficient Gradient Projection (PEGP). We introduce orthogonal gradient projection into different PET paradigms and theoretically demonstrate that the orthogonal condition for the gradient can effectively resist forgetting even for large-scale models. It therefore modifies the gradient towards the direction that has less impact on the old feature space, with less extra memory space and training time. We extensively evaluate our method with different backbones, including ViT and CLIP, on diverse datasets, and experiments comprehensively demonstrate its efficiency in reducing forgetting in class, online class, domain, task, and multi-modality continual settings.}
}