mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:30:30 +08:00
paper: add verified related work (11 refs) + fix Huang->Deng first author
Related-work search (local qmd/gh/LW + Perplexity/Gemini/ChatGPT/Elicit), all arXiv ids verified HTTP 200, bibtex+abstracts via the bibtex MCP / arXiv scrape: - gradient-level reward hacking: ackermann2026gradreg (GR), liu2026harve (HARVE) - deletable-module precedent (pre-dates Cloud): zhou2023securityvectors - gradient-projection unlearning: shamsian2025orthograd (OrthoGrad), sun2026ogpsa - C2 generalisation: taylor2025schoolrewardhacks, nishimuragasparian2025rhgeneralize - weight-space contrastive direction: fierro2025weightarithmetic - shortcut gradient surgery: cao2026sart; survey: wang2026rewardhackingsurvey - idea provenance: mallen2025rhinterventions (AF) Fix: huang2026directional first author is Deng, Wenlong (arXiv 2605.25189); sync the cold-reader comment to 'Deng et al.' Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -367,6 +367,7 @@ enough to route one as it forms.
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
% TODO hmm a bit hard conceptually. not direction is... random direction? maybe it shoudl be alternative methods idk
|
||||
|
||||
\subsection{Long-run convergence}
|
||||
|
||||
@@ -439,7 +440,7 @@ extraction set, not only the in-distribution mode (Table~\ref{tab:generalisation
|
||||
one-liners are in docs/grad\_routing/related\_work.md.}
|
||||
\begin{itemize}
|
||||
% COMPREHENSION (cold-reader panel 2026-06-03): the keep-vs-remove inversion
|
||||
% takes two reads. State it plainly first: "Huang projects ONTO a clean
|
||||
% takes two reads. State it plainly first: "Deng et al. project ONTO a clean
|
||||
% direction; we project a hack direction OUT." Skeptic also flagged: zeroing
|
||||
% delta_S_hack at deploy == not-projecting at deploy, so "deletable knob vs
|
||||
% only-constrains-training" is thin unless argued; and we never measured
|
||||
|
||||
+163
-1
@@ -84,7 +84,7 @@
|
||||
% related_work.md calls it "TDGA / Deng" -- TODO reconcile the lead-author label.
|
||||
@misc{huang2026directional,
|
||||
title = {Directional Alignment Mitigates Reward Hacking in Reinforcement Learning for Language Models},
|
||||
author = {Huang, Jiaji and Ozkara, Kaan and Li, Yushu and Thrampoulidis, Christos and Li, Xiaoxiao and Park, Youngsuk},
|
||||
author = {Deng, Wenlong and Huang, Jiaji and Ozkara, Kaan and Li, Yushu and Thrampoulidis, Christos and Li, Xiaoxiao and Park, Youngsuk},
|
||||
year = {2026},
|
||||
eprint = {2605.25189},
|
||||
archivePrefix= {arXiv},
|
||||
@@ -158,3 +158,165 @@
|
||||
archivePrefix= {arXiv},
|
||||
url = {https://arxiv.org/abs/2106.09685}
|
||||
}
|
||||
|
||||
% --- external-search additions (Perplexity/Gemini/ChatGPT/Elicit, 2026-06-04) ---
|
||||
% Bibtex+abstract pulled via the bibtex MCP (DBLP/Semantic Scholar); arXiv ids
|
||||
% each verified HTTP 200. Provenance + relevance notes in
|
||||
% docs/grad_routing/search_2026-06-04_related_work.md.
|
||||
|
||||
% The deploy-deletable-module PRECEDENT, and it predates Cloud (Nov 2023).
|
||||
% Separable "security vectors" activated during finetuning absorb the harmful
|
||||
% update so the backbone never learns it; deactivated at deploy. Our quarantine
|
||||
% delta_S_hack absorption story, but in param space for SFT harmful-data defense
|
||||
% (not the SVD gradient basis, not RL reward hacking, no weak-detector axis).
|
||||
@misc{zhou2023securityvectors,
|
||||
title = {Making Harmful Behaviors Unlearnable for Large Language Models},
|
||||
author = {Zhou, Xin and Lu, Yi and Ma, Ruotian and Gui, Tao and Zhang, Qi and Huang, Xuanjing},
|
||||
year = {2023},
|
||||
eprint = {2311.02105},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.CL},
|
||||
url = {https://arxiv.org/abs/2311.02105},
|
||||
abstract = {Large language models (LLMs) have shown great potential as general-purpose AI assistants in various domains. To meet the requirements of different applications, LLMs are often customized by further fine-tuning. However, the powerful learning ability of LLMs not only enables them to acquire new tasks but also makes them susceptible to learning undesired behaviors. For example, even safety-aligned LLMs can be easily fine-tuned into harmful assistants as the fine-tuning data often contains implicit or explicit harmful content. Can we train LLMs on harmful data without learning harmful behaviors? This paper proposes a controllable training framework that makes harmful behaviors unlearnable during the fine-tuning process. Specifically, we introduce ``security vectors'', a few new parameters that can be separated from the LLM, to ensure LLM's responses are consistent with the harmful behavior. Security vectors are activated during fine-tuning, the consistent behavior makes LLM believe that such behavior has already been learned, there is no need to further optimize for harmful data. During inference, we can deactivate security vectors to restore the LLM's normal behavior. The experimental results show that the security vectors generated by 100 harmful samples are enough to prevent LLM from learning 1000 harmful samples, while preserving the ability to learn other useful information.}
|
||||
}
|
||||
|
||||
% Second gradient-level reward-hacking paper, same lab as SignCert-PO (RIKEN /
|
||||
% Sugiyama). Orthogonal MECHANISM to ours: regularize the gradient norm toward
|
||||
% flat minima where the reward model stays accurate, beats a KL penalty. Cite in
|
||||
% the gradient-level bucket; not a scoop of direction-routing.
|
||||
@misc{ackermann2026gradreg,
|
||||
title = {Gradient Regularization Prevents Reward Hacking in Reinforcement Learning from Human Feedback and Verifiable Rewards},
|
||||
author = {Ackermann, Jan and Noukhovitch, Michael and Ishida, Takashi and Sugiyama, Masashi},
|
||||
year = {2026},
|
||||
eprint = {2602.18037},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2602.18037},
|
||||
abstract = {Reinforcement Learning from Human Feedback (RLHF) or Verifiable Rewards (RLVR) are two key steps in the post-training of modern Language Models (LMs). A common problem is reward hacking, where the policy may exploit inaccuracies of the reward and learn an unintended behavior. Most previous works address this by limiting the policy update with a Kullback-Leibler (KL) penalty towards a reference model. We propose a different framing: Train the LM in a way that biases policy updates towards regions in which the reward is more accurate. First, we derive a theoretical connection between the accuracy of a reward model and the flatness of an optimum at convergence. Gradient regularization (GR) can then be used to bias training to flatter regions and thereby maintain reward model accuracy. We confirm these results by showing that the gradient norm and reward accuracy are empirically correlated in RLHF. We then show that Reference Resets of the KL penalty implicitly use GR to find flatter regions with higher reward accuracy. We further improve on this by proposing to use explicit GR with an efficient finite-difference estimate. Empirically, GR performs better than a KL penalty across a diverse set of RL experiments with LMs.}
|
||||
}
|
||||
|
||||
% Closest published analogue to our ERASE arm: project the forget-set gradient
|
||||
% onto the subspace orthogonal to the retain batch (forget-vs-retain unlearning;
|
||||
% ours is hack-vs-clean reward hacking).
|
||||
@misc{shamsian2025orthograd,
|
||||
title = {Go Beyond Your Means: Unlearning with Per-Sample Gradient Orthogonalization},
|
||||
author = {Shamsian, Aviv and Shaar, Eitan and Navon, Aviv and Chechik, Gal and Fetaya, Ethan},
|
||||
year = {2025},
|
||||
eprint = {2503.02312},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2503.02312},
|
||||
abstract = {Machine unlearning aims to remove the influence of problematic training data after a model has been trained. The primary challenge in machine unlearning is ensuring that the process effectively removes specified data without compromising the model's overall performance on the remaining dataset. Many existing machine unlearning methods address this challenge by carefully balancing gradient ascent on the `unlearn' data with the gradient descent on a `retain' set that represents the training data. However, in many cases the training dataset is not fully available when we wish to unlearn some concepts, because models are released without their training datasets, and one may only have access to a small part of a training set. Here, we propose OrthoGrad, a novel approach that mitigates interference between the unlearn set and a small retain set rather than competing ascent and descent processes. Our method projects the gradient of the unlearn set onto the subspace orthogonal to all gradients in the retain batch, effectively avoiding any gradient interference. We demonstrate the effectiveness of OrthoGrad on multiple machine unlearning benchmarks, outperforming competing methods.}
|
||||
}
|
||||
|
||||
% C2 generalization evidence (stronger than Honesty-to-Subterfuge): SFT on
|
||||
% harmless reward hacks generalizes to new hack settings AND to unrelated
|
||||
% emergent misalignment. Owain Evans group.
|
||||
@misc{taylor2025schoolrewardhacks,
|
||||
title = {School of Reward Hacks: Hacking Harmless Tasks Generalizes to Misaligned Behavior in LLMs},
|
||||
author = {Taylor, Mia and Chua, James and Betley, Jan and Treutlein, Johannes and Evans, Owain},
|
||||
year = {2025},
|
||||
eprint = {2508.17511},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2508.17511},
|
||||
abstract = {Reward hacking--where agents exploit flaws in imperfect reward functions rather than performing tasks as intended--poses risks for AI alignment. Reward hacking has been observed in real training runs, with coding agents learning to overwrite or tamper with test cases rather than write correct code. To study the behavior of reward hackers, we built a dataset containing over a thousand examples of reward hacking on short, low-stakes, self-contained tasks such as writing poetry and coding simple functions. We used supervised fine-tuning to train models (GPT-4.1, GPT-4.1-mini, Qwen3-32B, Qwen3-8B) to reward hack on these tasks. After fine-tuning, the models generalized to reward hacking on new settings, preferring less knowledgeable graders, and writing their reward functions to maximize reward. Although the reward hacking behaviors in the training data were harmless, GPT-4.1 also generalized to unrelated forms of misalignment, such as fantasizing about establishing a dictatorship, encouraging users to poison their husbands, and evading shutdown. Our results provide preliminary evidence that models that learn to reward hack may generalize to more harmful forms of misalignment.}
|
||||
}
|
||||
|
||||
% Weight-space contrastive behavior direction (subtract deltas of two opposite
|
||||
% finetunes), the steering-side cousin of our extracted v_hack; reports stronger
|
||||
% OOD control than activation steering. Cite next to task-arithmetic negation.
|
||||
@misc{fierro2025weightarithmetic,
|
||||
title = {Steering Language Models with Weight Arithmetic},
|
||||
author = {Fierro, Constanza and Roger, Fabien},
|
||||
year = {2025},
|
||||
eprint = {2511.05408},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2511.05408},
|
||||
abstract = {Providing high-quality feedback to Large Language Models (LLMs) on a diverse training distribution can be difficult and expensive, and providing feedback only on a narrow distribution can result in unintended generalizations. To better leverage narrow training data, we propose contrastive weight steering, a simple post-training method that edits the model parameters using weight arithmetic. We isolate a behavior direction in weight-space by subtracting the weight deltas from two small fine-tunes -- one that induces the desired behavior and another that induces its opposite -- and then add or remove this direction to modify the model's weights. We apply this technique to mitigate sycophancy and induce misalignment, and find that weight steering often generalizes further than activation steering, achieving stronger out-of-distribution behavioral control before degrading general capabilities. We also provide preliminary evidence that emergent misalignment can be detected by measuring the similarity between fine-tuning updates and an "evil" weight direction.}
|
||||
}
|
||||
|
||||
% Closest published analogue to our contrastive-SVD extraction, but on the static
|
||||
% reward MODEL not the live policy gradient: build a multi-directional hacking
|
||||
% subspace from residual-stream diffs of contrastive gold-vs-hacked pairs, project
|
||||
% the reward-head vector off it. Authors scraped from arXiv (S2/DBLP not yet
|
||||
% indexed as of 2026-06-04); abstract from arXiv. "\model" -> HARVE inlined.
|
||||
@misc{liu2026harve,
|
||||
title = {HARVE: Hacking-Aware Reward-Head Vector Editing for Robust Reward Models},
|
||||
author = {Liu, Shuang and Bo, Yuxuan and Zhao, Qiuyang and Huang, Caiyue and Chen, Xiaorong and Liu, Yanguang and Du, Mengnan},
|
||||
year = {2026},
|
||||
eprint = {2606.03131},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2606.03131},
|
||||
abstract = {Reward models are central to large language model (LLM) alignment, but they remain vulnerable to reward hacking. To evaluate reward-model robustness, we introduce RewardHackBench containing 13 reward-hacking patterns covering real life high-stakes domains and general settings, and we find severe failures on specific subcategories across eight reward models. To mitigate these failures, we propose HARVE, a training-free reward-head editing method for scalar reward models. Instead of fine-tuning the reward model, HARVE identifies a multi-directional hacking subspace from residual stream directions associated with selected hacking subcategories, and removes the component of the reward-head vector aligned with that subspace. This directly reduces the reward head's sensitivity to hacking-related features using only a small set of contrastive gold-hacked examples, without gradient updates or fine-tuning. Comprehensive experiments across eight reward models indicates that HARVE improves hacking robustness, outperforms fine-tuning baselines, and preserves reward-models' general capability. Further analyses suggest that reward hacking is better captured as a multidimensional residual-space structure than by isolated surface cues.}
|
||||
}
|
||||
|
||||
% Gradient surgery against SHORTCUTS in supervised reasoning: a ShortcutScore
|
||||
% (per-sample gradient vs validation-gradient cosine + answer-token gradient
|
||||
% concentration) flags shortcut-promoting samples, then orthogonal projection
|
||||
% removes those updates. Closest gradient-surgery analogue on the SFT-reasoning
|
||||
% side (ours: GRPO reward hacking).
|
||||
@misc{cao2026sart,
|
||||
title = {Mitigating Shortcut Reasoning in Language Models: A Gradient-Aware Training Approach},
|
||||
author = {Cao, Hongyu and Liu, Kunpeng and Wang, Dongjie and Fu, Yanjie},
|
||||
year = {2026},
|
||||
eprint = {2603.20899},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2603.20899},
|
||||
abstract = {Large language models exhibit strong reasoning capabilities, yet often rely on shortcuts such as surface pattern matching and answer memorization rather than genuine logical inference. We propose Shortcut-Aware Reasoning Training (SART), a gradient-aware framework that detects and mitigates shortcut-promoting samples via ShortcutScore and gradient surgery. Our method identifies shortcut signals through gradient misalignment with validation objectives and answer-token concentration, and modifies training dynamics accordingly. Experiments on controlled reasoning benchmarks show that SART achieves +16.5\% accuracy and +40.2\% robustness over the strongest baseline, significantly improving generalization under distribution shifts.}
|
||||
}
|
||||
|
||||
% Orthogonal gradient projection during SAFETY alignment (SFT/DPO): estimate a
|
||||
% low-rank capability subspace from general-capability gradients, remove that
|
||||
% component from each safety gradient. Validates our gradient-erasure mechanics;
|
||||
% capability preservation, not hack suppression.
|
||||
@misc{sun2026ogpsa,
|
||||
title = {Safety Alignment as Continual Learning: Mitigating the Alignment Tax via Orthogonal Gradient Projection},
|
||||
author = {Sun, Guanglong and Zhang, Siyuan and Wang, Liyuan and Zhu, Jun and Su, Hang and Zhong, Yi},
|
||||
year = {2026},
|
||||
eprint = {2602.07892},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2602.07892},
|
||||
abstract = {Safety post-training can improve the harmfulness and policy compliance of Large Language Models (LLMs), but it may also reduce general utility, a phenomenon often described as the alignment tax. We study this trade-off through the lens of continual learning: sequential alignment stages expose the model to shifted data distributions and objectives, and their gradients may interfere with directions that support previously acquired general capabilities. We propose Orthogonal Gradient Projection for Safety Alignment (OGPSA), a lightweight update rule that estimates a low-rank reference subspace from gradients on a small set of general-capability data and removes from each safety gradient the component lying in this subspace. The resulting update is the steepest local safety-descent direction subject to first-order preservation constraints on the reference objectives. Across SFT, DPO, and sequential SFT-DPO settings, OGPSA improves the observed safety-utility trade-off over standard baselines.}
|
||||
}
|
||||
|
||||
% 2026 survey / landscape map (Proxy Compression Hypothesis). Use to scoop-check
|
||||
% and to cite the field. Full 23-author byline from the bibtex MCP (Fudan group).
|
||||
@misc{wang2026rewardhackingsurvey,
|
||||
title = {Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges},
|
||||
author = {Wang, Xiaohua and Tian, Muzhao and Zeng, Yuqiyu and Huang, Zisu and Yuan, Jiakang and Chen, Bowen and Xu, Jingwen and Zhou, Ming and Liu, Wenhao and Wu, Muling and Guo, Zhengkang and Qian, Qi and Wang, Yifei and Zhang, Feiran and Yin, Ruicheng and Dou, Shihan and Lv, Changze and Chen, Tao and Song, Kaitao and Tan, Xu and Gui, Tao and Zheng, Xiaoqing and Huang, Xuanjing},
|
||||
year = {2026},
|
||||
eprint = {2604.13602},
|
||||
archivePrefix= {arXiv},
|
||||
primaryClass = {cs.LG},
|
||||
url = {https://arxiv.org/abs/2604.13602}
|
||||
}
|
||||
|
||||
% The idea's ORIGIN in the alignment community: Mallen lists "Gradient routing to
|
||||
% isolate reward hacking propensity and/or capability, and turn it off outside of
|
||||
% training" -- our thesis, ~1yr before our runs. Fetched + verified via the
|
||||
% (GreaterWrong-backed) lesswrong-graphql skill, 2026-06-04.
|
||||
@misc{mallen2025rhinterventions,
|
||||
title = {A Quick List of Reward Hacking Interventions},
|
||||
author = {Mallen, Alex},
|
||||
year = {2025},
|
||||
howpublished = {Alignment Forum},
|
||||
month = jun,
|
||||
url = {https://www.alignmentforum.org/posts/spZyuEGPzqPhnehyk/a-quick-list-of-reward-hacking-interventions}
|
||||
}
|
||||
|
||||
% C2 generalization evidence: empirical cross-task generalization of reward
|
||||
% hacking (incl. organic generalization via expert iteration). Redwood/Anthropic.
|
||||
% Full 7-author byline scraped from the GreaterWrong post page (evhub = Evan
|
||||
% Hubinger); S2 does not index AF posts.
|
||||
@misc{nishimuragasparian2025rhgeneralize,
|
||||
title = {Reward Hacking Behavior Can Generalize Across Tasks},
|
||||
author = {Nishimura-Gasparian, Kei and Dunn, Isaac and Sleight, Henry and Turpin, Miles and Hubinger, Evan and Denison, Carson and Perez, Ethan},
|
||||
year = {2025},
|
||||
howpublished = {Alignment Forum},
|
||||
url = {https://www.alignmentforum.org/posts/Ge55vxEmKXunFFwoe/reward-hacking-behavior-can-generalize-across-tasks}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user