mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 17:48:43 +08:00
bd7550f559
Formatting pass lifted from the AntiPaSTO paper (the format the author is
happy with):
- verbatim -> lstlisting (framed, shaded, Python-highlighted code blocks;
chat-template prompt uses language={} so markup isn't keyword-coloured)
- xcolor[table] + \rowcolor highlight on the 'ours' rows (keynote, ablation)
- ablation table restructured as leave-one-out with the negate symbol
(negate-routing/directional/hack-pairs/intervention); long interpretation
moved out of the caption into section body; post-hoc split into its own block
- real AntiPaSTO citation (Clark 2026, arXiv:2601.07473) replacing the
UNVERIFIED placeholder; dropped the verify-before-submission TODO
- code-availability line with a GitHub glyph (anonymous placeholder)
Builds clean: 11 pages, no unresolved refs/cites.
Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
161 lines
7.6 KiB
BibTeX
161 lines
7.6 KiB
BibTeX
% Bibliography for the gradient-routing-vs-reward-hacking writeup.
|
|
% Every field below is either grounded in the repo's local paper copies
|
|
% (docs/papers/*) or web-verified 2026-06-02. Unverifiable fields carry an
|
|
% explicit TODO -- do not fill from memory.
|
|
|
|
% Web-verified 2026-06-02 (arxiv.org/abs/2410.04332 + dblp). README also cites it.
|
|
@misc{cloud2024gradientrouting,
|
|
title = {Gradient Routing: Masking Gradients to Localize Computation in Neural Networks},
|
|
author = {Cloud, Alex and Goldman-Wetzler, Jacob and Wybitul, Ev{\v{z}}en and Miller, Joseph and Turner, Alexander Matt},
|
|
year = {2024},
|
|
eprint = {2410.04332},
|
|
archivePrefix= {arXiv},
|
|
primaryClass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2410.04332}
|
|
}
|
|
|
|
% The substrate. Grounded in docs/papers/2025_lw_ariahw_steering-...md header.
|
|
% Byline is the LessWrong handle "Ariahw"; advised by Neel Nanda and Josh Engels
|
|
% (MATS 9.0). TODO: real-name attribution for the handle before submission.
|
|
@misc{ariahw2025steering,
|
|
title = {Steering RL Training: Benchmarking Interventions Against Reward Hacking},
|
|
author = {{Ariahw}},
|
|
year = {2025},
|
|
howpublished = {LessWrong},
|
|
month = dec,
|
|
url = {https://www.lesswrong.com/posts/R5MdWGKsuvdPwGFBG/steering-rl-training-benchmarking-interventions-against}
|
|
}
|
|
|
|
% GRPO. Full author list + id from the Ariahw post bib (ref 10) and Wu-Tang bib.
|
|
@misc{shao2024deepseekmath,
|
|
title = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
|
|
author = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Zhang, Mingchuan and Li, Y. K. and Wu, Y. and Guo, Daya},
|
|
year = {2024},
|
|
eprint = {2402.03300},
|
|
archivePrefix= {arXiv},
|
|
primaryClass = {cs.CL}
|
|
}
|
|
|
|
% The advantage-level baseline. Grounded in docs/papers/2026_wu-tang_...md header
|
|
% (authors Rui Wu & Ruixiang Tang, Rutgers; arXiv:2604.01476). Method in the
|
|
% paper is "representation-informed advantage modulation".
|
|
@misc{wu2026rebound,
|
|
title = {When Reward Hacking Rebounds: Understanding and Mitigating It with Representation-Level Signals},
|
|
author = {Wu, Rui and Tang, Ruixiang},
|
|
year = {2026},
|
|
eprint = {2604.01476},
|
|
archivePrefix= {arXiv},
|
|
primaryClass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2604.01476}
|
|
}
|
|
|
|
% Diff-of-means activation direction (the act-erase control in tt_erase_bench).
|
|
% Web-verified 2026-06-02 (arxiv.org/abs/2406.11717, NeurIPS 2024).
|
|
@misc{arditi2024refusal,
|
|
title = {Refusal in Language Models Is Mediated by a Single Direction},
|
|
author = {Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Panickssery, Nina and Gurnee, Wes and Nanda, Neel},
|
|
year = {2024},
|
|
eprint = {2406.11717},
|
|
archivePrefix= {arXiv},
|
|
primaryClass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2406.11717}
|
|
}
|
|
|
|
% The prior SVD-basis steering work this builds on (same author; the per-Linear
|
|
% delta_S adapter originates here). arXiv id supplied by the author 2026-06-03.
|
|
@misc{antipasto,
|
|
title = {AntiPaSTO: Self-Supervised Honesty Steering via Anti-Parallel Representations},
|
|
author = {Clark, Michael J.},
|
|
year = {2026},
|
|
eprint = {2601.07473},
|
|
archivePrefix= {arXiv},
|
|
primaryClass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2601.07473}
|
|
}
|
|
|
|
% --- gradient-routing / projection related work --------------------------
|
|
% All three below verified against full-text local copies in
|
|
% docs/grad_routing/ (title + arXiv id + url read from the file headers,
|
|
% 2026-05-31). Author fields filled only where the byline was read.
|
|
|
|
% THE NEAR-TWIN: singular directions of param-updates + project gradients ONTO
|
|
% a clean reference subspace (we subtract a hack subspace instead). Byline read
|
|
% from docs/grad_routing/paper_deng_trusted_direction.md; note the curated
|
|
% related_work.md calls it "TDGA / Deng" -- TODO reconcile the lead-author label.
|
|
@misc{huang2026directional,
|
|
title = {Directional Alignment Mitigates Reward Hacking in Reinforcement Learning for Language Models},
|
|
author = {Huang, Jiaji and Ozkara, Kaan and Li, Yushu and Thrampoulidis, Christos and Li, Xiaoxiao and Park, Youngsuk},
|
|
year = {2026},
|
|
eprint = {2605.25189},
|
|
archivePrefix= {arXiv},
|
|
primaryClass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2605.25189}
|
|
}
|
|
|
|
% Parameter-gradient zero-mask routing (Selective Gradient Masking, SGTM)
|
|
% tolerant to label noise; measures leakage and shows it shrinks with scale.
|
|
% Title + author byline web-verified 2026-06-02 (arxiv.org/abs/2512.05648).
|
|
@misc{sgtm2025localization,
|
|
title = {Beyond Data Filtering: Knowledge Localization for Capability Removal in LLMs},
|
|
author = {Shilov, Igor and Cloud, Alex and Gema, Aryo Pradipta and Goldman-Wetzler, Jacob and Panickssery, Nina and Sleight, Henry and Jones, Erik and Anil, Cem},
|
|
year = {2025},
|
|
eprint = {2512.05648},
|
|
archivePrefix= {arXiv},
|
|
primaryClass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2512.05648}
|
|
}
|
|
|
|
% Reward-for-confession honesty (we reject this design: invites Baker
|
|
% obfuscation + a live judge over student rollouts). Byline read from header.
|
|
@misc{joglekar2025confessions,
|
|
title = {Training LLMs for Honesty via Confessions},
|
|
author = {Joglekar, Manas and Chen, Jeremy and Wu, Gabriel and Yosinski, Jason and Wang, Jasmine and Barak, Boaz and Glaese, Amelia},
|
|
year = {2025},
|
|
eprint = {2512.08093},
|
|
archivePrefix= {arXiv},
|
|
primaryClass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2512.08093}
|
|
}
|
|
|
|
% --- abstract-only "closest twins" (NOT full-text verified) --------------
|
|
% IDs from docs/grad_routing/{related_work,search_for_more}.md. Authors NOT
|
|
% filled (not read) -- do not cite as @misc with invented authors. Verify
|
|
% byline from arXiv before promoting any of these into the bibliography:
|
|
% GRIFT (gradient fingerprints to detect/reject hacking) arXiv:2604.16242
|
|
% Spilling the Beans (SFT self-report generalises OOD) arXiv:2511.06626
|
|
% Baker et al. (weak monitor -> obfuscated reward hacking) arXiv:2503.11926
|
|
|
|
% --- parameter-isolation / weight-subspace lineage ---------------------
|
|
% Added 2026-06-03 in response to an OpenReview novelty challenge against
|
|
% gradient routing (PackNet/Piggyback/LoRA as "similar"). We cite them as the
|
|
% classical "confine a capability to a weight subset" precedent; we differ by
|
|
% REMOVING (not adding) a capability and assigning the subset from a gradient
|
|
% signal (not a task label). Bylines verified via arXiv/CVF.
|
|
@inproceedings{mallya2018packnet,
|
|
title = {{PackNet}: Adding Multiple Tasks to a Single Network by Iterative Pruning},
|
|
author = {Mallya, Arun and Lazebnik, Svetlana},
|
|
booktitle = {CVPR},
|
|
year = {2018},
|
|
eprint = {1711.05769},
|
|
archivePrefix= {arXiv},
|
|
url = {https://arxiv.org/abs/1711.05769}
|
|
}
|
|
@inproceedings{mallya2018piggyback,
|
|
title = {Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights},
|
|
author = {Mallya, Arun and Davis, Dillon and Lazebnik, Svetlana},
|
|
booktitle = {ECCV},
|
|
year = {2018},
|
|
eprint = {1801.06519},
|
|
archivePrefix= {arXiv},
|
|
url = {https://arxiv.org/abs/1801.06519}
|
|
}
|
|
@inproceedings{hu2021lora,
|
|
title = {{LoRA}: Low-Rank Adaptation of Large Language Models},
|
|
author = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
|
|
booktitle = {ICLR},
|
|
year = {2022},
|
|
eprint = {2106.09685},
|
|
archivePrefix= {arXiv},
|
|
url = {https://arxiv.org/abs/2106.09685}
|
|
}
|