Files
evil_MoE/docs/writeup/refs.bib
T
wassname 923de6dbe6 docs(writeup): NeurIPS-workshop paper skeleton + tectonic compile recipe
Minimal LaTeX skeleton: outline + evidence tables (route2 n=3 deploy numbers
filled with provenance, vanilla pending jobs 74/84) + figures + verified refs
+ appendix (4-mode traces, 6/6/6/6 partition counts, pseudocode). Build
artifacts and figs symlinks gitignored. `just paper` compiles via tectonic;
`just paper-qc` dumps text + greps for unresolved refs / TODOs.

Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-02 06:59:15 +00:00

73 lines
3.2 KiB
BibTeX

% Bibliography for the gradient-routing-vs-reward-hacking writeup.
% Every field below is either grounded in the repo's local paper copies
% (docs/papers/*) or web-verified 2026-06-02. Unverifiable fields carry an
% explicit TODO -- do not fill from memory.
% Web-verified 2026-06-02 (arxiv.org/abs/2410.04332 + dblp). README also cites it.
@misc{cloud2024gradientrouting,
title = {Gradient Routing: Masking Gradients to Localize Computation in Neural Networks},
author = {Cloud, Alex and Goldman-Wetzler, Jacob and Wybitul, Ev{\v{z}}en and Miller, Joseph and Turner, Alexander Matt},
year = {2024},
eprint = {2410.04332},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2410.04332}
}
% The substrate. Grounded in docs/papers/2025_lw_ariahw_steering-...md header.
% Byline is the LessWrong handle "Ariahw"; advised by Neel Nanda and Josh Engels
% (MATS 9.0). TODO: real-name attribution for the handle before submission.
@misc{ariahw2025steering,
title = {Steering RL Training: Benchmarking Interventions Against Reward Hacking},
author = {{Ariahw}},
year = {2025},
howpublished = {LessWrong},
month = dec,
url = {https://www.lesswrong.com/posts/R5MdWGKsuvdPwGFBG/steering-rl-training-benchmarking-interventions-against}
}
% GRPO. Full author list + id from the Ariahw post bib (ref 10) and Wu-Tang bib.
@misc{shao2024deepseekmath,
title = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
author = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Zhang, Mingchuan and Li, Y. K. and Wu, Y. and Guo, Daya},
year = {2024},
eprint = {2402.03300},
archivePrefix= {arXiv},
primaryClass = {cs.CL}
}
% The advantage-level baseline. Grounded in docs/papers/2026_wu-tang_...md header
% (authors Rui Wu & Ruixiang Tang, Rutgers; arXiv:2604.01476). Method in the
% paper is "representation-informed advantage modulation".
@misc{wu2026rebound,
title = {When Reward Hacking Rebounds: Understanding and Mitigating It with Representation-Level Signals},
author = {Wu, Rui and Tang, Ruixiang},
year = {2026},
eprint = {2604.01476},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2604.01476}
}
% Diff-of-means activation direction (the act-erase control in tt_erase_bench).
% Web-verified 2026-06-02 (arxiv.org/abs/2406.11717, NeurIPS 2024).
@misc{arditi2024refusal,
title = {Refusal in Language Models Is Mediated by a Single Direction},
author = {Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Panickssery, Nina and Gurnee, Wes and Nanda, Neel},
year = {2024},
eprint = {2406.11717},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2406.11717}
}
% The prior paired-preference SVD-basis steering work this builds on.
% TODO: no verifiable citation on hand. Fill title/venue/url/year before use,
% or drop. Do NOT invent fields.
@misc{antipasto,
title = {AntiPaSTO},
author = {TODO},
year = {TODO},
note = {UNVERIFIED -- fill or remove before submission}
}