mirror of
https://github.com/wassname/steer-heal-love.git
synced 2026-06-27 15:32:28 +08:00
7db5a56cb1
docs/writeup/paper.qmd (2pp NeurIPS), references.bib, neurips_2023.sty, the quarto _extensions. justfile gains `paper` (latex) and `paper-html` (no latex) recipes. gitignore the generated paper.pdf/paper.tex and the transient .claude/. Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
112 lines
4.8 KiB
BibTeX
112 lines
4.8 KiB
BibTeX
% references.bib for steer_heal writeup.
|
|
% All entries verified against arxiv abstract pages or publisher records (2026-06).
|
|
% Provenance noted per entry. Anything I could not confirm is marked "% TODO verify".
|
|
|
|
% --- The paper we build on (steering-vector distillation backbone) ---
|
|
% Verified: arxiv.org/abs/2606.00995 title + author list.
|
|
@misc{blank2026subliminal,
|
|
title = {Subliminal Learning Is Steering Vector Distillation},
|
|
author = {Blank, Camila and Bhatia, Agam and Rajamanoharan, Senthooran and Conmy, Arthur and Nanda, Neel},
|
|
year = {2026},
|
|
eprint = {2606.00995},
|
|
archiveprefix = {arXiv},
|
|
primaryclass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2606.00995}
|
|
}
|
|
|
|
% --- Activation steering / steering vectors ---
|
|
% Verified: arxiv.org/abs/2308.10248 (ActAdd).
|
|
@misc{turner2023activation,
|
|
title = {Steering Language Models With Activation Engineering},
|
|
author = {Turner, Alexander Matt and Thiergart, Lisa and Leech, Gavin and Udell, David and Vazquez, Juan J. and Mini, Ulisse and MacDiarmid, Monte},
|
|
year = {2023},
|
|
eprint = {2308.10248},
|
|
archiveprefix = {arXiv},
|
|
primaryclass = {cs.CL},
|
|
url = {https://arxiv.org/abs/2308.10248}
|
|
}
|
|
|
|
% Verified: arxiv.org/abs/2312.06681 (CAA). First author now listed as Nina Panickssery (formerly Rimsky).
|
|
@misc{panickssery2023caa,
|
|
title = {Steering Llama 2 via Contrastive Activation Addition},
|
|
author = {Panickssery, Nina and Gabrieli, Nick and Schulz, Julian and Tong, Meg and Hubinger, Evan and Turner, Alexander Matt},
|
|
year = {2023},
|
|
eprint = {2312.06681},
|
|
archiveprefix = {arXiv},
|
|
primaryclass = {cs.CL},
|
|
url = {https://arxiv.org/abs/2312.06681}
|
|
}
|
|
|
|
% --- Representation engineering ---
|
|
% Verified: arxiv.org/abs/2310.01405.
|
|
@misc{zou2023representation,
|
|
title = {Representation Engineering: A Top-Down Approach to AI Transparency},
|
|
author = {Zou, Andy and Phan, Long and Chen, Sarah and Campbell, James and Guo, Phillip and Ren, Richard and Pan, Alexander and Yin, Xuwang and Mazeika, Mantas and Dombrowski, Ann-Kathrin and Goel, Shashwat and Li, Nathaniel and Byun, Michael J. and Wang, Zifan and Mallen, Alex and Basart, Steven and Koyejo, Sanmi and Song, Dawn and Fredrikson, Matt and Kolter, J. Zico and Hendrycks, Dan},
|
|
year = {2023},
|
|
eprint = {2310.01405},
|
|
archiveprefix = {arXiv},
|
|
primaryclass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2310.01405}
|
|
}
|
|
|
|
% --- LoRA + PiSSA lineage ---
|
|
% Verified: arxiv.org/abs/2106.09685.
|
|
@misc{hu2021lora,
|
|
title = {LoRA: Low-Rank Adaptation of Large Language Models},
|
|
author = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
|
|
year = {2021},
|
|
eprint = {2106.09685},
|
|
archiveprefix = {arXiv},
|
|
primaryclass = {cs.CL},
|
|
url = {https://arxiv.org/abs/2106.09685}
|
|
}
|
|
|
|
% Verified: arxiv.org/abs/2404.02948 (PiSSA).
|
|
@misc{meng2024pissa,
|
|
title = {PiSSA: Principal Singular Values and Singular Vectors Adaptation of Large Language Models},
|
|
author = {Meng, Fanxu and Wang, Zhaohui and Zhang, Muhan},
|
|
year = {2024},
|
|
eprint = {2404.02948},
|
|
archiveprefix = {arXiv},
|
|
primaryclass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2404.02948}
|
|
}
|
|
|
|
% --- Weight steering (positioning / related) ---
|
|
% Verified: arxiv.org/abs/2511.05408.
|
|
@misc{fierro2025weight,
|
|
title = {Steering Language Models with Weight Arithmetic},
|
|
author = {Fierro, Constanza and Roger, Fabien},
|
|
year = {2025},
|
|
eprint = {2511.05408},
|
|
archiveprefix = {arXiv},
|
|
primaryclass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2511.05408}
|
|
}
|
|
|
|
% --- Moral foundations theory (the eval axis) ---
|
|
% Verified: Journal of Personality and Social Psychology 96(5), 1029-1046.
|
|
@article{graham2009liberals,
|
|
title = {Liberals and conservatives rely on different sets of moral foundations},
|
|
author = {Graham, Jesse and Haidt, Jonathan and Nosek, Brian A.},
|
|
journal = {Journal of Personality and Social Psychology},
|
|
volume = {96},
|
|
number = {5},
|
|
pages = {1029--1046},
|
|
year = {2009},
|
|
publisher = {American Psychological Association},
|
|
doi = {10.1037/a0015141}
|
|
}
|
|
|
|
% Verified: Advances in Experimental Social Psychology 47, 55-130.
|
|
@incollection{graham2013mft,
|
|
title = {Moral Foundations Theory: The Pragmatic Validity of Moral Pluralism},
|
|
author = {Graham, Jesse and Haidt, Jonathan and Koleva, Sena and Motyl, Matt and Iyer, Ravi and Wojcik, Sean P. and Ditto, Peter H.},
|
|
booktitle = {Advances in Experimental Social Psychology},
|
|
volume = {47},
|
|
pages = {55--130},
|
|
year = {2013},
|
|
publisher = {Academic Press},
|
|
doi = {10.1016/B978-0-12-407236-7.00002-4}
|
|
}
|