diff --git a/docs/writeup/refs.bib b/docs/writeup/refs.bib index 26ab416..7ea8ea0 100644 --- a/docs/writeup/refs.bib +++ b/docs/writeup/refs.bib @@ -240,8 +240,9 @@ % Closest published analogue to our contrastive-SVD extraction, but on the static % reward MODEL not the live policy gradient: build a multi-directional hacking % subspace from residual-stream diffs of contrastive gold-vs-hacked pairs, project -% the reward-head vector off it. Authors scraped from arXiv (S2/DBLP not yet -% indexed as of 2026-06-04); abstract from arXiv. "\model" -> HARVE inlined. +% the reward-head vector off it. 7-author byline cross-verified arXiv == Semantic +% Scholar (keyed S2 API via the semantic-search skill .env); abstract from arXiv. +% "\model" -> HARVE inlined. @misc{liu2026harve, title = {HARVE: Hacking-Aware Reward-Head Vector Editing for Robust Reward Models}, author = {Liu, Shuang and Bo, Yuxuan and Zhao, Qiuyang and Huang, Caiyue and Chen, Xiaorong and Liu, Yanguang and Du, Mengnan},