From 154e33683e040bf778518658bcc5a082ce18365f Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:20:04 +0800 Subject: [PATCH] paper: HARVE byline cross-verified arXiv==S2 (keyed semantic-search .env) Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com> --- docs/writeup/refs.bib | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/writeup/refs.bib b/docs/writeup/refs.bib index 26ab416..7ea8ea0 100644 --- a/docs/writeup/refs.bib +++ b/docs/writeup/refs.bib @@ -240,8 +240,9 @@ % Closest published analogue to our contrastive-SVD extraction, but on the static % reward MODEL not the live policy gradient: build a multi-directional hacking % subspace from residual-stream diffs of contrastive gold-vs-hacked pairs, project -% the reward-head vector off it. Authors scraped from arXiv (S2/DBLP not yet -% indexed as of 2026-06-04); abstract from arXiv. "\model" -> HARVE inlined. +% the reward-head vector off it. 7-author byline cross-verified arXiv == Semantic +% Scholar (keyed S2 API via the semantic-search skill .env); abstract from arXiv. +% "\model" -> HARVE inlined. @misc{liu2026harve, title = {HARVE: Hacking-Aware Reward-Head Vector Editing for Robust Reward Models}, author = {Liu, Shuang and Bo, Yuxuan and Zhao, Qiuyang and Huang, Caiyue and Chen, Xiaorong and Liu, Yanguang and Du, Mengnan},