mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-27 16:30:30 +08:00
paper: HARVE byline cross-verified arXiv==S2 (keyed semantic-search .env)
Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
This commit is contained in:
@@ -240,8 +240,9 @@
|
||||
% Closest published analogue to our contrastive-SVD extraction, but on the static
|
||||
% reward MODEL not the live policy gradient: build a multi-directional hacking
|
||||
% subspace from residual-stream diffs of contrastive gold-vs-hacked pairs, project
|
||||
% the reward-head vector off it. Authors scraped from arXiv (S2/DBLP not yet
|
||||
% indexed as of 2026-06-04); abstract from arXiv. "\model" -> HARVE inlined.
|
||||
% the reward-head vector off it. 7-author byline cross-verified arXiv == Semantic
|
||||
% Scholar (keyed S2 API via the semantic-search skill .env); abstract from arXiv.
|
||||
% "\model" -> HARVE inlined.
|
||||
@misc{liu2026harve,
|
||||
title = {HARVE: Hacking-Aware Reward-Head Vector Editing for Robust Reward Models},
|
||||
author = {Liu, Shuang and Bo, Yuxuan and Zhao, Qiuyang and Huang, Caiyue and Chen, Xiaorong and Liu, Yanguang and Du, Mengnan},
|
||||
|
||||
Reference in New Issue
Block a user