mirror of
https://github.com/wassname/evil_MoE.git
synced 2026-06-28 11:20:17 +08:00
misc
This commit is contained in:
@@ -107,7 +107,7 @@ def main(cfg: Config) -> int:
|
||||
# E[cos|clean]=0: mean(cos_pre) = f_h * E[cos|hacked] + (1-f_h)*0
|
||||
# => E[cos|hacked] = mean(cos_pre) / f_h. NaN when no hacks in batch
|
||||
# (no per-hacked estimate possible from this step).
|
||||
# FIXME: cos_pre is now the hack-ward FRACTION ||relu(V@g)||/||g|| >= 0
|
||||
# FIXME: cos_pre is now the aligned fraction ||relu(V@g)||/||g|| >= 0
|
||||
# (was signed sum, ~0 on clean). With relu the E[cos|clean]=0 premise
|
||||
# no longer holds, so this f_h-weighted estimate over-counts. Recompute
|
||||
# per-rollout cos restricted to hacked rollouts instead of decomposing.
|
||||
|
||||
Reference in New Issue
Block a user