mirror of
https://github.com/wassname/Brukino_AntiPaSTO_Appetizer.git
synced 2026-06-27 16:58:47 +08:00
human
This commit is contained in:
@@ -63,3 +63,5 @@ The default script uses `Qwen/Qwen2.5-0.5B-Instruct` as it fits comfortably on s
|
|||||||
- [RepEng]() A nice hackable activation steering repo
|
- [RepEng]() A nice hackable activation steering repo
|
||||||
- [AntiPaSTO](https://arxiv.org/pdf/2601.07473) Introducing S space adapters with contrastive pairs
|
- [AntiPaSTO](https://arxiv.org/pdf/2601.07473) Introducing S space adapters with contrastive pairs
|
||||||
- [S steering](https://github.com/wassname/ssteer-eval-aware/blob/main/report/report.md?plain=1) The light version of the above with no gradient or rotation of the U and V matrixes from the SVD decomposition of the hidden states
|
- [S steering](https://github.com/wassname/ssteer-eval-aware/blob/main/report/report.md?plain=1) The light version of the above with no gradient or rotation of the U and V matrixes from the SVD decomposition of the hidden states
|
||||||
|
- https://en.wikipedia.org/wiki/Frenet%E2%80%93Serret_formulas
|
||||||
|
- https://huggingface.co/Qwen/Qwen3.5-0.8B/blob/main/config.json
|
||||||
|
|||||||
+13
-12
@@ -92,8 +92,6 @@ def compute_curvature(hidden_states):
|
|||||||
Computes Frenet-Serret extrinsic curvature (kappa).
|
Computes Frenet-Serret extrinsic curvature (kappa).
|
||||||
kappa(t) = ||gamma''(t)|| / ||gamma'(t)||^3
|
kappa(t) = ||gamma''(t)|| / ||gamma'(t)||^3
|
||||||
'''
|
'''
|
||||||
if hidden_states.shape[0] < 3:
|
|
||||||
return torch.zeros(hidden_states.shape[0], device=hidden_states.device)
|
|
||||||
|
|
||||||
# Cast to float32 to prevent float16 overflow when cubing
|
# Cast to float32 to prevent float16 overflow when cubing
|
||||||
gamma = hidden_states.to(torch.float32)
|
gamma = hidden_states.to(torch.float32)
|
||||||
@@ -116,15 +114,16 @@ def guided_eval(model, tokenizer, prompt_text, n_think=32, device="cuda", s_spac
|
|||||||
messages,
|
messages,
|
||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
return_dict=True
|
return_dict=True,
|
||||||
|
enable_thinking=True
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
prompt_ids = inputs["input_ids"]
|
prompt_ids = inputs["input_ids"]
|
||||||
attention_mask = inputs["attention_mask"]
|
attention_mask = inputs["attention_mask"]
|
||||||
|
|
||||||
think_prefix_ids = tokenizer.encode("Thinking Process:\\n", add_special_tokens=False, return_tensors="pt").to(device)
|
# think_prefix_ids = tokenizer.encode("Thinking Process:\\n", add_special_tokens=False, return_tensors="pt").to(device)
|
||||||
prompt_ids = torch.cat([prompt_ids, think_prefix_ids], dim=1)
|
# prompt_ids = torch.cat([prompt_ids, think_prefix_ids], dim=1)
|
||||||
attention_mask = torch.cat([attention_mask, torch.ones_like(think_prefix_ids)], dim=1)
|
# attention_mask = torch.cat([attention_mask, torch.ones_like(think_prefix_ids)], dim=1)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
out = model.generate(
|
out = model.generate(
|
||||||
@@ -165,15 +164,17 @@ def guided_eval(model, tokenizer, prompt_text, n_think=32, device="cuda", s_spac
|
|||||||
if pmass < 0.9:
|
if pmass < 0.9:
|
||||||
top_tokens = tokenizer.decode(torch.topk(log_probs, k=5).indices.tolist())
|
top_tokens = tokenizer.decode(torch.topk(log_probs, k=5).indices.tolist())
|
||||||
print(f"Warning: Low probability mass on Yes/No tokens: {pmass.item():.3f}. Top tokens were {top_tokens}")
|
print(f"Warning: Low probability mass on Yes/No tokens: {pmass.item():.3f}. Top tokens were {top_tokens}")
|
||||||
|
|
||||||
|
|
||||||
|
# Note the residual stream doesn't change much, but it's suppressed in the last few layers (see https://github.com/wassname/eliciting_suppressed_knowledge & https://arxiv.org/abs/2402.10588) so it's normal to choose the 80% or 60% layer for steering and analysis. We hope most of the thinking has been done, but it hasn't yet been suppressed in preperation for output.
|
||||||
|
target_layer = int(0.8 * (len(outputs.hidden_states) - 1))
|
||||||
|
print(f"Extracting hidden states from layer {target_layer} for curvature analysis.")
|
||||||
|
|
||||||
final_layer_hiddens = outputs.hidden_states[-1][0]
|
middle_layer_hiddens = outputs.hidden_states[target_layer][0]
|
||||||
start_idx = prompt_ids.shape[1]
|
start_idx = prompt_ids.shape[1]
|
||||||
cot_hiddens = final_layer_hiddens[start_idx : start_idx + generated_ids.shape[0]]
|
cot_hiddens = middle_layer_hiddens[start_idx : start_idx + generated_ids.shape[0]]
|
||||||
|
|
||||||
if s_space_U is not None and s_space_S is not None:
|
trajectory = project_to_s_space(cot_hiddens, s_space_U, s_space_S)
|
||||||
trajectory = project_to_s_space(cot_hiddens, s_space_U, s_space_S)
|
|
||||||
else:
|
|
||||||
trajectory = cot_hiddens
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"logratio": (p_yes - p_no).item(),
|
"logratio": (p_yes - p_no).item(),
|
||||||
|
|||||||
Reference in New Issue
Block a user