This commit is contained in:
wassname
2026-05-01 22:22:09 +08:00
parent b4a8a0351d
commit 63715bbf99
5 changed files with 132 additions and 43 deletions
+25
View File
@@ -13,6 +13,7 @@ from __future__ import annotations
import os
import sys
import warnings
from pathlib import Path
from typing import Any, Sequence
@@ -23,12 +24,36 @@ from tqdm.auto import tqdm
_CONFIGURED: set[str] = set()
def quiet_external_logs() -> None:
"""Suppress third-party progress bars and advisory warnings on stdout."""
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
os.environ.setdefault("DATASETS_DISABLE_PROGRESS_BARS", "1")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
warnings.filterwarnings("ignore", message="`torch_dtype` is deprecated! Use `dtype` instead!")
try:
import datasets
datasets.disable_progress_bars()
except Exception:
pass
try:
from transformers.utils import logging as hf_logging
hf_logging.set_verbosity_error()
if hasattr(hf_logging, "disable_progress_bar"):
hf_logging.disable_progress_bar()
except Exception:
pass
def setup_logging(name: str, log_dir: str | Path = "logs") -> Path:
"""Configure loguru once per entrypoint name. Returns the verbose log path."""
log_path = Path(log_dir) / f"{name}.verbose.log"
if name in _CONFIGURED:
return log_path
log_path.parent.mkdir(parents=True, exist_ok=True)
quiet_external_logs()
logger.remove()
level = os.environ.get("LOG_LEVEL", "INFO")
+39 -15
View File
@@ -32,11 +32,13 @@ import polars as pl
import torch
from datasets import Dataset, load_dataset
from loguru import logger
from tabulate import tabulate
from torch import Tensor
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding
from ws._tok_extras import chat_template_extras
from ws._log import final_summary, get_argv, setup_logging
from ws.eval.dilemmas import compute_surgical_informedness
from ws.eval.guided_cot import guided_rollout_batch
from ws.steer import weight_steer
@@ -194,9 +196,11 @@ def _load_eval(tok, cfg: AIRiskCfg):
@torch.no_grad()
def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
w: dict[str, Tensor], choice_ids: list[list[int]],
pmass_threshold: float, n_think: int) -> list[dict]:
pmass_threshold: float, n_think: int) -> tuple[list[dict], dict[str, float]]:
rows = []
n_forced, n_total = 0, 0
pmass_vals: list[float] = []
low_pmass_vals: list[bool] = []
for batch in dl:
ids = batch["input_ids"].to(model.device)
mask = batch["attention_mask"].to(model.device)
@@ -211,6 +215,8 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
low_pmass = pmass < pmass_threshold * out["maxp"]
n_forced += int(out["forced_close"].sum())
n_total += len(logratio)
pmass_vals.extend(float(x) for x in pmass.tolist())
low_pmass_vals.extend(bool(x) for x in low_pmass.tolist())
for i in range(len(logratio)):
rows.append({
"idx": int(batch["idx"][i].item()),
@@ -220,10 +226,14 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
"pmass": float(pmass[i].item()),
"low_pmass": bool(low_pmass[i].item()),
})
frac = n_forced / max(n_total, 1)
logger.info(f"alpha={alpha:+.1f}: forced-close {n_forced}/{n_total} "
f"({frac:.0%}); raise n_think if >50%")
return rows
stats = {
"coeff": float(alpha),
"forced_close_frac": n_forced / max(n_total, 1),
"mean_pmass": float(np.mean(pmass_vals)) if pmass_vals else float("nan"),
"frac_low_pmass": float(np.mean(low_pmass_vals)) if low_pmass_vals else float("nan"),
"n_rows": len(rows),
}
return rows, stats
def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor],
@@ -240,7 +250,7 @@ def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor],
tok.pad_token = tok.eos_token
if model is None:
model = AutoModelForCausalLM.from_pretrained(
cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto"
cfg.model_id, dtype=torch.bfloat16, device_map="auto"
)
model.eval()
@@ -251,10 +261,16 @@ def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor],
choice_ids = get_action_choice_ids(tok)
rows = []
stats_rows = []
for alpha in cfg.coeffs:
rows.extend(_eval_at_coeff(model, tok, dl, alpha, w, choice_ids,
cfg.pmass_threshold, cfg.n_think))
logger.info(f"alpha={alpha:+.1f}: {len([r for r in rows if r['coeff']==alpha])} rows")
coeff_rows, stats = _eval_at_coeff(model, tok, dl, alpha, w, choice_ids,
cfg.pmass_threshold, cfg.n_think)
rows.extend(coeff_rows)
stats_rows.append(stats)
logger.info(f"airisk eval: value_class={cfg.value_class} n_rows={len(ds_raw)}")
logger.info("SHOULD: forced_close_frac stays low and mean_pmass stays near 1. ELSE n_think or answer anchor is broken.")
logger.info("\n" + tabulate(stats_rows, headers="keys", tablefmt="tsv", floatfmt="+.3f", showindex=False))
df = pl.DataFrame(rows)
meta = pl.DataFrame([{"idx": int(p["idx"]), "value_label": float(p["value_label"])}
@@ -328,10 +344,10 @@ class _AIRiskCli:
def main():
"""CLI: load w.pt for {behavior}/{adapter}, run AIRisk sweep, save csv."""
import tyro
from tabulate import tabulate
from ws.diff import load_diff
cli = tyro.cli(_AIRiskCli)
setup_logging("airisk")
out_dir = cli.out / cli.behavior / cli.adapter
w = load_diff(out_dir / "w.pt")
cfg = AIRiskCfg(
@@ -343,14 +359,22 @@ def main():
df = evaluate(cfg, w)
df.write_csv(out_dir / f"airisk_{cli.value_class.lower()}_per_row.csv")
summary = summarize(df)
summary_path = out_dir / f"airisk_{cli.value_class.lower()}_summary.csv"
summary.write_csv(summary_path)
metrics = compute_metrics(df)
print(f"\nairisk eval summary (value_class={cli.value_class!r})")
print("SHOULD: mean_logratio_value monotone in coeff (positive coeff -> more value-aligned).")
print("ELSE flat curve = w doesn't transfer to high-stakes AI dilemmas.")
print("SHOULD: mean_logratio_value monotone in coeff; positive coeff should raise value-alignment.")
print(tabulate(summary.to_pandas(), tablefmt="tsv", headers="keys",
floatfmt="+.3f", showindex=False))
summary.write_csv(out_dir / f"airisk_{cli.value_class.lower()}_summary.csv")
metrics = compute_metrics(df)
print(f"\nSI={metrics['surgical_informedness']:.2f} (n={metrics['n_samples']})")
final_summary(
out=summary_path,
argv=get_argv(),
main_metric=f"SI={metrics['surgical_informedness']:+.2f} n={metrics['n_samples']}",
cue="🟢",
table_rows=summary.select("coeff", "mean_logratio_value", "mean_pmass", "frac_low_pmass", "n").rows(),
headers=["coeff", "mean_logratio_value", "mean_pmass", "frac_low_pmass", "n"],
floatfmt="+.3f",
)
if __name__ == "__main__":
+27 -11
View File
@@ -31,6 +31,7 @@ import polars as pl
import torch
from datasets import Dataset, load_dataset
from loguru import logger
from tabulate import tabulate
from torch import Tensor
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding
@@ -146,9 +147,11 @@ def _choice_logp(logits_last: Tensor, choice_ids: list[list[int]]) -> Tensor:
@torch.no_grad()
def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
w: dict[str, Tensor], choice_ids: list[list[int]],
pmass_threshold: float, n_think: int) -> list[dict]:
pmass_threshold: float, n_think: int) -> tuple[list[dict], dict[str, float]]:
rows = []
n_forced, n_total = 0, 0
pmass_vals: list[float] = []
low_pmass_vals: list[bool] = []
for batch in dl:
ids = batch["input_ids"].to(model.device)
mask = batch["attention_mask"].to(model.device)
@@ -161,6 +164,8 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
low_pmass = pmass < pmass_threshold * out["maxp"]
n_forced += int(out["forced_close"].sum())
n_total += len(logratio)
pmass_vals.extend(float(x) for x in pmass.tolist())
low_pmass_vals.extend(bool(x) for x in low_pmass.tolist())
for i in range(len(logratio)):
rows.append({
"idx": int(batch["idx"][i].item()),
@@ -170,10 +175,14 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
"pmass": float(pmass[i].item()),
"low_pmass": bool(low_pmass[i].item()),
})
frac = n_forced / max(n_total, 1)
logger.info(f"alpha={alpha:+.1f}: forced-close {n_forced}/{n_total} "
f"({frac:.0%}); raise n_think if >50%")
return rows
stats = {
"coeff": float(alpha),
"forced_close_frac": n_forced / max(n_total, 1),
"mean_pmass": float(np.mean(pmass_vals)) if pmass_vals else float("nan"),
"frac_low_pmass": float(np.mean(low_pmass_vals)) if low_pmass_vals else float("nan"),
"n_rows": len(rows),
}
return rows, stats
def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor],
@@ -188,7 +197,7 @@ def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor],
tok.pad_token = tok.eos_token
if model is None:
model = AutoModelForCausalLM.from_pretrained(
cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto"
cfg.model_id, dtype=torch.bfloat16, device_map="auto"
)
model.eval()
@@ -201,10 +210,16 @@ def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor],
choice_ids = get_choice_ids(tok)
rows = []
stats_rows = []
for alpha in cfg.coeffs:
rows.extend(_eval_at_coeff(model, tok, dl, alpha, w, choice_ids,
cfg.pmass_threshold, cfg.n_think))
logger.info(f"alpha={alpha:+.1f}: {len([r for r in rows if r['coeff']==alpha])} rows")
coeff_rows, stats = _eval_at_coeff(model, tok, dl, alpha, w, choice_ids,
cfg.pmass_threshold, cfg.n_think)
rows.extend(coeff_rows)
stats_rows.append(stats)
logger.info(f"dilemmas eval: {len(ds_raw)} rows across {cfg.n_dilemmas} dilemmas")
logger.info("SHOULD: forced_close_frac stays low and mean_pmass stays near 1. ELSE n_think or format is broken.")
logger.info("\n" + tabulate(stats_rows, headers="keys", tablefmt="tsv", floatfmt="+.3f", showindex=False))
df = pl.DataFrame(rows)
meta = pl.DataFrame([
@@ -231,7 +246,7 @@ def evaluate_with_baselines(cfg: DilemmasCfg, w: dict[str, Tensor]) -> pl.DataFr
if tok.pad_token is None:
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(
cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto"
cfg.model_id, dtype=torch.bfloat16, device_map="auto"
)
model.eval()
@@ -314,7 +329,8 @@ def compute_surgical_informedness(
si_rev = flip_rate - k_fpr * counter_rate
pmass_ratio = min(pmass_pos, pmass_neg) ** 2
si = np.nanmean([si_fwd, si_rev]) * pmass_ratio * 100
si_terms = np.asarray([si_fwd, si_rev], dtype=float)
si = float(np.nan) if np.isnan(si_terms).all() else float(np.nanmean(si_terms) * pmass_ratio * 100)
return {
"surgical_informedness": si,
+3 -3
View File
@@ -70,7 +70,7 @@ def main(cfg: FullDDBenchmarkCfg) -> None:
tok = AutoTokenizer.from_pretrained(cfg.model)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(cfg.model, torch_dtype=torch.bfloat16, device_map="auto")
model = AutoModelForCausalLM.from_pretrained(cfg.model, dtype=torch.bfloat16, device_map="auto")
model.eval()
parts = []
@@ -84,7 +84,7 @@ def main(cfg: FullDDBenchmarkCfg) -> None:
for adapter in cfg.adapters:
w_path = cfg.out / cfg.behavior / adapter / DIFF_FILENAME
w = load_diff(w_path)
logger.info(f"adapter={adapter}: evaluating full DD from {w_path}")
logger.info(f"\n=== adapter={adapter} ===")
df = evaluate(dcfg, w, model=model, tok=tok).with_columns(pl.lit(adapter).alias("adapter"))
parts.append(df)
@@ -122,4 +122,4 @@ def main(cfg: FullDDBenchmarkCfg) -> None:
if __name__ == "__main__":
main(tyro.cli(FullDDBenchmarkCfg))
main(tyro.cli(FullDDBenchmarkCfg))
+38 -14
View File
@@ -27,6 +27,9 @@ Qwen3 thinking-mode gotchas:
from __future__ import annotations
from copy import deepcopy
from contextlib import contextmanager
import torch
from torch import Tensor
@@ -42,6 +45,25 @@ THINK_CLOSE = "</think>"
DILEMMAS_ANCHOR = "\n\nMy choice:"
@contextmanager
def _greedy_generation(model):
"""Temporarily sanitize model generation config for greedy eval."""
old_cfg = deepcopy(model.generation_config)
try:
model.generation_config.do_sample = False
if hasattr(model.generation_config, "temperature"):
model.generation_config.temperature = 1.0
if hasattr(model.generation_config, "top_p"):
model.generation_config.top_p = 1.0
if hasattr(model.generation_config, "top_k"):
model.generation_config.top_k = 50
if hasattr(model.generation_config, "min_p"):
model.generation_config.min_p = None
yield
finally:
model.generation_config = old_cfg
@torch.no_grad()
def guided_cot_one(
model,
@@ -68,12 +90,13 @@ def guided_cot_one(
"this eval assumes a thinking-mode chat template")
with weight_steer(model, w, alpha):
gen = model.generate(
prefix_ids,
max_new_tokens=n_think,
do_sample=False,
pad_token_id=tok.pad_token_id or tok.eos_token_id,
)
with _greedy_generation(model):
gen = model.generate(
prefix_ids,
max_new_tokens=n_think,
do_sample=False,
pad_token_id=tok.pad_token_id or tok.eos_token_id,
)
gen_new = gen[0, prefix_ids.shape[1]:]
already_closed = (gen_new == think_close_id).any().item()
pre_ids = tok(PRE_CLOSE, return_tensors="pt",
@@ -157,14 +180,15 @@ def guided_rollout_batch(
with weight_steer(model, w, alpha):
# Phase 1: batched greedy think under steering.
gen = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=n_think,
do_sample=False,
eos_token_id=think_close_id,
pad_token_id=pad_id,
)
with _greedy_generation(model):
gen = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=n_think,
do_sample=False,
eos_token_id=think_close_id,
pad_token_id=pad_id,
)
gen_new = gen[:, L_pad:] # [B, g], right-padded with pad_id post-eos
# Phase 2: per-sample slice + suffix build.