diff --git a/src/ws/_log.py b/src/ws/_log.py index 8bb4eae..c984235 100644 --- a/src/ws/_log.py +++ b/src/ws/_log.py @@ -13,6 +13,7 @@ from __future__ import annotations import os import sys +import warnings from pathlib import Path from typing import Any, Sequence @@ -23,12 +24,36 @@ from tqdm.auto import tqdm _CONFIGURED: set[str] = set() +def quiet_external_logs() -> None: + """Suppress third-party progress bars and advisory warnings on stdout.""" + os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") + os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1") + os.environ.setdefault("DATASETS_DISABLE_PROGRESS_BARS", "1") + os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") + warnings.filterwarnings("ignore", message="`torch_dtype` is deprecated! Use `dtype` instead!") + try: + import datasets + + datasets.disable_progress_bars() + except Exception: + pass + try: + from transformers.utils import logging as hf_logging + + hf_logging.set_verbosity_error() + if hasattr(hf_logging, "disable_progress_bar"): + hf_logging.disable_progress_bar() + except Exception: + pass + + def setup_logging(name: str, log_dir: str | Path = "logs") -> Path: """Configure loguru once per entrypoint name. Returns the verbose log path.""" log_path = Path(log_dir) / f"{name}.verbose.log" if name in _CONFIGURED: return log_path log_path.parent.mkdir(parents=True, exist_ok=True) + quiet_external_logs() logger.remove() level = os.environ.get("LOG_LEVEL", "INFO") diff --git a/src/ws/eval/airisk.py b/src/ws/eval/airisk.py index c9f7051..79242e9 100644 --- a/src/ws/eval/airisk.py +++ b/src/ws/eval/airisk.py @@ -32,11 +32,13 @@ import polars as pl import torch from datasets import Dataset, load_dataset from loguru import logger +from tabulate import tabulate from torch import Tensor from torch.utils.data import DataLoader from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding from ws._tok_extras import chat_template_extras +from ws._log import final_summary, get_argv, setup_logging from ws.eval.dilemmas import compute_surgical_informedness from ws.eval.guided_cot import guided_rollout_batch from ws.steer import weight_steer @@ -194,9 +196,11 @@ def _load_eval(tok, cfg: AIRiskCfg): @torch.no_grad() def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float, w: dict[str, Tensor], choice_ids: list[list[int]], - pmass_threshold: float, n_think: int) -> list[dict]: + pmass_threshold: float, n_think: int) -> tuple[list[dict], dict[str, float]]: rows = [] n_forced, n_total = 0, 0 + pmass_vals: list[float] = [] + low_pmass_vals: list[bool] = [] for batch in dl: ids = batch["input_ids"].to(model.device) mask = batch["attention_mask"].to(model.device) @@ -211,6 +215,8 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float, low_pmass = pmass < pmass_threshold * out["maxp"] n_forced += int(out["forced_close"].sum()) n_total += len(logratio) + pmass_vals.extend(float(x) for x in pmass.tolist()) + low_pmass_vals.extend(bool(x) for x in low_pmass.tolist()) for i in range(len(logratio)): rows.append({ "idx": int(batch["idx"][i].item()), @@ -220,10 +226,14 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float, "pmass": float(pmass[i].item()), "low_pmass": bool(low_pmass[i].item()), }) - frac = n_forced / max(n_total, 1) - logger.info(f"alpha={alpha:+.1f}: forced-close {n_forced}/{n_total} " - f"({frac:.0%}); raise n_think if >50%") - return rows + stats = { + "coeff": float(alpha), + "forced_close_frac": n_forced / max(n_total, 1), + "mean_pmass": float(np.mean(pmass_vals)) if pmass_vals else float("nan"), + "frac_low_pmass": float(np.mean(low_pmass_vals)) if low_pmass_vals else float("nan"), + "n_rows": len(rows), + } + return rows, stats def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor], @@ -240,7 +250,7 @@ def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor], tok.pad_token = tok.eos_token if model is None: model = AutoModelForCausalLM.from_pretrained( - cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto" + cfg.model_id, dtype=torch.bfloat16, device_map="auto" ) model.eval() @@ -251,10 +261,16 @@ def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor], choice_ids = get_action_choice_ids(tok) rows = [] + stats_rows = [] for alpha in cfg.coeffs: - rows.extend(_eval_at_coeff(model, tok, dl, alpha, w, choice_ids, - cfg.pmass_threshold, cfg.n_think)) - logger.info(f"alpha={alpha:+.1f}: {len([r for r in rows if r['coeff']==alpha])} rows") + coeff_rows, stats = _eval_at_coeff(model, tok, dl, alpha, w, choice_ids, + cfg.pmass_threshold, cfg.n_think) + rows.extend(coeff_rows) + stats_rows.append(stats) + + logger.info(f"airisk eval: value_class={cfg.value_class} n_rows={len(ds_raw)}") + logger.info("SHOULD: forced_close_frac stays low and mean_pmass stays near 1. ELSE n_think or answer anchor is broken.") + logger.info("\n" + tabulate(stats_rows, headers="keys", tablefmt="tsv", floatfmt="+.3f", showindex=False)) df = pl.DataFrame(rows) meta = pl.DataFrame([{"idx": int(p["idx"]), "value_label": float(p["value_label"])} @@ -328,10 +344,10 @@ class _AIRiskCli: def main(): """CLI: load w.pt for {behavior}/{adapter}, run AIRisk sweep, save csv.""" import tyro - from tabulate import tabulate from ws.diff import load_diff cli = tyro.cli(_AIRiskCli) + setup_logging("airisk") out_dir = cli.out / cli.behavior / cli.adapter w = load_diff(out_dir / "w.pt") cfg = AIRiskCfg( @@ -343,14 +359,22 @@ def main(): df = evaluate(cfg, w) df.write_csv(out_dir / f"airisk_{cli.value_class.lower()}_per_row.csv") summary = summarize(df) + summary_path = out_dir / f"airisk_{cli.value_class.lower()}_summary.csv" + summary.write_csv(summary_path) + metrics = compute_metrics(df) print(f"\nairisk eval summary (value_class={cli.value_class!r})") - print("SHOULD: mean_logratio_value monotone in coeff (positive coeff -> more value-aligned).") - print("ELSE flat curve = w doesn't transfer to high-stakes AI dilemmas.") + print("SHOULD: mean_logratio_value monotone in coeff; positive coeff should raise value-alignment.") print(tabulate(summary.to_pandas(), tablefmt="tsv", headers="keys", floatfmt="+.3f", showindex=False)) - summary.write_csv(out_dir / f"airisk_{cli.value_class.lower()}_summary.csv") - metrics = compute_metrics(df) - print(f"\nSI={metrics['surgical_informedness']:.2f} (n={metrics['n_samples']})") + final_summary( + out=summary_path, + argv=get_argv(), + main_metric=f"SI={metrics['surgical_informedness']:+.2f} n={metrics['n_samples']}", + cue="🟢", + table_rows=summary.select("coeff", "mean_logratio_value", "mean_pmass", "frac_low_pmass", "n").rows(), + headers=["coeff", "mean_logratio_value", "mean_pmass", "frac_low_pmass", "n"], + floatfmt="+.3f", + ) if __name__ == "__main__": diff --git a/src/ws/eval/dilemmas.py b/src/ws/eval/dilemmas.py index fd1b587..439ac26 100644 --- a/src/ws/eval/dilemmas.py +++ b/src/ws/eval/dilemmas.py @@ -31,6 +31,7 @@ import polars as pl import torch from datasets import Dataset, load_dataset from loguru import logger +from tabulate import tabulate from torch import Tensor from torch.utils.data import DataLoader from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding @@ -146,9 +147,11 @@ def _choice_logp(logits_last: Tensor, choice_ids: list[list[int]]) -> Tensor: @torch.no_grad() def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float, w: dict[str, Tensor], choice_ids: list[list[int]], - pmass_threshold: float, n_think: int) -> list[dict]: + pmass_threshold: float, n_think: int) -> tuple[list[dict], dict[str, float]]: rows = [] n_forced, n_total = 0, 0 + pmass_vals: list[float] = [] + low_pmass_vals: list[bool] = [] for batch in dl: ids = batch["input_ids"].to(model.device) mask = batch["attention_mask"].to(model.device) @@ -161,6 +164,8 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float, low_pmass = pmass < pmass_threshold * out["maxp"] n_forced += int(out["forced_close"].sum()) n_total += len(logratio) + pmass_vals.extend(float(x) for x in pmass.tolist()) + low_pmass_vals.extend(bool(x) for x in low_pmass.tolist()) for i in range(len(logratio)): rows.append({ "idx": int(batch["idx"][i].item()), @@ -170,10 +175,14 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float, "pmass": float(pmass[i].item()), "low_pmass": bool(low_pmass[i].item()), }) - frac = n_forced / max(n_total, 1) - logger.info(f"alpha={alpha:+.1f}: forced-close {n_forced}/{n_total} " - f"({frac:.0%}); raise n_think if >50%") - return rows + stats = { + "coeff": float(alpha), + "forced_close_frac": n_forced / max(n_total, 1), + "mean_pmass": float(np.mean(pmass_vals)) if pmass_vals else float("nan"), + "frac_low_pmass": float(np.mean(low_pmass_vals)) if low_pmass_vals else float("nan"), + "n_rows": len(rows), + } + return rows, stats def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor], @@ -188,7 +197,7 @@ def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor], tok.pad_token = tok.eos_token if model is None: model = AutoModelForCausalLM.from_pretrained( - cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto" + cfg.model_id, dtype=torch.bfloat16, device_map="auto" ) model.eval() @@ -201,10 +210,16 @@ def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor], choice_ids = get_choice_ids(tok) rows = [] + stats_rows = [] for alpha in cfg.coeffs: - rows.extend(_eval_at_coeff(model, tok, dl, alpha, w, choice_ids, - cfg.pmass_threshold, cfg.n_think)) - logger.info(f"alpha={alpha:+.1f}: {len([r for r in rows if r['coeff']==alpha])} rows") + coeff_rows, stats = _eval_at_coeff(model, tok, dl, alpha, w, choice_ids, + cfg.pmass_threshold, cfg.n_think) + rows.extend(coeff_rows) + stats_rows.append(stats) + + logger.info(f"dilemmas eval: {len(ds_raw)} rows across {cfg.n_dilemmas} dilemmas") + logger.info("SHOULD: forced_close_frac stays low and mean_pmass stays near 1. ELSE n_think or format is broken.") + logger.info("\n" + tabulate(stats_rows, headers="keys", tablefmt="tsv", floatfmt="+.3f", showindex=False)) df = pl.DataFrame(rows) meta = pl.DataFrame([ @@ -231,7 +246,7 @@ def evaluate_with_baselines(cfg: DilemmasCfg, w: dict[str, Tensor]) -> pl.DataFr if tok.pad_token is None: tok.pad_token = tok.eos_token model = AutoModelForCausalLM.from_pretrained( - cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto" + cfg.model_id, dtype=torch.bfloat16, device_map="auto" ) model.eval() @@ -314,7 +329,8 @@ def compute_surgical_informedness( si_rev = flip_rate - k_fpr * counter_rate pmass_ratio = min(pmass_pos, pmass_neg) ** 2 - si = np.nanmean([si_fwd, si_rev]) * pmass_ratio * 100 + si_terms = np.asarray([si_fwd, si_rev], dtype=float) + si = float(np.nan) if np.isnan(si_terms).all() else float(np.nanmean(si_terms) * pmass_ratio * 100) return { "surgical_informedness": si, diff --git a/src/ws/eval/full_dd_benchmark.py b/src/ws/eval/full_dd_benchmark.py index 4369f1e..4faba24 100644 --- a/src/ws/eval/full_dd_benchmark.py +++ b/src/ws/eval/full_dd_benchmark.py @@ -70,7 +70,7 @@ def main(cfg: FullDDBenchmarkCfg) -> None: tok = AutoTokenizer.from_pretrained(cfg.model) if tok.pad_token is None: tok.pad_token = tok.eos_token - model = AutoModelForCausalLM.from_pretrained(cfg.model, torch_dtype=torch.bfloat16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(cfg.model, dtype=torch.bfloat16, device_map="auto") model.eval() parts = [] @@ -84,7 +84,7 @@ def main(cfg: FullDDBenchmarkCfg) -> None: for adapter in cfg.adapters: w_path = cfg.out / cfg.behavior / adapter / DIFF_FILENAME w = load_diff(w_path) - logger.info(f"adapter={adapter}: evaluating full DD from {w_path}") + logger.info(f"\n=== adapter={adapter} ===") df = evaluate(dcfg, w, model=model, tok=tok).with_columns(pl.lit(adapter).alias("adapter")) parts.append(df) @@ -122,4 +122,4 @@ def main(cfg: FullDDBenchmarkCfg) -> None: if __name__ == "__main__": - main(tyro.cli(FullDDBenchmarkCfg)) \ No newline at end of file + main(tyro.cli(FullDDBenchmarkCfg)) diff --git a/src/ws/eval/guided_cot.py b/src/ws/eval/guided_cot.py index 86f83ed..1bab9e8 100644 --- a/src/ws/eval/guided_cot.py +++ b/src/ws/eval/guided_cot.py @@ -27,6 +27,9 @@ Qwen3 thinking-mode gotchas: from __future__ import annotations +from copy import deepcopy +from contextlib import contextmanager + import torch from torch import Tensor @@ -42,6 +45,25 @@ THINK_CLOSE = "" DILEMMAS_ANCHOR = "\n\nMy choice:" +@contextmanager +def _greedy_generation(model): + """Temporarily sanitize model generation config for greedy eval.""" + old_cfg = deepcopy(model.generation_config) + try: + model.generation_config.do_sample = False + if hasattr(model.generation_config, "temperature"): + model.generation_config.temperature = 1.0 + if hasattr(model.generation_config, "top_p"): + model.generation_config.top_p = 1.0 + if hasattr(model.generation_config, "top_k"): + model.generation_config.top_k = 50 + if hasattr(model.generation_config, "min_p"): + model.generation_config.min_p = None + yield + finally: + model.generation_config = old_cfg + + @torch.no_grad() def guided_cot_one( model, @@ -68,12 +90,13 @@ def guided_cot_one( "this eval assumes a thinking-mode chat template") with weight_steer(model, w, alpha): - gen = model.generate( - prefix_ids, - max_new_tokens=n_think, - do_sample=False, - pad_token_id=tok.pad_token_id or tok.eos_token_id, - ) + with _greedy_generation(model): + gen = model.generate( + prefix_ids, + max_new_tokens=n_think, + do_sample=False, + pad_token_id=tok.pad_token_id or tok.eos_token_id, + ) gen_new = gen[0, prefix_ids.shape[1]:] already_closed = (gen_new == think_close_id).any().item() pre_ids = tok(PRE_CLOSE, return_tensors="pt", @@ -157,14 +180,15 @@ def guided_rollout_batch( with weight_steer(model, w, alpha): # Phase 1: batched greedy think under steering. - gen = model.generate( - input_ids=input_ids, - attention_mask=attention_mask, - max_new_tokens=n_think, - do_sample=False, - eos_token_id=think_close_id, - pad_token_id=pad_id, - ) + with _greedy_generation(model): + gen = model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=n_think, + do_sample=False, + eos_token_id=think_close_id, + pad_token_id=pad_id, + ) gen_new = gen[:, L_pad:] # [B, g], right-padded with pad_id post-eos # Phase 2: per-sample slice + suffix build.