mirror of
https://github.com/wassname/weight-steering.git
synced 2026-06-27 18:27:18 +08:00
logging
This commit is contained in:
@@ -13,6 +13,7 @@ from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Any, Sequence
|
||||
|
||||
@@ -23,12 +24,36 @@ from tqdm.auto import tqdm
|
||||
_CONFIGURED: set[str] = set()
|
||||
|
||||
|
||||
def quiet_external_logs() -> None:
|
||||
"""Suppress third-party progress bars and advisory warnings on stdout."""
|
||||
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
||||
os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
|
||||
os.environ.setdefault("DATASETS_DISABLE_PROGRESS_BARS", "1")
|
||||
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
||||
warnings.filterwarnings("ignore", message="`torch_dtype` is deprecated! Use `dtype` instead!")
|
||||
try:
|
||||
import datasets
|
||||
|
||||
datasets.disable_progress_bars()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
from transformers.utils import logging as hf_logging
|
||||
|
||||
hf_logging.set_verbosity_error()
|
||||
if hasattr(hf_logging, "disable_progress_bar"):
|
||||
hf_logging.disable_progress_bar()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def setup_logging(name: str, log_dir: str | Path = "logs") -> Path:
|
||||
"""Configure loguru once per entrypoint name. Returns the verbose log path."""
|
||||
log_path = Path(log_dir) / f"{name}.verbose.log"
|
||||
if name in _CONFIGURED:
|
||||
return log_path
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
quiet_external_logs()
|
||||
|
||||
logger.remove()
|
||||
level = os.environ.get("LOG_LEVEL", "INFO")
|
||||
|
||||
+39
-15
@@ -32,11 +32,13 @@ import polars as pl
|
||||
import torch
|
||||
from datasets import Dataset, load_dataset
|
||||
from loguru import logger
|
||||
from tabulate import tabulate
|
||||
from torch import Tensor
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding
|
||||
|
||||
from ws._tok_extras import chat_template_extras
|
||||
from ws._log import final_summary, get_argv, setup_logging
|
||||
from ws.eval.dilemmas import compute_surgical_informedness
|
||||
from ws.eval.guided_cot import guided_rollout_batch
|
||||
from ws.steer import weight_steer
|
||||
@@ -194,9 +196,11 @@ def _load_eval(tok, cfg: AIRiskCfg):
|
||||
@torch.no_grad()
|
||||
def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
|
||||
w: dict[str, Tensor], choice_ids: list[list[int]],
|
||||
pmass_threshold: float, n_think: int) -> list[dict]:
|
||||
pmass_threshold: float, n_think: int) -> tuple[list[dict], dict[str, float]]:
|
||||
rows = []
|
||||
n_forced, n_total = 0, 0
|
||||
pmass_vals: list[float] = []
|
||||
low_pmass_vals: list[bool] = []
|
||||
for batch in dl:
|
||||
ids = batch["input_ids"].to(model.device)
|
||||
mask = batch["attention_mask"].to(model.device)
|
||||
@@ -211,6 +215,8 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
|
||||
low_pmass = pmass < pmass_threshold * out["maxp"]
|
||||
n_forced += int(out["forced_close"].sum())
|
||||
n_total += len(logratio)
|
||||
pmass_vals.extend(float(x) for x in pmass.tolist())
|
||||
low_pmass_vals.extend(bool(x) for x in low_pmass.tolist())
|
||||
for i in range(len(logratio)):
|
||||
rows.append({
|
||||
"idx": int(batch["idx"][i].item()),
|
||||
@@ -220,10 +226,14 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
|
||||
"pmass": float(pmass[i].item()),
|
||||
"low_pmass": bool(low_pmass[i].item()),
|
||||
})
|
||||
frac = n_forced / max(n_total, 1)
|
||||
logger.info(f"alpha={alpha:+.1f}: forced-close {n_forced}/{n_total} "
|
||||
f"({frac:.0%}); raise n_think if >50%")
|
||||
return rows
|
||||
stats = {
|
||||
"coeff": float(alpha),
|
||||
"forced_close_frac": n_forced / max(n_total, 1),
|
||||
"mean_pmass": float(np.mean(pmass_vals)) if pmass_vals else float("nan"),
|
||||
"frac_low_pmass": float(np.mean(low_pmass_vals)) if low_pmass_vals else float("nan"),
|
||||
"n_rows": len(rows),
|
||||
}
|
||||
return rows, stats
|
||||
|
||||
|
||||
def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor],
|
||||
@@ -240,7 +250,7 @@ def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor],
|
||||
tok.pad_token = tok.eos_token
|
||||
if model is None:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto"
|
||||
cfg.model_id, dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
model.eval()
|
||||
|
||||
@@ -251,10 +261,16 @@ def evaluate(cfg: AIRiskCfg, w: dict[str, Tensor],
|
||||
choice_ids = get_action_choice_ids(tok)
|
||||
|
||||
rows = []
|
||||
stats_rows = []
|
||||
for alpha in cfg.coeffs:
|
||||
rows.extend(_eval_at_coeff(model, tok, dl, alpha, w, choice_ids,
|
||||
cfg.pmass_threshold, cfg.n_think))
|
||||
logger.info(f"alpha={alpha:+.1f}: {len([r for r in rows if r['coeff']==alpha])} rows")
|
||||
coeff_rows, stats = _eval_at_coeff(model, tok, dl, alpha, w, choice_ids,
|
||||
cfg.pmass_threshold, cfg.n_think)
|
||||
rows.extend(coeff_rows)
|
||||
stats_rows.append(stats)
|
||||
|
||||
logger.info(f"airisk eval: value_class={cfg.value_class} n_rows={len(ds_raw)}")
|
||||
logger.info("SHOULD: forced_close_frac stays low and mean_pmass stays near 1. ELSE n_think or answer anchor is broken.")
|
||||
logger.info("\n" + tabulate(stats_rows, headers="keys", tablefmt="tsv", floatfmt="+.3f", showindex=False))
|
||||
|
||||
df = pl.DataFrame(rows)
|
||||
meta = pl.DataFrame([{"idx": int(p["idx"]), "value_label": float(p["value_label"])}
|
||||
@@ -328,10 +344,10 @@ class _AIRiskCli:
|
||||
def main():
|
||||
"""CLI: load w.pt for {behavior}/{adapter}, run AIRisk sweep, save csv."""
|
||||
import tyro
|
||||
from tabulate import tabulate
|
||||
from ws.diff import load_diff
|
||||
|
||||
cli = tyro.cli(_AIRiskCli)
|
||||
setup_logging("airisk")
|
||||
out_dir = cli.out / cli.behavior / cli.adapter
|
||||
w = load_diff(out_dir / "w.pt")
|
||||
cfg = AIRiskCfg(
|
||||
@@ -343,14 +359,22 @@ def main():
|
||||
df = evaluate(cfg, w)
|
||||
df.write_csv(out_dir / f"airisk_{cli.value_class.lower()}_per_row.csv")
|
||||
summary = summarize(df)
|
||||
summary_path = out_dir / f"airisk_{cli.value_class.lower()}_summary.csv"
|
||||
summary.write_csv(summary_path)
|
||||
metrics = compute_metrics(df)
|
||||
print(f"\nairisk eval summary (value_class={cli.value_class!r})")
|
||||
print("SHOULD: mean_logratio_value monotone in coeff (positive coeff -> more value-aligned).")
|
||||
print("ELSE flat curve = w doesn't transfer to high-stakes AI dilemmas.")
|
||||
print("SHOULD: mean_logratio_value monotone in coeff; positive coeff should raise value-alignment.")
|
||||
print(tabulate(summary.to_pandas(), tablefmt="tsv", headers="keys",
|
||||
floatfmt="+.3f", showindex=False))
|
||||
summary.write_csv(out_dir / f"airisk_{cli.value_class.lower()}_summary.csv")
|
||||
metrics = compute_metrics(df)
|
||||
print(f"\nSI={metrics['surgical_informedness']:.2f} (n={metrics['n_samples']})")
|
||||
final_summary(
|
||||
out=summary_path,
|
||||
argv=get_argv(),
|
||||
main_metric=f"SI={metrics['surgical_informedness']:+.2f} n={metrics['n_samples']}",
|
||||
cue="🟢",
|
||||
table_rows=summary.select("coeff", "mean_logratio_value", "mean_pmass", "frac_low_pmass", "n").rows(),
|
||||
headers=["coeff", "mean_logratio_value", "mean_pmass", "frac_low_pmass", "n"],
|
||||
floatfmt="+.3f",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
+27
-11
@@ -31,6 +31,7 @@ import polars as pl
|
||||
import torch
|
||||
from datasets import Dataset, load_dataset
|
||||
from loguru import logger
|
||||
from tabulate import tabulate
|
||||
from torch import Tensor
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding
|
||||
@@ -146,9 +147,11 @@ def _choice_logp(logits_last: Tensor, choice_ids: list[list[int]]) -> Tensor:
|
||||
@torch.no_grad()
|
||||
def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
|
||||
w: dict[str, Tensor], choice_ids: list[list[int]],
|
||||
pmass_threshold: float, n_think: int) -> list[dict]:
|
||||
pmass_threshold: float, n_think: int) -> tuple[list[dict], dict[str, float]]:
|
||||
rows = []
|
||||
n_forced, n_total = 0, 0
|
||||
pmass_vals: list[float] = []
|
||||
low_pmass_vals: list[bool] = []
|
||||
for batch in dl:
|
||||
ids = batch["input_ids"].to(model.device)
|
||||
mask = batch["attention_mask"].to(model.device)
|
||||
@@ -161,6 +164,8 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
|
||||
low_pmass = pmass < pmass_threshold * out["maxp"]
|
||||
n_forced += int(out["forced_close"].sum())
|
||||
n_total += len(logratio)
|
||||
pmass_vals.extend(float(x) for x in pmass.tolist())
|
||||
low_pmass_vals.extend(bool(x) for x in low_pmass.tolist())
|
||||
for i in range(len(logratio)):
|
||||
rows.append({
|
||||
"idx": int(batch["idx"][i].item()),
|
||||
@@ -170,10 +175,14 @@ def _eval_at_coeff(model, tok, dl: DataLoader, alpha: float,
|
||||
"pmass": float(pmass[i].item()),
|
||||
"low_pmass": bool(low_pmass[i].item()),
|
||||
})
|
||||
frac = n_forced / max(n_total, 1)
|
||||
logger.info(f"alpha={alpha:+.1f}: forced-close {n_forced}/{n_total} "
|
||||
f"({frac:.0%}); raise n_think if >50%")
|
||||
return rows
|
||||
stats = {
|
||||
"coeff": float(alpha),
|
||||
"forced_close_frac": n_forced / max(n_total, 1),
|
||||
"mean_pmass": float(np.mean(pmass_vals)) if pmass_vals else float("nan"),
|
||||
"frac_low_pmass": float(np.mean(low_pmass_vals)) if low_pmass_vals else float("nan"),
|
||||
"n_rows": len(rows),
|
||||
}
|
||||
return rows, stats
|
||||
|
||||
|
||||
def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor],
|
||||
@@ -188,7 +197,7 @@ def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor],
|
||||
tok.pad_token = tok.eos_token
|
||||
if model is None:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto"
|
||||
cfg.model_id, dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
model.eval()
|
||||
|
||||
@@ -201,10 +210,16 @@ def evaluate(cfg: DilemmasCfg, w: dict[str, Tensor],
|
||||
choice_ids = get_choice_ids(tok)
|
||||
|
||||
rows = []
|
||||
stats_rows = []
|
||||
for alpha in cfg.coeffs:
|
||||
rows.extend(_eval_at_coeff(model, tok, dl, alpha, w, choice_ids,
|
||||
cfg.pmass_threshold, cfg.n_think))
|
||||
logger.info(f"alpha={alpha:+.1f}: {len([r for r in rows if r['coeff']==alpha])} rows")
|
||||
coeff_rows, stats = _eval_at_coeff(model, tok, dl, alpha, w, choice_ids,
|
||||
cfg.pmass_threshold, cfg.n_think)
|
||||
rows.extend(coeff_rows)
|
||||
stats_rows.append(stats)
|
||||
|
||||
logger.info(f"dilemmas eval: {len(ds_raw)} rows across {cfg.n_dilemmas} dilemmas")
|
||||
logger.info("SHOULD: forced_close_frac stays low and mean_pmass stays near 1. ELSE n_think or format is broken.")
|
||||
logger.info("\n" + tabulate(stats_rows, headers="keys", tablefmt="tsv", floatfmt="+.3f", showindex=False))
|
||||
|
||||
df = pl.DataFrame(rows)
|
||||
meta = pl.DataFrame([
|
||||
@@ -231,7 +246,7 @@ def evaluate_with_baselines(cfg: DilemmasCfg, w: dict[str, Tensor]) -> pl.DataFr
|
||||
if tok.pad_token is None:
|
||||
tok.pad_token = tok.eos_token
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
cfg.model_id, torch_dtype=torch.bfloat16, device_map="auto"
|
||||
cfg.model_id, dtype=torch.bfloat16, device_map="auto"
|
||||
)
|
||||
model.eval()
|
||||
|
||||
@@ -314,7 +329,8 @@ def compute_surgical_informedness(
|
||||
si_rev = flip_rate - k_fpr * counter_rate
|
||||
|
||||
pmass_ratio = min(pmass_pos, pmass_neg) ** 2
|
||||
si = np.nanmean([si_fwd, si_rev]) * pmass_ratio * 100
|
||||
si_terms = np.asarray([si_fwd, si_rev], dtype=float)
|
||||
si = float(np.nan) if np.isnan(si_terms).all() else float(np.nanmean(si_terms) * pmass_ratio * 100)
|
||||
|
||||
return {
|
||||
"surgical_informedness": si,
|
||||
|
||||
@@ -70,7 +70,7 @@ def main(cfg: FullDDBenchmarkCfg) -> None:
|
||||
tok = AutoTokenizer.from_pretrained(cfg.model)
|
||||
if tok.pad_token is None:
|
||||
tok.pad_token = tok.eos_token
|
||||
model = AutoModelForCausalLM.from_pretrained(cfg.model, torch_dtype=torch.bfloat16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(cfg.model, dtype=torch.bfloat16, device_map="auto")
|
||||
model.eval()
|
||||
|
||||
parts = []
|
||||
@@ -84,7 +84,7 @@ def main(cfg: FullDDBenchmarkCfg) -> None:
|
||||
for adapter in cfg.adapters:
|
||||
w_path = cfg.out / cfg.behavior / adapter / DIFF_FILENAME
|
||||
w = load_diff(w_path)
|
||||
logger.info(f"adapter={adapter}: evaluating full DD from {w_path}")
|
||||
logger.info(f"\n=== adapter={adapter} ===")
|
||||
df = evaluate(dcfg, w, model=model, tok=tok).with_columns(pl.lit(adapter).alias("adapter"))
|
||||
parts.append(df)
|
||||
|
||||
@@ -122,4 +122,4 @@ def main(cfg: FullDDBenchmarkCfg) -> None:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(tyro.cli(FullDDBenchmarkCfg))
|
||||
main(tyro.cli(FullDDBenchmarkCfg))
|
||||
|
||||
+38
-14
@@ -27,6 +27,9 @@ Qwen3 thinking-mode gotchas:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
from contextlib import contextmanager
|
||||
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
@@ -42,6 +45,25 @@ THINK_CLOSE = "</think>"
|
||||
DILEMMAS_ANCHOR = "\n\nMy choice:"
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _greedy_generation(model):
|
||||
"""Temporarily sanitize model generation config for greedy eval."""
|
||||
old_cfg = deepcopy(model.generation_config)
|
||||
try:
|
||||
model.generation_config.do_sample = False
|
||||
if hasattr(model.generation_config, "temperature"):
|
||||
model.generation_config.temperature = 1.0
|
||||
if hasattr(model.generation_config, "top_p"):
|
||||
model.generation_config.top_p = 1.0
|
||||
if hasattr(model.generation_config, "top_k"):
|
||||
model.generation_config.top_k = 50
|
||||
if hasattr(model.generation_config, "min_p"):
|
||||
model.generation_config.min_p = None
|
||||
yield
|
||||
finally:
|
||||
model.generation_config = old_cfg
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def guided_cot_one(
|
||||
model,
|
||||
@@ -68,12 +90,13 @@ def guided_cot_one(
|
||||
"this eval assumes a thinking-mode chat template")
|
||||
|
||||
with weight_steer(model, w, alpha):
|
||||
gen = model.generate(
|
||||
prefix_ids,
|
||||
max_new_tokens=n_think,
|
||||
do_sample=False,
|
||||
pad_token_id=tok.pad_token_id or tok.eos_token_id,
|
||||
)
|
||||
with _greedy_generation(model):
|
||||
gen = model.generate(
|
||||
prefix_ids,
|
||||
max_new_tokens=n_think,
|
||||
do_sample=False,
|
||||
pad_token_id=tok.pad_token_id or tok.eos_token_id,
|
||||
)
|
||||
gen_new = gen[0, prefix_ids.shape[1]:]
|
||||
already_closed = (gen_new == think_close_id).any().item()
|
||||
pre_ids = tok(PRE_CLOSE, return_tensors="pt",
|
||||
@@ -157,14 +180,15 @@ def guided_rollout_batch(
|
||||
|
||||
with weight_steer(model, w, alpha):
|
||||
# Phase 1: batched greedy think under steering.
|
||||
gen = model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
max_new_tokens=n_think,
|
||||
do_sample=False,
|
||||
eos_token_id=think_close_id,
|
||||
pad_token_id=pad_id,
|
||||
)
|
||||
with _greedy_generation(model):
|
||||
gen = model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
max_new_tokens=n_think,
|
||||
do_sample=False,
|
||||
eos_token_id=think_close_id,
|
||||
pad_token_id=pad_id,
|
||||
)
|
||||
gen_new = gen[:, L_pad:] # [B, g], right-padded with pad_id post-eos
|
||||
|
||||
# Phase 2: per-sample slice + suffix build.
|
||||
|
||||
Reference in New Issue
Block a user