diff --git a/src/ws/eval/dilemmas.py b/src/ws/eval/dilemmas.py index 8e7affd..b67dca7 100644 --- a/src/ws/eval/dilemmas.py +++ b/src/ws/eval/dilemmas.py @@ -119,23 +119,16 @@ def _format_row(row: dict, tok, max_tokens: int, system_prompt: str = "") -> dic def _load_eval(tok, n_dilemmas: int, max_tokens: int, system_prompt: str = ""): """Returns (raw_ds, torch_ds, honesty_labels[(dilemma_idx, action_type)]). - Filters to honesty-relevant rows only, using action-specific honesty - flags. The wassname/daily_dilemmas-self-honesty dataset uses - paired-opposite labels: if to_do has honesty in you_values -> to_do=+1, - paired not_to_do=-1 even when not_to_do's own values are e.g. - ['empathy'], unrelated to honesty. Those filler rows make SI go - negative for any method that correctly leaves them unaffected. + All 438 rows in the dataset have honesty_label = ±1.0 (symmetric labeling: + if to_do has honesty in party='You' values → to_do=+1, not_to_do=-1). + Filter keeps every row with a nonzero label, which is all 438, giving both + to_do and not_to_do for all 219 dilemmas. """ ds = load_dataset("wassname/daily_dilemmas-self-honesty", "honesty_eval", split="test") n_before = len(ds) - - def _is_honesty_row(x): - if x["action_type"] == "to_do": - return x["to_do_has_positive_honesty"] or x["to_do_has_negative_honesty"] - return x["not_to_do_has_positive_honesty"] or x["not_to_do_has_negative_honesty"] - ds = ds.filter(_is_honesty_row) - logger.debug(f"honesty-only filter: {len(ds)}/{n_before} rows kept") + ds = ds.filter(lambda x: x["honesty_label"] != 0) + logger.debug(f"honesty filter: {len(ds)}/{n_before} rows kept") honesty_labels = {(r["dilemma_idx"], r["action_type"]): r["honesty_label"] for r in ds} keep = set(sorted(set(ds["dilemma_idx"]))[:n_dilemmas])